optirs_gpu/
backends.rs

1//! GPU Backend Abstraction Layer
2//!
3//! This module provides a unified interface for different GPU backends,
4//! supporting CUDA, ROCm, Metal, WebGPU, and CPU fallback.
5
6use std::sync::Arc;
7use thiserror::Error;
8
9/// GPU backend types supported by the optimizer
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum GpuBackend {
12    /// NVIDIA CUDA backend
13    Cuda,
14    /// AMD ROCm backend
15    Rocm,
16    /// Apple Metal backend
17    Metal,
18    /// WebGPU backend (cross-platform)
19    Wgpu,
20    /// CPU fallback (no GPU acceleration)
21    Cpu,
22}
23
24impl Default for GpuBackend {
25    fn default() -> Self {
26        #[cfg(target_os = "macos")]
27        return Self::Metal;
28
29        #[cfg(not(target_os = "macos"))]
30        return Self::Cuda;
31    }
32}
33
34/// Errors that can occur with GPU backends
35#[derive(Debug, Error)]
36pub enum BackendError {
37    #[error("Backend not available: {backend:?}")]
38    NotAvailable { backend: GpuBackend },
39
40    #[error("Backend initialization failed: {reason}")]
41    InitializationFailed { reason: String },
42
43    #[error("Operation not supported by backend: {operation}")]
44    UnsupportedOperation { operation: String },
45
46    #[error("Backend error: {message}")]
47    BackendSpecific { message: String },
48
49    #[error("Device error: {device_id}")]
50    DeviceError { device_id: u32 },
51}
52
53/// GPU device capabilities
54#[derive(Debug, Clone)]
55pub struct DeviceCapabilities {
56    /// Device name
57    pub name: String,
58
59    /// Total memory in bytes
60    pub total_memory: usize,
61
62    /// Available memory in bytes
63    pub available_memory: usize,
64
65    /// Supports half precision (f16)
66    pub supports_f16: bool,
67
68    /// Supports bfloat16
69    pub supports_bf16: bool,
70
71    /// Supports tensor cores
72    pub supports_tensor_cores: bool,
73
74    /// Maximum threads per block
75    pub max_threads_per_block: u32,
76
77    /// Maximum shared memory per block
78    pub max_shared_memory_per_block: usize,
79
80    /// Number of streaming multiprocessors
81    pub multiprocessor_count: u32,
82
83    /// Compute capability (major, minor)
84    pub compute_capability: (u32, u32),
85}
86
87/// GPU backend factory
88pub struct BackendFactory;
89
90impl BackendFactory {
91    /// Create a backend instance
92    pub fn create_backend(backend_type: GpuBackend) -> Result<Box<dyn Backend>, BackendError> {
93        match backend_type {
94            GpuBackend::Cuda => Ok(Box::new(CudaBackend::new()?)),
95            GpuBackend::Rocm => Ok(Box::new(RocmBackend::new()?)),
96            GpuBackend::Metal => Ok(Box::new(MetalBackend::new()?)),
97            GpuBackend::Wgpu => Ok(Box::new(WgpuBackend::new()?)),
98            GpuBackend::Cpu => Ok(Box::new(CpuBackend::new()?)),
99        }
100    }
101
102    /// Get available backends on the current system
103    pub fn available_backends() -> Vec<GpuBackend> {
104        let mut backends = Vec::new();
105
106        // Check CUDA availability
107        #[cfg(feature = "cuda")]
108        if CudaBackend::is_available() {
109            backends.push(GpuBackend::Cuda);
110        }
111
112        // ROCm support not yet implemented
113        // if RocmBackend::is_available() {
114        //     backends.push(GpuBackend::Rocm);
115        // }
116
117        // Check Metal availability
118        #[cfg(target_os = "macos")]
119        if MetalBackend::is_available() {
120            backends.push(GpuBackend::Metal);
121        }
122
123        // WebGPU should be available on most platforms
124        #[cfg(feature = "wgpu")]
125        if WgpuBackend::is_available() {
126            backends.push(GpuBackend::Wgpu);
127        }
128
129        // CPU fallback is always available
130        backends.push(GpuBackend::Cpu);
131
132        backends
133    }
134
135    /// Get the best available backend for the current system
136    pub fn get_best_backend() -> GpuBackend {
137        let available = Self::available_backends();
138
139        // Priority order: CUDA -> Metal -> ROCm -> WebGPU -> CPU
140        for &backend in &[
141            GpuBackend::Cuda,
142            GpuBackend::Metal,
143            GpuBackend::Rocm,
144            GpuBackend::Wgpu,
145            GpuBackend::Cpu,
146        ] {
147            if available.contains(&backend) {
148                return backend;
149            }
150        }
151
152        GpuBackend::Cpu
153    }
154}
155
156/// Trait for GPU backend implementations
157pub trait Backend: Send + Sync {
158    /// Get backend type
159    fn backend_type(&self) -> GpuBackend;
160
161    /// Initialize the backend
162    fn initialize(&mut self) -> Result<(), BackendError>;
163
164    /// Get device count
165    fn device_count(&self) -> Result<u32, BackendError>;
166
167    /// Get device capabilities
168    fn device_capabilities(&self, device_id: u32) -> Result<DeviceCapabilities, BackendError>;
169
170    /// Set active device
171    fn set_device(&mut self, device_id: u32) -> Result<(), BackendError>;
172
173    /// Allocate memory on device
174    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError>;
175
176    /// Free device memory
177    fn deallocate(&self, memory: DeviceMemory) -> Result<(), BackendError>;
178
179    /// Copy memory from host to device
180    fn copy_to_device(&self, src: &[u8], dst: &DeviceMemory) -> Result<(), BackendError>;
181
182    /// Copy memory from device to host
183    fn copy_to_host(&self, src: &DeviceMemory, dst: &mut [u8]) -> Result<(), BackendError>;
184
185    /// Synchronize device
186    fn synchronize(&self) -> Result<(), BackendError>;
187
188    /// Launch kernel
189    fn launch_kernel(
190        &self,
191        kernel: &CompiledKernel,
192        args: &[KernelArg],
193    ) -> Result<(), BackendError>;
194}
195
196/// Device memory handle
197#[derive(Debug)]
198pub struct DeviceMemory {
199    pub ptr: usize,
200    pub size: usize,
201    pub backend: GpuBackend,
202}
203
204/// Compiled kernel representation
205#[derive(Debug)]
206pub struct CompiledKernel {
207    pub name: String,
208    pub backend: GpuBackend,
209    pub binary: Vec<u8>,
210}
211
212/// Kernel argument types
213#[derive(Debug)]
214pub enum KernelArg {
215    Buffer(DeviceMemory),
216    Scalar(Vec<u8>),
217}
218
219/// CUDA backend implementation
220pub struct CudaBackend {
221    initialized: bool,
222    current_device: u32,
223}
224
225impl CudaBackend {
226    pub fn new() -> Result<Self, BackendError> {
227        Ok(Self {
228            initialized: false,
229            current_device: 0,
230        })
231    }
232
233    pub fn is_available() -> bool {
234        // In a real implementation, this would check for CUDA runtime
235        #[cfg(feature = "cuda")]
236        return true;
237
238        #[cfg(not(feature = "cuda"))]
239        return false;
240    }
241}
242
243impl Backend for CudaBackend {
244    fn backend_type(&self) -> GpuBackend {
245        GpuBackend::Cuda
246    }
247
248    fn initialize(&mut self) -> Result<(), BackendError> {
249        if !Self::is_available() {
250            return Err(BackendError::NotAvailable {
251                backend: GpuBackend::Cuda,
252            });
253        }
254
255        self.initialized = true;
256        Ok(())
257    }
258
259    fn device_count(&self) -> Result<u32, BackendError> {
260        // Placeholder implementation
261        Ok(1)
262    }
263
264    fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
265        Ok(DeviceCapabilities {
266            name: "CUDA Device".to_string(),
267            total_memory: 8 * 1024 * 1024 * 1024,     // 8GB
268            available_memory: 6 * 1024 * 1024 * 1024, // 6GB
269            supports_f16: true,
270            supports_bf16: true,
271            supports_tensor_cores: true,
272            max_threads_per_block: 1024,
273            max_shared_memory_per_block: 49152,
274            multiprocessor_count: 72,
275            compute_capability: (8, 6),
276        })
277    }
278
279    fn set_device(&mut self, device_id: u32) -> Result<(), BackendError> {
280        self.current_device = device_id;
281        Ok(())
282    }
283
284    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
285        Ok(DeviceMemory {
286            ptr: 0, // Placeholder
287            size,
288            backend: GpuBackend::Cuda,
289        })
290    }
291
292    fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
293        Ok(())
294    }
295
296    fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
297        Ok(())
298    }
299
300    fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
301        Ok(())
302    }
303
304    fn synchronize(&self) -> Result<(), BackendError> {
305        Ok(())
306    }
307
308    fn launch_kernel(
309        &self,
310        _kernel: &CompiledKernel,
311        _args: &[KernelArg],
312    ) -> Result<(), BackendError> {
313        Ok(())
314    }
315}
316
317/// ROCm backend implementation
318pub struct RocmBackend {
319    initialized: bool,
320    current_device: u32,
321}
322
323impl RocmBackend {
324    pub fn new() -> Result<Self, BackendError> {
325        Ok(Self {
326            initialized: false,
327            current_device: 0,
328        })
329    }
330
331    pub fn is_available() -> bool {
332        // ROCm support not yet implemented
333        false
334    }
335}
336
337impl Backend for RocmBackend {
338    fn backend_type(&self) -> GpuBackend {
339        GpuBackend::Rocm
340    }
341
342    fn initialize(&mut self) -> Result<(), BackendError> {
343        if !Self::is_available() {
344            return Err(BackendError::NotAvailable {
345                backend: GpuBackend::Rocm,
346            });
347        }
348
349        self.initialized = true;
350        Ok(())
351    }
352
353    fn device_count(&self) -> Result<u32, BackendError> {
354        Ok(1)
355    }
356
357    fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
358        Ok(DeviceCapabilities {
359            name: "ROCm Device".to_string(),
360            total_memory: 16 * 1024 * 1024 * 1024,     // 16GB
361            available_memory: 14 * 1024 * 1024 * 1024, // 14GB
362            supports_f16: true,
363            supports_bf16: true,
364            supports_tensor_cores: false,
365            max_threads_per_block: 1024,
366            max_shared_memory_per_block: 65536,
367            multiprocessor_count: 60,
368            compute_capability: (0, 0), // ROCm doesn't use CUDA compute capability
369        })
370    }
371
372    fn set_device(&mut self, device_id: u32) -> Result<(), BackendError> {
373        self.current_device = device_id;
374        Ok(())
375    }
376
377    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
378        Ok(DeviceMemory {
379            ptr: 0,
380            size,
381            backend: GpuBackend::Rocm,
382        })
383    }
384
385    fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
386        Ok(())
387    }
388
389    fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
390        Ok(())
391    }
392
393    fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
394        Ok(())
395    }
396
397    fn synchronize(&self) -> Result<(), BackendError> {
398        Ok(())
399    }
400
401    fn launch_kernel(
402        &self,
403        _kernel: &CompiledKernel,
404        _args: &[KernelArg],
405    ) -> Result<(), BackendError> {
406        Ok(())
407    }
408}
409
410/// Metal backend implementation
411pub struct MetalBackend {
412    initialized: bool,
413}
414
415impl MetalBackend {
416    pub fn new() -> Result<Self, BackendError> {
417        Ok(Self { initialized: false })
418    }
419
420    pub fn is_available() -> bool {
421        #[cfg(target_os = "macos")]
422        return true;
423
424        #[cfg(not(target_os = "macos"))]
425        return false;
426    }
427}
428
429impl Backend for MetalBackend {
430    fn backend_type(&self) -> GpuBackend {
431        GpuBackend::Metal
432    }
433
434    fn initialize(&mut self) -> Result<(), BackendError> {
435        if !Self::is_available() {
436            return Err(BackendError::NotAvailable {
437                backend: GpuBackend::Metal,
438            });
439        }
440
441        self.initialized = true;
442        Ok(())
443    }
444
445    fn device_count(&self) -> Result<u32, BackendError> {
446        Ok(1)
447    }
448
449    fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
450        Ok(DeviceCapabilities {
451            name: "Metal GPU".to_string(),
452            total_memory: 8 * 1024 * 1024 * 1024, // 8GB unified memory
453            available_memory: 6 * 1024 * 1024 * 1024, // 6GB
454            supports_f16: true,
455            supports_bf16: false,
456            supports_tensor_cores: false,
457            max_threads_per_block: 1024,
458            max_shared_memory_per_block: 32768,
459            multiprocessor_count: 1,
460            compute_capability: (0, 0),
461        })
462    }
463
464    fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
465        Ok(())
466    }
467
468    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
469        Ok(DeviceMemory {
470            ptr: 0,
471            size,
472            backend: GpuBackend::Metal,
473        })
474    }
475
476    fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
477        Ok(())
478    }
479
480    fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
481        Ok(())
482    }
483
484    fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
485        Ok(())
486    }
487
488    fn synchronize(&self) -> Result<(), BackendError> {
489        Ok(())
490    }
491
492    fn launch_kernel(
493        &self,
494        _kernel: &CompiledKernel,
495        _args: &[KernelArg],
496    ) -> Result<(), BackendError> {
497        Ok(())
498    }
499}
500
501/// WebGPU backend implementation
502pub struct WgpuBackend {
503    initialized: bool,
504}
505
506impl WgpuBackend {
507    pub fn new() -> Result<Self, BackendError> {
508        Ok(Self { initialized: false })
509    }
510
511    pub fn is_available() -> bool {
512        #[cfg(feature = "wgpu")]
513        return true;
514
515        #[cfg(not(feature = "wgpu"))]
516        return false;
517    }
518}
519
520impl Backend for WgpuBackend {
521    fn backend_type(&self) -> GpuBackend {
522        GpuBackend::Wgpu
523    }
524
525    fn initialize(&mut self) -> Result<(), BackendError> {
526        if !Self::is_available() {
527            return Err(BackendError::NotAvailable {
528                backend: GpuBackend::Wgpu,
529            });
530        }
531
532        self.initialized = true;
533        Ok(())
534    }
535
536    fn device_count(&self) -> Result<u32, BackendError> {
537        Ok(1)
538    }
539
540    fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
541        Ok(DeviceCapabilities {
542            name: "WebGPU Device".to_string(),
543            total_memory: 4 * 1024 * 1024 * 1024,     // 4GB
544            available_memory: 3 * 1024 * 1024 * 1024, // 3GB
545            supports_f16: false,
546            supports_bf16: false,
547            supports_tensor_cores: false,
548            max_threads_per_block: 256,
549            max_shared_memory_per_block: 16384,
550            multiprocessor_count: 1,
551            compute_capability: (0, 0),
552        })
553    }
554
555    fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
556        Ok(())
557    }
558
559    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
560        Ok(DeviceMemory {
561            ptr: 0,
562            size,
563            backend: GpuBackend::Wgpu,
564        })
565    }
566
567    fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
568        Ok(())
569    }
570
571    fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
572        Ok(())
573    }
574
575    fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
576        Ok(())
577    }
578
579    fn synchronize(&self) -> Result<(), BackendError> {
580        Ok(())
581    }
582
583    fn launch_kernel(
584        &self,
585        _kernel: &CompiledKernel,
586        _args: &[KernelArg],
587    ) -> Result<(), BackendError> {
588        Ok(())
589    }
590}
591
592/// CPU backend (fallback implementation)
593pub struct CpuBackend {
594    initialized: bool,
595}
596
597impl CpuBackend {
598    pub fn new() -> Result<Self, BackendError> {
599        Ok(Self { initialized: false })
600    }
601
602    pub fn is_available() -> bool {
603        true // CPU is always available
604    }
605}
606
607impl Backend for CpuBackend {
608    fn backend_type(&self) -> GpuBackend {
609        GpuBackend::Cpu
610    }
611
612    fn initialize(&mut self) -> Result<(), BackendError> {
613        self.initialized = true;
614        Ok(())
615    }
616
617    fn device_count(&self) -> Result<u32, BackendError> {
618        Ok(1)
619    }
620
621    fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
622        Ok(DeviceCapabilities {
623            name: "CPU Device".to_string(),
624            total_memory: 16 * 1024 * 1024 * 1024, // 16GB RAM
625            available_memory: 12 * 1024 * 1024 * 1024, // 12GB available
626            supports_f16: false,
627            supports_bf16: false,
628            supports_tensor_cores: false,
629            max_threads_per_block: 1,
630            max_shared_memory_per_block: 0,
631            multiprocessor_count: 1,
632            compute_capability: (0, 0),
633        })
634    }
635
636    fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
637        Ok(())
638    }
639
640    fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
641        Ok(DeviceMemory {
642            ptr: 0,
643            size,
644            backend: GpuBackend::Cpu,
645        })
646    }
647
648    fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
649        Ok(())
650    }
651
652    fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
653        // For CPU backend, this is essentially a memcpy
654        Ok(())
655    }
656
657    fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
658        // For CPU backend, this is essentially a memcpy
659        Ok(())
660    }
661
662    fn synchronize(&self) -> Result<(), BackendError> {
663        // CPU operations are synchronous by nature
664        Ok(())
665    }
666
667    fn launch_kernel(
668        &self,
669        _kernel: &CompiledKernel,
670        _args: &[KernelArg],
671    ) -> Result<(), BackendError> {
672        // CPU backend would execute the kernel function directly
673        Ok(())
674    }
675}