1use std::sync::Arc;
7use thiserror::Error;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum GpuBackend {
12 Cuda,
14 Rocm,
16 Metal,
18 Wgpu,
20 Cpu,
22}
23
24impl Default for GpuBackend {
25 fn default() -> Self {
26 #[cfg(target_os = "macos")]
27 return Self::Metal;
28
29 #[cfg(not(target_os = "macos"))]
30 return Self::Cuda;
31 }
32}
33
34#[derive(Debug, Error)]
36pub enum BackendError {
37 #[error("Backend not available: {backend:?}")]
38 NotAvailable { backend: GpuBackend },
39
40 #[error("Backend initialization failed: {reason}")]
41 InitializationFailed { reason: String },
42
43 #[error("Operation not supported by backend: {operation}")]
44 UnsupportedOperation { operation: String },
45
46 #[error("Backend error: {message}")]
47 BackendSpecific { message: String },
48
49 #[error("Device error: {device_id}")]
50 DeviceError { device_id: u32 },
51}
52
53#[derive(Debug, Clone)]
55pub struct DeviceCapabilities {
56 pub name: String,
58
59 pub total_memory: usize,
61
62 pub available_memory: usize,
64
65 pub supports_f16: bool,
67
68 pub supports_bf16: bool,
70
71 pub supports_tensor_cores: bool,
73
74 pub max_threads_per_block: u32,
76
77 pub max_shared_memory_per_block: usize,
79
80 pub multiprocessor_count: u32,
82
83 pub compute_capability: (u32, u32),
85}
86
87pub struct BackendFactory;
89
90impl BackendFactory {
91 pub fn create_backend(backend_type: GpuBackend) -> Result<Box<dyn Backend>, BackendError> {
93 match backend_type {
94 GpuBackend::Cuda => Ok(Box::new(CudaBackend::new()?)),
95 GpuBackend::Rocm => Ok(Box::new(RocmBackend::new()?)),
96 GpuBackend::Metal => Ok(Box::new(MetalBackend::new()?)),
97 GpuBackend::Wgpu => Ok(Box::new(WgpuBackend::new()?)),
98 GpuBackend::Cpu => Ok(Box::new(CpuBackend::new()?)),
99 }
100 }
101
102 pub fn available_backends() -> Vec<GpuBackend> {
104 let mut backends = Vec::new();
105
106 #[cfg(feature = "cuda")]
108 if CudaBackend::is_available() {
109 backends.push(GpuBackend::Cuda);
110 }
111
112 #[cfg(target_os = "macos")]
119 if MetalBackend::is_available() {
120 backends.push(GpuBackend::Metal);
121 }
122
123 #[cfg(feature = "wgpu")]
125 if WgpuBackend::is_available() {
126 backends.push(GpuBackend::Wgpu);
127 }
128
129 backends.push(GpuBackend::Cpu);
131
132 backends
133 }
134
135 pub fn get_best_backend() -> GpuBackend {
137 let available = Self::available_backends();
138
139 for &backend in &[
141 GpuBackend::Cuda,
142 GpuBackend::Metal,
143 GpuBackend::Rocm,
144 GpuBackend::Wgpu,
145 GpuBackend::Cpu,
146 ] {
147 if available.contains(&backend) {
148 return backend;
149 }
150 }
151
152 GpuBackend::Cpu
153 }
154}
155
156pub trait Backend: Send + Sync {
158 fn backend_type(&self) -> GpuBackend;
160
161 fn initialize(&mut self) -> Result<(), BackendError>;
163
164 fn device_count(&self) -> Result<u32, BackendError>;
166
167 fn device_capabilities(&self, device_id: u32) -> Result<DeviceCapabilities, BackendError>;
169
170 fn set_device(&mut self, device_id: u32) -> Result<(), BackendError>;
172
173 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError>;
175
176 fn deallocate(&self, memory: DeviceMemory) -> Result<(), BackendError>;
178
179 fn copy_to_device(&self, src: &[u8], dst: &DeviceMemory) -> Result<(), BackendError>;
181
182 fn copy_to_host(&self, src: &DeviceMemory, dst: &mut [u8]) -> Result<(), BackendError>;
184
185 fn synchronize(&self) -> Result<(), BackendError>;
187
188 fn launch_kernel(
190 &self,
191 kernel: &CompiledKernel,
192 args: &[KernelArg],
193 ) -> Result<(), BackendError>;
194}
195
196#[derive(Debug)]
198pub struct DeviceMemory {
199 pub ptr: usize,
200 pub size: usize,
201 pub backend: GpuBackend,
202}
203
204#[derive(Debug)]
206pub struct CompiledKernel {
207 pub name: String,
208 pub backend: GpuBackend,
209 pub binary: Vec<u8>,
210}
211
212#[derive(Debug)]
214pub enum KernelArg {
215 Buffer(DeviceMemory),
216 Scalar(Vec<u8>),
217}
218
219pub struct CudaBackend {
221 initialized: bool,
222 current_device: u32,
223}
224
225impl CudaBackend {
226 pub fn new() -> Result<Self, BackendError> {
227 Ok(Self {
228 initialized: false,
229 current_device: 0,
230 })
231 }
232
233 pub fn is_available() -> bool {
234 #[cfg(feature = "cuda")]
236 return true;
237
238 #[cfg(not(feature = "cuda"))]
239 return false;
240 }
241}
242
243impl Backend for CudaBackend {
244 fn backend_type(&self) -> GpuBackend {
245 GpuBackend::Cuda
246 }
247
248 fn initialize(&mut self) -> Result<(), BackendError> {
249 if !Self::is_available() {
250 return Err(BackendError::NotAvailable {
251 backend: GpuBackend::Cuda,
252 });
253 }
254
255 self.initialized = true;
256 Ok(())
257 }
258
259 fn device_count(&self) -> Result<u32, BackendError> {
260 Ok(1)
262 }
263
264 fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
265 Ok(DeviceCapabilities {
266 name: "CUDA Device".to_string(),
267 total_memory: 8 * 1024 * 1024 * 1024, available_memory: 6 * 1024 * 1024 * 1024, supports_f16: true,
270 supports_bf16: true,
271 supports_tensor_cores: true,
272 max_threads_per_block: 1024,
273 max_shared_memory_per_block: 49152,
274 multiprocessor_count: 72,
275 compute_capability: (8, 6),
276 })
277 }
278
279 fn set_device(&mut self, device_id: u32) -> Result<(), BackendError> {
280 self.current_device = device_id;
281 Ok(())
282 }
283
284 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
285 Ok(DeviceMemory {
286 ptr: 0, size,
288 backend: GpuBackend::Cuda,
289 })
290 }
291
292 fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
293 Ok(())
294 }
295
296 fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
297 Ok(())
298 }
299
300 fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
301 Ok(())
302 }
303
304 fn synchronize(&self) -> Result<(), BackendError> {
305 Ok(())
306 }
307
308 fn launch_kernel(
309 &self,
310 _kernel: &CompiledKernel,
311 _args: &[KernelArg],
312 ) -> Result<(), BackendError> {
313 Ok(())
314 }
315}
316
317pub struct RocmBackend {
319 initialized: bool,
320 current_device: u32,
321}
322
323impl RocmBackend {
324 pub fn new() -> Result<Self, BackendError> {
325 Ok(Self {
326 initialized: false,
327 current_device: 0,
328 })
329 }
330
331 pub fn is_available() -> bool {
332 false
334 }
335}
336
337impl Backend for RocmBackend {
338 fn backend_type(&self) -> GpuBackend {
339 GpuBackend::Rocm
340 }
341
342 fn initialize(&mut self) -> Result<(), BackendError> {
343 if !Self::is_available() {
344 return Err(BackendError::NotAvailable {
345 backend: GpuBackend::Rocm,
346 });
347 }
348
349 self.initialized = true;
350 Ok(())
351 }
352
353 fn device_count(&self) -> Result<u32, BackendError> {
354 Ok(1)
355 }
356
357 fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
358 Ok(DeviceCapabilities {
359 name: "ROCm Device".to_string(),
360 total_memory: 16 * 1024 * 1024 * 1024, available_memory: 14 * 1024 * 1024 * 1024, supports_f16: true,
363 supports_bf16: true,
364 supports_tensor_cores: false,
365 max_threads_per_block: 1024,
366 max_shared_memory_per_block: 65536,
367 multiprocessor_count: 60,
368 compute_capability: (0, 0), })
370 }
371
372 fn set_device(&mut self, device_id: u32) -> Result<(), BackendError> {
373 self.current_device = device_id;
374 Ok(())
375 }
376
377 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
378 Ok(DeviceMemory {
379 ptr: 0,
380 size,
381 backend: GpuBackend::Rocm,
382 })
383 }
384
385 fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
386 Ok(())
387 }
388
389 fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
390 Ok(())
391 }
392
393 fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
394 Ok(())
395 }
396
397 fn synchronize(&self) -> Result<(), BackendError> {
398 Ok(())
399 }
400
401 fn launch_kernel(
402 &self,
403 _kernel: &CompiledKernel,
404 _args: &[KernelArg],
405 ) -> Result<(), BackendError> {
406 Ok(())
407 }
408}
409
410pub struct MetalBackend {
412 initialized: bool,
413}
414
415impl MetalBackend {
416 pub fn new() -> Result<Self, BackendError> {
417 Ok(Self { initialized: false })
418 }
419
420 pub fn is_available() -> bool {
421 #[cfg(target_os = "macos")]
422 return true;
423
424 #[cfg(not(target_os = "macos"))]
425 return false;
426 }
427}
428
429impl Backend for MetalBackend {
430 fn backend_type(&self) -> GpuBackend {
431 GpuBackend::Metal
432 }
433
434 fn initialize(&mut self) -> Result<(), BackendError> {
435 if !Self::is_available() {
436 return Err(BackendError::NotAvailable {
437 backend: GpuBackend::Metal,
438 });
439 }
440
441 self.initialized = true;
442 Ok(())
443 }
444
445 fn device_count(&self) -> Result<u32, BackendError> {
446 Ok(1)
447 }
448
449 fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
450 Ok(DeviceCapabilities {
451 name: "Metal GPU".to_string(),
452 total_memory: 8 * 1024 * 1024 * 1024, available_memory: 6 * 1024 * 1024 * 1024, supports_f16: true,
455 supports_bf16: false,
456 supports_tensor_cores: false,
457 max_threads_per_block: 1024,
458 max_shared_memory_per_block: 32768,
459 multiprocessor_count: 1,
460 compute_capability: (0, 0),
461 })
462 }
463
464 fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
465 Ok(())
466 }
467
468 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
469 Ok(DeviceMemory {
470 ptr: 0,
471 size,
472 backend: GpuBackend::Metal,
473 })
474 }
475
476 fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
477 Ok(())
478 }
479
480 fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
481 Ok(())
482 }
483
484 fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
485 Ok(())
486 }
487
488 fn synchronize(&self) -> Result<(), BackendError> {
489 Ok(())
490 }
491
492 fn launch_kernel(
493 &self,
494 _kernel: &CompiledKernel,
495 _args: &[KernelArg],
496 ) -> Result<(), BackendError> {
497 Ok(())
498 }
499}
500
501pub struct WgpuBackend {
503 initialized: bool,
504}
505
506impl WgpuBackend {
507 pub fn new() -> Result<Self, BackendError> {
508 Ok(Self { initialized: false })
509 }
510
511 pub fn is_available() -> bool {
512 #[cfg(feature = "wgpu")]
513 return true;
514
515 #[cfg(not(feature = "wgpu"))]
516 return false;
517 }
518}
519
520impl Backend for WgpuBackend {
521 fn backend_type(&self) -> GpuBackend {
522 GpuBackend::Wgpu
523 }
524
525 fn initialize(&mut self) -> Result<(), BackendError> {
526 if !Self::is_available() {
527 return Err(BackendError::NotAvailable {
528 backend: GpuBackend::Wgpu,
529 });
530 }
531
532 self.initialized = true;
533 Ok(())
534 }
535
536 fn device_count(&self) -> Result<u32, BackendError> {
537 Ok(1)
538 }
539
540 fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
541 Ok(DeviceCapabilities {
542 name: "WebGPU Device".to_string(),
543 total_memory: 4 * 1024 * 1024 * 1024, available_memory: 3 * 1024 * 1024 * 1024, supports_f16: false,
546 supports_bf16: false,
547 supports_tensor_cores: false,
548 max_threads_per_block: 256,
549 max_shared_memory_per_block: 16384,
550 multiprocessor_count: 1,
551 compute_capability: (0, 0),
552 })
553 }
554
555 fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
556 Ok(())
557 }
558
559 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
560 Ok(DeviceMemory {
561 ptr: 0,
562 size,
563 backend: GpuBackend::Wgpu,
564 })
565 }
566
567 fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
568 Ok(())
569 }
570
571 fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
572 Ok(())
573 }
574
575 fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
576 Ok(())
577 }
578
579 fn synchronize(&self) -> Result<(), BackendError> {
580 Ok(())
581 }
582
583 fn launch_kernel(
584 &self,
585 _kernel: &CompiledKernel,
586 _args: &[KernelArg],
587 ) -> Result<(), BackendError> {
588 Ok(())
589 }
590}
591
592pub struct CpuBackend {
594 initialized: bool,
595}
596
597impl CpuBackend {
598 pub fn new() -> Result<Self, BackendError> {
599 Ok(Self { initialized: false })
600 }
601
602 pub fn is_available() -> bool {
603 true }
605}
606
607impl Backend for CpuBackend {
608 fn backend_type(&self) -> GpuBackend {
609 GpuBackend::Cpu
610 }
611
612 fn initialize(&mut self) -> Result<(), BackendError> {
613 self.initialized = true;
614 Ok(())
615 }
616
617 fn device_count(&self) -> Result<u32, BackendError> {
618 Ok(1)
619 }
620
621 fn device_capabilities(&self, _device_id: u32) -> Result<DeviceCapabilities, BackendError> {
622 Ok(DeviceCapabilities {
623 name: "CPU Device".to_string(),
624 total_memory: 16 * 1024 * 1024 * 1024, available_memory: 12 * 1024 * 1024 * 1024, supports_f16: false,
627 supports_bf16: false,
628 supports_tensor_cores: false,
629 max_threads_per_block: 1,
630 max_shared_memory_per_block: 0,
631 multiprocessor_count: 1,
632 compute_capability: (0, 0),
633 })
634 }
635
636 fn set_device(&mut self, _device_id: u32) -> Result<(), BackendError> {
637 Ok(())
638 }
639
640 fn allocate(&self, size: usize) -> Result<DeviceMemory, BackendError> {
641 Ok(DeviceMemory {
642 ptr: 0,
643 size,
644 backend: GpuBackend::Cpu,
645 })
646 }
647
648 fn deallocate(&self, _memory: DeviceMemory) -> Result<(), BackendError> {
649 Ok(())
650 }
651
652 fn copy_to_device(&self, _src: &[u8], _dst: &DeviceMemory) -> Result<(), BackendError> {
653 Ok(())
655 }
656
657 fn copy_to_host(&self, _src: &DeviceMemory, _dst: &mut [u8]) -> Result<(), BackendError> {
658 Ok(())
660 }
661
662 fn synchronize(&self) -> Result<(), BackendError> {
663 Ok(())
665 }
666
667 fn launch_kernel(
668 &self,
669 _kernel: &CompiledKernel,
670 _args: &[KernelArg],
671 ) -> Result<(), BackendError> {
672 Ok(())
674 }
675}