1use crate::traits::SimdError;
7
8#[cfg(feature = "no-std")]
9use alloc::{
10 boxed::Box,
11 format,
12 string::{String, ToString},
13 vec::Vec,
14};
15
16#[cfg(feature = "no-std")]
17use core::any::Any;
18#[cfg(not(feature = "no-std"))]
19use std::any::Any;
20
21#[cfg(feature = "no-std")]
22use spin::Mutex;
23#[cfg(not(feature = "no-std"))]
24use std::sync::Mutex;
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum GpuBackend {
29 Cuda,
30 OpenCL,
31 Metal,
32 Vulkan,
33}
34
35#[derive(Debug, Clone)]
37pub struct GpuDevice {
38 pub id: u32,
39 pub name: String,
40 pub backend: GpuBackend,
41 pub compute_units: u32,
42 pub memory_mb: u64,
43 pub supports_f64: bool,
44 pub supports_f16: bool,
45}
46
47#[derive(Debug)]
49pub struct GpuBuffer<T> {
50 pub ptr: *mut T,
51 pub size: usize,
52 pub device: GpuDevice,
53 #[allow(dead_code)]
54 backend_handle: Option<Box<dyn Any + Send + Sync>>,
56}
57
58unsafe impl<T: Send> Send for GpuBuffer<T> {}
59unsafe impl<T: Sync> Sync for GpuBuffer<T> {}
60
61impl<T> Drop for GpuBuffer<T> {
62 fn drop(&mut self) {
63 }
66}
67
68pub struct GpuContext {
70 pub device: GpuDevice,
71 pub streams: Vec<GpuStream>,
72 #[allow(dead_code)] backend_context: Option<Box<dyn Any + Send + Sync>>,
74}
75
76#[derive(Debug)]
78pub struct GpuStream {
79 pub id: u32,
80 pub device_id: u32,
81 #[allow(dead_code)] backend_stream: Option<Box<dyn Any + Send + Sync>>,
83}
84
85#[derive(Debug, Clone)]
87pub struct KernelConfig {
88 pub grid_size: (u32, u32, u32),
89 pub block_size: (u32, u32, u32),
90 pub shared_memory: u32,
91 pub stream: Option<u32>,
92}
93
94impl Default for KernelConfig {
95 fn default() -> Self {
96 Self {
97 grid_size: (1, 1, 1),
98 block_size: (256, 1, 1),
99 shared_memory: 0,
100 stream: None,
101 }
102 }
103}
104
105pub trait GpuOperations {
107 fn allocate<T>(&self, size: usize) -> Result<GpuBuffer<T>, SimdError>;
109
110 fn copy_to_device<T>(
112 &self,
113 host_data: &[T],
114 gpu_buffer: &mut GpuBuffer<T>,
115 ) -> Result<(), SimdError>;
116
117 fn copy_to_host<T>(
119 &self,
120 gpu_buffer: &GpuBuffer<T>,
121 host_data: &mut [T],
122 ) -> Result<(), SimdError>;
123
124 fn launch_kernel(
126 &self,
127 kernel: &str,
128 config: &KernelConfig,
129 args: &[&dyn Any],
130 ) -> Result<(), SimdError>;
131
132 fn synchronize(&self) -> Result<(), SimdError>;
134}
135
136pub mod cuda {
138 use super::*;
139
140 pub struct CudaDevice {
142 #[allow(dead_code)] device_id: u32,
144 #[allow(dead_code)] context: Option<Box<dyn Any + Send + Sync>>,
146 }
147
148 impl CudaDevice {
149 pub fn new(device_id: u32) -> Result<Self, SimdError> {
150 Ok(Self {
152 device_id,
153 context: None,
154 })
155 }
156
157 pub fn get_device_count() -> Result<u32, SimdError> {
158 Err(SimdError::UnsupportedOperation(
160 "CUDA not available".to_string(),
161 ))
162 }
163
164 pub fn get_device_info(device_id: u32) -> Result<GpuDevice, SimdError> {
165 Ok(GpuDevice {
167 id: device_id,
168 name: format!("CUDA Device {}", device_id),
169 backend: GpuBackend::Cuda,
170 compute_units: 80,
171 memory_mb: 8192,
172 supports_f64: true,
173 supports_f16: true,
174 })
175 }
176 }
177
178 impl GpuOperations for CudaDevice {
179 fn allocate<T>(&self, size: usize) -> Result<GpuBuffer<T>, SimdError> {
180 let _ = size;
182 Err(SimdError::UnsupportedOperation(
183 "CUDA not available".to_string(),
184 ))
185 }
186
187 fn copy_to_device<T>(
188 &self,
189 _host_data: &[T],
190 _gpu_buffer: &mut GpuBuffer<T>,
191 ) -> Result<(), SimdError> {
192 Err(SimdError::UnsupportedOperation(
194 "CUDA not available".to_string(),
195 ))
196 }
197
198 fn copy_to_host<T>(
199 &self,
200 _gpu_buffer: &GpuBuffer<T>,
201 _host_data: &mut [T],
202 ) -> Result<(), SimdError> {
203 Err(SimdError::UnsupportedOperation(
205 "CUDA not available".to_string(),
206 ))
207 }
208
209 fn launch_kernel(
210 &self,
211 _kernel: &str,
212 _config: &KernelConfig,
213 _args: &[&dyn Any],
214 ) -> Result<(), SimdError> {
215 Err(SimdError::UnsupportedOperation(
217 "CUDA not available".to_string(),
218 ))
219 }
220
221 fn synchronize(&self) -> Result<(), SimdError> {
222 Err(SimdError::UnsupportedOperation(
224 "CUDA not available".to_string(),
225 ))
226 }
227 }
228
229 pub mod kernels {
231
232 pub const VECTOR_ADD_KERNEL: &str = r#"
234 extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
235 int idx = blockIdx.x * blockDim.x + threadIdx.x;
236 if (idx < n) {
237 c[idx] = a[idx] + b[idx];
238 }
239 }
240 "#;
241
242 pub const DOT_PRODUCT_KERNEL: &str = r#"
244 extern "C" __global__ void dot_product(float* a, float* b, float* result, int n) {
245 __shared__ float shared[256];
246 int idx = blockIdx.x * blockDim.x + threadIdx.x;
247 int tid = threadIdx.x;
248
249 float sum = 0.0f;
250 while (idx < n) {
251 sum += a[idx] * b[idx];
252 idx += blockDim.x * gridDim.x;
253 }
254
255 shared[tid] = sum;
256 __syncthreads();
257
258 // Reduction in shared memory
259 for (int s = blockDim.x / 2; s > 0; s >>= 1) {
260 if (tid < s) {
261 shared[tid] += shared[tid + s];
262 }
263 __syncthreads();
264 }
265
266 if (tid == 0) {
267 atomicAdd(result, shared[0]);
268 }
269 }
270 "#;
271
272 pub const MATRIX_MUL_KERNEL: &str = r#"
274 extern "C" __global__ void matrix_mul(float* a, float* b, float* c, int m, int n, int k) {
275 int row = blockIdx.y * blockDim.y + threadIdx.y;
276 int col = blockIdx.x * blockDim.x + threadIdx.x;
277
278 if (row < m && col < n) {
279 float sum = 0.0f;
280 for (int i = 0; i < k; i++) {
281 sum += a[row * k + i] * b[i * n + col];
282 }
283 c[row * n + col] = sum;
284 }
285 }
286 "#;
287
288 pub const RELU_KERNEL: &str = r#"
290 extern "C" __global__ void relu(float* input, float* output, int n) {
291 int idx = blockIdx.x * blockDim.x + threadIdx.x;
292 if (idx < n) {
293 output[idx] = fmaxf(0.0f, input[idx]);
294 }
295 }
296 "#;
297
298 pub const SOFTMAX_KERNEL: &str = r#"
300 extern "C" __global__ void softmax(float* input, float* output, int n) {
301 extern __shared__ float shared[];
302 int idx = blockIdx.x * blockDim.x + threadIdx.x;
303 int tid = threadIdx.x;
304
305 // Find maximum for numerical stability
306 float max_val = (idx < n) ? input[idx] : -INFINITY;
307 shared[tid] = max_val;
308 __syncthreads();
309
310 for (int s = blockDim.x / 2; s > 0; s >>= 1) {
311 if (tid < s) {
312 shared[tid] = fmaxf(shared[tid], shared[tid + s]);
313 }
314 __syncthreads();
315 }
316
317 float global_max = shared[0];
318
319 // Compute exp and sum
320 float exp_val = (idx < n) ? expf(input[idx] - global_max) : 0.0f;
321 shared[tid] = exp_val;
322 __syncthreads();
323
324 for (int s = blockDim.x / 2; s > 0; s >>= 1) {
325 if (tid < s) {
326 shared[tid] += shared[tid + s];
327 }
328 __syncthreads();
329 }
330
331 float sum = shared[0];
332
333 if (idx < n) {
334 output[idx] = exp_val / sum;
335 }
336 }
337 "#;
338 }
339}
340
341pub mod opencl {
343 use super::*;
344
345 pub struct OpenCLDevice {
347 #[allow(dead_code)] device_id: u32,
349 #[allow(dead_code)] context: Option<Box<dyn Any + Send + Sync>>,
351 #[allow(dead_code)] command_queue: Option<Box<dyn Any + Send + Sync>>,
353 }
354
355 impl OpenCLDevice {
356 pub fn new(device_id: u32) -> Result<Self, SimdError> {
357 Ok(Self {
358 device_id,
359 context: None,
360 command_queue: None,
361 })
362 }
363
364 pub fn get_platforms() -> Result<Vec<String>, SimdError> {
365 Err(SimdError::UnsupportedOperation(
367 "OpenCL not available".to_string(),
368 ))
369 }
370
371 pub fn get_devices(platform_id: u32) -> Result<Vec<GpuDevice>, SimdError> {
372 let _ = platform_id;
374 Err(SimdError::UnsupportedOperation(
375 "OpenCL not available".to_string(),
376 ))
377 }
378 }
379
380 impl GpuOperations for OpenCLDevice {
381 fn allocate<T>(&self, size: usize) -> Result<GpuBuffer<T>, SimdError> {
382 let _ = size;
384 Err(SimdError::UnsupportedOperation(
385 "OpenCL not available".to_string(),
386 ))
387 }
388
389 fn copy_to_device<T>(
390 &self,
391 _host_data: &[T],
392 _gpu_buffer: &mut GpuBuffer<T>,
393 ) -> Result<(), SimdError> {
394 Err(SimdError::UnsupportedOperation(
396 "OpenCL not available".to_string(),
397 ))
398 }
399
400 fn copy_to_host<T>(
401 &self,
402 _gpu_buffer: &GpuBuffer<T>,
403 _host_data: &mut [T],
404 ) -> Result<(), SimdError> {
405 Err(SimdError::UnsupportedOperation(
407 "OpenCL not available".to_string(),
408 ))
409 }
410
411 fn launch_kernel(
412 &self,
413 _kernel: &str,
414 _config: &KernelConfig,
415 _args: &[&dyn Any],
416 ) -> Result<(), SimdError> {
417 Err(SimdError::UnsupportedOperation(
419 "OpenCL not available".to_string(),
420 ))
421 }
422
423 fn synchronize(&self) -> Result<(), SimdError> {
424 Err(SimdError::UnsupportedOperation(
426 "OpenCL not available".to_string(),
427 ))
428 }
429 }
430
431 pub mod kernels {
433 pub const VECTOR_ADD_KERNEL: &str = r#"
435 __kernel void vector_add(__global float* a, __global float* b, __global float* c, int n) {
436 int idx = get_global_id(0);
437 if (idx < n) {
438 c[idx] = a[idx] + b[idx];
439 }
440 }
441 "#;
442
443 pub const DOT_PRODUCT_KERNEL: &str = r#"
445 __kernel void dot_product(__global float* a, __global float* b, __global float* result, int n) {
446 __local float local_sum[256];
447 int idx = get_global_id(0);
448 int lid = get_local_id(0);
449
450 float sum = 0.0f;
451 if (idx < n) {
452 sum = a[idx] * b[idx];
453 }
454
455 local_sum[lid] = sum;
456 barrier(CLK_LOCAL_MEM_FENCE);
457
458 // Reduction
459 for (int s = get_local_size(0) / 2; s > 0; s >>= 1) {
460 if (lid < s) {
461 local_sum[lid] += local_sum[lid + s];
462 }
463 barrier(CLK_LOCAL_MEM_FENCE);
464 }
465
466 if (lid == 0) {
467 atomic_add_global(result, local_sum[0]);
468 }
469 }
470 "#;
471
472 pub const MATRIX_MUL_KERNEL: &str = r#"
474 __kernel void matrix_mul(__global float* a, __global float* b, __global float* c, int m, int n, int k) {
475 int row = get_global_id(1);
476 int col = get_global_id(0);
477
478 if (row < m && col < n) {
479 float sum = 0.0f;
480 for (int i = 0; i < k; i++) {
481 sum += a[row * k + i] * b[i * n + col];
482 }
483 c[row * n + col] = sum;
484 }
485 }
486 "#;
487 }
488}
489
490pub struct GpuManager {
492 cuda_devices: Vec<cuda::CudaDevice>,
493 opencl_devices: Vec<opencl::OpenCLDevice>,
494 preferred_backend: Option<GpuBackend>,
495}
496
497impl GpuManager {
498 pub fn new() -> Self {
499 Self {
500 cuda_devices: Vec::new(),
501 opencl_devices: Vec::new(),
502 preferred_backend: None,
503 }
504 }
505
506 pub fn initialize(&mut self) -> Result<(), SimdError> {
508 if let Ok(count) = cuda::CudaDevice::get_device_count() {
510 for i in 0..count {
511 if let Ok(device) = cuda::CudaDevice::new(i) {
512 self.cuda_devices.push(device);
513 }
514 }
515 }
516
517 if let Ok(platforms) = opencl::OpenCLDevice::get_platforms() {
519 for (platform_id, _platform) in platforms.iter().enumerate() {
520 if let Ok(devices) = opencl::OpenCLDevice::get_devices(platform_id as u32) {
521 for device in devices {
522 if let Ok(opencl_device) = opencl::OpenCLDevice::new(device.id) {
523 self.opencl_devices.push(opencl_device);
524 }
525 }
526 }
527 }
528 }
529
530 if !self.cuda_devices.is_empty() {
532 self.preferred_backend = Some(GpuBackend::Cuda);
533 } else if !self.opencl_devices.is_empty() {
534 self.preferred_backend = Some(GpuBackend::OpenCL);
535 }
536
537 Ok(())
538 }
539
540 pub fn get_devices(&self) -> Vec<GpuDevice> {
542 let mut devices = Vec::new();
543
544 for (i, _) in self.cuda_devices.iter().enumerate() {
545 if let Ok(device) = cuda::CudaDevice::get_device_info(i as u32) {
546 devices.push(device);
547 }
548 }
549
550 for (i, _) in self.opencl_devices.iter().enumerate() {
552 devices.push(GpuDevice {
553 id: i as u32,
554 name: format!("OpenCL Device {}", i),
555 backend: GpuBackend::OpenCL,
556 compute_units: 16,
557 memory_mb: 4096,
558 supports_f64: true,
559 supports_f16: false,
560 });
561 }
562
563 devices
564 }
565
566 pub fn get_best_device(&self) -> Option<GpuDevice> {
568 let devices = self.get_devices();
569 devices
570 .into_iter()
571 .max_by_key(|d| d.compute_units * (d.memory_mb / 1024) as u32)
572 }
573
574 pub fn is_available(&self) -> bool {
576 !self.cuda_devices.is_empty() || !self.opencl_devices.is_empty()
577 }
578}
579
580impl Default for GpuManager {
581 fn default() -> Self {
582 Self::new()
583 }
584}
585
586use once_cell::sync::Lazy;
588pub static GPU_MANAGER: Lazy<Mutex<GpuManager>> = Lazy::new(|| Mutex::new(GpuManager::new()));
589
590pub fn initialize_gpu() -> Result<(), SimdError> {
592 #[cfg(not(feature = "no-std"))]
593 let mut manager = GPU_MANAGER
594 .lock()
595 .map_err(|_| SimdError::ExternalLibraryError("Failed to lock GPU manager".to_string()))?;
596 #[cfg(feature = "no-std")]
597 let mut manager = GPU_MANAGER.lock();
598 manager.initialize()
599}
600
601pub fn is_gpu_available() -> bool {
603 #[cfg(not(feature = "no-std"))]
604 {
605 if let Ok(manager) = GPU_MANAGER.lock() {
606 manager.is_available()
607 } else {
608 false
609 }
610 }
611 #[cfg(feature = "no-std")]
612 {
613 let manager = GPU_MANAGER.lock();
614 manager.is_available()
615 }
616}
617
618pub fn get_gpu_devices() -> Vec<GpuDevice> {
620 #[cfg(not(feature = "no-std"))]
621 {
622 if let Ok(manager) = GPU_MANAGER.lock() {
623 manager.get_devices()
624 } else {
625 Vec::new()
626 }
627 }
628 #[cfg(feature = "no-std")]
629 {
630 let manager = GPU_MANAGER.lock();
631 manager.get_devices()
632 }
633}
634
635#[allow(non_snake_case)]
636#[cfg(all(test, not(feature = "no-std")))]
637mod tests {
638 use super::*;
639
640 #[cfg(feature = "no-std")]
641 use alloc::{
642 string::{String, ToString},
643 vec,
644 vec::Vec,
645 };
646
647 #[test]
648 fn test_gpu_manager_creation() {
649 let manager = GpuManager::new();
650 assert_eq!(manager.cuda_devices.len(), 0);
651 assert_eq!(manager.opencl_devices.len(), 0);
652 }
653
654 #[test]
655 fn test_gpu_device_creation() {
656 let device = GpuDevice {
657 id: 0,
658 name: "Test Device".to_string(),
659 backend: GpuBackend::Cuda,
660 compute_units: 80,
661 memory_mb: 8192,
662 supports_f64: true,
663 supports_f16: true,
664 };
665
666 assert_eq!(device.id, 0);
667 assert_eq!(device.backend, GpuBackend::Cuda);
668 assert!(device.supports_f64);
669 }
670
671 #[test]
672 fn test_kernel_config_default() {
673 let config = KernelConfig::default();
674 assert_eq!(config.grid_size, (1, 1, 1));
675 assert_eq!(config.block_size, (256, 1, 1));
676 assert_eq!(config.shared_memory, 0);
677 }
678
679 #[test]
680 fn test_cuda_device_creation() {
681 if cuda::CudaDevice::get_device_count().is_ok() {
683 let result = cuda::CudaDevice::new(0);
684 match result {
686 Ok(_device) => {
687 }
689 Err(SimdError::UnsupportedOperation(_)) => {
690 }
692 Err(_) => panic!("Unexpected error type"),
693 }
694 }
695 }
696
697 #[test]
698 fn test_opencl_platforms() {
699 match opencl::OpenCLDevice::get_platforms() {
701 Ok(platforms) => {
702 assert!(!platforms.is_empty());
704 }
705 Err(SimdError::UnsupportedOperation(_)) => {
706 }
708 Err(_) => panic!("Unexpected error type"),
709 }
710 }
711
712 #[test]
713 fn test_gpu_initialization() {
714 let result = initialize_gpu();
716 assert!(result.is_ok() || matches!(result, Err(SimdError::UnsupportedOperation(_))));
718 }
719
720 #[test]
721 fn test_gpu_availability_check() {
722 let _available = is_gpu_available();
724 }
725
726 #[test]
727 fn test_get_devices() {
728 let _devices = get_gpu_devices();
730 }
732}