rustframes/array/
gpu.rs

1#[cfg(feature = "cuda")]
2pub mod cuda {
3    use super::super::Array;
4    use cuda::runtime::CudaDevice;
5    use cudarc::driver::*;
6    use cudarc::nvrtc::compile_ptx;
7    use parquet::data_type::DataType;
8    use rand::rand_core::block;
9    use std::collections::HashMap;
10
11    pub struct GpuArray {
12        pub data: CudaSlice<f32>,
13        pub shape: Vec<usize>,
14        pub device: Arc<CudaDevice>,
15    }
16
17    impl GpuArray {
18        /// Create GPU array from CPU array
19        pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
20            let device = CudaDevice::new(0)?;
21
22            let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
23
24            let gpu_data = device.htod_sync_copy(&f32_data);
25
26            OK(GpuArray {
27                data: gpu_data,
28                shape: cpu_array.shape.clone(),
29                device: Arc::new(device),
30            })
31        }
32
33        /// Transfer back to CPU
34        pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
35            let cpu_data: Vec<f32> = self.device.dtoh_sync_copy(&self.data)?;
36            let f64_data: Vec<f64> = cpu_data.iter().map(|&x| x as f64).collect();
37
38            Ok(Array::from_vec(f64_data, self.shape.clone()))
39        }
40
41        /// GPU matrix multiplication using cuBLAS
42        pub fn matmul_gpu(&self, other: &GpuArray) -> Result<GpuArray, Box<dyn std::error::Error>> {
43            use cudarc::cublas::{CudaBlas, GemmConfig};
44
45            assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
46            assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
47            assert_eq!(
48                self.shape[1], other.shape[0],
49                "Matrix dimensions incompatible"
50            );
51
52            let (m, k) = (self.shape[0], self.shape[1]);
53            let n = other.shape[1];
54
55            let blas = CudaBlas::new(self.device.clone())?;
56            let mut result = self.device.alloc_zeros::<f32>(m * n)?;
57
58            let config = GemmConfig {
59                transa: cudarc::cublas::Transpose::NoTrans,
60                transb: cudarc::cublas::Transpose::NoTrans,
61                m: m as i32,
62                n: n as i32,
63                k: k as i32,
64                alpha: 1.0,
65                lda: k as i32,
66                ldb: n as i32,
67                beta: 0.0,
68                ldc: n as i32,
69            };
70
71            unsafe {
72                blas.gemm(config, &self.data, &other.data, &mut result)?;
73            }
74
75            Ok(GpuArray {
76                data: result,
77                shape: vec![m, n],
78                device: self.device.clone(),
79            })
80        }
81
82        /// Element-wise operation using custom CUDA kernels
83        pub fn add_gpu(&self, other: &GpuArray) -> Result<GpuArray, Box<dyn std::error::Error>> {
84            assert_eq!(self.shape, other.shape, "Arrays must have same shape");
85
86            let kernel_src = r#"
87            extern "C" __global__ void add_kernel(float* a, float* b, float* c, int n) {
88                int idx = blockIdx.x * blockDim.x + threadIdx.x;
89                if (idx < n) {
90                    c[idx] = a[idx] + b[idx];
91                }
92            }
93            "#;
94
95            let ptx = compile_ptx(kernel_src)?;
96            self.device.load_ptx(ptx, "add_kernel", &["add_kernel"])?;
97
98            let kernel = self.device.get_func("add_kernel", "add_kernel")?;
99            let mut result = self.device.alloc_zeros::<f32>(self.data.len())?;
100
101            let n = self.data.len();
102            let block_size = 256;
103            let grid_size = (n * block_size - 1) / block_size;
104
105            unsafe {
106                kernel.launch(
107                    LaunchConfig {
108                        grid_dim: (grid_size as u32, 1, 1),
109                        block_dim: (block_size as u32, 1, 1),
110                        shared_mem_bytes: 0,
111                    },
112                    (&self.data, &other.data, &mut result, n as i32),
113                )?;
114            }
115
116            self.device.synchronize()?;
117
118            Ok(GpuArray {
119                data: result,
120                shape: self.shape.clone(),
121                device: self.device.clone(),
122            })
123        }
124
125        /// CUDA-accelerated reduction operations
126        pub fn sum_gpu(&self) -> Result<f32, Box<dyn std::error::Error>> {
127            let kernel_src = r#"
128            extern "C" __global__ void sum_reduce_kernel(float* input, float* output, int n) {
129                extern __shared__ float sdata[];
130
131                int tid = threadIdx.x;
132                int idx = blockIdx.x * blockDim.x + threadIdx.x;
133
134                // Load data into shared memory
135                sdata[tid] = (idx < n) ? input[idx] : 0.0f;
136                __syncthreads();
137
138                // Reduction in shared memory
139                for (int s = blockDim.x / 2; s > 0; s >>= 1) {
140                    if (tid < s) {
141                        sdata[tid] += sdata[tid + s];
142                    }
143                    __syncthreads();
144                }
145
146                // Write result of this block to global memory
147                if (tid == 0) {
148                    output[blockIdx.x] = sdata[0];
149                }
150            }
151            "#;
152
153            let ptx = compile_ptx(kernel_src)?;
154            self.device
155                .load_ptx(ptx, "sum_reduce", &["sum_reduce_kernel"])?;
156
157            let kernel = self.device.get_func("sum_reduce", "sum_reduce_kernel")?;
158
159            let n = self.data.len();
160            let block_size = 256;
161            let grid_size = (n * block_size - 1) / block_size;
162
163            let mut partial_sums = self.device.alloc_zeros::<f32>(grid_size)?;
164
165            unsafe {
166                kernel.launch(
167                    LaunchConfig {
168                        grid_dim: (grid_size as u32, 1, 1),
169                        block_dim: (block_size as u32, 1, 1),
170                        shared_mem_bytes: block_size * std::mem::size_of::<f32>() as u32,
171                    },
172                    (&self.data, &mut partial_sums, n as i32),
173                )?;
174            }
175
176            self.device.synchronize()?;
177
178            let cpu_partial: Vec<f32> = self.device.dtoh_sync_copy(&partial_sums)?;
179            Ok(cpu_partial.iter().sum())
180        }
181
182        /// GPU-accelerated convolution
183        pub fn conv2d_gpu(
184            &self,
185            kernel: &GpuArray,
186        ) -> Result<GpuArray, Box<dyn std::error::Error>> {
187            use cudarc::cudnn::*;
188
189            let cudnn = CudnnHandle::new(self.device.clone())?;
190
191            let input_desc = TensorDescriptor::new(
192                DataType::Float,
193                TensorFormat::NCHW,
194                &[1, 1, self.shape[0] as i32, self.shape[1] as i32],
195            )?;
196
197            let kernel_desc = FilterDescriptor::new(
198                DataType::Float,
199                TensorFormat::NCHW,
200                &[1, 1, kernel.shape[0] as i32, kernel.shape[1] as i32],
201            )?;
202
203            let conv_desc = ConvolutionDescriptor::new(
204                &[0, 0],
205                &[1, 1],
206                &[1, 1],
207                ConvolutionMode::CrossCorrelation,
208                DataType::Float,
209            )?;
210
211            let mut output_dims = [0i32; 4];
212            cudnn.get_convolution_nd_forward_output_dim(
213                &conv_desc,
214                &input_desc,
215                &kernel_desc,
216                &mut output_dims,
217            )?;
218
219            let output_desc =
220                TensorDescriptor::new(DataType::Float, TensorFormat::NCHW, &output_dims)?;
221
222            let output_size = output_dims.iter().product::<i32>() as usize;
223            let mut output = self.device.alloc_zeros::<f32>(output_size)?;
224
225            let algo = cudnn.find_convolution_forward_algorithm(
226                &input_desc,
227                &self.data,
228                &kernel_desc,
229                &kernel.data,
230                &conv_desc,
231                &output_desc,
232                &mut output,
233            )?;
234
235            cudnn.convolution_forward(
236                1.0,
237                &input_desc,
238                &self.data,
239                &kernel_desc,
240                &kernel.data,
241                &conv_desc,
242                algo,
243                0.0,
244                &output_desc,
245                &mut output,
246            )?;
247
248            Ok(GpuArray {
249                data: output,
250                shape: vec![output_dims[2] as usize, output_dims[3] as usize],
251                device: self.device.clone(),
252            })
253        }
254    }
255
256    /// GPU-accelerated arrray operations
257    impl Array<f64> {
258        /// Send array to GPU for computation
259        pub fn to_gpu(&self) -> Result<GpuArray, Box<dyn std::error::Error>> {
260            GpuArray::from_cpu_array(self)
261        }
262
263        /// Perform GPU-accelerated matrix multiplication
264        pub fn matmul_gpu(
265            &self,
266            other: &Array<f64>,
267        ) -> Result<Array<f64>, Box<dyn std::error::Error>> {
268            let gpu_a = self.to_gpu()?;
269            let gpu_b = other.to_gpu()?;
270            let gpu_result = gpu_a.matmul_gpu(&gpu_b)?;
271            gpu_result.to_cpu_array()
272        }
273
274        /// GPU-accelerated element-wise addition
275        pub fn add_gpu(
276            &self,
277            other: &Array<f64>,
278        ) -> Result<Array<f64>, Box<dyn std::error::Error>> {
279            let gpu_a = self.to_gpu()?;
280            let gpu_b = other.to_gpu()?;
281            let gpu_result = gpu_a.add_gpu(&gpu_b)?;
282            gpu_result.to_cpu_array()
283        }
284
285        /// GPU-accelerated sum reduction
286        pub fn sum_gpu(&self) -> Result<f64, Box<dyn std::error::Error>> {
287            let gpu_array = self.to_gpu()?;
288            let result = gpu_array.sum_gpu()?;
289            Ok(result as f64)
290        }
291    }
292
293    // Memory pool for efficient GPU memory management
294    pub struct GpuMemoryPool {
295        device: Arc<CudaDevice>,
296        free_buffers: HashMap<usize, Vec<CudaSlice<f32>>>,
297        used_buffers: Vec<CudaSlice<f32>>,
298    }
299
300    impl GpuMemoryPool {
301        pub fn new(device: Arc<CudaDevice>) -> Self {
302            GpuMemoryPool {
303                device,
304                free_buffers: HashMap::new(),
305                used_buffers: Vec::new(),
306            }
307        }
308
309        pub fn allocate(
310            &mut self,
311            size: usize,
312        ) -> Result<CudaSlice<f32>, Box<dyn std::error::Error>> {
313            if let Some(buffers) = self.free_buffers.get_mut(&size) {
314                if let Some(buffer) = buffers.pop() {
315                    self.used_buffers.push(buffer.clone());
316                    return Ok(buffer);
317                }
318            }
319
320            // Allocate new buffer
321            let buffer = self.device.alloc_zeros::<f32>(size)?;
322            self.used_buffers.push(buffer.clone());
323            Ok(buffer)
324        }
325
326        pub fn deallocate(&mut self, buffer: CudaSlice<f32>) {
327            let size = buffer.len();
328            if let Some(pos) = self
329                .used_buffers
330                .iter()
331                .position(|b| std::ptr::eq(b.as_ptr(), buffer.as_ptr()))
332            {
333                self.used_buffers.remove(pos);
334                self.free_buffers
335                    .entry(size)
336                    .or_insert_with(Vec::new)
337                    .push(buffer);
338            }
339        }
340
341        pub fn clear(&mut self) {
342            self.free_buffers.clear();
343            self.used_buffers.clear();
344        }
345    }
346}
347
348// ROCm/HIP support for AMD GPUs
349#[cfg(feature = "rocm")]
350pub mod rocm {
351    use super::super::Array;
352    use hip_rs::*;
353
354    pub struct HipArray {
355        pub data: DeviceBuffer<f32>,
356        pub shape: Vec<usize>,
357        pub device: HipDevice,
358    }
359
360    impl HipArray {
361        pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
362            let device = HipDevice::new(0)?;
363            device.set_current()?;
364
365            let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
366            let gpu_data = device.alloc_and_copy(&f32_data)?;
367
368            Ok(HipArray {
369                data: gpu_data,
370                shape: cpu_array.shape.clone(),
371                device,
372            })
373        }
374
375        pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
376            self.device.set_current()?;
377            let cpu_data: Vec<f32> = self.device.copy_to_host(&self.data)?;
378            let f64_data: Vec<f64> = cpu_data.iter().map(|&x| x as f64).collect();
379
380            Ok(Array::from_vec(f64_data, self.shape.clone()))
381        }
382
383        pub fn matmul_rocm(
384            &self,
385            other: &HipArray,
386        ) -> Result<HipArray, Box<dyn std::error::Error>> {
387            use rocblas_rs::*;
388
389            assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
390            assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
391            assert_eq!(
392                self.shape[1], other.shape[0],
393                "Matrix dimensions incompatible"
394            );
395
396            let (m, k) = (self.shape[0], self.shape[1]);
397            let n = other.shape[1];
398
399            self.device.set_current()?;
400            let rocblas_handle = RocblasHandle::new()?;
401            let mut result = self.device.alloc_zeros::<f32>(m * n)?;
402
403            rocblas_handle.sgemm(
404                RocblasOperation::None,
405                RocblasOperation::None,
406                m as i32,
407                n as i32,
408                k as i32,
409                1.0, // alpha
410                &self.data,
411                k as i32, // lda
412                &other.data,
413                n as i32, // ldb
414                0.0,      // beta
415                &mut result,
416                n as i32, // ldc
417            )?;
418
419            Ok(HipArray {
420                data: result,
421                shape: vec![m, n],
422                device: self.device.clone(),
423            })
424        }
425    }
426}
427
428// Metal Performance Shaders for Apple Silicon
429#[cfg(feature = "metal")]
430pub mod metal {
431    use super::super::Array;
432    use metal::*;
433    use objc::rc::autoreleasepool;
434
435    pub struct MetalArray {
436        pub buffer: Buffer,
437        pub shape: Vec<usize>,
438        pub device: Device,
439    }
440
441    impl MetalArray {
442        pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
443            let device = Device::system_default().ok_or("No Metal device found")?;
444
445            let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
446            let buffer = device.new_buffer_with_data(
447                f32_data.as_ptr() as *const std::ffi::c_void,
448                (f32_data.len() * std::mem::size_of::<f32>()) as u64,
449                MTLResourceOptions::StorageModeShared,
450            );
451
452            Ok(MetalArray {
453                buffer,
454                shape: cpu_array.shape.clone(),
455                device,
456            })
457        }
458
459        pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
460            let ptr = self.buffer.contents() as *const f32;
461            let len = self.buffer.length() as usize / std::mem::size_of::<f32>();
462
463            let f32_data = unsafe { std::slice::from_raw_parts(ptr, len) };
464            let f64_data: Vec<f64> = f32_data.iter().map(|&x| x as f64).collect();
465
466            Ok(Array::from_vec(f64_data, self.shape.clone()))
467        }
468
469        pub fn matmul_metal(
470            &self,
471            other: &MetalArray,
472        ) -> Result<MetalArray, Box<dyn std::error::Error>> {
473            use metal_performance_shaders::*;
474
475            assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
476            assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
477            assert_eq!(
478                self.shape[1], other.shape[0],
479                "Matrix dimensions incompatible"
480            );
481
482            let (m, k) = (self.shape[0], self.shape[1]);
483            let n = other.shape[1];
484
485            autoreleasepool(|| {
486                let command_queue = self.device.new_command_queue();
487                let command_buffer = command_queue.new_command_buffer();
488
489                // Create matrix descriptors
490                let desc_a = MPSMatrixDescriptor::matrix_descriptor(
491                    m as NSUInteger,
492                    k as NSUInteger,
493                    k as NSUInteger,
494                    MPSDataType::Float32,
495                );
496
497                let desc_b = MPSMatrixDescriptor::matrix_descriptor(
498                    k as NSUInteger,
499                    n as NSUInteger,
500                    n as NSUInteger,
501                    MPSDataType::Float32,
502                );
503
504                let desc_c = MPSMatrixDescriptor::matrix_descriptor(
505                    m as NSUInteger,
506                    n as NSUInteger,
507                    n as NSUInteger,
508                    MPSDataType::Float32,
509                );
510
511                // Create result buffer
512                let result_buffer = self.device.new_buffer(
513                    (m * n * std::mem::size_of::<f32>()) as u64,
514                    MTLResourceOptions::StorageModeShared,
515                );
516
517                // Create MPS matrices
518                let matrix_a = MPSMatrix::new(&self.buffer, &desc_a);
519                let matrix_b = MPSMatrix::new(&other.buffer, &desc_b);
520                let matrix_c = MPSMatrix::new(&result_buffer, &desc_c);
521
522                // Perform matrix multiplication
523                let matmul =
524                    MPSMatrixMultiplication::new(&self.device, false, false, m, n, k, 1.0, 0.0);
525                matmul.encode_to_command_buffer(&command_buffer, &matrix_a, &matrix_b, &matrix_c);
526
527                command_buffer.commit();
528                command_buffer.wait_until_completed();
529
530                Ok(MetalArray {
531                    buffer: result_buffer,
532                    shape: vec![m, n],
533                    device: self.device.clone(),
534                })
535            })
536        }
537    }
538}
539
540// Unified GPU interface
541pub enum GpuBackend {
542    #[cfg(feature = "cuda")]
543    Cuda(cuda::GpuArray),
544    #[cfg(feature = "rocm")]
545    Rocm(rocm::HipArray),
546    #[cfg(feature = "metal")]
547    Metal(metal::MetalArray),
548    Cpu, // Fallback to CPU
549}
550
551impl GpuBackend {
552    pub fn detect_best_backend() -> Self {
553        #[cfg(feature = "cuda")]
554        {
555            if cudarc::driver::CudaDevice::new(0).is_ok() {
556                return GpuBackend::Cpu; // Placeholder - would create CUDA backend
557            }
558        }
559
560        #[cfg(feature = "rocm")]
561        {
562            if hip_rs::HipDevice::new(0).is_ok() {
563                return GpuBackend::Cpu; // Placeholder - would create ROCm backend
564            }
565        }
566
567        #[cfg(feature = "metal")]
568        {
569            if metal::Device::system_default().is_some() {
570                return GpuBackend::Cpu; // Placeholder - would create Metal backend
571            }
572        }
573
574        GpuBackend::Cpu
575    }
576
577    pub fn matmul(&self, other: &GpuBackend) -> Result<GpuBackend, Box<dyn std::error::Error>> {
578        match (self, other) {
579            #[cfg(feature = "cuda")]
580            (GpuBackend::Cuda(a), GpuBackend::Cuda(b)) => Ok(GpuBackend::Cuda(a.matmul_gpu(b)?)),
581            #[cfg(feature = "rocm")]
582            (GpuBackend::Rocm(a), GpuBackend::Rocm(b)) => Ok(GpuBackend::Rocm(a.matmul_rocm(b)?)),
583            #[cfg(feature = "metal")]
584            (GpuBackend::Metal(a), GpuBackend::Metal(b)) => {
585                Ok(GpuBackend::Metal(a.matmul_metal(b)?))
586            }
587            _ => Err("Incompatible GPU backends or CPU fallback needed".into()),
588        }
589    }
590}
591
592pub struct GpuTuner {
593    device_info: DeviceInfo,
594    optimal_configs: std::collections::HashMap<String, TuningConfig>,
595}
596
597#[derive(Debug, Clone)]
598pub struct DeviceInfo {
599    pub name: String,
600    pub memory_gb: f32,
601    pub compute_capability: String,
602    pub max_threads_per_block: u32,
603    pub max_shared_memory: u32,
604}
605
606#[derive(Debug, Clone)]
607pub struct TuningConfig {
608    pub block_size: u32,
609    pub grid_size_multiplier: f32,
610    pub shared_memory_usage: f32,
611    pub register_usage: f32,
612}
613
614impl Default for GpuTuner {
615    fn default() -> Self {
616        Self::new()
617    }
618}
619
620impl GpuTuner {
621    pub fn new() -> Self {
622        GpuTuner {
623            device_info: Self::query_device_info(),
624            optimal_configs: std::collections::HashMap::new(),
625        }
626    }
627
628    fn query_device_info() -> DeviceInfo {
629        #[cfg(feature = "cuda")]
630        {
631            if let Ok(device) = cudarc::driver::CudaDevice::new(0) {
632                return DeviceInfo {
633                    name: device.name().unwrap_or_default(),
634                    memory_gb: device.total_memory().unwrap_or(0) as f32 / 1e9,
635                    compute_capability: format!(
636                        "{}.{}",
637                        device.compute_capability().0,
638                        device.compute_capability().1
639                    ),
640                    max_threads_per_block: 1024, // Common for modern GPUs
641                    max_shared_memory: 48 * 1024, // 48KB for modern GPUs
642                };
643            }
644        }
645
646        // Fallback device info
647        DeviceInfo {
648            name: "Unknown".to_string(),
649            memory_gb: 0.0,
650            compute_capability: "0.0".to_string(),
651            max_threads_per_block: 256,
652            max_shared_memory: 16 * 1024,
653        }
654    }
655
656    pub fn tune_matmul(&mut self, m: usize, n: usize, k: usize) -> TuningConfig {
657        let key = format!("matmul_{}x{}x{}", m, n, k);
658
659        if let Some(config) = self.optimal_configs.get(&key) {
660            return config.clone();
661        }
662
663        // Auto-tune based on matrix size and device capabilities
664        let block_size = if m * n > 1_000_000 {
665            self.device_info.max_threads_per_block
666        } else if m * n > 100_000 {
667            512
668        } else {
669            256
670        };
671
672        let config = TuningConfig {
673            block_size,
674            grid_size_multiplier: 1.0,
675            shared_memory_usage: 0.8, // Use 80% of available shared memory
676            register_usage: 0.7,      // Conservative register usage
677        };
678
679        self.optimal_configs.insert(key, config.clone());
680        config
681    }
682
683    pub fn benchmark_configuration(
684        &self,
685        _config: &TuningConfig,
686        _operation: &str,
687    ) -> Result<f64, Box<dyn std::error::Error>> {
688        // Placeholder for actual benchmarking
689        Ok(1.0) // milliseconds
690    }
691}