1#[cfg(feature = "cuda")]
2pub mod cuda {
3 use super::super::Array;
4 use cuda::runtime::CudaDevice;
5 use cudarc::driver::*;
6 use cudarc::nvrtc::compile_ptx;
7 use parquet::data_type::DataType;
8 use rand::rand_core::block;
9 use std::collections::HashMap;
10
11 pub struct GpuArray {
12 pub data: CudaSlice<f32>,
13 pub shape: Vec<usize>,
14 pub device: Arc<CudaDevice>,
15 }
16
17 impl GpuArray {
18 pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
20 let device = CudaDevice::new(0)?;
21
22 let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
23
24 let gpu_data = device.htod_sync_copy(&f32_data);
25
26 OK(GpuArray {
27 data: gpu_data,
28 shape: cpu_array.shape.clone(),
29 device: Arc::new(device),
30 })
31 }
32
33 pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
35 let cpu_data: Vec<f32> = self.device.dtoh_sync_copy(&self.data)?;
36 let f64_data: Vec<f64> = cpu_data.iter().map(|&x| x as f64).collect();
37
38 Ok(Array::from_vec(f64_data, self.shape.clone()))
39 }
40
41 pub fn matmul_gpu(&self, other: &GpuArray) -> Result<GpuArray, Box<dyn std::error::Error>> {
43 use cudarc::cublas::{CudaBlas, GemmConfig};
44
45 assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
46 assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
47 assert_eq!(
48 self.shape[1], other.shape[0],
49 "Matrix dimensions incompatible"
50 );
51
52 let (m, k) = (self.shape[0], self.shape[1]);
53 let n = other.shape[1];
54
55 let blas = CudaBlas::new(self.device.clone())?;
56 let mut result = self.device.alloc_zeros::<f32>(m * n)?;
57
58 let config = GemmConfig {
59 transa: cudarc::cublas::Transpose::NoTrans,
60 transb: cudarc::cublas::Transpose::NoTrans,
61 m: m as i32,
62 n: n as i32,
63 k: k as i32,
64 alpha: 1.0,
65 lda: k as i32,
66 ldb: n as i32,
67 beta: 0.0,
68 ldc: n as i32,
69 };
70
71 unsafe {
72 blas.gemm(config, &self.data, &other.data, &mut result)?;
73 }
74
75 Ok(GpuArray {
76 data: result,
77 shape: vec![m, n],
78 device: self.device.clone(),
79 })
80 }
81
82 pub fn add_gpu(&self, other: &GpuArray) -> Result<GpuArray, Box<dyn std::error::Error>> {
84 assert_eq!(self.shape, other.shape, "Arrays must have same shape");
85
86 let kernel_src = r#"
87 extern "C" __global__ void add_kernel(float* a, float* b, float* c, int n) {
88 int idx = blockIdx.x * blockDim.x + threadIdx.x;
89 if (idx < n) {
90 c[idx] = a[idx] + b[idx];
91 }
92 }
93 "#;
94
95 let ptx = compile_ptx(kernel_src)?;
96 self.device.load_ptx(ptx, "add_kernel", &["add_kernel"])?;
97
98 let kernel = self.device.get_func("add_kernel", "add_kernel")?;
99 let mut result = self.device.alloc_zeros::<f32>(self.data.len())?;
100
101 let n = self.data.len();
102 let block_size = 256;
103 let grid_size = (n * block_size - 1) / block_size;
104
105 unsafe {
106 kernel.launch(
107 LaunchConfig {
108 grid_dim: (grid_size as u32, 1, 1),
109 block_dim: (block_size as u32, 1, 1),
110 shared_mem_bytes: 0,
111 },
112 (&self.data, &other.data, &mut result, n as i32),
113 )?;
114 }
115
116 self.device.synchronize()?;
117
118 Ok(GpuArray {
119 data: result,
120 shape: self.shape.clone(),
121 device: self.device.clone(),
122 })
123 }
124
125 pub fn sum_gpu(&self) -> Result<f32, Box<dyn std::error::Error>> {
127 let kernel_src = r#"
128 extern "C" __global__ void sum_reduce_kernel(float* input, float* output, int n) {
129 extern __shared__ float sdata[];
130
131 int tid = threadIdx.x;
132 int idx = blockIdx.x * blockDim.x + threadIdx.x;
133
134 // Load data into shared memory
135 sdata[tid] = (idx < n) ? input[idx] : 0.0f;
136 __syncthreads();
137
138 // Reduction in shared memory
139 for (int s = blockDim.x / 2; s > 0; s >>= 1) {
140 if (tid < s) {
141 sdata[tid] += sdata[tid + s];
142 }
143 __syncthreads();
144 }
145
146 // Write result of this block to global memory
147 if (tid == 0) {
148 output[blockIdx.x] = sdata[0];
149 }
150 }
151 "#;
152
153 let ptx = compile_ptx(kernel_src)?;
154 self.device
155 .load_ptx(ptx, "sum_reduce", &["sum_reduce_kernel"])?;
156
157 let kernel = self.device.get_func("sum_reduce", "sum_reduce_kernel")?;
158
159 let n = self.data.len();
160 let block_size = 256;
161 let grid_size = (n * block_size - 1) / block_size;
162
163 let mut partial_sums = self.device.alloc_zeros::<f32>(grid_size)?;
164
165 unsafe {
166 kernel.launch(
167 LaunchConfig {
168 grid_dim: (grid_size as u32, 1, 1),
169 block_dim: (block_size as u32, 1, 1),
170 shared_mem_bytes: block_size * std::mem::size_of::<f32>() as u32,
171 },
172 (&self.data, &mut partial_sums, n as i32),
173 )?;
174 }
175
176 self.device.synchronize()?;
177
178 let cpu_partial: Vec<f32> = self.device.dtoh_sync_copy(&partial_sums)?;
179 Ok(cpu_partial.iter().sum())
180 }
181
182 pub fn conv2d_gpu(
184 &self,
185 kernel: &GpuArray,
186 ) -> Result<GpuArray, Box<dyn std::error::Error>> {
187 use cudarc::cudnn::*;
188
189 let cudnn = CudnnHandle::new(self.device.clone())?;
190
191 let input_desc = TensorDescriptor::new(
192 DataType::Float,
193 TensorFormat::NCHW,
194 &[1, 1, self.shape[0] as i32, self.shape[1] as i32],
195 )?;
196
197 let kernel_desc = FilterDescriptor::new(
198 DataType::Float,
199 TensorFormat::NCHW,
200 &[1, 1, kernel.shape[0] as i32, kernel.shape[1] as i32],
201 )?;
202
203 let conv_desc = ConvolutionDescriptor::new(
204 &[0, 0],
205 &[1, 1],
206 &[1, 1],
207 ConvolutionMode::CrossCorrelation,
208 DataType::Float,
209 )?;
210
211 let mut output_dims = [0i32; 4];
212 cudnn.get_convolution_nd_forward_output_dim(
213 &conv_desc,
214 &input_desc,
215 &kernel_desc,
216 &mut output_dims,
217 )?;
218
219 let output_desc =
220 TensorDescriptor::new(DataType::Float, TensorFormat::NCHW, &output_dims)?;
221
222 let output_size = output_dims.iter().product::<i32>() as usize;
223 let mut output = self.device.alloc_zeros::<f32>(output_size)?;
224
225 let algo = cudnn.find_convolution_forward_algorithm(
226 &input_desc,
227 &self.data,
228 &kernel_desc,
229 &kernel.data,
230 &conv_desc,
231 &output_desc,
232 &mut output,
233 )?;
234
235 cudnn.convolution_forward(
236 1.0,
237 &input_desc,
238 &self.data,
239 &kernel_desc,
240 &kernel.data,
241 &conv_desc,
242 algo,
243 0.0,
244 &output_desc,
245 &mut output,
246 )?;
247
248 Ok(GpuArray {
249 data: output,
250 shape: vec![output_dims[2] as usize, output_dims[3] as usize],
251 device: self.device.clone(),
252 })
253 }
254 }
255
256 impl Array<f64> {
258 pub fn to_gpu(&self) -> Result<GpuArray, Box<dyn std::error::Error>> {
260 GpuArray::from_cpu_array(self)
261 }
262
263 pub fn matmul_gpu(
265 &self,
266 other: &Array<f64>,
267 ) -> Result<Array<f64>, Box<dyn std::error::Error>> {
268 let gpu_a = self.to_gpu()?;
269 let gpu_b = other.to_gpu()?;
270 let gpu_result = gpu_a.matmul_gpu(&gpu_b)?;
271 gpu_result.to_cpu_array()
272 }
273
274 pub fn add_gpu(
276 &self,
277 other: &Array<f64>,
278 ) -> Result<Array<f64>, Box<dyn std::error::Error>> {
279 let gpu_a = self.to_gpu()?;
280 let gpu_b = other.to_gpu()?;
281 let gpu_result = gpu_a.add_gpu(&gpu_b)?;
282 gpu_result.to_cpu_array()
283 }
284
285 pub fn sum_gpu(&self) -> Result<f64, Box<dyn std::error::Error>> {
287 let gpu_array = self.to_gpu()?;
288 let result = gpu_array.sum_gpu()?;
289 Ok(result as f64)
290 }
291 }
292
293 pub struct GpuMemoryPool {
295 device: Arc<CudaDevice>,
296 free_buffers: HashMap<usize, Vec<CudaSlice<f32>>>,
297 used_buffers: Vec<CudaSlice<f32>>,
298 }
299
300 impl GpuMemoryPool {
301 pub fn new(device: Arc<CudaDevice>) -> Self {
302 GpuMemoryPool {
303 device,
304 free_buffers: HashMap::new(),
305 used_buffers: Vec::new(),
306 }
307 }
308
309 pub fn allocate(
310 &mut self,
311 size: usize,
312 ) -> Result<CudaSlice<f32>, Box<dyn std::error::Error>> {
313 if let Some(buffers) = self.free_buffers.get_mut(&size) {
314 if let Some(buffer) = buffers.pop() {
315 self.used_buffers.push(buffer.clone());
316 return Ok(buffer);
317 }
318 }
319
320 let buffer = self.device.alloc_zeros::<f32>(size)?;
322 self.used_buffers.push(buffer.clone());
323 Ok(buffer)
324 }
325
326 pub fn deallocate(&mut self, buffer: CudaSlice<f32>) {
327 let size = buffer.len();
328 if let Some(pos) = self
329 .used_buffers
330 .iter()
331 .position(|b| std::ptr::eq(b.as_ptr(), buffer.as_ptr()))
332 {
333 self.used_buffers.remove(pos);
334 self.free_buffers
335 .entry(size)
336 .or_insert_with(Vec::new)
337 .push(buffer);
338 }
339 }
340
341 pub fn clear(&mut self) {
342 self.free_buffers.clear();
343 self.used_buffers.clear();
344 }
345 }
346}
347
348#[cfg(feature = "rocm")]
350pub mod rocm {
351 use super::super::Array;
352 use hip_rs::*;
353
354 pub struct HipArray {
355 pub data: DeviceBuffer<f32>,
356 pub shape: Vec<usize>,
357 pub device: HipDevice,
358 }
359
360 impl HipArray {
361 pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
362 let device = HipDevice::new(0)?;
363 device.set_current()?;
364
365 let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
366 let gpu_data = device.alloc_and_copy(&f32_data)?;
367
368 Ok(HipArray {
369 data: gpu_data,
370 shape: cpu_array.shape.clone(),
371 device,
372 })
373 }
374
375 pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
376 self.device.set_current()?;
377 let cpu_data: Vec<f32> = self.device.copy_to_host(&self.data)?;
378 let f64_data: Vec<f64> = cpu_data.iter().map(|&x| x as f64).collect();
379
380 Ok(Array::from_vec(f64_data, self.shape.clone()))
381 }
382
383 pub fn matmul_rocm(
384 &self,
385 other: &HipArray,
386 ) -> Result<HipArray, Box<dyn std::error::Error>> {
387 use rocblas_rs::*;
388
389 assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
390 assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
391 assert_eq!(
392 self.shape[1], other.shape[0],
393 "Matrix dimensions incompatible"
394 );
395
396 let (m, k) = (self.shape[0], self.shape[1]);
397 let n = other.shape[1];
398
399 self.device.set_current()?;
400 let rocblas_handle = RocblasHandle::new()?;
401 let mut result = self.device.alloc_zeros::<f32>(m * n)?;
402
403 rocblas_handle.sgemm(
404 RocblasOperation::None,
405 RocblasOperation::None,
406 m as i32,
407 n as i32,
408 k as i32,
409 1.0, &self.data,
411 k as i32, &other.data,
413 n as i32, 0.0, &mut result,
416 n as i32, )?;
418
419 Ok(HipArray {
420 data: result,
421 shape: vec![m, n],
422 device: self.device.clone(),
423 })
424 }
425 }
426}
427
428#[cfg(feature = "metal")]
430pub mod metal {
431 use super::super::Array;
432 use metal::*;
433 use objc::rc::autoreleasepool;
434
435 pub struct MetalArray {
436 pub buffer: Buffer,
437 pub shape: Vec<usize>,
438 pub device: Device,
439 }
440
441 impl MetalArray {
442 pub fn from_cpu_array(cpu_array: &Array<f64>) -> Result<Self, Box<dyn std::error::Error>> {
443 let device = Device::system_default().ok_or("No Metal device found")?;
444
445 let f32_data: Vec<f32> = cpu_array.data.iter().map(|&x| x as f32).collect();
446 let buffer = device.new_buffer_with_data(
447 f32_data.as_ptr() as *const std::ffi::c_void,
448 (f32_data.len() * std::mem::size_of::<f32>()) as u64,
449 MTLResourceOptions::StorageModeShared,
450 );
451
452 Ok(MetalArray {
453 buffer,
454 shape: cpu_array.shape.clone(),
455 device,
456 })
457 }
458
459 pub fn to_cpu_array(&self) -> Result<Array<f64>, Box<dyn std::error::Error>> {
460 let ptr = self.buffer.contents() as *const f32;
461 let len = self.buffer.length() as usize / std::mem::size_of::<f32>();
462
463 let f32_data = unsafe { std::slice::from_raw_parts(ptr, len) };
464 let f64_data: Vec<f64> = f32_data.iter().map(|&x| x as f64).collect();
465
466 Ok(Array::from_vec(f64_data, self.shape.clone()))
467 }
468
469 pub fn matmul_metal(
470 &self,
471 other: &MetalArray,
472 ) -> Result<MetalArray, Box<dyn std::error::Error>> {
473 use metal_performance_shaders::*;
474
475 assert_eq!(self.shape.len(), 2, "Left matrix must be 2D");
476 assert_eq!(other.shape.len(), 2, "Right matrix must be 2D");
477 assert_eq!(
478 self.shape[1], other.shape[0],
479 "Matrix dimensions incompatible"
480 );
481
482 let (m, k) = (self.shape[0], self.shape[1]);
483 let n = other.shape[1];
484
485 autoreleasepool(|| {
486 let command_queue = self.device.new_command_queue();
487 let command_buffer = command_queue.new_command_buffer();
488
489 let desc_a = MPSMatrixDescriptor::matrix_descriptor(
491 m as NSUInteger,
492 k as NSUInteger,
493 k as NSUInteger,
494 MPSDataType::Float32,
495 );
496
497 let desc_b = MPSMatrixDescriptor::matrix_descriptor(
498 k as NSUInteger,
499 n as NSUInteger,
500 n as NSUInteger,
501 MPSDataType::Float32,
502 );
503
504 let desc_c = MPSMatrixDescriptor::matrix_descriptor(
505 m as NSUInteger,
506 n as NSUInteger,
507 n as NSUInteger,
508 MPSDataType::Float32,
509 );
510
511 let result_buffer = self.device.new_buffer(
513 (m * n * std::mem::size_of::<f32>()) as u64,
514 MTLResourceOptions::StorageModeShared,
515 );
516
517 let matrix_a = MPSMatrix::new(&self.buffer, &desc_a);
519 let matrix_b = MPSMatrix::new(&other.buffer, &desc_b);
520 let matrix_c = MPSMatrix::new(&result_buffer, &desc_c);
521
522 let matmul =
524 MPSMatrixMultiplication::new(&self.device, false, false, m, n, k, 1.0, 0.0);
525 matmul.encode_to_command_buffer(&command_buffer, &matrix_a, &matrix_b, &matrix_c);
526
527 command_buffer.commit();
528 command_buffer.wait_until_completed();
529
530 Ok(MetalArray {
531 buffer: result_buffer,
532 shape: vec![m, n],
533 device: self.device.clone(),
534 })
535 })
536 }
537 }
538}
539
540pub enum GpuBackend {
542 #[cfg(feature = "cuda")]
543 Cuda(cuda::GpuArray),
544 #[cfg(feature = "rocm")]
545 Rocm(rocm::HipArray),
546 #[cfg(feature = "metal")]
547 Metal(metal::MetalArray),
548 Cpu, }
550
551impl GpuBackend {
552 pub fn detect_best_backend() -> Self {
553 #[cfg(feature = "cuda")]
554 {
555 if cudarc::driver::CudaDevice::new(0).is_ok() {
556 return GpuBackend::Cpu; }
558 }
559
560 #[cfg(feature = "rocm")]
561 {
562 if hip_rs::HipDevice::new(0).is_ok() {
563 return GpuBackend::Cpu; }
565 }
566
567 #[cfg(feature = "metal")]
568 {
569 if metal::Device::system_default().is_some() {
570 return GpuBackend::Cpu; }
572 }
573
574 GpuBackend::Cpu
575 }
576
577 pub fn matmul(&self, other: &GpuBackend) -> Result<GpuBackend, Box<dyn std::error::Error>> {
578 match (self, other) {
579 #[cfg(feature = "cuda")]
580 (GpuBackend::Cuda(a), GpuBackend::Cuda(b)) => Ok(GpuBackend::Cuda(a.matmul_gpu(b)?)),
581 #[cfg(feature = "rocm")]
582 (GpuBackend::Rocm(a), GpuBackend::Rocm(b)) => Ok(GpuBackend::Rocm(a.matmul_rocm(b)?)),
583 #[cfg(feature = "metal")]
584 (GpuBackend::Metal(a), GpuBackend::Metal(b)) => {
585 Ok(GpuBackend::Metal(a.matmul_metal(b)?))
586 }
587 _ => Err("Incompatible GPU backends or CPU fallback needed".into()),
588 }
589 }
590}
591
592pub struct GpuTuner {
593 device_info: DeviceInfo,
594 optimal_configs: std::collections::HashMap<String, TuningConfig>,
595}
596
597#[derive(Debug, Clone)]
598pub struct DeviceInfo {
599 pub name: String,
600 pub memory_gb: f32,
601 pub compute_capability: String,
602 pub max_threads_per_block: u32,
603 pub max_shared_memory: u32,
604}
605
606#[derive(Debug, Clone)]
607pub struct TuningConfig {
608 pub block_size: u32,
609 pub grid_size_multiplier: f32,
610 pub shared_memory_usage: f32,
611 pub register_usage: f32,
612}
613
614impl Default for GpuTuner {
615 fn default() -> Self {
616 Self::new()
617 }
618}
619
620impl GpuTuner {
621 pub fn new() -> Self {
622 GpuTuner {
623 device_info: Self::query_device_info(),
624 optimal_configs: std::collections::HashMap::new(),
625 }
626 }
627
628 fn query_device_info() -> DeviceInfo {
629 #[cfg(feature = "cuda")]
630 {
631 if let Ok(device) = cudarc::driver::CudaDevice::new(0) {
632 return DeviceInfo {
633 name: device.name().unwrap_or_default(),
634 memory_gb: device.total_memory().unwrap_or(0) as f32 / 1e9,
635 compute_capability: format!(
636 "{}.{}",
637 device.compute_capability().0,
638 device.compute_capability().1
639 ),
640 max_threads_per_block: 1024, max_shared_memory: 48 * 1024, };
643 }
644 }
645
646 DeviceInfo {
648 name: "Unknown".to_string(),
649 memory_gb: 0.0,
650 compute_capability: "0.0".to_string(),
651 max_threads_per_block: 256,
652 max_shared_memory: 16 * 1024,
653 }
654 }
655
656 pub fn tune_matmul(&mut self, m: usize, n: usize, k: usize) -> TuningConfig {
657 let key = format!("matmul_{}x{}x{}", m, n, k);
658
659 if let Some(config) = self.optimal_configs.get(&key) {
660 return config.clone();
661 }
662
663 let block_size = if m * n > 1_000_000 {
665 self.device_info.max_threads_per_block
666 } else if m * n > 100_000 {
667 512
668 } else {
669 256
670 };
671
672 let config = TuningConfig {
673 block_size,
674 grid_size_multiplier: 1.0,
675 shared_memory_usage: 0.8, register_usage: 0.7, };
678
679 self.optimal_configs.insert(key, config.clone());
680 config
681 }
682
683 pub fn benchmark_configuration(
684 &self,
685 _config: &TuningConfig,
686 _operation: &str,
687 ) -> Result<f64, Box<dyn std::error::Error>> {
688 Ok(1.0) }
691}