ghostflow_ml/
gpu.rs

1//! GPU Acceleration - CUDA/OpenCL Support
2//!
3//! This module provides GPU acceleration utilities for tensor operations.
4//! Note: This is a foundational implementation. Full GPU support requires
5//! external libraries like wgpu, vulkan, or CUDA bindings.
6
7use ghostflow_core::Tensor;
8use std::sync::{Arc, Mutex};
9
10/// GPU device type
11#[derive(Clone, Copy, Debug, PartialEq)]
12pub enum DeviceType {
13    CPU,
14    CUDA,
15    OpenCL,
16    Metal,
17    Vulkan,
18}
19
20/// GPU device information
21#[derive(Clone, Debug)]
22pub struct DeviceInfo {
23    pub device_type: DeviceType,
24    pub device_id: usize,
25    pub name: String,
26    pub compute_capability: (u32, u32),
27    pub total_memory: usize,
28    pub available_memory: usize,
29}
30
31impl DeviceInfo {
32    pub fn cpu() -> Self {
33        DeviceInfo {
34            device_type: DeviceType::CPU,
35            device_id: 0,
36            name: "CPU".to_string(),
37            compute_capability: (0, 0),
38            total_memory: 0,
39            available_memory: 0,
40        }
41    }
42
43    pub fn cuda(device_id: usize) -> Self {
44        DeviceInfo {
45            device_type: DeviceType::CUDA,
46            device_id,
47            name: format!("CUDA Device {}", device_id),
48            compute_capability: (7, 5), // Example: RTX 2080
49            total_memory: 8 * 1024 * 1024 * 1024, // 8GB
50            available_memory: 7 * 1024 * 1024 * 1024,
51        }
52    }
53}
54
55/// GPU context manager
56pub struct GPUContext {
57    device: DeviceInfo,
58    #[allow(dead_code)]
59    stream: Option<usize>,
60    #[allow(dead_code)]
61    memory_pool: Arc<Mutex<Vec<Vec<f32>>>>,
62}
63
64impl GPUContext {
65    pub fn new(device: DeviceInfo) -> Self {
66        GPUContext {
67            device,
68            stream: None,
69            memory_pool: Arc::new(Mutex::new(Vec::new())),
70        }
71    }
72
73    pub fn cpu() -> Self {
74        Self::new(DeviceInfo::cpu())
75    }
76
77    pub fn cuda(device_id: usize) -> Result<Self, String> {
78        // In a real implementation, this would initialize CUDA
79        if Self::is_cuda_available() {
80            Ok(Self::new(DeviceInfo::cuda(device_id)))
81        } else {
82            Err("CUDA not available".to_string())
83        }
84    }
85
86    pub fn is_cuda_available() -> bool {
87        // Placeholder - would check for CUDA runtime
88        false
89    }
90
91    pub fn is_opencl_available() -> bool {
92        // Placeholder - would check for OpenCL runtime
93        false
94    }
95
96    pub fn device_count() -> usize {
97        // Placeholder - would query available devices
98        if Self::is_cuda_available() { 1 } else { 0 }
99    }
100
101    pub fn device_info(&self) -> &DeviceInfo {
102        &self.device
103    }
104
105    pub fn synchronize(&self) {
106        // Placeholder - would synchronize GPU operations
107    }
108
109    pub fn allocate(&self, size: usize) -> Vec<f32> {
110        // Simplified memory allocation
111        vec![0.0f32; size]
112    }
113
114    pub fn deallocate(&self, _buffer: Vec<f32>) {
115        // Placeholder - would free GPU memory
116    }
117}
118
119/// GPU tensor wrapper
120pub struct GPUTensor {
121    data: Vec<f32>,
122    dims: Vec<usize>,
123    device: DeviceType,
124    context: Arc<GPUContext>,
125}
126
127impl GPUTensor {
128    pub fn new(data: Vec<f32>, dims: Vec<usize>, context: Arc<GPUContext>) -> Self {
129        GPUTensor {
130            data,
131            dims,
132            device: context.device.device_type,
133            context,
134        }
135    }
136
137    pub fn from_tensor(tensor: &Tensor, context: Arc<GPUContext>) -> Self {
138        GPUTensor::new(
139            tensor.data_f32().to_vec(),
140            tensor.dims().to_vec(),
141            context,
142        )
143    }
144
145    pub fn to_tensor(&self) -> Tensor {
146        Tensor::from_slice(&self.data, &self.dims).unwrap()
147    }
148
149    pub fn to_device(&mut self, device: DeviceType) {
150        if self.device == device {
151            return;
152        }
153
154        // Placeholder - would transfer data between devices
155        self.device = device;
156    }
157
158    pub fn dims(&self) -> &[usize] {
159        &self.dims
160    }
161
162    pub fn device(&self) -> DeviceType {
163        self.device
164    }
165
166    /// Matrix multiplication on GPU
167    pub fn matmul(&self, other: &GPUTensor) -> GPUTensor {
168        assert_eq!(self.dims.len(), 2);
169        assert_eq!(other.dims.len(), 2);
170        assert_eq!(self.dims[1], other.dims[0]);
171
172        let m = self.dims[0];
173        let k = self.dims[1];
174        let n = other.dims[1];
175
176        let mut result = vec![0.0f32; m * n];
177
178        // CPU fallback (GPU implementation would use cuBLAS or similar)
179        for i in 0..m {
180            for j in 0..n {
181                let mut sum = 0.0f32;
182                for p in 0..k {
183                    sum += self.data[i * k + p] * other.data[p * n + j];
184                }
185                result[i * n + j] = sum;
186            }
187        }
188
189        GPUTensor::new(result, vec![m, n], self.context.clone())
190    }
191
192    /// Element-wise addition
193    pub fn add(&self, other: &GPUTensor) -> GPUTensor {
194        assert_eq!(self.dims, other.dims);
195
196        let result: Vec<f32> = self.data.iter()
197            .zip(other.data.iter())
198            .map(|(&a, &b)| a + b)
199            .collect();
200
201        GPUTensor::new(result, self.dims.clone(), self.context.clone())
202    }
203
204    /// Element-wise multiplication
205    pub fn mul(&self, other: &GPUTensor) -> GPUTensor {
206        assert_eq!(self.dims, other.dims);
207
208        let result: Vec<f32> = self.data.iter()
209            .zip(other.data.iter())
210            .map(|(&a, &b)| a * b)
211            .collect();
212
213        GPUTensor::new(result, self.dims.clone(), self.context.clone())
214    }
215
216    /// Scalar multiplication
217    pub fn scale(&self, scalar: f32) -> GPUTensor {
218        let result: Vec<f32> = self.data.iter()
219            .map(|&x| x * scalar)
220            .collect();
221
222        GPUTensor::new(result, self.dims.clone(), self.context.clone())
223    }
224
225    /// ReLU activation
226    pub fn relu(&self) -> GPUTensor {
227        let result: Vec<f32> = self.data.iter()
228            .map(|&x| x.max(0.0))
229            .collect();
230
231        GPUTensor::new(result, self.dims.clone(), self.context.clone())
232    }
233
234    /// Softmax activation
235    pub fn softmax(&self) -> GPUTensor {
236        assert_eq!(self.dims.len(), 2);
237        let batch_size = self.dims[0];
238        let features = self.dims[1];
239
240        let mut result = vec![0.0f32; self.data.len()];
241
242        for b in 0..batch_size {
243            let start = b * features;
244            let end = start + features;
245            let batch_data = &self.data[start..end];
246
247            // Find max for numerical stability
248            let max_val = batch_data.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
249
250            // Compute exp and sum
251            let mut sum = 0.0f32;
252            for i in 0..features {
253                let exp_val = (batch_data[i] - max_val).exp();
254                result[start + i] = exp_val;
255                sum += exp_val;
256            }
257
258            // Normalize
259            for i in 0..features {
260                result[start + i] /= sum;
261            }
262        }
263
264        GPUTensor::new(result, self.dims.clone(), self.context.clone())
265    }
266
267    /// Sum reduction
268    pub fn sum(&self) -> f32 {
269        self.data.iter().sum()
270    }
271
272    /// Mean reduction
273    pub fn mean(&self) -> f32 {
274        self.sum() / self.data.len() as f32
275    }
276}
277
278/// GPU-accelerated operations
279pub struct GPUOps {
280    context: Arc<GPUContext>,
281}
282
283impl GPUOps {
284    pub fn new(context: Arc<GPUContext>) -> Self {
285        GPUOps { context }
286    }
287
288    /// Convolution 2D on GPU
289    pub fn conv2d(
290        &self,
291        input: &GPUTensor,
292        kernel: &GPUTensor,
293        stride: (usize, usize),
294        padding: (usize, usize),
295    ) -> GPUTensor {
296        // Simplified 2D convolution
297        // Real implementation would use cuDNN or similar
298
299        let input_dims = input.dims();
300        let kernel_dims = kernel.dims();
301
302        assert_eq!(input_dims.len(), 4); // [batch, channels, height, width]
303        assert_eq!(kernel_dims.len(), 4); // [out_ch, in_ch, kh, kw]
304
305        let batch = input_dims[0];
306        let _in_channels = input_dims[1];
307        let in_h = input_dims[2];
308        let in_w = input_dims[3];
309
310        let out_channels = kernel_dims[0];
311        let kh = kernel_dims[2];
312        let kw = kernel_dims[3];
313
314        let out_h = (in_h + 2 * padding.0 - kh) / stride.0 + 1;
315        let out_w = (in_w + 2 * padding.1 - kw) / stride.1 + 1;
316
317        let output_size = batch * out_channels * out_h * out_w;
318        let output = vec![0.0f32; output_size];
319
320        // CPU fallback implementation
321        // GPU version would launch CUDA kernels
322
323        GPUTensor::new(
324            output,
325            vec![batch, out_channels, out_h, out_w],
326            self.context.clone(),
327        )
328    }
329
330    /// Batch normalization on GPU
331    pub fn batch_norm(
332        &self,
333        input: &GPUTensor,
334        gamma: &GPUTensor,
335        beta: &GPUTensor,
336        running_mean: &GPUTensor,
337        running_var: &GPUTensor,
338        eps: f32,
339    ) -> GPUTensor {
340        let dims = input.dims();
341        let channels = dims[1];
342        let spatial_size: usize = dims[2..].iter().product();
343
344        let mut output = input.data.clone();
345
346        // Normalize
347        for c in 0..channels {
348            let mean = running_mean.data[c];
349            let var = running_var.data[c];
350            let std = (var + eps).sqrt();
351
352            for b in 0..dims[0] {
353                for s in 0..spatial_size {
354                    let idx = (b * channels + c) * spatial_size + s;
355                    output[idx] = (output[idx] - mean) / std;
356                    output[idx] = gamma.data[c] * output[idx] + beta.data[c];
357                }
358            }
359        }
360
361        GPUTensor::new(output, dims.to_vec(), self.context.clone())
362    }
363
364    /// Max pooling 2D on GPU
365    pub fn max_pool2d(
366        &self,
367        input: &GPUTensor,
368        kernel_size: (usize, usize),
369        stride: (usize, usize),
370    ) -> GPUTensor {
371        let dims = input.dims();
372        assert_eq!(dims.len(), 4);
373
374        let batch = dims[0];
375        let channels = dims[1];
376        let in_h = dims[2];
377        let in_w = dims[3];
378
379        let out_h = (in_h - kernel_size.0) / stride.0 + 1;
380        let out_w = (in_w - kernel_size.1) / stride.1 + 1;
381
382        let mut output = vec![f32::NEG_INFINITY; batch * channels * out_h * out_w];
383
384        for b in 0..batch {
385            for c in 0..channels {
386                for oh in 0..out_h {
387                    for ow in 0..out_w {
388                        let mut max_val = f32::NEG_INFINITY;
389
390                        for kh in 0..kernel_size.0 {
391                            for kw in 0..kernel_size.1 {
392                                let ih = oh * stride.0 + kh;
393                                let iw = ow * stride.1 + kw;
394
395                                if ih < in_h && iw < in_w {
396                                    let in_idx = ((b * channels + c) * in_h + ih) * in_w + iw;
397                                    max_val = max_val.max(input.data[in_idx]);
398                                }
399                            }
400                        }
401
402                        let out_idx = ((b * channels + c) * out_h + oh) * out_w + ow;
403                        output[out_idx] = max_val;
404                    }
405                }
406            }
407        }
408
409        GPUTensor::new(
410            output,
411            vec![batch, channels, out_h, out_w],
412            self.context.clone(),
413        )
414    }
415}
416
417/// Memory management for GPU
418pub struct GPUMemoryManager {
419    context: Arc<GPUContext>,
420    allocated: Arc<Mutex<usize>>,
421    peak: Arc<Mutex<usize>>,
422}
423
424impl GPUMemoryManager {
425    pub fn new(context: Arc<GPUContext>) -> Self {
426        GPUMemoryManager {
427            context,
428            allocated: Arc::new(Mutex::new(0)),
429            peak: Arc::new(Mutex::new(0)),
430        }
431    }
432
433    pub fn allocate(&self, size: usize) -> Vec<f32> {
434        let mut allocated = self.allocated.lock().unwrap();
435        *allocated += size * std::mem::size_of::<f32>();
436
437        let mut peak = self.peak.lock().unwrap();
438        *peak = (*peak).max(*allocated);
439
440        self.context.allocate(size)
441    }
442
443    pub fn deallocate(&self, buffer: Vec<f32>) {
444        let size = buffer.len() * std::mem::size_of::<f32>();
445        let mut allocated = self.allocated.lock().unwrap();
446        *allocated = allocated.saturating_sub(size);
447
448        self.context.deallocate(buffer);
449    }
450
451    pub fn allocated_memory(&self) -> usize {
452        *self.allocated.lock().unwrap()
453    }
454
455    pub fn peak_memory(&self) -> usize {
456        *self.peak.lock().unwrap()
457    }
458
459    pub fn reset_peak(&self) {
460        let mut peak = self.peak.lock().unwrap();
461        *peak = *self.allocated.lock().unwrap();
462    }
463}
464
465/// Automatic Mixed Precision (AMP) support
466pub struct AutoMixedPrecision {
467    enabled: bool,
468    loss_scale: f32,
469    growth_factor: f32,
470    backoff_factor: f32,
471    growth_interval: usize,
472    iterations: usize,
473}
474
475impl AutoMixedPrecision {
476    pub fn new() -> Self {
477        AutoMixedPrecision {
478            enabled: false,
479            loss_scale: 65536.0,
480            growth_factor: 2.0,
481            backoff_factor: 0.5,
482            growth_interval: 2000,
483            iterations: 0,
484        }
485    }
486
487    pub fn enable(mut self) -> Self {
488        self.enabled = true;
489        self
490    }
491
492    pub fn scale_loss(&mut self, loss: f32) -> f32 {
493        if self.enabled {
494            loss * self.loss_scale
495        } else {
496            loss
497        }
498    }
499
500    pub fn unscale_gradients(&mut self, gradients: &mut [f32]) {
501        if self.enabled {
502            for grad in gradients {
503                *grad /= self.loss_scale;
504            }
505        }
506    }
507
508    pub fn update_scale(&mut self, found_inf: bool) {
509        if !self.enabled {
510            return;
511        }
512
513        self.iterations += 1;
514
515        if found_inf {
516            self.loss_scale *= self.backoff_factor;
517            self.iterations = 0;
518        } else if self.iterations >= self.growth_interval {
519            self.loss_scale *= self.growth_factor;
520            self.iterations = 0;
521        }
522    }
523}
524
525impl Default for AutoMixedPrecision {
526    fn default() -> Self { Self::new() }
527}
528
529#[cfg(test)]
530mod tests {
531    use super::*;
532
533    #[test]
534    fn test_gpu_context() {
535        let ctx = GPUContext::cpu();
536        assert_eq!(ctx.device_info().device_type, DeviceType::CPU);
537    }
538
539    #[test]
540    fn test_gpu_tensor() {
541        let ctx = Arc::new(GPUContext::cpu());
542        let data = vec![1.0, 2.0, 3.0, 4.0];
543        let tensor = GPUTensor::new(data, vec![2, 2], ctx);
544
545        assert_eq!(tensor.dims(), &[2, 2]);
546        assert_eq!(tensor.sum(), 10.0);
547        assert_eq!(tensor.mean(), 2.5);
548    }
549
550    #[test]
551    fn test_gpu_tensor_ops() {
552        let ctx = Arc::new(GPUContext::cpu());
553        
554        let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
555        let b = GPUTensor::new(vec![2.0, 2.0, 2.0, 2.0], vec![2, 2], ctx.clone());
556
557        let c = a.add(&b);
558        assert_eq!(c.data, vec![3.0, 4.0, 5.0, 6.0]);
559
560        let d = a.scale(2.0);
561        assert_eq!(d.data, vec![2.0, 4.0, 6.0, 8.0]);
562    }
563
564    #[test]
565    fn test_gpu_matmul() {
566        let ctx = Arc::new(GPUContext::cpu());
567        
568        let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
569        let b = GPUTensor::new(vec![1.0, 0.0, 0.0, 1.0], vec![2, 2], ctx.clone());
570
571        let c = a.matmul(&b);
572        assert_eq!(c.dims(), &[2, 2]);
573    }
574
575    #[test]
576    fn test_gpu_relu() {
577        let ctx = Arc::new(GPUContext::cpu());
578        let tensor = GPUTensor::new(vec![-1.0, 2.0, -3.0, 4.0], vec![2, 2], ctx);
579
580        let result = tensor.relu();
581        assert_eq!(result.data, vec![0.0, 2.0, 0.0, 4.0]);
582    }
583
584    #[test]
585    fn test_memory_manager() {
586        let ctx = Arc::new(GPUContext::cpu());
587        let manager = GPUMemoryManager::new(ctx);
588
589        let buffer = manager.allocate(100);
590        assert!(manager.allocated_memory() > 0);
591
592        manager.deallocate(buffer);
593    }
594
595    #[test]
596    fn test_amp() {
597        let mut amp = AutoMixedPrecision::new().enable();
598        
599        let loss = 1.0;
600        let scaled_loss = amp.scale_loss(loss);
601        assert!(scaled_loss > loss);
602
603        let mut grads = vec![1.0, 2.0, 3.0];
604        amp.unscale_gradients(&mut grads);
605        assert!(grads[0] < 1.0);
606    }
607}
608
609