1use ghostflow_core::Tensor;
8use std::sync::{Arc, Mutex};
9
10#[derive(Clone, Copy, Debug, PartialEq)]
12pub enum DeviceType {
13 CPU,
14 CUDA,
15 OpenCL,
16 Metal,
17 Vulkan,
18}
19
20#[derive(Clone, Debug)]
22pub struct DeviceInfo {
23 pub device_type: DeviceType,
24 pub device_id: usize,
25 pub name: String,
26 pub compute_capability: (u32, u32),
27 pub total_memory: usize,
28 pub available_memory: usize,
29}
30
31impl DeviceInfo {
32 pub fn cpu() -> Self {
33 DeviceInfo {
34 device_type: DeviceType::CPU,
35 device_id: 0,
36 name: "CPU".to_string(),
37 compute_capability: (0, 0),
38 total_memory: 0,
39 available_memory: 0,
40 }
41 }
42
43 pub fn cuda(device_id: usize) -> Self {
44 DeviceInfo {
45 device_type: DeviceType::CUDA,
46 device_id,
47 name: format!("CUDA Device {}", device_id),
48 compute_capability: (7, 5), total_memory: 8 * 1024 * 1024 * 1024, available_memory: 7 * 1024 * 1024 * 1024,
51 }
52 }
53}
54
55pub struct GPUContext {
57 device: DeviceInfo,
58 #[allow(dead_code)]
59 stream: Option<usize>,
60 #[allow(dead_code)]
61 memory_pool: Arc<Mutex<Vec<Vec<f32>>>>,
62}
63
64impl GPUContext {
65 pub fn new(device: DeviceInfo) -> Self {
66 GPUContext {
67 device,
68 stream: None,
69 memory_pool: Arc::new(Mutex::new(Vec::new())),
70 }
71 }
72
73 pub fn cpu() -> Self {
74 Self::new(DeviceInfo::cpu())
75 }
76
77 pub fn cuda(device_id: usize) -> Result<Self, String> {
78 if Self::is_cuda_available() {
80 Ok(Self::new(DeviceInfo::cuda(device_id)))
81 } else {
82 Err("CUDA not available".to_string())
83 }
84 }
85
86 pub fn is_cuda_available() -> bool {
87 false
89 }
90
91 pub fn is_opencl_available() -> bool {
92 false
94 }
95
96 pub fn device_count() -> usize {
97 if Self::is_cuda_available() { 1 } else { 0 }
99 }
100
101 pub fn device_info(&self) -> &DeviceInfo {
102 &self.device
103 }
104
105 pub fn synchronize(&self) {
106 }
108
109 pub fn allocate(&self, size: usize) -> Vec<f32> {
110 vec![0.0f32; size]
112 }
113
114 pub fn deallocate(&self, _buffer: Vec<f32>) {
115 }
117}
118
119pub struct GPUTensor {
121 data: Vec<f32>,
122 dims: Vec<usize>,
123 device: DeviceType,
124 context: Arc<GPUContext>,
125}
126
127impl GPUTensor {
128 pub fn new(data: Vec<f32>, dims: Vec<usize>, context: Arc<GPUContext>) -> Self {
129 GPUTensor {
130 data,
131 dims,
132 device: context.device.device_type,
133 context,
134 }
135 }
136
137 pub fn from_tensor(tensor: &Tensor, context: Arc<GPUContext>) -> Self {
138 GPUTensor::new(
139 tensor.data_f32().to_vec(),
140 tensor.dims().to_vec(),
141 context,
142 )
143 }
144
145 pub fn to_tensor(&self) -> Tensor {
146 Tensor::from_slice(&self.data, &self.dims).unwrap()
147 }
148
149 pub fn to_device(&mut self, device: DeviceType) {
150 if self.device == device {
151 return;
152 }
153
154 self.device = device;
156 }
157
158 pub fn dims(&self) -> &[usize] {
159 &self.dims
160 }
161
162 pub fn device(&self) -> DeviceType {
163 self.device
164 }
165
166 pub fn matmul(&self, other: &GPUTensor) -> GPUTensor {
168 assert_eq!(self.dims.len(), 2);
169 assert_eq!(other.dims.len(), 2);
170 assert_eq!(self.dims[1], other.dims[0]);
171
172 let m = self.dims[0];
173 let k = self.dims[1];
174 let n = other.dims[1];
175
176 let mut result = vec![0.0f32; m * n];
177
178 for i in 0..m {
180 for j in 0..n {
181 let mut sum = 0.0f32;
182 for p in 0..k {
183 sum += self.data[i * k + p] * other.data[p * n + j];
184 }
185 result[i * n + j] = sum;
186 }
187 }
188
189 GPUTensor::new(result, vec![m, n], self.context.clone())
190 }
191
192 pub fn add(&self, other: &GPUTensor) -> GPUTensor {
194 assert_eq!(self.dims, other.dims);
195
196 let result: Vec<f32> = self.data.iter()
197 .zip(other.data.iter())
198 .map(|(&a, &b)| a + b)
199 .collect();
200
201 GPUTensor::new(result, self.dims.clone(), self.context.clone())
202 }
203
204 pub fn mul(&self, other: &GPUTensor) -> GPUTensor {
206 assert_eq!(self.dims, other.dims);
207
208 let result: Vec<f32> = self.data.iter()
209 .zip(other.data.iter())
210 .map(|(&a, &b)| a * b)
211 .collect();
212
213 GPUTensor::new(result, self.dims.clone(), self.context.clone())
214 }
215
216 pub fn scale(&self, scalar: f32) -> GPUTensor {
218 let result: Vec<f32> = self.data.iter()
219 .map(|&x| x * scalar)
220 .collect();
221
222 GPUTensor::new(result, self.dims.clone(), self.context.clone())
223 }
224
225 pub fn relu(&self) -> GPUTensor {
227 let result: Vec<f32> = self.data.iter()
228 .map(|&x| x.max(0.0))
229 .collect();
230
231 GPUTensor::new(result, self.dims.clone(), self.context.clone())
232 }
233
234 pub fn softmax(&self) -> GPUTensor {
236 assert_eq!(self.dims.len(), 2);
237 let batch_size = self.dims[0];
238 let features = self.dims[1];
239
240 let mut result = vec![0.0f32; self.data.len()];
241
242 for b in 0..batch_size {
243 let start = b * features;
244 let end = start + features;
245 let batch_data = &self.data[start..end];
246
247 let max_val = batch_data.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
249
250 let mut sum = 0.0f32;
252 for i in 0..features {
253 let exp_val = (batch_data[i] - max_val).exp();
254 result[start + i] = exp_val;
255 sum += exp_val;
256 }
257
258 for i in 0..features {
260 result[start + i] /= sum;
261 }
262 }
263
264 GPUTensor::new(result, self.dims.clone(), self.context.clone())
265 }
266
267 pub fn sum(&self) -> f32 {
269 self.data.iter().sum()
270 }
271
272 pub fn mean(&self) -> f32 {
274 self.sum() / self.data.len() as f32
275 }
276}
277
278pub struct GPUOps {
280 context: Arc<GPUContext>,
281}
282
283impl GPUOps {
284 pub fn new(context: Arc<GPUContext>) -> Self {
285 GPUOps { context }
286 }
287
288 pub fn conv2d(
290 &self,
291 input: &GPUTensor,
292 kernel: &GPUTensor,
293 stride: (usize, usize),
294 padding: (usize, usize),
295 ) -> GPUTensor {
296 let input_dims = input.dims();
300 let kernel_dims = kernel.dims();
301
302 assert_eq!(input_dims.len(), 4); assert_eq!(kernel_dims.len(), 4); let batch = input_dims[0];
306 let _in_channels = input_dims[1];
307 let in_h = input_dims[2];
308 let in_w = input_dims[3];
309
310 let out_channels = kernel_dims[0];
311 let kh = kernel_dims[2];
312 let kw = kernel_dims[3];
313
314 let out_h = (in_h + 2 * padding.0 - kh) / stride.0 + 1;
315 let out_w = (in_w + 2 * padding.1 - kw) / stride.1 + 1;
316
317 let output_size = batch * out_channels * out_h * out_w;
318 let output = vec![0.0f32; output_size];
319
320 GPUTensor::new(
324 output,
325 vec![batch, out_channels, out_h, out_w],
326 self.context.clone(),
327 )
328 }
329
330 pub fn batch_norm(
332 &self,
333 input: &GPUTensor,
334 gamma: &GPUTensor,
335 beta: &GPUTensor,
336 running_mean: &GPUTensor,
337 running_var: &GPUTensor,
338 eps: f32,
339 ) -> GPUTensor {
340 let dims = input.dims();
341 let channels = dims[1];
342 let spatial_size: usize = dims[2..].iter().product();
343
344 let mut output = input.data.clone();
345
346 for c in 0..channels {
348 let mean = running_mean.data[c];
349 let var = running_var.data[c];
350 let std = (var + eps).sqrt();
351
352 for b in 0..dims[0] {
353 for s in 0..spatial_size {
354 let idx = (b * channels + c) * spatial_size + s;
355 output[idx] = (output[idx] - mean) / std;
356 output[idx] = gamma.data[c] * output[idx] + beta.data[c];
357 }
358 }
359 }
360
361 GPUTensor::new(output, dims.to_vec(), self.context.clone())
362 }
363
364 pub fn max_pool2d(
366 &self,
367 input: &GPUTensor,
368 kernel_size: (usize, usize),
369 stride: (usize, usize),
370 ) -> GPUTensor {
371 let dims = input.dims();
372 assert_eq!(dims.len(), 4);
373
374 let batch = dims[0];
375 let channels = dims[1];
376 let in_h = dims[2];
377 let in_w = dims[3];
378
379 let out_h = (in_h - kernel_size.0) / stride.0 + 1;
380 let out_w = (in_w - kernel_size.1) / stride.1 + 1;
381
382 let mut output = vec![f32::NEG_INFINITY; batch * channels * out_h * out_w];
383
384 for b in 0..batch {
385 for c in 0..channels {
386 for oh in 0..out_h {
387 for ow in 0..out_w {
388 let mut max_val = f32::NEG_INFINITY;
389
390 for kh in 0..kernel_size.0 {
391 for kw in 0..kernel_size.1 {
392 let ih = oh * stride.0 + kh;
393 let iw = ow * stride.1 + kw;
394
395 if ih < in_h && iw < in_w {
396 let in_idx = ((b * channels + c) * in_h + ih) * in_w + iw;
397 max_val = max_val.max(input.data[in_idx]);
398 }
399 }
400 }
401
402 let out_idx = ((b * channels + c) * out_h + oh) * out_w + ow;
403 output[out_idx] = max_val;
404 }
405 }
406 }
407 }
408
409 GPUTensor::new(
410 output,
411 vec![batch, channels, out_h, out_w],
412 self.context.clone(),
413 )
414 }
415}
416
417pub struct GPUMemoryManager {
419 context: Arc<GPUContext>,
420 allocated: Arc<Mutex<usize>>,
421 peak: Arc<Mutex<usize>>,
422}
423
424impl GPUMemoryManager {
425 pub fn new(context: Arc<GPUContext>) -> Self {
426 GPUMemoryManager {
427 context,
428 allocated: Arc::new(Mutex::new(0)),
429 peak: Arc::new(Mutex::new(0)),
430 }
431 }
432
433 pub fn allocate(&self, size: usize) -> Vec<f32> {
434 let mut allocated = self.allocated.lock().unwrap();
435 *allocated += size * std::mem::size_of::<f32>();
436
437 let mut peak = self.peak.lock().unwrap();
438 *peak = (*peak).max(*allocated);
439
440 self.context.allocate(size)
441 }
442
443 pub fn deallocate(&self, buffer: Vec<f32>) {
444 let size = buffer.len() * std::mem::size_of::<f32>();
445 let mut allocated = self.allocated.lock().unwrap();
446 *allocated = allocated.saturating_sub(size);
447
448 self.context.deallocate(buffer);
449 }
450
451 pub fn allocated_memory(&self) -> usize {
452 *self.allocated.lock().unwrap()
453 }
454
455 pub fn peak_memory(&self) -> usize {
456 *self.peak.lock().unwrap()
457 }
458
459 pub fn reset_peak(&self) {
460 let mut peak = self.peak.lock().unwrap();
461 *peak = *self.allocated.lock().unwrap();
462 }
463}
464
465pub struct AutoMixedPrecision {
467 enabled: bool,
468 loss_scale: f32,
469 growth_factor: f32,
470 backoff_factor: f32,
471 growth_interval: usize,
472 iterations: usize,
473}
474
475impl AutoMixedPrecision {
476 pub fn new() -> Self {
477 AutoMixedPrecision {
478 enabled: false,
479 loss_scale: 65536.0,
480 growth_factor: 2.0,
481 backoff_factor: 0.5,
482 growth_interval: 2000,
483 iterations: 0,
484 }
485 }
486
487 pub fn enable(mut self) -> Self {
488 self.enabled = true;
489 self
490 }
491
492 pub fn scale_loss(&mut self, loss: f32) -> f32 {
493 if self.enabled {
494 loss * self.loss_scale
495 } else {
496 loss
497 }
498 }
499
500 pub fn unscale_gradients(&mut self, gradients: &mut [f32]) {
501 if self.enabled {
502 for grad in gradients {
503 *grad /= self.loss_scale;
504 }
505 }
506 }
507
508 pub fn update_scale(&mut self, found_inf: bool) {
509 if !self.enabled {
510 return;
511 }
512
513 self.iterations += 1;
514
515 if found_inf {
516 self.loss_scale *= self.backoff_factor;
517 self.iterations = 0;
518 } else if self.iterations >= self.growth_interval {
519 self.loss_scale *= self.growth_factor;
520 self.iterations = 0;
521 }
522 }
523}
524
525impl Default for AutoMixedPrecision {
526 fn default() -> Self { Self::new() }
527}
528
529#[cfg(test)]
530mod tests {
531 use super::*;
532
533 #[test]
534 fn test_gpu_context() {
535 let ctx = GPUContext::cpu();
536 assert_eq!(ctx.device_info().device_type, DeviceType::CPU);
537 }
538
539 #[test]
540 fn test_gpu_tensor() {
541 let ctx = Arc::new(GPUContext::cpu());
542 let data = vec![1.0, 2.0, 3.0, 4.0];
543 let tensor = GPUTensor::new(data, vec![2, 2], ctx);
544
545 assert_eq!(tensor.dims(), &[2, 2]);
546 assert_eq!(tensor.sum(), 10.0);
547 assert_eq!(tensor.mean(), 2.5);
548 }
549
550 #[test]
551 fn test_gpu_tensor_ops() {
552 let ctx = Arc::new(GPUContext::cpu());
553
554 let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
555 let b = GPUTensor::new(vec![2.0, 2.0, 2.0, 2.0], vec![2, 2], ctx.clone());
556
557 let c = a.add(&b);
558 assert_eq!(c.data, vec![3.0, 4.0, 5.0, 6.0]);
559
560 let d = a.scale(2.0);
561 assert_eq!(d.data, vec![2.0, 4.0, 6.0, 8.0]);
562 }
563
564 #[test]
565 fn test_gpu_matmul() {
566 let ctx = Arc::new(GPUContext::cpu());
567
568 let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
569 let b = GPUTensor::new(vec![1.0, 0.0, 0.0, 1.0], vec![2, 2], ctx.clone());
570
571 let c = a.matmul(&b);
572 assert_eq!(c.dims(), &[2, 2]);
573 }
574
575 #[test]
576 fn test_gpu_relu() {
577 let ctx = Arc::new(GPUContext::cpu());
578 let tensor = GPUTensor::new(vec![-1.0, 2.0, -3.0, 4.0], vec![2, 2], ctx);
579
580 let result = tensor.relu();
581 assert_eq!(result.data, vec![0.0, 2.0, 0.0, 4.0]);
582 }
583
584 #[test]
585 fn test_memory_manager() {
586 let ctx = Arc::new(GPUContext::cpu());
587 let manager = GPUMemoryManager::new(ctx);
588
589 let buffer = manager.allocate(100);
590 assert!(manager.allocated_memory() > 0);
591
592 manager.deallocate(buffer);
593 }
594
595 #[test]
596 fn test_amp() {
597 let mut amp = AutoMixedPrecision::new().enable();
598
599 let loss = 1.0;
600 let scaled_loss = amp.scale_loss(loss);
601 assert!(scaled_loss > loss);
602
603 let mut grads = vec![1.0, 2.0, 3.0];
604 amp.unscale_gradients(&mut grads);
605 assert!(grads[0] < 1.0);
606 }
607}