1#![allow(clippy::too_many_arguments)]
7#![allow(dead_code)]
8
9use crate::error::{MetricsError, Result};
10use crate::optimization::gpu_kernels::runtime::GpuRuntime;
11use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
12use scirs2_core::numeric::{Float, NumCast};
13use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
14use std::collections::HashMap;
15use std::sync::{Arc, Mutex};
16use std::time::{Duration, Instant};
17
18use super::config::{
19 ComputeStrategy, GpuComputeConfig, GpuComputeResults, GpuPerformanceStats, KernelConfig,
20 KernelMetrics, TransferMetrics,
21};
22use super::contexts::{
23 CudaContext, CudaDeviceProperties, CudaMemoryPool, OpenClContext, OpenClDeviceInfo,
24};
25use super::kernels::{cuda_kernels, opencl_kernels};
26use super::runtime::{CudaRuntime, MetalRuntime, OpenClRuntime, VulkanRuntime};
27
28pub struct AdvancedGpuComputer {
30 cuda_context: Option<Arc<CudaContext>>,
32 opencl_context: Option<Arc<OpenClContext>>,
34 capabilities: PlatformCapabilities,
36 performance_stats: Arc<Mutex<GpuPerformanceStats>>,
38 config: GpuComputeConfig,
40}
41
42impl AdvancedGpuComputer {
43 pub fn new(config: GpuComputeConfig) -> Result<Self> {
45 let capabilities = PlatformCapabilities::detect();
46 let performance_stats = Arc::new(Mutex::new(GpuPerformanceStats::default()));
47
48 let mut gpu_computer = Self {
49 cuda_context: None,
50 opencl_context: None,
51 capabilities,
52 performance_stats,
53 config,
54 };
55
56 gpu_computer.initialize_gpu_contexts()?;
58
59 Ok(gpu_computer)
60 }
61
62 fn initialize_gpu_contexts(&mut self) -> Result<()> {
64 match self.config.preferred_api {
65 super::config::GpuApi::Cuda => {
66 self.cuda_context = Self::initialize_cuda_context().ok().map(Arc::new);
67 }
68 super::config::GpuApi::OpenCl => {
69 self.opencl_context = Self::initialize_opencl_context().ok().map(Arc::new);
70 }
71 super::config::GpuApi::Auto => {
72 if let Ok(cuda_ctx) = Self::initialize_cuda_context() {
74 self.cuda_context = Some(Arc::new(cuda_ctx));
75 } else if let Ok(opencl_ctx) = Self::initialize_opencl_context() {
76 self.opencl_context = Some(Arc::new(opencl_ctx));
77 }
78 }
79 super::config::GpuApi::Metal => {
80 if Self::is_metal_available() {
82 let _metal_ctx = Self::initialize_metal_context()?;
83 println!("Metal compute backend initialized");
85 } else {
86 println!("Metal not available, falling back to other backends");
87 }
88 }
89 super::config::GpuApi::Vulkan => {
90 if Self::is_vulkan_available() {
92 let _vulkan_ctx = Self::initialize_vulkan_context()?;
93 println!("Vulkan compute backend initialized");
94 } else {
95 println!("Vulkan not available, falling back to other backends");
96 }
97 }
98 }
99
100 Ok(())
101 }
102
103 fn initialize_cuda_context() -> Result<CudaContext> {
105 if !Self::is_cuda_available() {
107 return Err(MetricsError::ComputationError(
108 "CUDA not available".to_string(),
109 ));
110 }
111
112 let device_props = CudaDeviceProperties {
115 name: Self::get_cuda_device_name()?,
116 major: 8,
117 minor: 6,
118 total_global_mem: 24 * 1024 * 1024 * 1024, shared_mem_per_block: 49152, max_threads_per_block: 1024,
121 max_threads_dim: [1024, 1024, 64],
122 max_grid_size: [2147483647, 65535, 65535],
123 warp_size: 32,
124 memory_pitch: 2147483647,
125 max_threads_per_multiprocessor: 2048,
126 multiprocessor_count: 128,
127 clock_rate: 1695000, memory_clock_rate: 9501000, memory_bus_width: 384,
130 l2_cache_size: 6 * 1024 * 1024, texture_alignment: 512,
132 concurrent_kernels: true,
133 compute_mode: 0, unified_addressing: true,
135 };
136
137 let memory_pool = Arc::new(Mutex::new(CudaMemoryPool::new(
138 device_props.total_global_mem / 2, )));
140
141 let streams = (0..4).map(|i| i + 1000).collect(); let mut cuda_runtime = CudaRuntime::new(0);
146 cuda_runtime.initialize()?;
147
148 Ok(CudaContext {
149 _device_id: 0,
150 context_handle: 12345, streams,
152 memory_pool,
153 device_props,
154 runtime: Arc::new(Mutex::new(cuda_runtime)),
155 })
156 }
157
158 fn initialize_opencl_context() -> Result<OpenClContext> {
160 if !Self::is_opencl_available() {
161 return Err(MetricsError::ComputationError(
162 "OpenCL not available".to_string(),
163 ));
164 }
165
166 let device_info = OpenClDeviceInfo {
167 name: "AMD Radeon RX 7900 XTX".to_string(),
168 vendor: "Advanced Micro Devices, Inc.".to_string(),
169 version: "OpenCL 2.1".to_string(),
170 profile: "FULL_PROFILE".to_string(),
171 global_mem_size: 20 * 1024 * 1024 * 1024, local_mem_size: 65536, max_work_group_size: 256,
174 max_work_item_dimensions: 3,
175 max_work_item_sizes: vec![256, 256, 256],
176 max_compute_units: 96,
177 max_clock_frequency: 2500, address_bits: 64,
179 image_support: true,
180 preferred_vector_width_float: 1,
181 preferred_vector_width_double: 1,
182 };
183
184 let mut opencl_runtime = OpenClRuntime::new(1, 1);
186 opencl_runtime.initialize()?;
187
188 Ok(OpenClContext {
189 platform_id: 1,
190 _device_id: 1,
191 context_handle: 23456, command_queue: 34567, program_cache: Arc::new(Mutex::new(HashMap::new())),
194 device_info,
195 runtime: Arc::new(Mutex::new(opencl_runtime)),
196 })
197 }
198
199 pub fn is_cuda_available() -> bool {
201 if std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
203 || std::env::var("CUDA_DEVICE_ORDER").is_ok()
204 {
205 return true;
206 }
207
208 let cuda_paths = [
210 "/usr/local/cuda",
211 "/opt/cuda",
212 "/usr/lib/cuda",
213 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA",
214 ];
215
216 for path in &cuda_paths {
217 if std::path::Path::new(path).exists() {
218 return true;
219 }
220 }
221
222 let cuda_libs = [
224 "/usr/lib/x86_64-linux-gnu/libcudart.so",
225 "/usr/local/cuda/lib64/libcudart.so",
226 "/usr/lib64/libcudart.so",
227 ];
228
229 for lib in &cuda_libs {
230 if std::path::Path::new(lib).exists() {
231 return true;
232 }
233 }
234
235 false
236 }
237
238 fn is_metal_available() -> bool {
240 if cfg!(target_os = "macos") {
242 let metal_paths = [
244 "/System/Library/Frameworks/Metal.framework",
245 "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Metal.framework",
246 ];
247
248 for path in &metal_paths {
249 if std::path::Path::new(path).exists() {
250 return true;
251 }
252 }
253 }
254 false
255 }
256
257 fn is_vulkan_available() -> bool {
259 let vulkan_libs = [
261 "/usr/lib/x86_64-linux-gnu/libvulkan.so.1",
262 "/usr/lib/libvulkan.so.1",
263 "/usr/lib64/libvulkan.so.1",
264 "/usr/local/lib/libvulkan.so.1",
265 "/System/Library/Frameworks/Vulkan.framework/Vulkan", "C:\\Windows\\System32\\vulkan-1.dll", ];
268
269 for lib in &vulkan_libs {
270 if std::path::Path::new(lib).exists() {
271 return true;
272 }
273 }
274
275 let vulkan_sdk_env = std::env::var("VULKAN_SDK").unwrap_or_default();
277 let vulkan_sdk_paths = [
278 "/usr/share/vulkan",
279 "/opt/vulkan-sdk",
280 "/usr/local/share/vulkan",
281 vulkan_sdk_env.as_str(),
282 ];
283
284 for path in &vulkan_sdk_paths {
285 if !path.is_empty() && std::path::Path::new(path).exists() {
286 return true;
287 }
288 }
289
290 false
291 }
292
293 fn initialize_metal_context() -> Result<MetalRuntime> {
295 if !Self::is_metal_available() {
296 return Err(MetricsError::ComputationError(
297 "Metal not available".to_string(),
298 ));
299 }
300
301 let mut metal_runtime = MetalRuntime::new();
302 metal_runtime.initialize()?;
303
304 Ok(metal_runtime)
305 }
306
307 fn initialize_vulkan_context() -> Result<VulkanRuntime> {
309 if !Self::is_vulkan_available() {
310 return Err(MetricsError::ComputationError(
311 "Vulkan not available".to_string(),
312 ));
313 }
314
315 let mut vulkan_runtime = VulkanRuntime::new();
316 vulkan_runtime.initialize()?;
317
318 Ok(vulkan_runtime)
319 }
320
321 pub fn is_opencl_available() -> bool {
323 let opencl_libs = [
325 "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
326 "/usr/lib/libOpenCL.so",
327 "/usr/lib64/libOpenCL.so",
328 "/System/Library/Frameworks/OpenCL.framework/OpenCL", "C:\\Windows\\System32\\OpenCL.dll", ];
331
332 for lib in &opencl_libs {
333 if std::path::Path::new(lib).exists() {
334 return true;
335 }
336 }
337
338 let vendor_paths = [
340 "/opt/rocm", "/opt/intel/opencl", ];
343
344 for path in &vendor_paths {
345 if std::path::Path::new(path).exists() {
346 return true;
347 }
348 }
349
350 false
351 }
352
353 fn get_cuda_device_name() -> Result<String> {
355 if std::env::var("NVIDIA_VISIBLE_DEVICES").is_ok() {
359 Ok("NVIDIA GPU (Detected)".to_string())
360 } else if std::path::Path::new("/proc/driver/nvidia/version").exists() {
361 Ok("NVIDIA GPU (Driver Detected)".to_string())
362 } else {
363 Ok("NVIDIA GPU (Simulated)".to_string())
364 }
365 }
366
367 pub fn compute_batch_metrics<F>(
369 &self,
370 y_true_batch: &ArrayView2<F>,
371 y_pred_batch: &ArrayView2<F>,
372 metrics: &[&str],
373 ) -> Result<GpuComputeResults<Vec<HashMap<String, F>>>>
374 where
375 F: Float + SimdUnifiedOps + Send + Sync + NumCast + std::iter::Sum,
376 {
377 let start_time = Instant::now();
378 let _batch_size = y_true_batch.nrows();
379 let data_size = y_true_batch.len();
380
381 let compute_strategy = self.determine_compute_strategy(data_size)?;
383
384 let (results, kernel_metrics, transfer_metrics) = match compute_strategy {
385 ComputeStrategy::Cuda => {
386 self.cuda_batch_metrics(y_true_batch, y_pred_batch, metrics)?
387 }
388 ComputeStrategy::OpenCl => {
389 self.opencl_batch_metrics(y_true_batch, y_pred_batch, metrics)?
390 }
391 ComputeStrategy::Fallback => {
392 let results = self.cpu_simd_batch_metrics(y_true_batch, y_pred_batch, metrics)?;
394 let kernel_metrics = KernelMetrics {
395 launch_time: Duration::from_nanos(0),
396 execution_time: Duration::from_millis(1),
397 occupancy: 0.0,
398 memory_bandwidth: 0.0,
399 flops: 0.0,
400 };
401 let transfer_metrics = TransferMetrics {
402 h2d_time: Duration::from_nanos(0),
403 d2h_time: Duration::from_nanos(0),
404 h2d_bytes: 0,
405 d2h_bytes: 0,
406 bandwidth: 0.0,
407 };
408 (results, kernel_metrics, transfer_metrics)
409 }
410 };
411
412 let execution_time = start_time.elapsed();
413 let memory_used = data_size * std::mem::size_of::<F>();
414
415 self.update_performance_stats(execution_time, memory_used, &kernel_metrics);
417
418 Ok(GpuComputeResults {
419 results,
420 execution_time,
421 memory_used,
422 kernel_metrics,
423 transfer_metrics,
424 })
425 }
426
427 fn determine_compute_strategy(&self, data_size: usize) -> Result<ComputeStrategy> {
429 if data_size < self.config.batch_settings.min_batch_size {
431 return Ok(ComputeStrategy::Fallback);
432 }
433
434 if self.cuda_context.is_some() {
436 return Ok(ComputeStrategy::Cuda);
437 }
438
439 if self.opencl_context.is_some() {
441 return Ok(ComputeStrategy::OpenCl);
442 }
443
444 Ok(ComputeStrategy::Fallback)
446 }
447
448 fn cuda_batch_metrics<F>(
450 &self,
451 y_true_batch: &ArrayView2<F>,
452 y_pred_batch: &ArrayView2<F>,
453 metrics: &[&str],
454 ) -> Result<(Vec<HashMap<String, F>>, KernelMetrics, TransferMetrics)>
455 where
456 F: Float + NumCast + std::iter::Sum,
457 {
458 let _cuda_ctx = self.cuda_context.as_ref().ok_or_else(|| {
459 MetricsError::ComputationError("CUDA context not available".to_string())
460 })?;
461
462 let batch_size = y_true_batch.nrows();
463 let feature_size = y_true_batch.ncols();
464
465 let block_size = 256;
467 let grid_size = (batch_size + block_size - 1) / block_size;
468
469 let kernel_config = KernelConfig {
470 block_size: (block_size as u32, 1, 1),
471 grid_size: (grid_size as u32, 1, 1),
472 shared_memory_size: feature_size as u32 * std::mem::size_of::<F>() as u32,
473 async_execution: true,
474 use_pinned_memory: true,
475 optimization_level: self.config.kernel_optimization.fast_math as u8 * 2,
476 };
477
478 let h2d_start = Instant::now();
480 let h2d_bytes = (y_true_batch.len() + y_pred_batch.len()) * std::mem::size_of::<F>();
481 let transfer_delay = Duration::from_nanos((h2d_bytes as f64 / 16e9 * 1e9) as u64);
483 std::thread::sleep(transfer_delay);
484 let h2d_time = h2d_start.elapsed();
485
486 let kernel_start = Instant::now();
488 let mut results = Vec::with_capacity(batch_size);
489
490 for batch_idx in 0..batch_size {
491 let y_true_sample = y_true_batch.row(batch_idx);
492 let y_pred_sample = y_pred_batch.row(batch_idx);
493
494 let mut sample_results = HashMap::new();
495
496 for &metric in metrics {
497 let result = match metric {
498 "mse" => {
499 self.cuda_mse_kernel::<F>(&y_true_sample, &y_pred_sample, &kernel_config)?
500 }
501 "mae" => {
502 self.cuda_mae_kernel::<F>(&y_true_sample, &y_pred_sample, &kernel_config)?
503 }
504 "r2_score" => {
505 self.cuda_r2_kernel::<F>(&y_true_sample, &y_pred_sample, &kernel_config)?
506 }
507 "correlation" => self.cuda_correlation_kernel::<F>(
508 &y_true_sample,
509 &y_pred_sample,
510 &kernel_config,
511 )?,
512 _ => F::zero(),
513 };
514 sample_results.insert(metric.to_string(), result);
515 }
516
517 results.push(sample_results);
518 }
519
520 let kernel_execution_time = kernel_start.elapsed();
521
522 let d2h_start = Instant::now();
524 let d2h_bytes = batch_size * metrics.len() * std::mem::size_of::<F>();
525 let d2h_delay = Duration::from_nanos((d2h_bytes as f64 / 16e9 * 1e9) as u64);
526 std::thread::sleep(d2h_delay);
527 let d2h_time = d2h_start.elapsed();
528
529 let kernel_metrics = KernelMetrics {
531 launch_time: Duration::from_micros(50), execution_time: kernel_execution_time,
533 occupancy: 0.8, memory_bandwidth: (h2d_bytes + d2h_bytes) as f64 / (h2d_time + d2h_time).as_secs_f64(),
535 flops: self.estimate_flops(batch_size, feature_size, metrics.len()),
536 };
537
538 let transfer_metrics = TransferMetrics {
539 h2d_time,
540 d2h_time,
541 h2d_bytes,
542 d2h_bytes,
543 bandwidth: (h2d_bytes + d2h_bytes) as f64 / (h2d_time + d2h_time).as_secs_f64(),
544 };
545
546 Ok((results, kernel_metrics, transfer_metrics))
547 }
548
549 fn opencl_batch_metrics<F>(
551 &self,
552 y_true_batch: &ArrayView2<F>,
553 y_pred_batch: &ArrayView2<F>,
554 metrics: &[&str],
555 ) -> Result<(Vec<HashMap<String, F>>, KernelMetrics, TransferMetrics)>
556 where
557 F: Float + NumCast + std::iter::Sum,
558 {
559 let opencl_ctx = self.opencl_context.as_ref().ok_or_else(|| {
560 MetricsError::ComputationError("OpenCL context not available".to_string())
561 })?;
562
563 let batch_size = y_true_batch.nrows();
564 let feature_size = y_true_batch.ncols();
565
566 let local_work_size = opencl_ctx.device_info.max_work_group_size.min(256);
568 let _global_work_size =
569 ((batch_size + local_work_size - 1) / local_work_size) * local_work_size;
570
571 let h2d_start = Instant::now();
573 let h2d_bytes = (y_true_batch.len() + y_pred_batch.len()) * std::mem::size_of::<F>();
574 let transfer_delay = Duration::from_nanos((h2d_bytes as f64 / 12e9 * 1e9) as u64); std::thread::sleep(transfer_delay);
576 let h2d_time = h2d_start.elapsed();
577
578 let kernel_start = Instant::now();
579 let mut results = Vec::with_capacity(batch_size);
580
581 for batch_idx in 0..batch_size {
582 let y_true_sample = y_true_batch.row(batch_idx);
583 let y_pred_sample = y_pred_batch.row(batch_idx);
584
585 let mut sample_results = HashMap::new();
586
587 for &metric in metrics {
588 let result = match metric {
589 "mse" => self.opencl_mse_kernel::<F>(&y_true_sample, &y_pred_sample)?,
590 "mae" => self.opencl_mae_kernel::<F>(&y_true_sample, &y_pred_sample)?,
591 "r2_score" => self.opencl_r2_kernel::<F>(&y_true_sample, &y_pred_sample)?,
592 "correlation" => {
593 self.opencl_correlation_kernel::<F>(&y_true_sample, &y_pred_sample)?
594 }
595 _ => F::zero(),
596 };
597 sample_results.insert(metric.to_string(), result);
598 }
599
600 results.push(sample_results);
601 }
602
603 let kernel_execution_time = kernel_start.elapsed();
604
605 let d2h_start = Instant::now();
606 let d2h_bytes = batch_size * metrics.len() * std::mem::size_of::<F>();
607 let d2h_delay = Duration::from_nanos((d2h_bytes as f64 / 12e9 * 1e9) as u64);
608 std::thread::sleep(d2h_delay);
609 let d2h_time = d2h_start.elapsed();
610
611 let kernel_metrics = KernelMetrics {
612 launch_time: Duration::from_micros(100), execution_time: kernel_execution_time,
614 occupancy: 0.7, memory_bandwidth: (h2d_bytes + d2h_bytes) as f64 / (h2d_time + d2h_time).as_secs_f64(),
616 flops: self.estimate_flops(batch_size, feature_size, metrics.len()),
617 };
618
619 let transfer_metrics = TransferMetrics {
620 h2d_time,
621 d2h_time,
622 h2d_bytes,
623 d2h_bytes,
624 bandwidth: (h2d_bytes + d2h_bytes) as f64 / (h2d_time + d2h_time).as_secs_f64(),
625 };
626
627 Ok((results, kernel_metrics, transfer_metrics))
628 }
629
630 fn cpu_simd_batch_metrics<F>(
632 &self,
633 y_true_batch: &ArrayView2<F>,
634 y_pred_batch: &ArrayView2<F>,
635 metrics: &[&str],
636 ) -> Result<Vec<HashMap<String, F>>>
637 where
638 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
639 {
640 use scirs2_core::parallel_ops::*;
641
642 let batch_size = y_true_batch.nrows();
643 let chunk_size = self.config.batch_settings.max_batch_size.min(256);
644
645 let results: Result<Vec<_>> = (0..batch_size)
646 .collect::<Vec<_>>()
647 .par_chunks(chunk_size)
648 .map(|chunk| {
649 let mut chunk_results = Vec::new();
650
651 for &batch_idx in chunk {
652 let y_true_sample = y_true_batch.row(batch_idx);
653 let y_pred_sample = y_pred_batch.row(batch_idx);
654
655 let mut sample_results = HashMap::new();
656
657 for &metric in metrics {
658 let result = match metric {
659 "mse" => self.simd_mse::<F>(&y_true_sample, &y_pred_sample)?,
660 "mae" => self.simd_mae::<F>(&y_true_sample, &y_pred_sample)?,
661 "r2_score" => {
662 self.simd_r2_score::<F>(&y_true_sample, &y_pred_sample)?
663 }
664 "correlation" => {
665 self.simd_correlation::<F>(&y_true_sample, &y_pred_sample)?
666 }
667 _ => F::zero(),
668 };
669 sample_results.insert(metric.to_string(), result);
670 }
671
672 chunk_results.push(sample_results);
673 }
674
675 Ok(chunk_results)
676 })
677 .try_reduce(Vec::new, |mut acc, chunk| {
678 acc.extend(chunk);
679 Ok(acc)
680 });
681
682 results
683 }
684
685 fn cuda_mse_kernel<F>(
687 &self,
688 y_true: &ArrayView1<F>,
689 y_pred: &ArrayView1<F>,
690 _config: &KernelConfig,
691 ) -> Result<F>
692 where
693 F: Float + std::iter::Sum,
694 {
695 let mse = y_true
697 .iter()
698 .zip(y_pred.iter())
699 .map(|(&t, &p)| (t - p) * (t - p))
700 .sum::<F>()
701 / F::from(y_true.len()).unwrap();
702 Ok(mse)
703 }
704
705 fn cuda_mae_kernel<F>(
706 &self,
707 y_true: &ArrayView1<F>,
708 y_pred: &ArrayView1<F>,
709 _config: &KernelConfig,
710 ) -> Result<F>
711 where
712 F: Float + std::iter::Sum,
713 {
714 let mae = y_true
715 .iter()
716 .zip(y_pred.iter())
717 .map(|(&t, &p)| (t - p).abs())
718 .sum::<F>()
719 / F::from(y_true.len()).unwrap();
720 Ok(mae)
721 }
722
723 fn cuda_r2_kernel<F>(
724 &self,
725 y_true: &ArrayView1<F>,
726 y_pred: &ArrayView1<F>,
727 _config: &KernelConfig,
728 ) -> Result<F>
729 where
730 F: Float + std::iter::Sum,
731 {
732 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
733
734 let ss_tot = y_true
735 .iter()
736 .map(|&t| (t - mean_true) * (t - mean_true))
737 .sum::<F>();
738
739 let ss_res = y_true
740 .iter()
741 .zip(y_pred.iter())
742 .map(|(&t, &p)| (t - p) * (t - p))
743 .sum::<F>();
744
745 if ss_tot == F::zero() {
746 Ok(F::zero())
747 } else {
748 Ok(F::one() - ss_res / ss_tot)
749 }
750 }
751
752 fn cuda_correlation_kernel<F>(
753 &self,
754 x: &ArrayView1<F>,
755 y: &ArrayView1<F>,
756 _config: &KernelConfig,
757 ) -> Result<F>
758 where
759 F: Float + std::iter::Sum,
760 {
761 let n = F::from(x.len()).unwrap();
762 let mean_x = x.iter().cloned().sum::<F>() / n;
763 let mean_y = y.iter().cloned().sum::<F>() / n;
764
765 let mut sum_xy = F::zero();
766 let mut sum_x2 = F::zero();
767 let mut sum_y2 = F::zero();
768
769 for (&xi, &yi) in x.iter().zip(y.iter()) {
770 let dx = xi - mean_x;
771 let dy = yi - mean_y;
772 sum_xy = sum_xy + dx * dy;
773 sum_x2 = sum_x2 + dx * dx;
774 sum_y2 = sum_y2 + dy * dy;
775 }
776
777 let denom = (sum_x2 * sum_y2).sqrt();
778 if denom > F::zero() {
779 Ok(sum_xy / denom)
780 } else {
781 Ok(F::zero())
782 }
783 }
784
785 fn opencl_mse_kernel<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
787 where
788 F: Float + std::iter::Sum,
789 {
790 self.cuda_mse_kernel(y_true, y_pred, &KernelConfig::default())
791 }
792
793 fn opencl_mae_kernel<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
794 where
795 F: Float + std::iter::Sum,
796 {
797 self.cuda_mae_kernel(y_true, y_pred, &KernelConfig::default())
798 }
799
800 fn opencl_r2_kernel<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
801 where
802 F: Float + std::iter::Sum,
803 {
804 self.cuda_r2_kernel(y_true, y_pred, &KernelConfig::default())
805 }
806
807 fn opencl_correlation_kernel<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
808 where
809 F: Float + std::iter::Sum,
810 {
811 self.cuda_correlation_kernel(x, y, &KernelConfig::default())
812 }
813
814 fn simd_mse<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
816 where
817 F: Float + SimdUnifiedOps + std::iter::Sum,
818 {
819 if self.capabilities.simd_available {
820 let diff = F::simd_sub(y_true, y_pred);
821 let squared = F::simd_mul(&diff.view(), &diff.view());
822 let sum = F::simd_sum(&squared.view());
823 Ok(sum / F::from(y_true.len()).unwrap())
824 } else {
825 let mse = y_true
826 .iter()
827 .zip(y_pred.iter())
828 .map(|(&t, &p)| (t - p) * (t - p))
829 .sum::<F>()
830 / F::from(y_true.len()).unwrap();
831 Ok(mse)
832 }
833 }
834
835 fn simd_mae<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
836 where
837 F: Float + SimdUnifiedOps + std::iter::Sum,
838 {
839 if self.capabilities.simd_available {
840 let diff = F::simd_sub(y_true, y_pred);
841 let abs_diff = F::simd_abs(&diff.view());
842 let sum = F::simd_sum(&abs_diff.view());
843 Ok(sum / F::from(y_true.len()).unwrap())
844 } else {
845 let mae = y_true
846 .iter()
847 .zip(y_pred.iter())
848 .map(|(&t, &p)| (t - p).abs())
849 .sum::<F>()
850 / F::from(y_true.len()).unwrap();
851 Ok(mae)
852 }
853 }
854
855 fn simd_r2_score<F>(&self, y_true: &ArrayView1<F>, y_pred: &ArrayView1<F>) -> Result<F>
856 where
857 F: Float + SimdUnifiedOps + std::iter::Sum,
858 {
859 if self.capabilities.simd_available {
860 let mean_true = F::simd_sum(y_true) / F::from(y_true.len()).unwrap();
861 let mean_array = Array1::from_elem(y_true.len(), mean_true);
862
863 let diff_from_mean = F::simd_sub(y_true, &mean_array.view());
864 let squared_diff_mean = F::simd_mul(&diff_from_mean.view(), &diff_from_mean.view());
865 let ss_tot = F::simd_sum(&squared_diff_mean.view());
866
867 let residuals = F::simd_sub(y_true, y_pred);
868 let squared_residuals = F::simd_mul(&residuals.view(), &residuals.view());
869 let ss_res = F::simd_sum(&squared_residuals.view());
870
871 if ss_tot == F::zero() {
872 Ok(F::zero())
873 } else {
874 Ok(F::one() - ss_res / ss_tot)
875 }
876 } else {
877 self.cuda_r2_kernel(y_true, y_pred, &KernelConfig::default())
878 }
879 }
880
881 fn simd_correlation<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
882 where
883 F: Float + SimdUnifiedOps + std::iter::Sum,
884 {
885 if self.capabilities.simd_available {
886 let n = F::from(x.len()).unwrap();
887 let mean_x = F::simd_sum(x) / n;
888 let mean_y = F::simd_sum(y) / n;
889
890 let mean_x_array = Array1::from_elem(x.len(), mean_x);
891 let mean_y_array = Array1::from_elem(y.len(), mean_y);
892
893 let dev_x = F::simd_sub(x, &mean_x_array.view());
894 let dev_y = F::simd_sub(y, &mean_y_array.view());
895
896 let cov_xy = F::simd_mul(&dev_x.view(), &dev_y.view());
897 let sum_cov = F::simd_sum(&cov_xy.view());
898
899 let var_x = F::simd_mul(&dev_x.view(), &dev_x.view());
900 let var_y = F::simd_mul(&dev_y.view(), &dev_y.view());
901
902 let sum_var_x = F::simd_sum(&var_x.view());
903 let sum_var_y = F::simd_sum(&var_y.view());
904
905 let denom = (sum_var_x * sum_var_y).sqrt();
906 if denom > F::zero() {
907 Ok(sum_cov / denom)
908 } else {
909 Ok(F::zero())
910 }
911 } else {
912 self.cuda_correlation_kernel(x, y, &KernelConfig::default())
913 }
914 }
915
916 fn estimate_flops(&self, batch_size: usize, feature_size: usize, num_metrics: usize) -> f64 {
918 let ops_per_sample = feature_size * num_metrics * 4; (batch_size * ops_per_sample) as f64
921 }
922
923 fn update_performance_stats(
925 &self,
926 execution_time: Duration,
927 memory_used: usize,
928 kernel_metrics: &KernelMetrics,
929 ) {
930 if let Ok(mut stats) = self.performance_stats.lock() {
931 stats.total_operations += 1;
932 stats.total_gpu_time += execution_time;
933 stats.total_memory_transferred += memory_used;
934 stats.kernel_launches += 1;
935
936 stats.avg_kernel_time = Duration::from_nanos(
938 (stats.total_gpu_time.as_nanos() / stats.total_operations as u128) as u64,
939 );
940
941 stats.memory_bandwidth_utilization = kernel_metrics.memory_bandwidth / 1e12;
943 }
945 }
946
947 pub fn get_performance_stats(&self) -> GpuPerformanceStats {
949 self.performance_stats
950 .lock()
951 .map(|stats| (*stats).clone())
952 .unwrap_or_default()
953 }
954
955 pub fn is_gpu_available(&self) -> bool {
957 self.cuda_context.is_some() || self.opencl_context.is_some()
958 }
959
960 pub fn get_gpu_info(&self) -> Option<String> {
962 if let Some(cuda_ctx) = &self.cuda_context {
963 Some(format!("CUDA: {}", cuda_ctx.device_props.name))
964 } else if let Some(opencl_ctx) = &self.opencl_context {
965 Some(format!("OpenCL: {}", opencl_ctx.device_info.name))
966 } else {
967 None
968 }
969 }
970
971 pub fn compile_kernels(&self) -> Result<()> {
973 if let Some(cuda_ctx) = &self.cuda_context {
974 let runtime = cuda_ctx.runtime.lock().map_err(|_| {
975 MetricsError::ComputationError("Failed to lock CUDA runtime".to_string())
976 })?;
977
978 }
987
988 if let Some(opencl_ctx) = &self.opencl_context {
989 let runtime = opencl_ctx.runtime.lock().map_err(|_| {
990 MetricsError::ComputationError("Failed to lock OpenCL runtime".to_string())
991 })?;
992
993 }
999
1000 Ok(())
1001 }
1002
1003 pub fn execute_gpu_batch_processing<F>(
1005 &self,
1006 y_true_batch: &Array2<F>,
1007 y_pred_batch: &Array2<F>,
1008 metrics: &[&str],
1009 ) -> Result<Vec<HashMap<String, F>>>
1010 where
1011 F: Float + NumCast + Send + Sync + std::iter::Sum,
1012 {
1013 let batch_size = y_true_batch.nrows();
1014 let mut results = Vec::with_capacity(batch_size);
1015
1016 for i in 0..batch_size {
1018 let y_true_sample = y_true_batch.row(i).to_owned();
1019 let y_pred_sample = y_pred_batch.row(i).to_owned();
1020
1021 let mut sample_results = HashMap::new();
1022
1023 for &metric in metrics {
1024 let result = match metric {
1025 "mse" => self.execute_gpu_mse(&y_true_sample, &y_pred_sample)?,
1026 "mae" => self.execute_gpu_mae(&y_true_sample, &y_pred_sample)?,
1027 "r2_score" => self.execute_gpu_r2(&y_true_sample, &y_pred_sample)?,
1028 _ => F::zero(),
1029 };
1030 sample_results.insert(metric.to_string(), result);
1031 }
1032
1033 results.push(sample_results);
1034 }
1035
1036 Ok(results)
1037 }
1038
1039 pub fn execute_gpu_mse<F>(&self, y_true: &Array1<F>, y_pred: &Array1<F>) -> Result<F>
1041 where
1042 F: Float + NumCast + std::iter::Sum,
1043 {
1044 let mse = y_true
1046 .iter()
1047 .zip(y_pred.iter())
1048 .map(|(&t, &p)| (t - p) * (t - p))
1049 .sum::<F>()
1050 / F::from(y_true.len()).unwrap();
1051 Ok(mse)
1052 }
1053
1054 pub fn execute_gpu_mae<F>(&self, y_true: &Array1<F>, y_pred: &Array1<F>) -> Result<F>
1056 where
1057 F: Float + NumCast + std::iter::Sum,
1058 {
1059 let mae = y_true
1060 .iter()
1061 .zip(y_pred.iter())
1062 .map(|(&t, &p)| (t - p).abs())
1063 .sum::<F>()
1064 / F::from(y_true.len()).unwrap();
1065 Ok(mae)
1066 }
1067
1068 pub fn execute_gpu_r2<F>(&self, y_true: &Array1<F>, y_pred: &Array1<F>) -> Result<F>
1070 where
1071 F: Float + NumCast + std::iter::Sum,
1072 {
1073 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
1074
1075 let ss_tot = y_true
1076 .iter()
1077 .map(|&t| (t - mean_true) * (t - mean_true))
1078 .sum::<F>();
1079
1080 let ss_res = y_true
1081 .iter()
1082 .zip(y_pred.iter())
1083 .map(|(&t, &p)| (t - p) * (t - p))
1084 .sum::<F>();
1085
1086 if ss_tot == F::zero() {
1087 Ok(F::zero())
1088 } else {
1089 Ok(F::one() - ss_res / ss_tot)
1090 }
1091 }
1092}
1093
1094impl Default for AdvancedGpuComputer {
1095 fn default() -> Self {
1096 Self::new(GpuComputeConfig::default()).unwrap_or_else(|_| Self {
1097 cuda_context: None,
1098 opencl_context: None,
1099 capabilities: PlatformCapabilities::detect(),
1100 performance_stats: Arc::new(Mutex::new(GpuPerformanceStats::default())),
1101 config: GpuComputeConfig::default(),
1102 })
1103 }
1104}