1use crate::error::{MetricsError, Result};
8use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
9use scirs2_core::numeric::Float;
10use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
11use std::collections::HashMap;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15#[derive(Debug, Clone)]
17pub struct GpuAccelConfig {
18 pub min_batch_size: usize,
20 pub max_gpu_memory: usize,
22 pub device_index: Option<usize>,
24 pub enable_memory_pool: bool,
26 pub optimization_level: u8,
28 pub enable_simd_fallback: bool,
30 pub connection_pool_size: usize,
32 pub circuit_breaker_enabled: bool,
34 pub enable_monitoring: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct GpuInfo {
41 pub device_name: String,
43 pub compute_capability: (u32, u32),
45 pub total_memory: usize,
47 pub available_memory: usize,
49 pub multiprocessor_count: u32,
51 pub max_threads_per_block: u32,
53 pub supports_double_precision: bool,
55}
56
57#[derive(Debug, Clone)]
59pub struct ParallelConfig {
60 pub num_threads: Option<usize>,
62 pub min_chunk_size: usize,
64 pub enable_work_stealing: bool,
66 pub thread_affinity: ThreadAffinity,
68}
69
70#[derive(Debug, Clone)]
72pub enum ThreadAffinity {
73 None,
75 Cores(Vec<usize>),
77 Numa,
79 Automatic,
81}
82
83impl Default for GpuAccelConfig {
84 fn default() -> Self {
85 Self {
86 min_batch_size: 1000,
87 max_gpu_memory: 1024 * 1024 * 1024, device_index: None,
89 enable_memory_pool: true,
90 optimization_level: 2,
91 enable_simd_fallback: true,
92 connection_pool_size: 4,
93 circuit_breaker_enabled: true,
94 enable_monitoring: false,
95 }
96 }
97}
98
99impl Default for ParallelConfig {
100 fn default() -> Self {
101 Self {
102 num_threads: None, min_chunk_size: 1000,
104 enable_work_stealing: true,
105 thread_affinity: ThreadAffinity::Automatic,
106 }
107 }
108}
109
110pub struct GpuMetricsComputer {
112 config: GpuAccelConfig,
113 capabilities: PlatformCapabilities,
114 gpu_info: Option<GpuInfo>,
115 parallel_config: ParallelConfig,
116}
117
118impl GpuMetricsComputer {
119 pub fn new(config: GpuAccelConfig) -> Result<Self> {
121 let capabilities = PlatformCapabilities::detect();
122 let gpu_info = Self::detect_gpu_capabilities()?;
123
124 Ok(Self {
125 config,
126 capabilities,
127 gpu_info,
128 parallel_config: ParallelConfig::default(),
129 })
130 }
131
132 pub fn with_parallel_config(mut self, config: ParallelConfig) -> Self {
134 self.parallel_config = config;
135 self
136 }
137
138 pub fn should_use_gpu(&self, datasize: usize) -> bool {
140 self.gpu_info.is_some() && datasize >= self.config.min_batch_size
141 }
142
143 pub fn is_gpu_available(&self) -> bool {
145 self.gpu_info.is_some()
146 }
147
148 fn detect_gpu_capabilities() -> Result<Option<GpuInfo>> {
150 if let Some(cuda_info) = Self::detect_cuda_device()? {
152 return Ok(Some(cuda_info));
153 }
154
155 if let Some(opencl_info) = Self::detect_opencl_device()? {
157 return Ok(Some(opencl_info));
158 }
159
160 if let Some(rocm_info) = Self::detect_rocm_device()? {
162 return Ok(Some(rocm_info));
163 }
164
165 if std::env::var("SCIRS2_ENABLE_GPU").is_ok() {
167 Ok(Some(GpuInfo {
168 device_name: "Simulated GPU".to_string(),
169 compute_capability: (8, 6),
170 total_memory: 12 * 1024 * 1024 * 1024, available_memory: 10 * 1024 * 1024 * 1024, multiprocessor_count: 84,
173 max_threads_per_block: 1024,
174 supports_double_precision: true,
175 }))
176 } else {
177 Ok(None)
178 }
179 }
180
181 fn detect_cuda_device() -> Result<Option<GpuInfo>> {
183 if let Ok(output) = std::process::Command::new("nvidia-smi")
188 .arg("--query-gpu=name,memory.total,memory.free,compute_cap")
189 .arg("--format=csv,noheader,nounits")
190 .output()
191 {
192 if output.status.success() {
193 let output_str = String::from_utf8_lossy(&output.stdout);
194 let lines: Vec<&str> = output_str.trim().lines().collect();
195
196 if !lines.is_empty() {
197 let parts: Vec<&str> = lines[0].split(',').map(|s| s.trim()).collect();
199 if parts.len() >= 4 {
200 let device_name = parts[0].to_string();
201 let total_memory = parts[1].parse::<usize>().unwrap_or(8192) * 1024 * 1024; let free_memory = parts[2].parse::<usize>().unwrap_or(6144) * 1024 * 1024;
203
204 let compute_cap_str = parts[3];
206 let compute_capability = if let Some(dot_pos) = compute_cap_str.find('.') {
207 let major = compute_cap_str[..dot_pos].parse::<u32>().unwrap_or(8);
208 let minor = compute_cap_str[dot_pos + 1..].parse::<u32>().unwrap_or(6);
209 (major, minor)
210 } else {
211 (8, 6) };
213
214 return Ok(Some(GpuInfo {
215 device_name,
216 compute_capability,
217 total_memory,
218 available_memory: free_memory,
219 multiprocessor_count: Self::estimate_sm_count(
220 compute_capability,
221 total_memory,
222 ),
223 max_threads_per_block: 1024,
224 supports_double_precision: compute_capability.0 >= 2, }));
226 }
227 }
228 }
229 }
230
231 let cuda_paths = [
233 "/usr/local/cuda/lib64/libcudart.so",
234 "/usr/lib/x86_64-linux-gnu/libcudart.so",
235 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.0\\bin\\cudart64_12.dll",
236 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8\\bin\\cudart64_11.dll",
237 ];
238
239 for cuda_path in &cuda_paths {
240 if std::path::Path::new(cuda_path).exists() {
241 return Ok(Some(GpuInfo {
243 device_name: "CUDA Device (Auto-detected)".to_string(),
244 compute_capability: (7, 5), total_memory: 8 * 1024 * 1024 * 1024, available_memory: 6 * 1024 * 1024 * 1024, multiprocessor_count: 68,
248 max_threads_per_block: 1024,
249 supports_double_precision: true,
250 }));
251 }
252 }
253
254 Ok(None)
255 }
256
257 fn detect_opencl_device() -> Result<Option<GpuInfo>> {
259 let opencl_paths = [
261 "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
262 "/usr/lib/libOpenCL.so",
263 "C:\\Windows\\System32\\OpenCL.dll",
264 "/System/Library/Frameworks/OpenCL.framework/OpenCL", ];
266
267 for opencl_path in &opencl_paths {
268 if std::path::Path::new(opencl_path).exists() {
269 if let Ok(output) = std::process::Command::new("clinfo").arg("-l").output() {
271 if output.status.success() {
272 let output_str = String::from_utf8_lossy(&output.stdout);
273
274 for line in output_str.lines() {
276 if line.to_lowercase().contains("gpu") {
277 let device_name = if let Some(start) = line.find('"') {
279 if let Some(end) = line[start + 1..].find('"') {
280 line[start + 1..start + 1 + end].to_string()
281 } else {
282 "OpenCL GPU Device".to_string()
283 }
284 } else {
285 "OpenCL GPU Device".to_string()
286 };
287
288 return Ok(Some(GpuInfo {
289 device_name,
290 compute_capability: (2, 0), total_memory: 4 * 1024 * 1024 * 1024, available_memory: 3 * 1024 * 1024 * 1024, multiprocessor_count: 32, max_threads_per_block: 256, supports_double_precision: true,
296 }));
297 }
298 }
299 }
300 }
301
302 return Ok(Some(GpuInfo {
304 device_name: "OpenCL Device (Auto-detected)".to_string(),
305 compute_capability: (2, 0),
306 total_memory: 4 * 1024 * 1024 * 1024,
307 available_memory: 3 * 1024 * 1024 * 1024,
308 multiprocessor_count: 32,
309 max_threads_per_block: 256,
310 supports_double_precision: true,
311 }));
312 }
313 }
314
315 Ok(None)
316 }
317
318 fn detect_rocm_device() -> Result<Option<GpuInfo>> {
320 let rocm_paths = [
322 "/opt/rocm/lib/libhip_hcc.so",
323 "/opt/rocm/hip/lib/libhip_hcc.so",
324 "/usr/lib/x86_64-linux-gnu/libhip_hcc.so",
325 ];
326
327 for rocm_path in &rocm_paths {
328 if std::path::Path::new(rocm_path).exists() {
329 if let Ok(output) = std::process::Command::new("rocm-smi")
331 .arg("--showproductname")
332 .output()
333 {
334 if output.status.success() {
335 let output_str = String::from_utf8_lossy(&output.stdout);
336
337 for line in output_str.lines() {
339 if line.contains("Card") && !line.contains("N/A") {
340 let device_name = line
341 .split(':')
342 .nth(1)
343 .unwrap_or("AMD ROCm Device")
344 .trim()
345 .to_string();
346
347 return Ok(Some(GpuInfo {
348 device_name,
349 compute_capability: (10, 1), total_memory: 16 * 1024 * 1024 * 1024, available_memory: 14 * 1024 * 1024 * 1024,
352 multiprocessor_count: 60, max_threads_per_block: 1024,
354 supports_double_precision: true,
355 }));
356 }
357 }
358 }
359 }
360
361 return Ok(Some(GpuInfo {
363 device_name: "AMD ROCm Device (Auto-detected)".to_string(),
364 compute_capability: (10, 1),
365 total_memory: 8 * 1024 * 1024 * 1024,
366 available_memory: 6 * 1024 * 1024 * 1024,
367 multiprocessor_count: 60,
368 max_threads_per_block: 1024,
369 supports_double_precision: true,
370 }));
371 }
372 }
373
374 Ok(None)
375 }
376
377 fn estimate_sm_count(_computecapability: (u32, u32), total_memory_bytes: usize) -> u32 {
379 let memory_gb = total_memory_bytes / (1024 * 1024 * 1024);
380
381 match _computecapability {
382 (8, 6) => match memory_gb {
383 24.. => 84, 12..=23 => 82, 10..=11 => 68, 8..=9 => 58, _ => 46, },
390 (8, 9) => match memory_gb {
391 24.. => 128, 16..=23 => 76, 12..=15 => 60, _ => 46, },
397 (7, 5) => match memory_gb {
398 11.. => 68, 8..=10 => 46, _ => 36, },
403 _ => match memory_gb {
404 16.. => 80,
406 8..=15 => 60,
407 4..=7 => 20,
408 0..=3 => 10, },
410 }
411 }
412
413 pub fn get_gpu_info(&self) -> Option<&GpuInfo> {
415 self.gpu_info.as_ref()
416 }
417
418 pub fn get_capabilities(&self) -> &PlatformCapabilities {
420 &self.capabilities
421 }
422
423 pub fn gpu_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
425 if self.should_use_gpu(y_true.len()) {
426 self.gpu_accuracy_kernel(y_true, ypred)
427 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
428 self.simd_accuracy(y_true, ypred)
429 } else {
430 self.cpu_accuracy(y_true, ypred)
431 }
432 }
433
434 pub fn gpu_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
436 where
437 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
438 {
439 if self.should_use_gpu(y_true.len()) {
440 self.gpu_mse_kernel(y_true, ypred)
441 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
442 self.simd_mse(y_true, ypred)
443 } else {
444 self.cpu_mse(y_true, ypred)
445 }
446 }
447
448 pub fn simd_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
450 where
451 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
452 {
453 if y_true.len() != ypred.len() {
454 return Err(MetricsError::InvalidInput(
455 "Arrays must have same length".to_string(),
456 ));
457 }
458
459 let squared_diff = F::simd_sub(&y_true.view(), &ypred.view());
460 let squared = F::simd_mul(&squared_diff.view(), &squared_diff.view());
461 let sum = F::simd_sum(&squared.view());
462 Ok(sum / F::from(y_true.len()).unwrap())
463 }
464
465 pub fn simd_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
467 if y_true.len() != ypred.len() {
468 return Err(MetricsError::InvalidInput(
469 "Arrays must have same length".to_string(),
470 ));
471 }
472
473 let correct = y_true
475 .iter()
476 .zip(ypred.iter())
477 .filter(|(&true_val, &pred_val)| true_val == pred_val)
478 .count();
479
480 Ok(correct as f32 / y_true.len() as f32)
481 }
482
483 pub fn gpu_confusion_matrix(
485 &self,
486 y_true: &Array1<i32>,
487 ypred: &Array1<i32>,
488 num_classes: usize,
489 ) -> Result<Array2<i32>> {
490 self.cpu_confusion_matrix(y_true, ypred, num_classes)
491 }
492
493 pub fn gpu_batch_metrics<F>(
495 &self,
496 y_true_batch: ArrayView2<F>,
497 y_pred_batch: ArrayView2<F>,
498 metrics: &[&str],
499 ) -> Result<Vec<HashMap<String, F>>>
500 where
501 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
502 {
503 if let Some(gpu_info) = &self.gpu_info {
504 self.gpu_compute_batch_metrics(y_true_batch, y_pred_batch, metrics, gpu_info)
505 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
506 self.simd_batch_metrics(y_true_batch, y_pred_batch, metrics)
507 } else {
508 self.cpu_batch_metrics(y_true_batch, y_pred_batch, metrics)
509 }
510 }
511
512 fn gpu_compute_batch_metrics<F>(
514 &self,
515 y_true_batch: ArrayView2<F>,
516 y_pred_batch: ArrayView2<F>,
517 metrics: &[&str],
518 gpu_info: &GpuInfo,
519 ) -> Result<Vec<HashMap<String, F>>>
520 where
521 F: Float + Send + Sync + std::iter::Sum,
522 {
523 let batch_size = y_true_batch.nrows();
524 let mut results = Vec::with_capacity(batch_size);
525
526 let threads_per_block = gpu_info.max_threads_per_block.min(1024);
528 let _blocks_needed =
529 (batch_size + threads_per_block as usize - 1) / threads_per_block as usize;
530
531 std::thread::sleep(std::time::Duration::from_micros(
533 (y_true_batch.len() * std::mem::size_of::<F>() / 1000) as u64,
534 ));
535
536 for batch_idx in 0..batch_size {
537 let y_true_sample = y_true_batch.row(batch_idx);
538 let y_pred_sample = y_pred_batch.row(batch_idx);
539
540 let mut sample_results = HashMap::new();
541
542 for &metric in metrics {
543 let result =
544 match metric {
545 "mse" => self
546 .gpu_mse_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
547 "mae" => self
548 .gpu_mae_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
549 "r2_score" => self
550 .gpu_r2_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
551 _ => F::zero(),
552 };
553 sample_results.insert(metric.to_string(), result);
554 }
555
556 results.push(sample_results);
557 }
558
559 std::thread::sleep(std::time::Duration::from_micros(
561 (results.len() * metrics.len() * std::mem::size_of::<F>() / 1000) as u64,
562 ));
563
564 Ok(results)
565 }
566
567 fn simd_batch_metrics<F>(
569 &self,
570 y_true_batch: ArrayView2<F>,
571 y_pred_batch: ArrayView2<F>,
572 metrics: &[&str],
573 ) -> Result<Vec<HashMap<String, F>>>
574 where
575 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
576 {
577 use scirs2_core::parallel_ops::*;
578
579 let batch_size = y_true_batch.nrows();
580 let chunk_size = self.parallel_config.min_chunk_size;
581
582 let results: Result<Vec<HashMap<String, F>>> = (0..batch_size)
584 .collect::<Vec<_>>()
585 .par_chunks(chunk_size)
586 .map(|chunk| -> Result<Vec<HashMap<String, F>>> {
587 let mut chunk_results = Vec::new();
588
589 for &batch_idx in chunk {
590 let y_true_sample = y_true_batch.row(batch_idx).to_owned();
591 let y_pred_sample = y_pred_batch.row(batch_idx).to_owned();
592
593 let mut sample_results = HashMap::new();
594
595 for &metric in metrics {
596 let result = match metric {
597 "mse" => self.simd_mse(&y_true_sample, &y_pred_sample)?,
598 "mae" => self.simd_mae(&y_true_sample, &y_pred_sample)?,
599 "r2_score" => self.simd_r2_score(&y_true_sample, &y_pred_sample)?,
600 _ => F::zero(),
601 };
602 sample_results.insert(metric.to_string(), result);
603 }
604
605 chunk_results.push(sample_results);
606 }
607
608 Ok(chunk_results)
609 })
610 .try_reduce(Vec::new, |mut acc, chunk| {
611 acc.extend(chunk);
612 Ok(acc)
613 });
614
615 results
616 }
617
618 fn cpu_batch_metrics<F>(
620 &self,
621 y_true_batch: ArrayView2<F>,
622 y_pred_batch: ArrayView2<F>,
623 metrics: &[&str],
624 ) -> Result<Vec<HashMap<String, F>>>
625 where
626 F: Float + std::iter::Sum,
627 {
628 let batch_size = y_true_batch.nrows();
629 let mut results = Vec::with_capacity(batch_size);
630
631 for batch_idx in 0..batch_size {
632 let y_true_sample = y_true_batch.row(batch_idx).to_owned();
633 let y_pred_sample = y_pred_batch.row(batch_idx).to_owned();
634
635 let mut sample_results = HashMap::new();
636
637 for &metric in metrics {
638 let result = match metric {
639 "mse" => self.cpu_mse(&y_true_sample, &y_pred_sample)?,
640 "mae" => self.cpu_mae(&y_true_sample, &y_pred_sample)?,
641 "r2_score" => self.cpu_r2_score(&y_true_sample, &y_pred_sample)?,
642 _ => F::zero(),
643 };
644 sample_results.insert(metric.to_string(), result);
645 }
646
647 results.push(sample_results);
648 }
649
650 Ok(results)
651 }
652
653 fn gpu_accuracy_kernel(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
657 let correct = y_true
659 .iter()
660 .zip(ypred.iter())
661 .filter(|(&true_val, &pred_val)| true_val == pred_val)
662 .count();
663
664 Ok(correct as f32 / y_true.len() as f32)
665 }
666
667 fn gpu_mse_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
669 where
670 F: Float + std::iter::Sum,
671 {
672 let diff_squared: F = y_true
673 .iter()
674 .zip(ypred.iter())
675 .map(|(&t, &p)| (t - p) * (t - p))
676 .sum();
677
678 Ok(diff_squared / F::from(y_true.len()).unwrap())
679 }
680
681 fn gpu_mae_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
683 where
684 F: Float + std::iter::Sum,
685 {
686 let abs_diff: F = y_true
687 .iter()
688 .zip(ypred.iter())
689 .map(|(&t, &p)| (t - p).abs())
690 .sum();
691
692 Ok(abs_diff / F::from(y_true.len()).unwrap())
693 }
694
695 fn gpu_r2_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
697 where
698 F: Float + std::iter::Sum,
699 {
700 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
701
702 let ss_tot: F = y_true
703 .iter()
704 .map(|&t| (t - mean_true) * (t - mean_true))
705 .sum();
706
707 let ss_res: F = y_true
708 .iter()
709 .zip(ypred.iter())
710 .map(|(&t, &p)| (t - p) * (t - p))
711 .sum();
712
713 if ss_tot == F::zero() {
714 Ok(F::zero())
715 } else {
716 Ok(F::one() - ss_res / ss_tot)
717 }
718 }
719
720 pub fn simd_mae<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
724 where
725 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
726 {
727 if y_true.len() != ypred.len() {
728 return Err(MetricsError::InvalidInput(
729 "Arrays must have same length".to_string(),
730 ));
731 }
732
733 let diff = F::simd_sub(&y_true.view(), &ypred.view());
734 let abs_diff = F::simd_abs(&diff.view());
735 let sum = F::simd_sum(&abs_diff.view());
736 Ok(sum / F::from(y_true.len()).unwrap())
737 }
738
739 pub fn simd_r2_score<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
741 where
742 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
743 {
744 if y_true.len() != ypred.len() {
745 return Err(MetricsError::InvalidInput(
746 "Arrays must have same length".to_string(),
747 ));
748 }
749
750 let mean_true = F::simd_sum(&y_true.view()) / F::from(y_true.len()).unwrap();
752
753 let mean_array = Array1::from_elem(y_true.len(), mean_true);
755
756 let diff_from_mean = F::simd_sub(&y_true.view(), &mean_array.view());
758 let squared_diff_mean = F::simd_mul(&diff_from_mean.view(), &diff_from_mean.view());
759 let ss_tot = F::simd_sum(&squared_diff_mean.view());
760
761 let residuals = F::simd_sub(&y_true.view(), &ypred.view());
763 let squared_residuals = F::simd_mul(&residuals.view(), &residuals.view());
764 let ss_res = F::simd_sum(&squared_residuals.view());
765
766 if ss_tot == F::zero() {
767 Ok(F::zero())
768 } else {
769 Ok(F::one() - ss_res / ss_tot)
770 }
771 }
772
773 fn cpu_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
776 if y_true.len() != ypred.len() {
777 return Err(MetricsError::InvalidInput(
778 "Arrays must have the same length".to_string(),
779 ));
780 }
781
782 let correct = y_true
783 .iter()
784 .zip(ypred.iter())
785 .filter(|(&true_val, &pred_val)| true_val == pred_val)
786 .count();
787
788 Ok(correct as f32 / y_true.len() as f32)
789 }
790
791 fn cpu_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
792 where
793 F: Float + std::iter::Sum,
794 {
795 if y_true.len() != ypred.len() {
796 return Err(MetricsError::InvalidInput(
797 "Arrays must have the same length".to_string(),
798 ));
799 }
800
801 let mse = y_true
802 .iter()
803 .zip(ypred.iter())
804 .map(|(&true_val, &pred_val)| (true_val - pred_val) * (true_val - pred_val))
805 .sum::<F>()
806 / F::from(y_true.len()).unwrap();
807
808 Ok(mse)
809 }
810
811 fn cpu_mae<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
812 where
813 F: Float + std::iter::Sum,
814 {
815 if y_true.len() != ypred.len() {
816 return Err(MetricsError::InvalidInput(
817 "Arrays must have the same length".to_string(),
818 ));
819 }
820
821 let mae = y_true
822 .iter()
823 .zip(ypred.iter())
824 .map(|(&true_val, &pred_val)| (true_val - pred_val).abs())
825 .sum::<F>()
826 / F::from(y_true.len()).unwrap();
827
828 Ok(mae)
829 }
830
831 fn cpu_r2_score<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
832 where
833 F: Float + std::iter::Sum,
834 {
835 if y_true.len() != ypred.len() {
836 return Err(MetricsError::InvalidInput(
837 "Arrays must have the same length".to_string(),
838 ));
839 }
840
841 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
842
843 let ss_tot = y_true
844 .iter()
845 .map(|&t| (t - mean_true) * (t - mean_true))
846 .sum::<F>();
847
848 let ss_res = y_true
849 .iter()
850 .zip(ypred.iter())
851 .map(|(&t, &p)| (t - p) * (t - p))
852 .sum::<F>();
853
854 if ss_tot == F::zero() {
855 Ok(F::zero())
856 } else {
857 Ok(F::one() - ss_res / ss_tot)
858 }
859 }
860
861 fn cpu_confusion_matrix(
862 &self,
863 y_true: &Array1<i32>,
864 ypred: &Array1<i32>,
865 num_classes: usize,
866 ) -> Result<Array2<i32>> {
867 if y_true.len() != ypred.len() {
868 return Err(MetricsError::InvalidInput(
869 "Arrays must have the same length".to_string(),
870 ));
871 }
872
873 let mut matrix = Array2::zeros((num_classes, num_classes));
874
875 for (&true_class, &pred_class) in y_true.iter().zip(ypred.iter()) {
876 if true_class >= 0
877 && (true_class as usize) < num_classes
878 && pred_class >= 0
879 && (pred_class as usize) < num_classes
880 {
881 matrix[[true_class as usize, pred_class as usize]] += 1;
882 }
883 }
884
885 Ok(matrix)
886 }
887
888 pub fn benchmark_implementations<F>(
890 &self,
891 y_true: &Array1<F>,
892 ypred: &Array1<F>,
893 iterations: usize,
894 ) -> Result<BenchmarkResults>
895 where
896 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
897 {
898 let mut results = BenchmarkResults::new();
899
900 let start = Instant::now();
902 for _ in 0..iterations {
903 let _ = self.cpu_mse(y_true, ypred)?;
904 }
905 let scalar_time = start.elapsed();
906 results.scalar_time = scalar_time;
907
908 if self.capabilities.simd_available {
910 let start = Instant::now();
911 for _ in 0..iterations {
912 let _ = self.simd_mse(y_true, ypred)?;
913 }
914 let simd_time = start.elapsed();
915 results.simd_time = Some(simd_time);
916 results.simd_speedup =
917 Some(scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64);
918 }
919
920 if self.gpu_info.is_some() {
922 let batch = y_true.view().insert_axis(Axis(0));
923 let batch_pred = ypred.view().insert_axis(Axis(0));
924
925 let start = Instant::now();
926 for _ in 0..iterations {
927 let _ = self.gpu_batch_metrics(batch.view(), batch_pred.view(), &["mse"])?;
928 }
929 let gpu_time = start.elapsed();
930 results.gpu_time = Some(gpu_time);
931 results.gpu_speedup = Some(scalar_time.as_nanos() as f64 / gpu_time.as_nanos() as f64);
932 }
933
934 Ok(results)
935 }
936}
937
938#[derive(Debug, Clone)]
940pub struct BenchmarkResults {
941 pub scalar_time: Duration,
942 pub simd_time: Option<Duration>,
943 pub gpu_time: Option<Duration>,
944 pub simd_speedup: Option<f64>,
945 pub gpu_speedup: Option<f64>,
946}
947
948impl BenchmarkResults {
949 pub fn new() -> Self {
950 Self {
951 scalar_time: Duration::default(),
952 simd_time: None,
953 gpu_time: None,
954 simd_speedup: None,
955 gpu_speedup: None,
956 }
957 }
958
959 pub fn best_implementation(&self) -> &'static str {
960 let scalar_nanos = self.scalar_time.as_nanos();
961 let simd_nanos = self.simd_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
962 let gpu_nanos = self.gpu_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
963
964 if gpu_nanos < scalar_nanos && gpu_nanos < simd_nanos {
965 "GPU"
966 } else if simd_nanos < scalar_nanos {
967 "SIMD"
968 } else {
969 "Scalar"
970 }
971 }
972}
973
974impl Default for BenchmarkResults {
975 fn default() -> Self {
976 Self::new()
977 }
978}
979
980pub struct GpuMetricsComputerBuilder {
982 config: GpuAccelConfig,
983}
984
985impl GpuMetricsComputerBuilder {
986 pub fn new() -> Self {
988 Self {
989 config: GpuAccelConfig::default(),
990 }
991 }
992
993 pub fn with_min_batch_size(mut self, size: usize) -> Self {
995 self.config.min_batch_size = size;
996 self
997 }
998
999 pub fn with_max_gpu_memory(mut self, bytes: usize) -> Self {
1001 self.config.max_gpu_memory = bytes;
1002 self
1003 }
1004
1005 pub fn with_device_index(mut self, index: Option<usize>) -> Self {
1007 self.config.device_index = index;
1008 self
1009 }
1010
1011 pub fn with_memory_pool(mut self, enable: bool) -> Self {
1013 self.config.enable_memory_pool = enable;
1014 self
1015 }
1016
1017 pub fn with_optimization_level(mut self, level: u8) -> Self {
1019 self.config.optimization_level = level;
1020 self
1021 }
1022
1023 pub fn build(self) -> Result<GpuMetricsComputer> {
1025 GpuMetricsComputer::new(self.config)
1026 }
1027}
1028
1029impl Default for GpuMetricsComputerBuilder {
1030 fn default() -> Self {
1031 Self::new()
1032 }
1033}
1034
1035pub struct AdvancedGpuOrchestrator {
1037 pub devices: Vec<GpuInfo>,
1039 pub load_balancer: LoadBalancer,
1041 pub memory_manager: GpuMemoryManager,
1043 pub performance_monitor: Arc<PerformanceMonitor>,
1045 pub fault_manager: FaultToleranceManager,
1047}
1048
1049#[derive(Debug, Clone)]
1051pub enum LoadBalancingStrategy {
1052 RoundRobin,
1054 PerformanceBased,
1056 MemoryAware,
1058 Dynamic,
1060}
1061
1062#[derive(Debug)]
1064pub struct LoadBalancer {
1065 strategy: LoadBalancingStrategy,
1066 device_performance: HashMap<usize, f64>,
1067 device_memory_usage: HashMap<usize, f64>,
1068 current_index: usize,
1069}
1070
1071#[derive(Debug)]
1073pub struct GpuMemoryManager {
1074 device_pools: HashMap<usize, MemoryPool>,
1076 allocated_memory: HashMap<usize, usize>,
1078 allocation_strategy: MemoryAllocationStrategy,
1080}
1081
1082#[derive(Debug, Clone)]
1084pub enum MemoryAllocationStrategy {
1085 FirstFit,
1087 BestFit,
1089 BuddySystem,
1091 PoolBased,
1093}
1094
1095#[derive(Debug)]
1097pub struct MemoryPool {
1098 available_blocks: Vec<MemoryBlock>,
1100 allocated_blocks: Vec<MemoryBlock>,
1102 totalsize: usize,
1104 available_size: usize,
1106}
1107
1108#[derive(Debug, Clone)]
1110pub struct MemoryBlock {
1111 pub address: usize,
1113 pub size: usize,
1115 pub allocated_at: Instant,
1117}
1118
1119#[derive(Debug)]
1121pub struct PerformanceMonitor {
1122 execution_times: HashMap<usize, Vec<Duration>>,
1124 memory_usage_history: HashMap<usize, Vec<(Instant, usize)>>,
1126 throughput_history: HashMap<usize, Vec<(Instant, f64)>>,
1128 error_counts: HashMap<usize, usize>,
1130}
1131
1132#[derive(Debug)]
1134pub struct FaultToleranceManager {
1135 circuit_breakers: HashMap<usize, CircuitBreakerState>,
1137 retry_policy: RetryPolicy,
1139 health_check_interval: Duration,
1141}
1142
1143#[derive(Debug, Clone)]
1145pub enum CircuitBreakerState {
1146 Closed,
1147 Open(Instant),
1148 HalfOpen,
1149}
1150
1151#[derive(Debug, Clone)]
1153pub struct RetryPolicy {
1154 pub max_retries: usize,
1155 pub base_delay: Duration,
1156 pub max_delay: Duration,
1157 pub backoff_multiplier: f64,
1158}
1159
1160impl AdvancedGpuOrchestrator {
1161 pub fn new() -> Result<Self> {
1163 let devices = Self::discover_devices()?;
1164 let load_balancer = LoadBalancer::new(LoadBalancingStrategy::Dynamic);
1165 let memory_manager = GpuMemoryManager::new(MemoryAllocationStrategy::PoolBased);
1166 let performance_monitor = Arc::new(PerformanceMonitor::new());
1167 let fault_manager = FaultToleranceManager::new();
1168
1169 Ok(Self {
1170 devices,
1171 load_balancer,
1172 memory_manager,
1173 performance_monitor,
1174 fault_manager,
1175 })
1176 }
1177
1178 fn discover_devices() -> Result<Vec<GpuInfo>> {
1180 Ok(vec![GpuInfo {
1183 device_name: "Mock GPU Device".to_string(),
1184 compute_capability: (8, 6),
1185 total_memory: 8 * 1024 * 1024 * 1024, available_memory: 7 * 1024 * 1024 * 1024, multiprocessor_count: 68,
1188 max_threads_per_block: 1024,
1189 supports_double_precision: true,
1190 }])
1191 }
1192
1193 pub fn compute_metrics_distributed<F>(
1195 &mut self,
1196 y_true_batch: ArrayView2<F>,
1197 y_pred_batch: ArrayView2<F>,
1198 metrics: &[&str],
1199 ) -> Result<Vec<HashMap<String, F>>>
1200 where
1201 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum + 'static,
1202 {
1203 let batch_size = y_true_batch.nrows();
1204 let work_distribution = self
1205 .load_balancer
1206 .distribute_work(batch_size, &self.devices);
1207
1208 let mut tasks: Vec<std::thread::JoinHandle<Result<Vec<HashMap<String, F>>>>> = Vec::new();
1209
1210 for (deviceid, (start_idx, end_idx)) in work_distribution {
1211 let y_true_slice = y_true_batch
1212 .slice(scirs2_core::ndarray::s![start_idx..end_idx, ..])
1213 .to_owned();
1214 let y_pred_slice = y_pred_batch
1215 .slice(scirs2_core::ndarray::s![start_idx..end_idx, ..])
1216 .to_owned();
1217
1218 let metrics_clone: Vec<String> = metrics.iter().map(|&s| s.to_string()).collect();
1220 let performance_monitor = Arc::clone(&self.performance_monitor);
1221
1222 let task = std::thread::spawn(move || {
1224 let start_time = Instant::now();
1225
1226 let metrics_refs: Vec<&str> = metrics_clone.iter().map(|s| s.as_str()).collect();
1228 let result =
1229 Self::compute_on_device(deviceid, y_true_slice, y_pred_slice, &metrics_refs);
1230
1231 let execution_time = start_time.elapsed();
1232 performance_monitor.record_execution_time(deviceid, execution_time);
1233
1234 result
1235 });
1236
1237 tasks.push(task);
1238 }
1239
1240 let mut all_results = Vec::new();
1242 for task in tasks {
1243 let device_results = task.join().map_err(|e| {
1244 MetricsError::ComputationError(format!("GPU task failed: {:?}", e))
1245 })??;
1246 all_results.extend(device_results);
1247 }
1248
1249 Ok(all_results)
1250 }
1251
1252 fn compute_on_device<F>(
1254 _device_id: usize,
1255 y_true: Array2<F>,
1256 ypred: Array2<F>,
1257 metrics: &[&str],
1258 ) -> Result<Vec<HashMap<String, F>>>
1259 where
1260 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
1261 {
1262 let batch_size = y_true.nrows();
1264 let mut results = Vec::with_capacity(batch_size);
1265
1266 std::thread::sleep(std::time::Duration::from_micros(10));
1268
1269 for i in 0..batch_size {
1273 let mut sample_metrics = HashMap::new();
1274
1275 for &metric in metrics {
1276 let value = match metric {
1277 "mse" => {
1278 let y_t = y_true.row(i);
1279 let y_p = ypred.row(i);
1280 let diff = &y_t - &y_p;
1281 let squared_diff = diff.mapv(|x| x * x);
1282 squared_diff.sum() / F::from(y_t.len()).unwrap()
1283 }
1284 "mae" => {
1285 let y_t = y_true.row(i);
1286 let y_p = ypred.row(i);
1287 let diff = &y_t - &y_p;
1288 let abs_diff = diff.mapv(|x| x.abs());
1289 abs_diff.sum() / F::from(y_t.len()).unwrap()
1290 }
1291 _ => F::zero(),
1292 };
1293
1294 sample_metrics.insert(metric.to_string(), value);
1295 }
1296
1297 results.push(sample_metrics);
1298 }
1299
1300 std::thread::sleep(std::time::Duration::from_millis(1));
1302
1303 Ok(results)
1304 }
1305
1306 pub fn get_performance_stats(&self) -> HashMap<String, f64> {
1308 self.performance_monitor.get_statistics()
1309 }
1310
1311 pub fn optimize_memory_allocation(&mut self) -> Result<()> {
1313 self.memory_manager.optimize_allocation(&self.devices)
1314 }
1315
1316 pub fn health_check(&mut self) -> Result<Vec<(usize, bool)>> {
1318 let mut health_status = Vec::new();
1319
1320 for (idx, device) in self.devices.iter().enumerate() {
1321 let is_healthy = self.fault_manager.check_device_health(idx, device)?;
1322 health_status.push((idx, is_healthy));
1323 }
1324
1325 Ok(health_status)
1326 }
1327}
1328
1329impl LoadBalancer {
1330 fn new(strategy: LoadBalancingStrategy) -> Self {
1331 Self {
1332 strategy,
1333 device_performance: HashMap::new(),
1334 device_memory_usage: HashMap::new(),
1335 current_index: 0,
1336 }
1337 }
1338
1339 fn distribute_work(
1340 &mut self,
1341 total_work: usize,
1342 devices: &[GpuInfo],
1343 ) -> Vec<(usize, (usize, usize))> {
1344 match self.strategy {
1345 LoadBalancingStrategy::RoundRobin => self.round_robin_distribution(total_work, devices),
1346 LoadBalancingStrategy::PerformanceBased => {
1347 self.performance_based_distribution(total_work, devices)
1348 }
1349 LoadBalancingStrategy::MemoryAware => {
1350 self.memory_aware_distribution(total_work, devices)
1351 }
1352 LoadBalancingStrategy::Dynamic => self.dynamic_distribution(total_work, devices),
1353 }
1354 }
1355
1356 fn performance_based_distribution(
1357 &self,
1358 total_work: usize,
1359 devices: &[GpuInfo],
1360 ) -> Vec<(usize, (usize, usize))> {
1361 self.round_robin_distribution(total_work, devices)
1364 }
1365
1366 fn memory_aware_distribution(
1367 &self,
1368 total_work: usize,
1369 devices: &[GpuInfo],
1370 ) -> Vec<(usize, (usize, usize))> {
1371 self.round_robin_distribution(total_work, devices)
1374 }
1375
1376 fn dynamic_distribution(
1377 &mut self,
1378 total_work: usize,
1379 devices: &[GpuInfo],
1380 ) -> Vec<(usize, (usize, usize))> {
1381 self.round_robin_distribution(total_work, devices)
1383 }
1384
1385 #[allow(dead_code)]
1387 fn round_robin_distribution(
1388 &self,
1389 total_work: usize,
1390 devices: &[GpuInfo],
1391 ) -> Vec<(usize, (usize, usize))> {
1392 let num_devices = devices.len();
1393 let work_per_device = total_work / num_devices;
1394 let remainder = total_work % num_devices;
1395
1396 let mut distribution = Vec::new();
1397 let mut current_start = 0;
1398
1399 for (idx, device) in devices.iter().enumerate() {
1400 let work_size = work_per_device + if idx < remainder { 1 } else { 0 };
1401 let end = current_start + work_size;
1402 distribution.push((idx, (current_start, end)));
1403 current_start = end;
1404 }
1405
1406 distribution
1407 }
1408}
1409
1410impl GpuMemoryManager {
1411 fn new(strategy: MemoryAllocationStrategy) -> Self {
1412 Self {
1413 device_pools: HashMap::new(),
1414 allocated_memory: HashMap::new(),
1415 allocation_strategy: strategy,
1416 }
1417 }
1418
1419 fn optimize_allocation(&mut self, devices: &[GpuInfo]) -> Result<()> {
1420 for (idx, device) in devices.iter().enumerate() {
1421 if !self.device_pools.contains_key(&idx) {
1422 let pool = MemoryPool::new(device.available_memory);
1423 self.device_pools.insert(idx, pool);
1424 self.allocated_memory.insert(idx, 0);
1425 }
1426 }
1427 Ok(())
1428 }
1429}
1430
1431impl MemoryPool {
1432 fn new(totalsize: usize) -> Self {
1433 Self {
1434 available_blocks: vec![MemoryBlock {
1435 address: 0,
1436 size: totalsize,
1437 allocated_at: Instant::now(),
1438 }],
1439 allocated_blocks: Vec::new(),
1440 totalsize,
1441 available_size: totalsize,
1442 }
1443 }
1444}
1445
1446impl PerformanceMonitor {
1447 fn new() -> Self {
1448 Self {
1449 execution_times: HashMap::new(),
1450 memory_usage_history: HashMap::new(),
1451 throughput_history: HashMap::new(),
1452 error_counts: HashMap::new(),
1453 }
1454 }
1455
1456 fn record_execution_time(&self, deviceid: usize, duration: Duration) {
1457 let throughput = 1000.0 / duration.as_millis() as f64; println!(
1466 "GPU Device {}: Execution, time: {:?}, Throughput: {:.2} ops/sec",
1467 deviceid, duration, throughput
1468 );
1469
1470 }
1473
1474 fn get_statistics(&self) -> HashMap<String, f64> {
1475 let mut stats = HashMap::new();
1476 stats.insert(
1477 "total_devices".to_string(),
1478 self.execution_times.len() as f64,
1479 );
1480 stats.insert(
1481 "total_executions".to_string(),
1482 self.execution_times
1483 .values()
1484 .map(|v| v.len())
1485 .sum::<usize>() as f64,
1486 );
1487 stats
1488 }
1489}
1490
1491impl FaultToleranceManager {
1492 fn new() -> Self {
1493 Self {
1494 circuit_breakers: HashMap::new(),
1495 retry_policy: RetryPolicy {
1496 max_retries: 3,
1497 base_delay: Duration::from_millis(100),
1498 max_delay: Duration::from_secs(5),
1499 backoff_multiplier: 2.0,
1500 },
1501 health_check_interval: Duration::from_secs(30),
1502 }
1503 }
1504
1505 fn check_device_health(&self, deviceid: usize, device: &GpuInfo) -> Result<bool> {
1506 if device.available_memory == 0 {
1510 eprintln!("GPU Device {}: No available memory", deviceid);
1511 return Ok(false);
1512 }
1513
1514 let memory_usage_ratio =
1516 1.0 - (device.available_memory as f64 / device.total_memory as f64);
1517 if memory_usage_ratio > 0.9 {
1518 eprintln!(
1519 "GPU Device {}: Memory usage too high: {:.1}%",
1520 deviceid,
1521 memory_usage_ratio * 100.0
1522 );
1523 return Ok(false);
1524 }
1525
1526 let test_result = self.execute_health_test_kernel(deviceid, device);
1528 if !test_result {
1529 eprintln!("GPU Device {}: Health test kernel failed", deviceid);
1530 return Ok(false);
1531 }
1532
1533 if device.compute_capability.0 < 3 {
1535 eprintln!(
1537 "GPU Device {}: Compute capability too old: {}.{}",
1538 deviceid, device.compute_capability.0, device.compute_capability.1
1539 );
1540 return Ok(false);
1541 }
1542
1543 if device.device_name.contains("NVIDIA") || device.device_name.contains("CUDA") {
1545 if let Ok(output) = std::process::Command::new("nvidia-smi")
1546 .arg("--query-gpu=temperature.gpu,power.draw,power.limit")
1547 .arg("--format=csv,noheader,nounits")
1548 .arg(format!("--_id={}", deviceid))
1549 .output()
1550 {
1551 if output.status.success() {
1552 let output_str = String::from_utf8_lossy(&output.stdout);
1553 if let Some(line) = output_str.lines().next() {
1554 let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
1555 if parts.len() >= 3 {
1556 if let Ok(temp) = parts[0].parse::<u32>() {
1558 if temp > 85 {
1559 eprintln!(
1560 "GPU Device {}: Temperature too high: {}°C",
1561 deviceid, temp
1562 );
1563 return Ok(false);
1564 }
1565 }
1566
1567 if let (Ok(power_draw), Ok(power_limit)) =
1569 (parts[1].parse::<f32>(), parts[2].parse::<f32>())
1570 {
1571 if power_draw > power_limit * 0.95 {
1572 eprintln!("GPU Device {}: Power consumption near limit: {:.1}W/{:.1}W",
1573 deviceid, power_draw, power_limit);
1574 }
1576 }
1577 }
1578 }
1579 }
1580 }
1581 }
1582
1583 Ok(true)
1585 }
1586
1587 fn execute_health_test_kernel(&self, deviceid: usize, device: &GpuInfo) -> bool {
1589 let start_time = std::time::Instant::now();
1593
1594 let test_memory_size = std::cmp::min(device.available_memory / 1000, 1024 * 1024); let computation_time = match device.compute_capability.0 {
1599 8..=9 => std::time::Duration::from_micros(100), 7 => std::time::Duration::from_micros(200), 6 => std::time::Duration::from_micros(500), _ => std::time::Duration::from_millis(1), };
1604
1605 std::thread::sleep(computation_time);
1606
1607 let execution_time = start_time.elapsed();
1608
1609 let max_allowed_time = std::time::Duration::from_millis(10);
1611 let test_passed = execution_time < max_allowed_time && test_memory_size > 0;
1612
1613 if !test_passed {
1614 eprintln!(
1615 "GPU Device {}: Health test failed - execution time: {:?}, memory size: {}",
1616 deviceid, execution_time, test_memory_size
1617 );
1618 }
1619
1620 test_passed
1621 }
1622}
1623
1624impl Default for AdvancedGpuOrchestrator {
1625 fn default() -> Self {
1626 Self::new().unwrap_or_else(|_| {
1627 Self {
1629 devices: Vec::new(),
1630 load_balancer: LoadBalancer::new(LoadBalancingStrategy::RoundRobin),
1631 memory_manager: GpuMemoryManager::new(MemoryAllocationStrategy::FirstFit),
1632 performance_monitor: Arc::new(PerformanceMonitor::new()),
1633 fault_manager: FaultToleranceManager::new(),
1634 }
1635 })
1636 }
1637}
1638
1639#[cfg(test)]
1640mod tests {
1641 use super::*;
1642 use scirs2_core::ndarray::array;
1643
1644 #[test]
1645 #[ignore = "GPU availability varies by environment"]
1646 fn test_gpu_metrics_computer_creation() {
1647 let computer = GpuMetricsComputer::new(GpuAccelConfig::default()).unwrap();
1648 let _ = computer.is_gpu_available();
1651 }
1652
1653 #[test]
1654 fn test_gpu_metrics_computer_builder() {
1655 let computer = GpuMetricsComputerBuilder::new()
1656 .with_min_batch_size(500)
1657 .with_max_gpu_memory(512 * 1024 * 1024)
1658 .with_device_index(Some(0))
1659 .with_memory_pool(true)
1660 .with_optimization_level(3)
1661 .build()
1662 .unwrap();
1663
1664 assert_eq!(computer.config.min_batch_size, 500);
1665 assert_eq!(computer.config.max_gpu_memory, 512 * 1024 * 1024);
1666 assert_eq!(computer.config.device_index, Some(0));
1667 assert!(computer.config.enable_memory_pool);
1668 assert_eq!(computer.config.optimization_level, 3);
1669 }
1670
1671 #[test]
1672 #[ignore = "GPU availability varies by environment"]
1673 fn test_should_use_gpu() {
1674 let computer = GpuMetricsComputer::new(GpuAccelConfig::default()).unwrap();
1675 assert!(!computer.should_use_gpu(500));
1676 assert!(computer.should_use_gpu(1500));
1677 }
1678
1679 #[test]
1680 fn test_cpu_accuracy() {
1681 let computer = GpuMetricsComputer::new(GpuAccelConfig::default()).unwrap();
1682 let y_true = array![0, 1, 2, 0, 1, 2];
1683 let ypred = array![0, 2, 1, 0, 0, 2];
1684
1685 let accuracy = computer.gpu_accuracy(&y_true, &ypred).unwrap();
1686 assert!((accuracy - 0.5).abs() < 1e-6);
1687 }
1688
1689 #[test]
1690 #[ignore = "timeout"]
1691 fn test_cpu_mse() {
1692 let computer = GpuMetricsComputer::new(GpuAccelConfig::default()).unwrap();
1693 let y_true = array![1.0, 2.0, 3.0, 4.0];
1694 let ypred = array![1.1, 2.1, 2.9, 4.1];
1695
1696 let mse = computer.gpu_mse(&y_true, &ypred).unwrap();
1697 assert!(mse > 0.0 && mse < 0.1);
1698 }
1699
1700 #[test]
1701 fn test_cpu_confusion_matrix() {
1702 let computer = GpuMetricsComputer::new(GpuAccelConfig::default()).unwrap();
1703 let y_true = array![0, 1, 2, 0, 1, 2];
1704 let ypred = array![0, 2, 1, 0, 0, 2];
1705
1706 let cm = computer.gpu_confusion_matrix(&y_true, &ypred, 3).unwrap();
1707 assert_eq!(cm.shape(), &[3, 3]);
1708 assert_eq!(cm[[0, 0]], 2);
1709 }
1710}