1use crate::error::{MetricsError, Result};
8use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
9use scirs2_core::numeric::Float;
10use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
11use std::collections::HashMap;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15#[derive(Debug, Clone)]
17pub struct GpuAccelConfig {
18 pub min_batch_size: usize,
20 pub max_gpu_memory: usize,
22 pub device_index: Option<usize>,
24 pub enable_memory_pool: bool,
26 pub optimization_level: u8,
28 pub enable_simd_fallback: bool,
30 pub connection_pool_size: usize,
32 pub circuit_breaker_enabled: bool,
34 pub enable_monitoring: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct GpuInfo {
41 pub device_name: String,
43 pub compute_capability: (u32, u32),
45 pub total_memory: usize,
47 pub available_memory: usize,
49 pub multiprocessor_count: u32,
51 pub max_threads_per_block: u32,
53 pub supports_double_precision: bool,
55}
56
57#[derive(Debug, Clone)]
59pub struct ParallelConfig {
60 pub num_threads: Option<usize>,
62 pub min_chunk_size: usize,
64 pub enable_work_stealing: bool,
66 pub thread_affinity: ThreadAffinity,
68}
69
70#[derive(Debug, Clone)]
72pub enum ThreadAffinity {
73 None,
75 Cores(Vec<usize>),
77 Numa,
79 Automatic,
81}
82
83impl Default for GpuAccelConfig {
84 fn default() -> Self {
85 Self {
86 min_batch_size: 1000,
87 max_gpu_memory: 1024 * 1024 * 1024, device_index: None,
89 enable_memory_pool: true,
90 optimization_level: 2,
91 enable_simd_fallback: true,
92 connection_pool_size: 4,
93 circuit_breaker_enabled: true,
94 enable_monitoring: false,
95 }
96 }
97}
98
99impl Default for ParallelConfig {
100 fn default() -> Self {
101 Self {
102 num_threads: None, min_chunk_size: 1000,
104 enable_work_stealing: true,
105 thread_affinity: ThreadAffinity::Automatic,
106 }
107 }
108}
109
110pub struct GpuMetricsComputer {
112 config: GpuAccelConfig,
113 capabilities: PlatformCapabilities,
114 gpu_info: Option<GpuInfo>,
115 parallel_config: ParallelConfig,
116}
117
118impl GpuMetricsComputer {
119 pub fn new(config: GpuAccelConfig) -> Result<Self> {
121 let capabilities = PlatformCapabilities::detect();
122 let gpu_info = Self::detect_gpu_capabilities()?;
123
124 Ok(Self {
125 config,
126 capabilities,
127 gpu_info,
128 parallel_config: ParallelConfig::default(),
129 })
130 }
131
132 pub fn with_parallel_config(mut self, config: ParallelConfig) -> Self {
134 self.parallel_config = config;
135 self
136 }
137
138 pub fn should_use_gpu(&self, datasize: usize) -> bool {
140 self.gpu_info.is_some() && datasize >= self.config.min_batch_size
141 }
142
143 pub fn is_gpu_available(&self) -> bool {
145 self.gpu_info.is_some()
146 }
147
148 fn detect_gpu_capabilities() -> Result<Option<GpuInfo>> {
150 if let Some(cuda_info) = Self::detect_cuda_device()? {
152 return Ok(Some(cuda_info));
153 }
154
155 if let Some(opencl_info) = Self::detect_opencl_device()? {
157 return Ok(Some(opencl_info));
158 }
159
160 if let Some(rocm_info) = Self::detect_rocm_device()? {
162 return Ok(Some(rocm_info));
163 }
164
165 if std::env::var("SCIRS2_ENABLE_GPU").is_ok() {
167 Ok(Some(GpuInfo {
168 device_name: "Simulated GPU".to_string(),
169 compute_capability: (8, 6),
170 total_memory: 12 * 1024 * 1024 * 1024, available_memory: 10 * 1024 * 1024 * 1024, multiprocessor_count: 84,
173 max_threads_per_block: 1024,
174 supports_double_precision: true,
175 }))
176 } else {
177 Ok(None)
178 }
179 }
180
181 fn detect_cuda_device() -> Result<Option<GpuInfo>> {
183 if let Ok(output) = std::process::Command::new("nvidia-smi")
188 .arg("--query-gpu=name,memory.total,memory.free,compute_cap")
189 .arg("--format=csv,noheader,nounits")
190 .output()
191 {
192 if output.status.success() {
193 let output_str = String::from_utf8_lossy(&output.stdout);
194 let lines: Vec<&str> = output_str.trim().lines().collect();
195
196 if !lines.is_empty() {
197 let parts: Vec<&str> = lines[0].split(',').map(|s| s.trim()).collect();
199 if parts.len() >= 4 {
200 let device_name = parts[0].to_string();
201 let total_memory = parts[1].parse::<usize>().unwrap_or(8192) * 1024 * 1024; let free_memory = parts[2].parse::<usize>().unwrap_or(6144) * 1024 * 1024;
203
204 let compute_cap_str = parts[3];
206 let compute_capability = if let Some(dot_pos) = compute_cap_str.find('.') {
207 let major = compute_cap_str[..dot_pos].parse::<u32>().unwrap_or(8);
208 let minor = compute_cap_str[dot_pos + 1..].parse::<u32>().unwrap_or(6);
209 (major, minor)
210 } else {
211 (8, 6) };
213
214 return Ok(Some(GpuInfo {
215 device_name,
216 compute_capability,
217 total_memory,
218 available_memory: free_memory,
219 multiprocessor_count: Self::estimate_sm_count(
220 compute_capability,
221 total_memory,
222 ),
223 max_threads_per_block: 1024,
224 supports_double_precision: compute_capability.0 >= 2, }));
226 }
227 }
228 }
229 }
230
231 let cuda_paths = [
233 "/usr/local/cuda/lib64/libcudart.so",
234 "/usr/lib/x86_64-linux-gnu/libcudart.so",
235 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.0\\bin\\cudart64_12.dll",
236 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8\\bin\\cudart64_11.dll",
237 ];
238
239 for cuda_path in &cuda_paths {
240 if std::path::Path::new(cuda_path).exists() {
241 return Ok(Some(GpuInfo {
243 device_name: "CUDA Device (Auto-detected)".to_string(),
244 compute_capability: (7, 5), total_memory: 8 * 1024 * 1024 * 1024, available_memory: 6 * 1024 * 1024 * 1024, multiprocessor_count: 68,
248 max_threads_per_block: 1024,
249 supports_double_precision: true,
250 }));
251 }
252 }
253
254 Ok(None)
255 }
256
257 fn detect_opencl_device() -> Result<Option<GpuInfo>> {
259 let opencl_paths = [
261 "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
262 "/usr/lib/libOpenCL.so",
263 "C:\\Windows\\System32\\OpenCL.dll",
264 "/System/Library/Frameworks/OpenCL.framework/OpenCL", ];
266
267 for opencl_path in &opencl_paths {
268 if std::path::Path::new(opencl_path).exists() {
269 if let Ok(output) = std::process::Command::new("clinfo").arg("-l").output() {
271 if output.status.success() {
272 let output_str = String::from_utf8_lossy(&output.stdout);
273
274 for line in output_str.lines() {
276 if line.to_lowercase().contains("gpu") {
277 let device_name = if let Some(start) = line.find('"') {
279 if let Some(end) = line[start + 1..].find('"') {
280 line[start + 1..start + 1 + end].to_string()
281 } else {
282 "OpenCL GPU Device".to_string()
283 }
284 } else {
285 "OpenCL GPU Device".to_string()
286 };
287
288 return Ok(Some(GpuInfo {
289 device_name,
290 compute_capability: (2, 0), total_memory: 4 * 1024 * 1024 * 1024, available_memory: 3 * 1024 * 1024 * 1024, multiprocessor_count: 32, max_threads_per_block: 256, supports_double_precision: true,
296 }));
297 }
298 }
299 }
300 }
301
302 return Ok(Some(GpuInfo {
304 device_name: "OpenCL Device (Auto-detected)".to_string(),
305 compute_capability: (2, 0),
306 total_memory: 4 * 1024 * 1024 * 1024,
307 available_memory: 3 * 1024 * 1024 * 1024,
308 multiprocessor_count: 32,
309 max_threads_per_block: 256,
310 supports_double_precision: true,
311 }));
312 }
313 }
314
315 Ok(None)
316 }
317
318 fn detect_rocm_device() -> Result<Option<GpuInfo>> {
320 let rocm_paths = [
322 "/opt/rocm/lib/libhip_hcc.so",
323 "/opt/rocm/hip/lib/libhip_hcc.so",
324 "/usr/lib/x86_64-linux-gnu/libhip_hcc.so",
325 ];
326
327 for rocm_path in &rocm_paths {
328 if std::path::Path::new(rocm_path).exists() {
329 if let Ok(output) = std::process::Command::new("rocm-smi")
331 .arg("--showproductname")
332 .output()
333 {
334 if output.status.success() {
335 let output_str = String::from_utf8_lossy(&output.stdout);
336
337 for line in output_str.lines() {
339 if line.contains("Card") && !line.contains("N/A") {
340 let device_name = line
341 .split(':')
342 .nth(1)
343 .unwrap_or("AMD ROCm Device")
344 .trim()
345 .to_string();
346
347 return Ok(Some(GpuInfo {
348 device_name,
349 compute_capability: (10, 1), total_memory: 16 * 1024 * 1024 * 1024, available_memory: 14 * 1024 * 1024 * 1024,
352 multiprocessor_count: 60, max_threads_per_block: 1024,
354 supports_double_precision: true,
355 }));
356 }
357 }
358 }
359 }
360
361 return Ok(Some(GpuInfo {
363 device_name: "AMD ROCm Device (Auto-detected)".to_string(),
364 compute_capability: (10, 1),
365 total_memory: 8 * 1024 * 1024 * 1024,
366 available_memory: 6 * 1024 * 1024 * 1024,
367 multiprocessor_count: 60,
368 max_threads_per_block: 1024,
369 supports_double_precision: true,
370 }));
371 }
372 }
373
374 Ok(None)
375 }
376
377 fn estimate_sm_count(_computecapability: (u32, u32), total_memory_bytes: usize) -> u32 {
379 let memory_gb = total_memory_bytes / (1024 * 1024 * 1024);
380
381 match _computecapability {
382 (8, 6) => match memory_gb {
383 24.. => 84, 12..=23 => 82, 10..=11 => 68, 8..=9 => 58, _ => 46, },
390 (8, 9) => match memory_gb {
391 24.. => 128, 16..=23 => 76, 12..=15 => 60, _ => 46, },
397 (7, 5) => match memory_gb {
398 11.. => 68, 8..=10 => 46, _ => 36, },
403 _ => match memory_gb {
404 16.. => 80,
406 8..=15 => 60,
407 4..=7 => 20,
408 0..=3 => 10, },
410 }
411 }
412
413 pub fn get_gpu_info(&self) -> Option<&GpuInfo> {
415 self.gpu_info.as_ref()
416 }
417
418 pub fn get_capabilities(&self) -> &PlatformCapabilities {
420 &self.capabilities
421 }
422
423 pub fn gpu_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
425 if self.should_use_gpu(y_true.len()) {
426 self.gpu_accuracy_kernel(y_true, ypred)
427 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
428 self.simd_accuracy(y_true, ypred)
429 } else {
430 self.cpu_accuracy(y_true, ypred)
431 }
432 }
433
434 pub fn gpu_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
436 where
437 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
438 {
439 if self.should_use_gpu(y_true.len()) {
440 self.gpu_mse_kernel(y_true, ypred)
441 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
442 self.simd_mse(y_true, ypred)
443 } else {
444 self.cpu_mse(y_true, ypred)
445 }
446 }
447
448 pub fn simd_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
450 where
451 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
452 {
453 if y_true.len() != ypred.len() {
454 return Err(MetricsError::InvalidInput(
455 "Arrays must have same length".to_string(),
456 ));
457 }
458
459 let squared_diff = F::simd_sub(&y_true.view(), &ypred.view());
460 let squared = F::simd_mul(&squared_diff.view(), &squared_diff.view());
461 let sum = F::simd_sum(&squared.view());
462 Ok(sum / F::from(y_true.len()).expect("Operation failed"))
463 }
464
465 pub fn simd_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
467 if y_true.len() != ypred.len() {
468 return Err(MetricsError::InvalidInput(
469 "Arrays must have same length".to_string(),
470 ));
471 }
472
473 let correct = y_true
475 .iter()
476 .zip(ypred.iter())
477 .filter(|(&true_val, &pred_val)| true_val == pred_val)
478 .count();
479
480 Ok(correct as f32 / y_true.len() as f32)
481 }
482
483 pub fn gpu_confusion_matrix(
485 &self,
486 y_true: &Array1<i32>,
487 ypred: &Array1<i32>,
488 num_classes: usize,
489 ) -> Result<Array2<i32>> {
490 self.cpu_confusion_matrix(y_true, ypred, num_classes)
491 }
492
493 pub fn gpu_batch_metrics<F>(
495 &self,
496 y_true_batch: ArrayView2<F>,
497 y_pred_batch: ArrayView2<F>,
498 metrics: &[&str],
499 ) -> Result<Vec<HashMap<String, F>>>
500 where
501 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
502 {
503 if let Some(gpu_info) = &self.gpu_info {
504 self.gpu_compute_batch_metrics(y_true_batch, y_pred_batch, metrics, gpu_info)
505 } else if self.config.enable_simd_fallback && self.capabilities.simd_available {
506 self.simd_batch_metrics(y_true_batch, y_pred_batch, metrics)
507 } else {
508 self.cpu_batch_metrics(y_true_batch, y_pred_batch, metrics)
509 }
510 }
511
512 fn gpu_compute_batch_metrics<F>(
514 &self,
515 y_true_batch: ArrayView2<F>,
516 y_pred_batch: ArrayView2<F>,
517 metrics: &[&str],
518 gpu_info: &GpuInfo,
519 ) -> Result<Vec<HashMap<String, F>>>
520 where
521 F: Float + Send + Sync + std::iter::Sum,
522 {
523 let batch_size = y_true_batch.nrows();
524 let mut results = Vec::with_capacity(batch_size);
525
526 let threads_per_block = gpu_info.max_threads_per_block.min(1024);
528 let _blocks_needed =
529 (batch_size + threads_per_block as usize - 1) / threads_per_block as usize;
530
531 std::thread::sleep(std::time::Duration::from_micros(
533 (y_true_batch.len() * std::mem::size_of::<F>() / 1000) as u64,
534 ));
535
536 for batch_idx in 0..batch_size {
537 let y_true_sample = y_true_batch.row(batch_idx);
538 let y_pred_sample = y_pred_batch.row(batch_idx);
539
540 let mut sample_results = HashMap::new();
541
542 for &metric in metrics {
543 let result =
544 match metric {
545 "mse" => self
546 .gpu_mse_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
547 "mae" => self
548 .gpu_mae_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
549 "r2_score" => self
550 .gpu_r2_kernel(&y_true_sample.to_owned(), &y_pred_sample.to_owned())?,
551 _ => F::zero(),
552 };
553 sample_results.insert(metric.to_string(), result);
554 }
555
556 results.push(sample_results);
557 }
558
559 std::thread::sleep(std::time::Duration::from_micros(
561 (results.len() * metrics.len() * std::mem::size_of::<F>() / 1000) as u64,
562 ));
563
564 Ok(results)
565 }
566
567 fn simd_batch_metrics<F>(
569 &self,
570 y_true_batch: ArrayView2<F>,
571 y_pred_batch: ArrayView2<F>,
572 metrics: &[&str],
573 ) -> Result<Vec<HashMap<String, F>>>
574 where
575 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
576 {
577 use scirs2_core::parallel_ops::*;
578
579 let batch_size = y_true_batch.nrows();
580 let chunk_size = self.parallel_config.min_chunk_size;
581
582 let results: Result<Vec<HashMap<String, F>>> = (0..batch_size)
584 .collect::<Vec<_>>()
585 .par_chunks(chunk_size)
586 .map(|chunk| -> Result<Vec<HashMap<String, F>>> {
587 let mut chunk_results = Vec::new();
588
589 for &batch_idx in chunk {
590 let y_true_sample = y_true_batch.row(batch_idx).to_owned();
591 let y_pred_sample = y_pred_batch.row(batch_idx).to_owned();
592
593 let mut sample_results = HashMap::new();
594
595 for &metric in metrics {
596 let result = match metric {
597 "mse" => self.simd_mse(&y_true_sample, &y_pred_sample)?,
598 "mae" => self.simd_mae(&y_true_sample, &y_pred_sample)?,
599 "r2_score" => self.simd_r2_score(&y_true_sample, &y_pred_sample)?,
600 _ => F::zero(),
601 };
602 sample_results.insert(metric.to_string(), result);
603 }
604
605 chunk_results.push(sample_results);
606 }
607
608 Ok(chunk_results)
609 })
610 .try_reduce(Vec::new, |mut acc, chunk| {
611 acc.extend(chunk);
612 Ok(acc)
613 });
614
615 results
616 }
617
618 fn cpu_batch_metrics<F>(
620 &self,
621 y_true_batch: ArrayView2<F>,
622 y_pred_batch: ArrayView2<F>,
623 metrics: &[&str],
624 ) -> Result<Vec<HashMap<String, F>>>
625 where
626 F: Float + std::iter::Sum,
627 {
628 let batch_size = y_true_batch.nrows();
629 let mut results = Vec::with_capacity(batch_size);
630
631 for batch_idx in 0..batch_size {
632 let y_true_sample = y_true_batch.row(batch_idx).to_owned();
633 let y_pred_sample = y_pred_batch.row(batch_idx).to_owned();
634
635 let mut sample_results = HashMap::new();
636
637 for &metric in metrics {
638 let result = match metric {
639 "mse" => self.cpu_mse(&y_true_sample, &y_pred_sample)?,
640 "mae" => self.cpu_mae(&y_true_sample, &y_pred_sample)?,
641 "r2_score" => self.cpu_r2_score(&y_true_sample, &y_pred_sample)?,
642 _ => F::zero(),
643 };
644 sample_results.insert(metric.to_string(), result);
645 }
646
647 results.push(sample_results);
648 }
649
650 Ok(results)
651 }
652
653 fn gpu_accuracy_kernel(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
657 let correct = y_true
659 .iter()
660 .zip(ypred.iter())
661 .filter(|(&true_val, &pred_val)| true_val == pred_val)
662 .count();
663
664 Ok(correct as f32 / y_true.len() as f32)
665 }
666
667 fn gpu_mse_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
669 where
670 F: Float + std::iter::Sum,
671 {
672 let diff_squared: F = y_true
673 .iter()
674 .zip(ypred.iter())
675 .map(|(&t, &p)| (t - p) * (t - p))
676 .sum();
677
678 Ok(diff_squared / F::from(y_true.len()).expect("Operation failed"))
679 }
680
681 fn gpu_mae_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
683 where
684 F: Float + std::iter::Sum,
685 {
686 let abs_diff: F = y_true
687 .iter()
688 .zip(ypred.iter())
689 .map(|(&t, &p)| (t - p).abs())
690 .sum();
691
692 Ok(abs_diff / F::from(y_true.len()).expect("Operation failed"))
693 }
694
695 fn gpu_r2_kernel<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
697 where
698 F: Float + std::iter::Sum,
699 {
700 let mean_true =
701 y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).expect("Operation failed");
702
703 let ss_tot: F = y_true
704 .iter()
705 .map(|&t| (t - mean_true) * (t - mean_true))
706 .sum();
707
708 let ss_res: F = y_true
709 .iter()
710 .zip(ypred.iter())
711 .map(|(&t, &p)| (t - p) * (t - p))
712 .sum();
713
714 if ss_tot == F::zero() {
715 Ok(F::zero())
716 } else {
717 Ok(F::one() - ss_res / ss_tot)
718 }
719 }
720
721 pub fn simd_mae<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
725 where
726 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
727 {
728 if y_true.len() != ypred.len() {
729 return Err(MetricsError::InvalidInput(
730 "Arrays must have same length".to_string(),
731 ));
732 }
733
734 let diff = F::simd_sub(&y_true.view(), &ypred.view());
735 let abs_diff = F::simd_abs(&diff.view());
736 let sum = F::simd_sum(&abs_diff.view());
737 Ok(sum / F::from(y_true.len()).expect("Operation failed"))
738 }
739
740 pub fn simd_r2_score<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
742 where
743 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
744 {
745 if y_true.len() != ypred.len() {
746 return Err(MetricsError::InvalidInput(
747 "Arrays must have same length".to_string(),
748 ));
749 }
750
751 let mean_true =
753 F::simd_sum(&y_true.view()) / F::from(y_true.len()).expect("Operation failed");
754
755 let mean_array = Array1::from_elem(y_true.len(), mean_true);
757
758 let diff_from_mean = F::simd_sub(&y_true.view(), &mean_array.view());
760 let squared_diff_mean = F::simd_mul(&diff_from_mean.view(), &diff_from_mean.view());
761 let ss_tot = F::simd_sum(&squared_diff_mean.view());
762
763 let residuals = F::simd_sub(&y_true.view(), &ypred.view());
765 let squared_residuals = F::simd_mul(&residuals.view(), &residuals.view());
766 let ss_res = F::simd_sum(&squared_residuals.view());
767
768 if ss_tot == F::zero() {
769 Ok(F::zero())
770 } else {
771 Ok(F::one() - ss_res / ss_tot)
772 }
773 }
774
775 fn cpu_accuracy(&self, y_true: &Array1<i32>, ypred: &Array1<i32>) -> Result<f32> {
778 if y_true.len() != ypred.len() {
779 return Err(MetricsError::InvalidInput(
780 "Arrays must have the same length".to_string(),
781 ));
782 }
783
784 let correct = y_true
785 .iter()
786 .zip(ypred.iter())
787 .filter(|(&true_val, &pred_val)| true_val == pred_val)
788 .count();
789
790 Ok(correct as f32 / y_true.len() as f32)
791 }
792
793 fn cpu_mse<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
794 where
795 F: Float + std::iter::Sum,
796 {
797 if y_true.len() != ypred.len() {
798 return Err(MetricsError::InvalidInput(
799 "Arrays must have the same length".to_string(),
800 ));
801 }
802
803 let mse = y_true
804 .iter()
805 .zip(ypred.iter())
806 .map(|(&true_val, &pred_val)| (true_val - pred_val) * (true_val - pred_val))
807 .sum::<F>()
808 / F::from(y_true.len()).expect("Operation failed");
809
810 Ok(mse)
811 }
812
813 fn cpu_mae<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
814 where
815 F: Float + std::iter::Sum,
816 {
817 if y_true.len() != ypred.len() {
818 return Err(MetricsError::InvalidInput(
819 "Arrays must have the same length".to_string(),
820 ));
821 }
822
823 let mae = y_true
824 .iter()
825 .zip(ypred.iter())
826 .map(|(&true_val, &pred_val)| (true_val - pred_val).abs())
827 .sum::<F>()
828 / F::from(y_true.len()).expect("Operation failed");
829
830 Ok(mae)
831 }
832
833 fn cpu_r2_score<F>(&self, y_true: &Array1<F>, ypred: &Array1<F>) -> Result<F>
834 where
835 F: Float + std::iter::Sum,
836 {
837 if y_true.len() != ypred.len() {
838 return Err(MetricsError::InvalidInput(
839 "Arrays must have the same length".to_string(),
840 ));
841 }
842
843 let mean_true =
844 y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).expect("Operation failed");
845
846 let ss_tot = y_true
847 .iter()
848 .map(|&t| (t - mean_true) * (t - mean_true))
849 .sum::<F>();
850
851 let ss_res = y_true
852 .iter()
853 .zip(ypred.iter())
854 .map(|(&t, &p)| (t - p) * (t - p))
855 .sum::<F>();
856
857 if ss_tot == F::zero() {
858 Ok(F::zero())
859 } else {
860 Ok(F::one() - ss_res / ss_tot)
861 }
862 }
863
864 fn cpu_confusion_matrix(
865 &self,
866 y_true: &Array1<i32>,
867 ypred: &Array1<i32>,
868 num_classes: usize,
869 ) -> Result<Array2<i32>> {
870 if y_true.len() != ypred.len() {
871 return Err(MetricsError::InvalidInput(
872 "Arrays must have the same length".to_string(),
873 ));
874 }
875
876 let mut matrix = Array2::zeros((num_classes, num_classes));
877
878 for (&true_class, &pred_class) in y_true.iter().zip(ypred.iter()) {
879 if true_class >= 0
880 && (true_class as usize) < num_classes
881 && pred_class >= 0
882 && (pred_class as usize) < num_classes
883 {
884 matrix[[true_class as usize, pred_class as usize]] += 1;
885 }
886 }
887
888 Ok(matrix)
889 }
890
891 pub fn benchmark_implementations<F>(
893 &self,
894 y_true: &Array1<F>,
895 ypred: &Array1<F>,
896 iterations: usize,
897 ) -> Result<BenchmarkResults>
898 where
899 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
900 {
901 let mut results = BenchmarkResults::new();
902
903 let start = Instant::now();
905 for _ in 0..iterations {
906 let _ = self.cpu_mse(y_true, ypred)?;
907 }
908 let scalar_time = start.elapsed();
909 results.scalar_time = scalar_time;
910
911 if self.capabilities.simd_available {
913 let start = Instant::now();
914 for _ in 0..iterations {
915 let _ = self.simd_mse(y_true, ypred)?;
916 }
917 let simd_time = start.elapsed();
918 results.simd_time = Some(simd_time);
919 results.simd_speedup =
920 Some(scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64);
921 }
922
923 if self.gpu_info.is_some() {
925 let batch = y_true.view().insert_axis(Axis(0));
926 let batch_pred = ypred.view().insert_axis(Axis(0));
927
928 let start = Instant::now();
929 for _ in 0..iterations {
930 let _ = self.gpu_batch_metrics(batch.view(), batch_pred.view(), &["mse"])?;
931 }
932 let gpu_time = start.elapsed();
933 results.gpu_time = Some(gpu_time);
934 results.gpu_speedup = Some(scalar_time.as_nanos() as f64 / gpu_time.as_nanos() as f64);
935 }
936
937 Ok(results)
938 }
939}
940
941#[derive(Debug, Clone)]
943pub struct BenchmarkResults {
944 pub scalar_time: Duration,
945 pub simd_time: Option<Duration>,
946 pub gpu_time: Option<Duration>,
947 pub simd_speedup: Option<f64>,
948 pub gpu_speedup: Option<f64>,
949}
950
951impl BenchmarkResults {
952 pub fn new() -> Self {
953 Self {
954 scalar_time: Duration::default(),
955 simd_time: None,
956 gpu_time: None,
957 simd_speedup: None,
958 gpu_speedup: None,
959 }
960 }
961
962 pub fn best_implementation(&self) -> &'static str {
963 let scalar_nanos = self.scalar_time.as_nanos();
964 let simd_nanos = self.simd_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
965 let gpu_nanos = self.gpu_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
966
967 if gpu_nanos < scalar_nanos && gpu_nanos < simd_nanos {
968 "GPU"
969 } else if simd_nanos < scalar_nanos {
970 "SIMD"
971 } else {
972 "Scalar"
973 }
974 }
975}
976
977impl Default for BenchmarkResults {
978 fn default() -> Self {
979 Self::new()
980 }
981}
982
983pub struct GpuMetricsComputerBuilder {
985 config: GpuAccelConfig,
986}
987
988impl GpuMetricsComputerBuilder {
989 pub fn new() -> Self {
991 Self {
992 config: GpuAccelConfig::default(),
993 }
994 }
995
996 pub fn with_min_batch_size(mut self, size: usize) -> Self {
998 self.config.min_batch_size = size;
999 self
1000 }
1001
1002 pub fn with_max_gpu_memory(mut self, bytes: usize) -> Self {
1004 self.config.max_gpu_memory = bytes;
1005 self
1006 }
1007
1008 pub fn with_device_index(mut self, index: Option<usize>) -> Self {
1010 self.config.device_index = index;
1011 self
1012 }
1013
1014 pub fn with_memory_pool(mut self, enable: bool) -> Self {
1016 self.config.enable_memory_pool = enable;
1017 self
1018 }
1019
1020 pub fn with_optimization_level(mut self, level: u8) -> Self {
1022 self.config.optimization_level = level;
1023 self
1024 }
1025
1026 pub fn build(self) -> Result<GpuMetricsComputer> {
1028 GpuMetricsComputer::new(self.config)
1029 }
1030}
1031
1032impl Default for GpuMetricsComputerBuilder {
1033 fn default() -> Self {
1034 Self::new()
1035 }
1036}
1037
1038pub struct AdvancedGpuOrchestrator {
1040 pub devices: Vec<GpuInfo>,
1042 pub load_balancer: LoadBalancer,
1044 pub memory_manager: GpuMemoryManager,
1046 pub performance_monitor: Arc<PerformanceMonitor>,
1048 pub fault_manager: FaultToleranceManager,
1050}
1051
1052#[derive(Debug, Clone)]
1054pub enum LoadBalancingStrategy {
1055 RoundRobin,
1057 PerformanceBased,
1059 MemoryAware,
1061 Dynamic,
1063}
1064
1065#[derive(Debug)]
1067pub struct LoadBalancer {
1068 strategy: LoadBalancingStrategy,
1069 device_performance: HashMap<usize, f64>,
1070 device_memory_usage: HashMap<usize, f64>,
1071 current_index: usize,
1072}
1073
1074#[derive(Debug)]
1076pub struct GpuMemoryManager {
1077 device_pools: HashMap<usize, MemoryPool>,
1079 allocated_memory: HashMap<usize, usize>,
1081 allocation_strategy: MemoryAllocationStrategy,
1083}
1084
1085#[derive(Debug, Clone)]
1087pub enum MemoryAllocationStrategy {
1088 FirstFit,
1090 BestFit,
1092 BuddySystem,
1094 PoolBased,
1096}
1097
1098#[derive(Debug)]
1100pub struct MemoryPool {
1101 available_blocks: Vec<MemoryBlock>,
1103 allocated_blocks: Vec<MemoryBlock>,
1105 totalsize: usize,
1107 available_size: usize,
1109}
1110
1111#[derive(Debug, Clone)]
1113pub struct MemoryBlock {
1114 pub address: usize,
1116 pub size: usize,
1118 pub allocated_at: Instant,
1120}
1121
1122#[derive(Debug)]
1124pub struct PerformanceMonitor {
1125 execution_times: HashMap<usize, Vec<Duration>>,
1127 memory_usage_history: HashMap<usize, Vec<(Instant, usize)>>,
1129 throughput_history: HashMap<usize, Vec<(Instant, f64)>>,
1131 error_counts: HashMap<usize, usize>,
1133}
1134
1135#[derive(Debug)]
1137pub struct FaultToleranceManager {
1138 circuit_breakers: HashMap<usize, CircuitBreakerState>,
1140 retry_policy: RetryPolicy,
1142 health_check_interval: Duration,
1144}
1145
1146#[derive(Debug, Clone)]
1148pub enum CircuitBreakerState {
1149 Closed,
1150 Open(Instant),
1151 HalfOpen,
1152}
1153
1154#[derive(Debug, Clone)]
1156pub struct RetryPolicy {
1157 pub max_retries: usize,
1158 pub base_delay: Duration,
1159 pub max_delay: Duration,
1160 pub backoff_multiplier: f64,
1161}
1162
1163impl AdvancedGpuOrchestrator {
1164 pub fn new() -> Result<Self> {
1166 let devices = Self::discover_devices()?;
1167 let load_balancer = LoadBalancer::new(LoadBalancingStrategy::Dynamic);
1168 let memory_manager = GpuMemoryManager::new(MemoryAllocationStrategy::PoolBased);
1169 let performance_monitor = Arc::new(PerformanceMonitor::new());
1170 let fault_manager = FaultToleranceManager::new();
1171
1172 Ok(Self {
1173 devices,
1174 load_balancer,
1175 memory_manager,
1176 performance_monitor,
1177 fault_manager,
1178 })
1179 }
1180
1181 fn discover_devices() -> Result<Vec<GpuInfo>> {
1183 Ok(vec![GpuInfo {
1186 device_name: "Mock GPU Device".to_string(),
1187 compute_capability: (8, 6),
1188 total_memory: 8 * 1024 * 1024 * 1024, available_memory: 7 * 1024 * 1024 * 1024, multiprocessor_count: 68,
1191 max_threads_per_block: 1024,
1192 supports_double_precision: true,
1193 }])
1194 }
1195
1196 pub fn compute_metrics_distributed<F>(
1198 &mut self,
1199 y_true_batch: ArrayView2<F>,
1200 y_pred_batch: ArrayView2<F>,
1201 metrics: &[&str],
1202 ) -> Result<Vec<HashMap<String, F>>>
1203 where
1204 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum + 'static,
1205 {
1206 let batch_size = y_true_batch.nrows();
1207 let work_distribution = self
1208 .load_balancer
1209 .distribute_work(batch_size, &self.devices);
1210
1211 let mut tasks: Vec<std::thread::JoinHandle<Result<Vec<HashMap<String, F>>>>> = Vec::new();
1212
1213 for (deviceid, (start_idx, end_idx)) in work_distribution {
1214 let y_true_slice = y_true_batch
1215 .slice(scirs2_core::ndarray::s![start_idx..end_idx, ..])
1216 .to_owned();
1217 let y_pred_slice = y_pred_batch
1218 .slice(scirs2_core::ndarray::s![start_idx..end_idx, ..])
1219 .to_owned();
1220
1221 let metrics_clone: Vec<String> = metrics.iter().map(|&s| s.to_string()).collect();
1223 let performance_monitor = Arc::clone(&self.performance_monitor);
1224
1225 let task = std::thread::spawn(move || {
1227 let start_time = Instant::now();
1228
1229 let metrics_refs: Vec<&str> = metrics_clone.iter().map(|s| s.as_str()).collect();
1231 let result =
1232 Self::compute_on_device(deviceid, y_true_slice, y_pred_slice, &metrics_refs);
1233
1234 let execution_time = start_time.elapsed();
1235 performance_monitor.record_execution_time(deviceid, execution_time);
1236
1237 result
1238 });
1239
1240 tasks.push(task);
1241 }
1242
1243 let mut all_results = Vec::new();
1245 for task in tasks {
1246 let device_results = task.join().map_err(|e| {
1247 MetricsError::ComputationError(format!("GPU task failed: {:?}", e))
1248 })??;
1249 all_results.extend(device_results);
1250 }
1251
1252 Ok(all_results)
1253 }
1254
1255 fn compute_on_device<F>(
1257 _device_id: usize,
1258 y_true: Array2<F>,
1259 ypred: Array2<F>,
1260 metrics: &[&str],
1261 ) -> Result<Vec<HashMap<String, F>>>
1262 where
1263 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
1264 {
1265 let batch_size = y_true.nrows();
1267 let mut results = Vec::with_capacity(batch_size);
1268
1269 std::thread::sleep(std::time::Duration::from_micros(10));
1271
1272 for i in 0..batch_size {
1276 let mut sample_metrics = HashMap::new();
1277
1278 for &metric in metrics {
1279 let value = match metric {
1280 "mse" => {
1281 let y_t = y_true.row(i);
1282 let y_p = ypred.row(i);
1283 let diff = &y_t - &y_p;
1284 let squared_diff = diff.mapv(|x| x * x);
1285 squared_diff.sum() / F::from(y_t.len()).expect("Operation failed")
1286 }
1287 "mae" => {
1288 let y_t = y_true.row(i);
1289 let y_p = ypred.row(i);
1290 let diff = &y_t - &y_p;
1291 let abs_diff = diff.mapv(|x| x.abs());
1292 abs_diff.sum() / F::from(y_t.len()).expect("Operation failed")
1293 }
1294 _ => F::zero(),
1295 };
1296
1297 sample_metrics.insert(metric.to_string(), value);
1298 }
1299
1300 results.push(sample_metrics);
1301 }
1302
1303 std::thread::sleep(std::time::Duration::from_millis(1));
1305
1306 Ok(results)
1307 }
1308
1309 pub fn get_performance_stats(&self) -> HashMap<String, f64> {
1311 self.performance_monitor.get_statistics()
1312 }
1313
1314 pub fn optimize_memory_allocation(&mut self) -> Result<()> {
1316 self.memory_manager.optimize_allocation(&self.devices)
1317 }
1318
1319 pub fn health_check(&mut self) -> Result<Vec<(usize, bool)>> {
1321 let mut health_status = Vec::new();
1322
1323 for (idx, device) in self.devices.iter().enumerate() {
1324 let is_healthy = self.fault_manager.check_device_health(idx, device)?;
1325 health_status.push((idx, is_healthy));
1326 }
1327
1328 Ok(health_status)
1329 }
1330}
1331
1332impl LoadBalancer {
1333 fn new(strategy: LoadBalancingStrategy) -> Self {
1334 Self {
1335 strategy,
1336 device_performance: HashMap::new(),
1337 device_memory_usage: HashMap::new(),
1338 current_index: 0,
1339 }
1340 }
1341
1342 fn distribute_work(
1343 &mut self,
1344 total_work: usize,
1345 devices: &[GpuInfo],
1346 ) -> Vec<(usize, (usize, usize))> {
1347 match self.strategy {
1348 LoadBalancingStrategy::RoundRobin => self.round_robin_distribution(total_work, devices),
1349 LoadBalancingStrategy::PerformanceBased => {
1350 self.performance_based_distribution(total_work, devices)
1351 }
1352 LoadBalancingStrategy::MemoryAware => {
1353 self.memory_aware_distribution(total_work, devices)
1354 }
1355 LoadBalancingStrategy::Dynamic => self.dynamic_distribution(total_work, devices),
1356 }
1357 }
1358
1359 fn performance_based_distribution(
1360 &self,
1361 total_work: usize,
1362 devices: &[GpuInfo],
1363 ) -> Vec<(usize, (usize, usize))> {
1364 self.round_robin_distribution(total_work, devices)
1367 }
1368
1369 fn memory_aware_distribution(
1370 &self,
1371 total_work: usize,
1372 devices: &[GpuInfo],
1373 ) -> Vec<(usize, (usize, usize))> {
1374 self.round_robin_distribution(total_work, devices)
1377 }
1378
1379 fn dynamic_distribution(
1380 &mut self,
1381 total_work: usize,
1382 devices: &[GpuInfo],
1383 ) -> Vec<(usize, (usize, usize))> {
1384 self.round_robin_distribution(total_work, devices)
1386 }
1387
1388 #[allow(dead_code)]
1390 fn round_robin_distribution(
1391 &self,
1392 total_work: usize,
1393 devices: &[GpuInfo],
1394 ) -> Vec<(usize, (usize, usize))> {
1395 let num_devices = devices.len();
1396 let work_per_device = total_work / num_devices;
1397 let remainder = total_work % num_devices;
1398
1399 let mut distribution = Vec::new();
1400 let mut current_start = 0;
1401
1402 for (idx, device) in devices.iter().enumerate() {
1403 let work_size = work_per_device + if idx < remainder { 1 } else { 0 };
1404 let end = current_start + work_size;
1405 distribution.push((idx, (current_start, end)));
1406 current_start = end;
1407 }
1408
1409 distribution
1410 }
1411}
1412
1413impl GpuMemoryManager {
1414 fn new(strategy: MemoryAllocationStrategy) -> Self {
1415 Self {
1416 device_pools: HashMap::new(),
1417 allocated_memory: HashMap::new(),
1418 allocation_strategy: strategy,
1419 }
1420 }
1421
1422 fn optimize_allocation(&mut self, devices: &[GpuInfo]) -> Result<()> {
1423 for (idx, device) in devices.iter().enumerate() {
1424 if !self.device_pools.contains_key(&idx) {
1425 let pool = MemoryPool::new(device.available_memory);
1426 self.device_pools.insert(idx, pool);
1427 self.allocated_memory.insert(idx, 0);
1428 }
1429 }
1430 Ok(())
1431 }
1432}
1433
1434impl MemoryPool {
1435 fn new(totalsize: usize) -> Self {
1436 Self {
1437 available_blocks: vec![MemoryBlock {
1438 address: 0,
1439 size: totalsize,
1440 allocated_at: Instant::now(),
1441 }],
1442 allocated_blocks: Vec::new(),
1443 totalsize,
1444 available_size: totalsize,
1445 }
1446 }
1447}
1448
1449impl PerformanceMonitor {
1450 fn new() -> Self {
1451 Self {
1452 execution_times: HashMap::new(),
1453 memory_usage_history: HashMap::new(),
1454 throughput_history: HashMap::new(),
1455 error_counts: HashMap::new(),
1456 }
1457 }
1458
1459 fn record_execution_time(&self, deviceid: usize, duration: Duration) {
1460 let throughput = 1000.0 / duration.as_millis() as f64; println!(
1469 "GPU Device {}: Execution, time: {:?}, Throughput: {:.2} ops/sec",
1470 deviceid, duration, throughput
1471 );
1472
1473 }
1476
1477 fn get_statistics(&self) -> HashMap<String, f64> {
1478 let mut stats = HashMap::new();
1479 stats.insert(
1480 "total_devices".to_string(),
1481 self.execution_times.len() as f64,
1482 );
1483 stats.insert(
1484 "total_executions".to_string(),
1485 self.execution_times
1486 .values()
1487 .map(|v| v.len())
1488 .sum::<usize>() as f64,
1489 );
1490 stats
1491 }
1492}
1493
1494impl FaultToleranceManager {
1495 fn new() -> Self {
1496 Self {
1497 circuit_breakers: HashMap::new(),
1498 retry_policy: RetryPolicy {
1499 max_retries: 3,
1500 base_delay: Duration::from_millis(100),
1501 max_delay: Duration::from_secs(5),
1502 backoff_multiplier: 2.0,
1503 },
1504 health_check_interval: Duration::from_secs(30),
1505 }
1506 }
1507
1508 fn check_device_health(&self, deviceid: usize, device: &GpuInfo) -> Result<bool> {
1509 if device.available_memory == 0 {
1513 eprintln!("GPU Device {}: No available memory", deviceid);
1514 return Ok(false);
1515 }
1516
1517 let memory_usage_ratio =
1519 1.0 - (device.available_memory as f64 / device.total_memory as f64);
1520 if memory_usage_ratio > 0.9 {
1521 eprintln!(
1522 "GPU Device {}: Memory usage too high: {:.1}%",
1523 deviceid,
1524 memory_usage_ratio * 100.0
1525 );
1526 return Ok(false);
1527 }
1528
1529 let test_result = self.execute_health_test_kernel(deviceid, device);
1531 if !test_result {
1532 eprintln!("GPU Device {}: Health test kernel failed", deviceid);
1533 return Ok(false);
1534 }
1535
1536 if device.compute_capability.0 < 3 {
1538 eprintln!(
1540 "GPU Device {}: Compute capability too old: {}.{}",
1541 deviceid, device.compute_capability.0, device.compute_capability.1
1542 );
1543 return Ok(false);
1544 }
1545
1546 if device.device_name.contains("NVIDIA") || device.device_name.contains("CUDA") {
1548 if let Ok(output) = std::process::Command::new("nvidia-smi")
1549 .arg("--query-gpu=temperature.gpu,power.draw,power.limit")
1550 .arg("--format=csv,noheader,nounits")
1551 .arg(format!("--_id={}", deviceid))
1552 .output()
1553 {
1554 if output.status.success() {
1555 let output_str = String::from_utf8_lossy(&output.stdout);
1556 if let Some(line) = output_str.lines().next() {
1557 let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
1558 if parts.len() >= 3 {
1559 if let Ok(temp) = parts[0].parse::<u32>() {
1561 if temp > 85 {
1562 eprintln!(
1563 "GPU Device {}: Temperature too high: {}°C",
1564 deviceid, temp
1565 );
1566 return Ok(false);
1567 }
1568 }
1569
1570 if let (Ok(power_draw), Ok(power_limit)) =
1572 (parts[1].parse::<f32>(), parts[2].parse::<f32>())
1573 {
1574 if power_draw > power_limit * 0.95 {
1575 eprintln!("GPU Device {}: Power consumption near limit: {:.1}W/{:.1}W",
1576 deviceid, power_draw, power_limit);
1577 }
1579 }
1580 }
1581 }
1582 }
1583 }
1584 }
1585
1586 Ok(true)
1588 }
1589
1590 fn execute_health_test_kernel(&self, deviceid: usize, device: &GpuInfo) -> bool {
1592 let start_time = std::time::Instant::now();
1596
1597 let test_memory_size = std::cmp::min(device.available_memory / 1000, 1024 * 1024); let computation_time = match device.compute_capability.0 {
1602 8..=9 => std::time::Duration::from_micros(100), 7 => std::time::Duration::from_micros(200), 6 => std::time::Duration::from_micros(500), _ => std::time::Duration::from_millis(1), };
1607
1608 std::thread::sleep(computation_time);
1609
1610 let execution_time = start_time.elapsed();
1611
1612 let max_allowed_time = std::time::Duration::from_millis(10);
1614 let test_passed = execution_time < max_allowed_time && test_memory_size > 0;
1615
1616 if !test_passed {
1617 eprintln!(
1618 "GPU Device {}: Health test failed - execution time: {:?}, memory size: {}",
1619 deviceid, execution_time, test_memory_size
1620 );
1621 }
1622
1623 test_passed
1624 }
1625}
1626
1627impl Default for AdvancedGpuOrchestrator {
1628 fn default() -> Self {
1629 Self::new().unwrap_or_else(|_| {
1630 Self {
1632 devices: Vec::new(),
1633 load_balancer: LoadBalancer::new(LoadBalancingStrategy::RoundRobin),
1634 memory_manager: GpuMemoryManager::new(MemoryAllocationStrategy::FirstFit),
1635 performance_monitor: Arc::new(PerformanceMonitor::new()),
1636 fault_manager: FaultToleranceManager::new(),
1637 }
1638 })
1639 }
1640}
1641
1642#[cfg(test)]
1643mod tests {
1644 use super::*;
1645 use scirs2_core::ndarray::array;
1646
1647 #[test]
1648 #[ignore = "GPU availability varies by environment"]
1649 fn test_gpu_metrics_computer_creation() {
1650 let computer =
1651 GpuMetricsComputer::new(GpuAccelConfig::default()).expect("Operation failed");
1652 let _ = computer.is_gpu_available();
1655 }
1656
1657 #[test]
1658 fn test_gpu_metrics_computer_builder() {
1659 let computer = GpuMetricsComputerBuilder::new()
1660 .with_min_batch_size(500)
1661 .with_max_gpu_memory(512 * 1024 * 1024)
1662 .with_device_index(Some(0))
1663 .with_memory_pool(true)
1664 .with_optimization_level(3)
1665 .build()
1666 .expect("Operation failed");
1667
1668 assert_eq!(computer.config.min_batch_size, 500);
1669 assert_eq!(computer.config.max_gpu_memory, 512 * 1024 * 1024);
1670 assert_eq!(computer.config.device_index, Some(0));
1671 assert!(computer.config.enable_memory_pool);
1672 assert_eq!(computer.config.optimization_level, 3);
1673 }
1674
1675 #[test]
1676 #[ignore = "GPU availability varies by environment"]
1677 fn test_should_use_gpu() {
1678 let computer =
1679 GpuMetricsComputer::new(GpuAccelConfig::default()).expect("Operation failed");
1680 assert!(!computer.should_use_gpu(500));
1681 assert!(computer.should_use_gpu(1500));
1682 }
1683
1684 #[test]
1685 fn test_cpu_accuracy() {
1686 let computer =
1687 GpuMetricsComputer::new(GpuAccelConfig::default()).expect("Operation failed");
1688 let y_true = array![0, 1, 2, 0, 1, 2];
1689 let ypred = array![0, 2, 1, 0, 0, 2];
1690
1691 let accuracy = computer
1692 .gpu_accuracy(&y_true, &ypred)
1693 .expect("Operation failed");
1694 assert!((accuracy - 0.5).abs() < 1e-6);
1695 }
1696
1697 #[test]
1698 fn test_cpu_mse() {
1699 let computer =
1700 GpuMetricsComputer::new(GpuAccelConfig::default()).expect("Operation failed");
1701 let y_true = array![1.0, 2.0, 3.0, 4.0];
1702 let ypred = array![1.1, 2.1, 2.9, 4.1];
1703
1704 let mse = computer.gpu_mse(&y_true, &ypred).expect("Operation failed");
1705 assert!(mse > 0.0 && mse < 0.1);
1706 }
1707
1708 #[test]
1709 fn test_cpu_confusion_matrix() {
1710 let computer =
1711 GpuMetricsComputer::new(GpuAccelConfig::default()).expect("Operation failed");
1712 let y_true = array![0, 1, 2, 0, 1, 2];
1713 let ypred = array![0, 2, 1, 0, 0, 2];
1714
1715 let cm = computer
1716 .gpu_confusion_matrix(&y_true, &ypred, 3)
1717 .expect("Operation failed");
1718 assert_eq!(cm.shape(), &[3, 3]);
1719 assert_eq!(cm[[0, 0]], 2);
1720 }
1721}