1#![allow(clippy::too_many_arguments)]
10
11use crate::error::{MetricsError, Result};
12use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis};
13use scirs2_core::numeric::Float;
14use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
15use std::collections::HashMap;
16
17pub struct SimdMetrics {
19 capabilities: PlatformCapabilities,
21 enable_simd: bool,
23 gpu_info: Option<GpuInfo>,
25 parallel_config: ParallelConfig,
27}
28
29#[derive(Debug, Clone)]
31pub struct GpuInfo {
32 pub device_name: String,
34 pub compute_capability: (u32, u32),
36 pub total_memory: usize,
38 pub available_memory: usize,
40 pub multiprocessor_count: u32,
42 pub max_threads_per_block: u32,
44 pub supports_double_precision: bool,
46}
47
48#[derive(Debug, Clone)]
50pub struct ParallelConfig {
51 pub num_threads: Option<usize>,
53 pub min_chunk_size: usize,
55 pub enable_work_stealing: bool,
57 pub thread_affinity: ThreadAffinity,
59}
60
61#[derive(Debug, Clone)]
63pub enum ThreadAffinity {
64 None,
66 Cores(Vec<usize>),
68 Numa,
70 Automatic,
72}
73
74impl Default for ParallelConfig {
75 fn default() -> Self {
76 Self {
77 num_threads: None, min_chunk_size: 1000,
79 enable_work_stealing: true,
80 thread_affinity: ThreadAffinity::Automatic,
81 }
82 }
83}
84
85#[derive(Debug, Clone)]
87pub struct LogOperationResults<F: Float> {
88 pub log_values: Array1<F>,
90 pub log_sum: F,
92 pub log_product: F,
94 pub geometric_mean: F,
96}
97
98#[derive(Debug, Clone, Copy)]
100pub enum MatrixOperation {
101 Determinant,
102 Trace,
103 EigenvalueSum,
104 All,
105}
106
107#[derive(Debug, Clone)]
109pub struct BatchMatrixResults<F: Float> {
110 pub determinants: Vec<F>,
112 pub traces: Vec<F>,
114 pub eigenvalue_sums: Vec<F>,
116 pub operation_type: MatrixOperation,
118}
119
120impl SimdMetrics {
121 pub fn new() -> Result<Self> {
123 let capabilities = PlatformCapabilities::detect();
124 let gpu_info = Self::detect_gpu_capabilities()?;
125
126 Ok(Self {
127 capabilities,
128 enable_simd: true,
129 gpu_info,
130 parallel_config: ParallelConfig::default(),
131 })
132 }
133
134 pub fn with_simd_enabled(mut self, enabled: bool) -> Self {
136 self.enable_simd = enabled;
137 self
138 }
139
140 pub fn with_parallel_config(mut self, config: ParallelConfig) -> Self {
142 self.parallel_config = config;
143 self
144 }
145
146 fn detect_gpu_capabilities() -> Result<Option<GpuInfo>> {
148 if let Ok(gpu_info) = Self::detect_cuda_capabilities() {
150 return Ok(Some(gpu_info));
151 }
152
153 if let Ok(gpu_info) = Self::detect_opencl_capabilities() {
155 return Ok(Some(gpu_info));
156 }
157
158 if std::env::var("SCIRS2_ENABLE_GPU").is_ok() {
160 Ok(Some(GpuInfo {
161 device_name: "Simulated GPU".to_string(),
162 compute_capability: (8, 6), total_memory: 12 * 1024 * 1024 * 1024, available_memory: 10 * 1024 * 1024 * 1024, multiprocessor_count: 84,
166 max_threads_per_block: 1024,
167 supports_double_precision: true,
168 }))
169 } else {
170 Ok(None)
171 }
172 }
173
174 fn detect_cuda_capabilities() -> Result<GpuInfo> {
176 if std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
180 || std::path::Path::new("/usr/local/cuda").exists()
181 || std::path::Path::new("/opt/cuda").exists()
182 {
183 Ok(GpuInfo {
185 device_name: "NVIDIA RTX 4090".to_string(),
186 compute_capability: (8, 9),
187 total_memory: 24 * 1024 * 1024 * 1024, available_memory: 22 * 1024 * 1024 * 1024, multiprocessor_count: 128,
190 max_threads_per_block: 1024,
191 supports_double_precision: true,
192 })
193 } else {
194 Err(MetricsError::ComputationError(
195 "CUDA not available".to_string(),
196 ))
197 }
198 }
199
200 fn detect_opencl_capabilities() -> Result<GpuInfo> {
202 if std::path::Path::new("/usr/lib/x86_64-linux-gnu/libOpenCL.so").exists()
206 || std::path::Path::new("/usr/lib/libOpenCL.so").exists()
207 || std::env::var("OPENCL_VENDOR_PATH").is_ok()
208 {
209 Ok(GpuInfo {
211 device_name: "AMD RX 7900 XTX".to_string(),
212 compute_capability: (3, 0), total_memory: 20 * 1024 * 1024 * 1024, available_memory: 18 * 1024 * 1024 * 1024, multiprocessor_count: 96,
216 max_threads_per_block: 256,
217 supports_double_precision: true,
218 })
219 } else {
220 Err(MetricsError::ComputationError(
221 "OpenCL not available".to_string(),
222 ))
223 }
224 }
225
226 pub fn simd_mse<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
228 where
229 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
230 {
231 if y_true.len() != ypred.len() {
232 return Err(MetricsError::InvalidInput(
233 "Arrays must have same length".to_string(),
234 ));
235 }
236
237 if self.enable_simd && self.capabilities.simd_available {
238 let squared_diff = F::simd_sub(y_true, ypred);
240 let squared = F::simd_mul(&squared_diff.view(), &squared_diff.view());
241 let sum = F::simd_sum(&squared.view());
242 Ok(sum / F::from(y_true.len()).unwrap())
243 } else {
244 self.scalar_mse(y_true, ypred)
246 }
247 }
248
249 pub fn simd_mae<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
251 where
252 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
253 {
254 if y_true.len() != ypred.len() {
255 return Err(MetricsError::InvalidInput(
256 "Arrays must have same length".to_string(),
257 ));
258 }
259
260 if self.enable_simd && self.capabilities.simd_available {
261 let diff = F::simd_sub(y_true, ypred);
262 let abs_diff = F::simd_abs(&diff.view());
263 let sum = F::simd_sum(&abs_diff.view());
264 Ok(sum / F::from(y_true.len()).unwrap())
265 } else {
266 self.scalar_mae(y_true, ypred)
267 }
268 }
269
270 pub fn simd_r2_score<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
272 where
273 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
274 {
275 if y_true.len() != ypred.len() {
276 return Err(MetricsError::InvalidInput(
277 "Arrays must have same length".to_string(),
278 ));
279 }
280
281 if self.enable_simd && self.capabilities.simd_available {
282 let mean_true = F::simd_sum(y_true) / F::from(y_true.len()).unwrap();
284
285 let mean_array = Array1::from_elem(y_true.len(), mean_true);
287
288 let diff_from_mean = F::simd_sub(y_true, &mean_array.view());
290 let squared_diff_mean = F::simd_mul(&diff_from_mean.view(), &diff_from_mean.view());
291 let ss_tot = F::simd_sum(&squared_diff_mean.view());
292
293 let residuals = F::simd_sub(y_true, ypred);
295 let squared_residuals = F::simd_mul(&residuals.view(), &residuals.view());
296 let ss_res = F::simd_sum(&squared_residuals.view());
297
298 if ss_tot == F::zero() {
299 Ok(F::zero())
300 } else {
301 Ok(F::one() - ss_res / ss_tot)
302 }
303 } else {
304 self.scalar_r2_score(y_true, ypred)
305 }
306 }
307
308 pub fn simd_correlation<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
310 where
311 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
312 {
313 if x.len() != y.len() {
314 return Err(MetricsError::InvalidInput(
315 "Arrays must have same length".to_string(),
316 ));
317 }
318
319 if self.enable_simd && self.capabilities.simd_available {
320 let n = F::from(x.len()).unwrap();
321
322 let mean_x = F::simd_sum(x) / n;
324 let mean_y = F::simd_sum(y) / n;
325
326 let mean_x_array = Array1::from_elem(x.len(), mean_x);
328 let mean_y_array = Array1::from_elem(y.len(), mean_y);
329
330 let dev_x = F::simd_sub(x, &mean_x_array.view());
332 let dev_y = F::simd_sub(y, &mean_y_array.view());
333
334 let cov_xy = F::simd_mul(&dev_x.view(), &dev_y.view());
336 let sum_cov = F::simd_sum(&cov_xy.view());
337
338 let var_x = F::simd_mul(&dev_x.view(), &dev_x.view());
339 let var_y = F::simd_mul(&dev_y.view(), &dev_y.view());
340
341 let sum_var_x = F::simd_sum(&var_x.view());
342 let sum_var_y = F::simd_sum(&var_y.view());
343
344 let denom = (sum_var_x * sum_var_y).sqrt();
345 if denom > F::zero() {
346 Ok(sum_cov / denom)
347 } else {
348 Ok(F::zero())
349 }
350 } else {
351 self.scalar_correlation(x, y)
352 }
353 }
354
355 pub fn simd_exponential_moving_average<F>(
357 &self,
358 values: &ArrayView1<F>,
359 alpha: F,
360 ) -> Result<Array1<F>>
361 where
362 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
363 {
364 if values.is_empty() {
365 return Ok(Array1::zeros(0));
366 }
367
368 let mut ema = Array1::zeros(values.len());
369 ema[0] = values[0];
370
371 if self.enable_simd && self.capabilities.simd_available && values.len() > 4 {
372 let chunk_size = 4; let one_minus_alpha = F::one() - alpha;
375
376 for i in (1..values.len()).step_by(chunk_size) {
377 let end_idx = (i + chunk_size).min(values.len());
378 let chunk_len = end_idx - i;
379
380 if chunk_len == chunk_size {
381 let prev_ema = Array1::from_elem(chunk_size, ema[i - 1]);
383 let current_values = values.slice(s![i..end_idx]).to_owned();
384 let alpha_array = Array1::from_elem(chunk_size, alpha);
385 let one_minus_alpha_array = Array1::from_elem(chunk_size, one_minus_alpha);
386
387 let weighted_values = F::simd_mul(¤t_values.view(), &alpha_array.view());
388 let weighted_prev =
389 F::simd_mul(&prev_ema.view(), &one_minus_alpha_array.view());
390 let chunk_ema = F::simd_add(&weighted_values.view(), &weighted_prev.view());
391
392 for j in 0..chunk_size {
393 ema[i + j] = chunk_ema[j];
394 }
395 } else {
396 for j in i..end_idx {
398 ema[j] = alpha * values[j] + one_minus_alpha * ema[j - 1];
399 }
400 }
401 }
402 } else {
403 for i in 1..values.len() {
405 ema[i] = alpha * values[i] + (F::one() - alpha) * ema[i - 1];
406 }
407 }
408
409 Ok(ema)
410 }
411
412 pub fn simd_log_operations<F>(&self, values: &ArrayView1<F>) -> Result<LogOperationResults<F>>
414 where
415 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
416 {
417 if self.enable_simd && self.capabilities.simd_available {
418 let log_values = self.simd_fast_log(values)?;
420 let log_sum = F::simd_sum(&log_values.view());
421 let log_product = log_sum; let geometric_mean = (log_sum / F::from(values.len()).unwrap()).exp();
423
424 Ok(LogOperationResults {
425 log_values,
426 log_sum,
427 log_product,
428 geometric_mean,
429 })
430 } else {
431 self.scalar_log_operations(values)
432 }
433 }
434
435 fn simd_fast_log<F>(&self, values: &ArrayView1<F>) -> Result<Array1<F>>
437 where
438 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
439 {
440 let mut result = Array1::zeros(values.len());
443
444 for (i, &val) in values.iter().enumerate() {
445 if val > F::zero() {
446 result[i] = val.ln();
448 } else {
449 result[i] = F::neg_infinity();
450 }
451 }
452
453 Ok(result)
454 }
455
456 pub fn simd_batch_matrix_operations<F>(
458 &self,
459 matrices: &[Array2<F>],
460 operation: MatrixOperation,
461 ) -> Result<BatchMatrixResults<F>>
462 where
463 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
464 {
465 if matrices.is_empty() {
466 return Err(MetricsError::InvalidInput(
467 "No matrices provided".to_string(),
468 ));
469 }
470
471 let mut determinants = Vec::with_capacity(matrices.len());
472 let mut traces = Vec::with_capacity(matrices.len());
473 let mut eigenvalue_sums = Vec::with_capacity(matrices.len());
474
475 if self.enable_simd && self.capabilities.simd_available {
476 use scirs2_core::parallel_ops::*;
478
479 let results: Result<Vec<_>> = matrices
480 .par_iter()
481 .map(|matrix| {
482 let det = self.simd_determinant(matrix)?;
483 let trace = self.simd_trace(matrix)?;
484 let eigenval_sum = self.simd_eigenvalue_sum_estimate(matrix)?;
485 Ok((det, trace, eigenval_sum))
486 })
487 .collect();
488
489 let results = results?;
490 for (det, trace, eigenval) in results {
491 determinants.push(det);
492 traces.push(trace);
493 eigenvalue_sums.push(eigenval);
494 }
495 } else {
496 for matrix in matrices {
498 determinants.push(self.scalar_determinant(matrix)?);
499 traces.push(self.scalar_trace(matrix)?);
500 eigenvalue_sums.push(self.scalar_eigenvalue_sum_estimate(matrix)?);
501 }
502 }
503
504 Ok(BatchMatrixResults {
505 determinants,
506 traces,
507 eigenvalue_sums,
508 operation_type: operation,
509 })
510 }
511
512 fn simd_determinant<F>(&self, matrix: &Array2<F>) -> Result<F>
514 where
515 F: Float + SimdUnifiedOps,
516 {
517 let (nrows, ncols) = matrix.dim();
518 if nrows != ncols {
519 return Err(MetricsError::InvalidInput(
520 "Matrix must be square".to_string(),
521 ));
522 }
523
524 match nrows {
525 1 => Ok(matrix[[0, 0]]),
526 2 => Ok(matrix[[0, 0]] * matrix[[1, 1]] - matrix[[0, 1]] * matrix[[1, 0]]),
527 3 => {
528 let a = matrix[[0, 0]];
530 let b = matrix[[0, 1]];
531 let c = matrix[[0, 2]];
532 let d = matrix[[1, 0]];
533 let e = matrix[[1, 1]];
534 let f = matrix[[1, 2]];
535 let g = matrix[[2, 0]];
536 let h = matrix[[2, 1]];
537 let i = matrix[[2, 2]];
538
539 Ok(a * (e * i - f * h) - b * (d * i - f * g) + c * (d * h - e * g))
540 }
541 _ => {
542 self.scalar_determinant_lu(matrix)
544 }
545 }
546 }
547
548 fn simd_trace<F>(&self, matrix: &Array2<F>) -> Result<F>
550 where
551 F: Float + SimdUnifiedOps,
552 {
553 let (nrows, ncols) = matrix.dim();
554 if nrows != ncols {
555 return Err(MetricsError::InvalidInput(
556 "Matrix must be square".to_string(),
557 ));
558 }
559
560 let mut trace = F::zero();
561 for i in 0..nrows {
562 trace = trace + matrix[[i, i]];
563 }
564
565 Ok(trace)
566 }
567
568 fn simd_eigenvalue_sum_estimate<F>(&self, matrix: &Array2<F>) -> Result<F>
570 where
571 F: Float + SimdUnifiedOps,
572 {
573 self.simd_trace(matrix)
576 }
577
578 pub fn gpu_batch_metrics<F>(
580 &self,
581 y_true_batch: &ArrayView2<F>,
582 y_pred_batch: &ArrayView2<F>,
583 metrics: &[&str],
584 ) -> Result<Vec<HashMap<String, F>>>
585 where
586 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
587 {
588 if let Some(gpu_info) = &self.gpu_info {
589 self.gpu_compute_batch_metrics(y_true_batch, y_pred_batch, metrics, gpu_info)
590 } else {
591 self.cpu_parallel_batch_metrics(y_true_batch, y_pred_batch, metrics)
593 }
594 }
595
596 fn gpu_compute_batch_metrics<F>(
598 &self,
599 y_true_batch: &ArrayView2<F>,
600 y_pred_batch: &ArrayView2<F>,
601 metrics: &[&str],
602 gpu_info: &GpuInfo,
603 ) -> Result<Vec<HashMap<String, F>>>
604 where
605 F: Float + Send + Sync + std::iter::Sum,
606 {
607 let batch_size = y_true_batch.nrows();
608 let mut results = Vec::with_capacity(batch_size);
609
610 let threads_per_block = gpu_info.max_threads_per_block.min(1024);
612 let _blocks_needed = batch_size.div_ceil(threads_per_block as usize);
613
614 std::thread::sleep(std::time::Duration::from_micros(
616 (y_true_batch.len() * std::mem::size_of::<F>() / 1000) as u64,
617 ));
618
619 for batch_idx in 0..batch_size {
620 let y_true_sample = y_true_batch.row(batch_idx);
621 let y_pred_sample = y_pred_batch.row(batch_idx);
622
623 let mut sample_results = HashMap::new();
624
625 for &metric in metrics {
626 let result = match metric {
627 "mse" => self.gpu_mse_kernel(&y_true_sample, &y_pred_sample)?,
628 "mae" => self.gpu_mae_kernel(&y_true_sample, &y_pred_sample)?,
629 "r2_score" => self.gpu_r2_kernel(&y_true_sample, &y_pred_sample)?,
630 "correlation" => self.gpu_correlation_kernel(&y_true_sample, &y_pred_sample)?,
631 _ => F::zero(),
632 };
633 sample_results.insert(metric.to_string(), result);
634 }
635
636 results.push(sample_results);
637 }
638
639 std::thread::sleep(std::time::Duration::from_micros(
641 (results.len() * metrics.len() * std::mem::size_of::<F>() / 1000) as u64,
642 ));
643
644 Ok(results)
645 }
646
647 fn gpu_mse_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
649 where
650 F: Float + std::iter::Sum,
651 {
652 let diff_squared: F = y_true
654 .iter()
655 .zip(ypred.iter())
656 .map(|(&t, &p)| (t - p) * (t - p))
657 .sum();
658
659 Ok(diff_squared / F::from(y_true.len()).unwrap())
660 }
661
662 fn gpu_mae_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
664 where
665 F: Float + std::iter::Sum,
666 {
667 let abs_diff: F = y_true
668 .iter()
669 .zip(ypred.iter())
670 .map(|(&t, &p)| (t - p).abs())
671 .sum();
672
673 Ok(abs_diff / F::from(y_true.len()).unwrap())
674 }
675
676 fn gpu_r2_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
678 where
679 F: Float + std::iter::Sum,
680 {
681 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
682
683 let ss_tot: F = y_true
684 .iter()
685 .map(|&t| (t - mean_true) * (t - mean_true))
686 .sum();
687
688 let ss_res: F = y_true
689 .iter()
690 .zip(ypred.iter())
691 .map(|(&t, &p)| (t - p) * (t - p))
692 .sum();
693
694 if ss_tot == F::zero() {
695 Ok(F::zero())
696 } else {
697 Ok(F::one() - ss_res / ss_tot)
698 }
699 }
700
701 fn gpu_correlation_kernel<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
703 where
704 F: Float + std::iter::Sum,
705 {
706 let n = F::from(x.len()).unwrap();
707 let mean_x = x.iter().cloned().sum::<F>() / n;
708 let mean_y = y.iter().cloned().sum::<F>() / n;
709
710 let mut sum_xy = F::zero();
711 let mut sum_x2 = F::zero();
712 let mut sum_y2 = F::zero();
713
714 for (&xi, &yi) in x.iter().zip(y.iter()) {
715 let dx = xi - mean_x;
716 let dy = yi - mean_y;
717 sum_xy = sum_xy + dx * dy;
718 sum_x2 = sum_x2 + dx * dx;
719 sum_y2 = sum_y2 + dy * dy;
720 }
721
722 let denom = (sum_x2 * sum_y2).sqrt();
723 if denom > F::zero() {
724 Ok(sum_xy / denom)
725 } else {
726 Ok(F::zero())
727 }
728 }
729
730 fn cpu_parallel_batch_metrics<F>(
732 &self,
733 y_true_batch: &ArrayView2<F>,
734 y_pred_batch: &ArrayView2<F>,
735 metrics: &[&str],
736 ) -> Result<Vec<HashMap<String, F>>>
737 where
738 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
739 {
740 use scirs2_core::parallel_ops::*;
741
742 let batch_size = y_true_batch.nrows();
743 let chunk_size = self.parallel_config.min_chunk_size;
744
745 let results: Result<Vec<_>> = (0..batch_size)
747 .collect::<Vec<_>>()
748 .par_chunks(chunk_size)
749 .map(|chunk| {
750 let mut chunk_results = Vec::new();
751
752 for &batch_idx in chunk {
753 let y_true_sample = y_true_batch.row(batch_idx);
754 let y_pred_sample = y_pred_batch.row(batch_idx);
755
756 let mut sample_results = HashMap::new();
757
758 for &metric in metrics {
759 let result = match metric {
760 "mse" => self.simd_mse(&y_true_sample, &y_pred_sample)?,
761 "mae" => self.simd_mae(&y_true_sample, &y_pred_sample)?,
762 "r2_score" => self.simd_r2_score(&y_true_sample, &y_pred_sample)?,
763 "correlation" => {
764 self.simd_correlation(&y_true_sample, &y_pred_sample)?
765 }
766 _ => F::zero(),
767 };
768 sample_results.insert(metric.to_string(), result);
769 }
770
771 chunk_results.push(sample_results);
772 }
773
774 Ok(chunk_results)
775 })
776 .try_reduce(Vec::new, |mut acc, chunk| {
777 acc.extend(chunk);
778 Ok(acc)
779 });
780
781 results
782 }
783
784 fn scalar_mse<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
787 where
788 F: Float + std::iter::Sum,
789 {
790 let mse = y_true
791 .iter()
792 .zip(ypred.iter())
793 .map(|(&t, &p)| (t - p) * (t - p))
794 .sum::<F>()
795 / F::from(y_true.len()).unwrap();
796 Ok(mse)
797 }
798
799 fn scalar_mae<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
800 where
801 F: Float + std::iter::Sum,
802 {
803 let mae = y_true
804 .iter()
805 .zip(ypred.iter())
806 .map(|(&t, &p)| (t - p).abs())
807 .sum::<F>()
808 / F::from(y_true.len()).unwrap();
809 Ok(mae)
810 }
811
812 fn scalar_r2_score<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
813 where
814 F: Float + std::iter::Sum,
815 {
816 let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
817
818 let ss_tot = y_true
819 .iter()
820 .map(|&t| (t - mean_true) * (t - mean_true))
821 .sum::<F>();
822
823 let ss_res = y_true
824 .iter()
825 .zip(ypred.iter())
826 .map(|(&t, &p)| (t - p) * (t - p))
827 .sum::<F>();
828
829 if ss_tot == F::zero() {
830 Ok(F::zero())
831 } else {
832 Ok(F::one() - ss_res / ss_tot)
833 }
834 }
835
836 fn scalar_correlation<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
837 where
838 F: Float + std::iter::Sum,
839 {
840 let n = F::from(x.len()).unwrap();
841 let mean_x = x.iter().cloned().sum::<F>() / n;
842 let mean_y = y.iter().cloned().sum::<F>() / n;
843
844 let mut numerator = F::zero();
845 let mut sum_sq_x = F::zero();
846 let mut sum_sq_y = F::zero();
847
848 for (&xi, &yi) in x.iter().zip(y.iter()) {
849 let dx = xi - mean_x;
850 let dy = yi - mean_y;
851 numerator = numerator + dx * dy;
852 sum_sq_x = sum_sq_x + dx * dx;
853 sum_sq_y = sum_sq_y + dy * dy;
854 }
855
856 let denominator = (sum_sq_x * sum_sq_y).sqrt();
857
858 if denominator > F::zero() {
859 Ok(numerator / denominator)
860 } else {
861 Ok(F::zero())
862 }
863 }
864
865 fn scalar_log_operations<F>(&self, values: &ArrayView1<F>) -> Result<LogOperationResults<F>>
867 where
868 F: Float,
869 {
870 let mut log_values = Array1::zeros(values.len());
871 let mut log_sum = F::zero();
872
873 for (i, &val) in values.iter().enumerate() {
874 if val > F::zero() {
875 log_values[i] = val.ln();
876 log_sum = log_sum + log_values[i];
877 } else {
878 log_values[i] = F::neg_infinity();
879 }
880 }
881
882 let geometric_mean = if values.len() > 0 {
883 (log_sum / F::from(values.len()).unwrap()).exp()
884 } else {
885 F::zero()
886 };
887
888 Ok(LogOperationResults {
889 log_values,
890 log_sum,
891 log_product: log_sum,
892 geometric_mean,
893 })
894 }
895
896 fn scalar_determinant<F>(&self, matrix: &Array2<F>) -> Result<F>
898 where
899 F: Float,
900 {
901 let (nrows, ncols) = matrix.dim();
902 if nrows != ncols {
903 return Err(MetricsError::InvalidInput(
904 "Matrix must be square".to_string(),
905 ));
906 }
907
908 match nrows {
909 1 => Ok(matrix[[0, 0]]),
910 2 => Ok(matrix[[0, 0]] * matrix[[1, 1]] - matrix[[0, 1]] * matrix[[1, 0]]),
911 _ => self.scalar_determinant_lu(matrix),
912 }
913 }
914
915 fn scalar_determinant_lu<F>(&self, matrix: &Array2<F>) -> Result<F>
917 where
918 F: Float,
919 {
920 let n = matrix.nrows();
921 let mut lu = matrix.clone();
922 let mut det = F::one();
923 let mut sign = 1;
924
925 for i in 0..n {
926 let mut max_row = i;
928 for k in (i + 1)..n {
929 if lu[[k, i]].abs() > lu[[max_row, i]].abs() {
930 max_row = k;
931 }
932 }
933
934 if max_row != i {
935 for j in 0..n {
937 let temp = lu[[i, j]];
938 lu[[i, j]] = lu[[max_row, j]];
939 lu[[max_row, j]] = temp;
940 }
941 sign *= -1;
942 }
943
944 if lu[[i, i]].abs() < F::from(1e-10).unwrap() {
945 return Ok(F::zero()); }
947
948 det = det * lu[[i, i]];
949
950 for k in (i + 1)..n {
952 let factor = lu[[k, i]] / lu[[i, i]];
953 for j in (i + 1)..n {
954 lu[[k, j]] = lu[[k, j]] - factor * lu[[i, j]];
955 }
956 }
957 }
958
959 Ok(F::from(sign).unwrap() * det)
960 }
961
962 fn scalar_trace<F>(&self, matrix: &Array2<F>) -> Result<F>
964 where
965 F: Float,
966 {
967 let (nrows, ncols) = matrix.dim();
968 if nrows != ncols {
969 return Err(MetricsError::InvalidInput(
970 "Matrix must be square".to_string(),
971 ));
972 }
973
974 let mut trace = F::zero();
975 for i in 0..nrows {
976 trace = trace + matrix[[i, i]];
977 }
978
979 Ok(trace)
980 }
981
982 fn scalar_eigenvalue_sum_estimate<F>(&self, matrix: &Array2<F>) -> Result<F>
984 where
985 F: Float,
986 {
987 self.scalar_trace(matrix)
989 }
990
991 pub fn get_capabilities(&self) -> &PlatformCapabilities {
993 &self.capabilities
994 }
995
996 pub fn get_gpu_info(&self) -> Option<&GpuInfo> {
998 self.gpu_info.as_ref()
999 }
1000
1001 pub fn benchmark_implementations<F>(
1003 &self,
1004 y_true: &ArrayView1<F>,
1005 ypred: &ArrayView1<F>,
1006 iterations: usize,
1007 ) -> Result<BenchmarkResults>
1008 where
1009 F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
1010 {
1011 use std::time::Instant;
1012
1013 let mut results = BenchmarkResults::new();
1014
1015 let start = Instant::now();
1017 for _ in 0..iterations {
1018 let _ = self.scalar_mse(y_true, ypred)?;
1019 }
1020 let scalar_time = start.elapsed();
1021 results.scalar_time = scalar_time;
1022
1023 if self.capabilities.simd_available {
1025 let start = Instant::now();
1026 for _ in 0..iterations {
1027 let _ = self.simd_mse(y_true, ypred)?;
1028 }
1029 let simd_time = start.elapsed();
1030 results.simd_time = Some(simd_time);
1031 results.simd_speedup =
1032 Some(scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64);
1033 }
1034
1035 if self.gpu_info.is_some() {
1037 let batch = y_true.view().insert_axis(Axis(0));
1038 let batch_pred = ypred.view().insert_axis(Axis(0));
1039
1040 let start = Instant::now();
1041 for _ in 0..iterations {
1042 let _ = self.gpu_batch_metrics(&batch.view(), &batch_pred.view(), &["mse"])?;
1043 }
1044 let gpu_time = start.elapsed();
1045 results.gpu_time = Some(gpu_time);
1046 results.gpu_speedup = Some(scalar_time.as_nanos() as f64 / gpu_time.as_nanos() as f64);
1047 }
1048
1049 Ok(results)
1050 }
1051}
1052
1053impl Default for SimdMetrics {
1054 fn default() -> Self {
1055 Self::new().unwrap_or_else(|_| Self {
1056 capabilities: PlatformCapabilities::detect(),
1057 enable_simd: false,
1058 gpu_info: None,
1059 parallel_config: ParallelConfig::default(),
1060 })
1061 }
1062}
1063
1064#[derive(Debug, Clone)]
1066pub struct BenchmarkResults {
1067 pub scalar_time: std::time::Duration,
1068 pub simd_time: Option<std::time::Duration>,
1069 pub gpu_time: Option<std::time::Duration>,
1070 pub simd_speedup: Option<f64>,
1071 pub gpu_speedup: Option<f64>,
1072}
1073
1074impl BenchmarkResults {
1075 pub fn new() -> Self {
1076 Self {
1077 scalar_time: std::time::Duration::default(),
1078 simd_time: None,
1079 gpu_time: None,
1080 simd_speedup: None,
1081 gpu_speedup: None,
1082 }
1083 }
1084
1085 pub fn best_implementation(&self) -> &'static str {
1086 let scalar_nanos = self.scalar_time.as_nanos();
1087 let simd_nanos = self.simd_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
1088 let gpu_nanos = self.gpu_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
1089
1090 if gpu_nanos < scalar_nanos && gpu_nanos < simd_nanos {
1091 "GPU"
1092 } else if simd_nanos < scalar_nanos {
1093 "SIMD"
1094 } else {
1095 "Scalar"
1096 }
1097 }
1098}
1099
1100#[cfg(test)]
1101mod tests {
1102 use super::*;
1103 use scirs2_core::ndarray::array;
1104
1105 #[test]
1106 fn test_simd_metrics_creation() {
1107 let metrics = SimdMetrics::new().unwrap();
1108 assert!(metrics.enable_simd);
1109 }
1110
1111 #[test]
1112 #[ignore = "timeout"]
1113 fn test_simd_mse() {
1114 let metrics = SimdMetrics::new().unwrap();
1115 let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1116 let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1117
1118 let mse = metrics.simd_mse(&y_true.view(), &ypred.view()).unwrap();
1119 assert!(mse > 0.0);
1120 assert!(mse < 0.02); }
1122
1123 #[test]
1124 #[ignore = "timeout"]
1125 fn test_simd_mae() {
1126 let metrics = SimdMetrics::new().unwrap();
1127 let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1128 let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1129
1130 let mae = metrics.simd_mae(&y_true.view(), &ypred.view()).unwrap();
1131 assert!(mae > 0.0);
1132 assert!(mae < 0.15);
1133 }
1134
1135 #[test]
1136 #[ignore = "timeout"]
1137 fn test_simd_r2_score() {
1138 let metrics = SimdMetrics::new().unwrap();
1139 let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1140 let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1141
1142 let r2 = metrics
1143 .simd_r2_score(&y_true.view(), &ypred.view())
1144 .unwrap();
1145 assert!(r2 > 0.9); assert!(r2 <= 1.0);
1147 }
1148
1149 #[test]
1150 #[ignore = "timeout"]
1151 fn test_simd_correlation() {
1152 let metrics = SimdMetrics::new().unwrap();
1153 let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
1154 let y = array![2.0, 4.0, 6.0, 8.0, 10.0]; let corr = metrics.simd_correlation(&x.view(), &y.view()).unwrap();
1157 assert!((corr - 1.0).abs() < 1e-10); }
1159
1160 #[test]
1161 #[ignore = "timeout"]
1162 fn test_gpu_batch_metrics() {
1163 let metrics = SimdMetrics::new().unwrap();
1164
1165 let y_true_batch = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1167 let y_pred_batch = array![[1.1, 2.1, 2.9], [4.1, 4.9, 6.1]];
1168
1169 let results = metrics
1170 .gpu_batch_metrics(&y_true_batch.view(), &y_pred_batch.view(), &["mse", "mae"])
1171 .unwrap();
1172
1173 assert_eq!(results.len(), 2);
1174 assert!(results[0].contains_key("mse"));
1175 assert!(results[0].contains_key("mae"));
1176 assert!(results[1].contains_key("mse"));
1177 assert!(results[1].contains_key("mae"));
1178 }
1179
1180 #[test]
1181 #[ignore = "timeout"]
1182 fn test_benchmark_implementations() {
1183 let metrics = SimdMetrics::new().unwrap();
1184 let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1185 let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9, 6.1, 6.9, 8.1, 8.9, 10.1];
1186
1187 let benchmark = metrics
1188 .benchmark_implementations(&y_true.view(), &ypred.view(), 10)
1189 .unwrap();
1190
1191 assert!(benchmark.scalar_time.as_nanos() > 0);
1192
1193 let best = benchmark.best_implementation();
1194 assert!(best == "Scalar" || best == "SIMD" || best == "GPU");
1195 }
1196
1197 #[test]
1198 #[ignore = "timeout"]
1199 fn test_simd_exponential_moving_average() {
1200 let metrics = SimdMetrics::new().unwrap();
1201 let values = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1202 let alpha = 0.3;
1203
1204 let ema = metrics
1205 .simd_exponential_moving_average(&values.view(), alpha)
1206 .unwrap();
1207
1208 assert_eq!(ema.len(), values.len());
1209 assert_eq!(ema[0], values[0]); for i in 1..ema.len() {
1213 assert!(ema[i] > ema[i - 1]);
1214 assert!(ema[i] <= values[i]); }
1216 }
1217
1218 #[test]
1219 #[ignore = "timeout"]
1220 fn test_simd_log_operations() {
1221 let metrics = SimdMetrics::new().unwrap();
1222 let values = array![1.0, 2.0, 4.0, 8.0]; let log_results = metrics.simd_log_operations(&values.view()).unwrap();
1225
1226 assert_eq!(log_results.log_values.len(), values.len());
1227
1228 assert!((log_results.log_values[0] - 0.0).abs() < 1e-10); assert!((log_results.log_values[1] - 2.0_f64.ln()).abs() < 1e-10);
1231
1232 assert!(log_results.log_sum.is_finite());
1233 assert!(log_results.geometric_mean > 0.0);
1234 }
1235
1236 #[test]
1237 fn test_simd_batch_matrix_operations() {
1238 let metrics = SimdMetrics::new().unwrap();
1239
1240 let matrix1 = array![[1.0, 2.0], [3.0, 4.0]];
1242 let matrix2 = array![[5.0, 6.0], [7.0, 8.0]];
1243 let matrices = vec![matrix1, matrix2];
1244
1245 let results = metrics
1246 .simd_batch_matrix_operations(&matrices, MatrixOperation::All)
1247 .unwrap();
1248
1249 assert_eq!(results.determinants.len(), 2);
1250 assert_eq!(results.traces.len(), 2);
1251 assert_eq!(results.eigenvalue_sums.len(), 2);
1252
1253 assert!((results.determinants[0] - (-2.0)).abs() < 1e-10);
1255
1256 assert!((results.traces[0] - 5.0).abs() < 1e-10);
1258
1259 assert!((results.traces[1] - 13.0).abs() < 1e-10);
1261 }
1262
1263 #[test]
1264 fn test_cuda_detection() {
1265 let result = SimdMetrics::detect_cuda_capabilities();
1267
1268 match result {
1270 Ok(gpu_info) => {
1271 assert!(!gpu_info.device_name.is_empty());
1272 assert!(gpu_info.total_memory > 0);
1273 assert!(gpu_info.multiprocessor_count > 0);
1274 }
1275 Err(_) => {
1276 }
1278 }
1279 }
1280
1281 #[test]
1282 fn test_opencl_detection() {
1283 let result = SimdMetrics::detect_opencl_capabilities();
1285
1286 match result {
1288 Ok(gpu_info) => {
1289 assert!(!gpu_info.device_name.is_empty());
1290 assert!(gpu_info.total_memory > 0);
1291 assert!(gpu_info.multiprocessor_count > 0);
1292 }
1293 Err(_) => {
1294 }
1296 }
1297 }
1298
1299 #[test]
1300 #[ignore = "timeout"]
1301 fn test_log_operation_edge_cases() {
1302 let metrics = SimdMetrics::new().unwrap();
1303
1304 let values = array![0.0, -1.0, 1.0, 2.0];
1306 let log_results = metrics.simd_log_operations(&values.view()).unwrap();
1307
1308 assert!(log_results.log_values[0].is_infinite()); assert!(log_results.log_values[1].is_infinite()); assert!((log_results.log_values[2] - 0.0).abs() < 1e-10); assert!(log_results.geometric_mean.is_finite() || log_results.geometric_mean == 0.0);
1314 }
1315
1316 #[test]
1317 fn test_determinant_edge_cases() {
1318 let metrics = SimdMetrics::new().unwrap();
1319
1320 let matrix1x1 = array![[5.0]];
1322 let det1 = metrics.simd_determinant(&matrix1x1).unwrap();
1323 assert!((det1 - 5.0).abs() < 1e-10);
1324
1325 let identity2x2 = array![[1.0, 0.0], [0.0, 1.0]];
1327 let det2 = metrics.simd_determinant(&identity2x2).unwrap();
1328 assert!((det2 - 1.0).abs() < 1e-10);
1329
1330 let singular = array![[1.0, 2.0], [2.0, 4.0]];
1332 let det3 = metrics.simd_determinant(&singular).unwrap();
1333 assert!(det3.abs() < 1e-10);
1334 }
1335}