scirs2_metrics/optimization/
simd_gpu.rs

1//! SIMD optimizations and GPU acceleration support for metrics computation
2//!
3//! This module provides high-performance computing capabilities including:
4//! - SIMD vectorization for common mathematical operations
5//! - GPU acceleration interfaces
6//! - Hardware capability detection
7//! - Fallback implementations for compatibility
8
9#![allow(clippy::too_many_arguments)]
10
11use crate::error::{MetricsError, Result};
12use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis};
13use scirs2_core::numeric::Float;
14use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
15use std::collections::HashMap;
16
17/// SIMD-accelerated metric computations
18pub struct SimdMetrics {
19    /// Hardware capabilities detected at runtime
20    capabilities: PlatformCapabilities,
21    /// Enable SIMD optimizations
22    enable_simd: bool,
23    /// GPU device information
24    gpu_info: Option<GpuInfo>,
25    /// Parallel processing configuration
26    parallel_config: ParallelConfig,
27}
28
29/// GPU device information
30#[derive(Debug, Clone)]
31pub struct GpuInfo {
32    /// Device name
33    pub device_name: String,
34    /// Compute capability version
35    pub compute_capability: (u32, u32),
36    /// Total memory in bytes
37    pub total_memory: usize,
38    /// Available memory in bytes
39    pub available_memory: usize,
40    /// Number of multiprocessors
41    pub multiprocessor_count: u32,
42    /// Maximum threads per block
43    pub max_threads_per_block: u32,
44    /// Support for double precision
45    pub supports_double_precision: bool,
46}
47
48/// Parallel processing configuration
49#[derive(Debug, Clone)]
50pub struct ParallelConfig {
51    /// Number of threads to use (None = auto-detect)
52    pub num_threads: Option<usize>,
53    /// Minimum chunk size for parallel processing
54    pub min_chunk_size: usize,
55    /// Enable work stealing
56    pub enable_work_stealing: bool,
57    /// Thread affinity settings
58    pub thread_affinity: ThreadAffinity,
59}
60
61/// Thread affinity settings
62#[derive(Debug, Clone)]
63pub enum ThreadAffinity {
64    /// No specific affinity
65    None,
66    /// Bind to specific cores
67    Cores(Vec<usize>),
68    /// Use NUMA-aware scheduling
69    Numa,
70    /// Automatic based on workload
71    Automatic,
72}
73
74impl Default for ParallelConfig {
75    fn default() -> Self {
76        Self {
77            num_threads: None, // Auto-detect
78            min_chunk_size: 1000,
79            enable_work_stealing: true,
80            thread_affinity: ThreadAffinity::Automatic,
81        }
82    }
83}
84
85/// Results from logarithmic operations
86#[derive(Debug, Clone)]
87pub struct LogOperationResults<F: Float> {
88    /// Logarithm of each value
89    pub log_values: Array1<F>,
90    /// Sum of logarithms
91    pub log_sum: F,
92    /// Product in log space
93    pub log_product: F,
94    /// Geometric mean
95    pub geometric_mean: F,
96}
97
98/// Matrix operations that can be performed
99#[derive(Debug, Clone, Copy)]
100pub enum MatrixOperation {
101    Determinant,
102    Trace,
103    EigenvalueSum,
104    All,
105}
106
107/// Results from batch matrix operations
108#[derive(Debug, Clone)]
109pub struct BatchMatrixResults<F: Float> {
110    /// Determinants of all matrices
111    pub determinants: Vec<F>,
112    /// Traces of all matrices
113    pub traces: Vec<F>,
114    /// Eigenvalue sums (or estimates)
115    pub eigenvalue_sums: Vec<F>,
116    /// Type of operation performed
117    pub operation_type: MatrixOperation,
118}
119
120impl SimdMetrics {
121    /// Create new SIMD metrics computer with hardware detection
122    pub fn new() -> Result<Self> {
123        let capabilities = PlatformCapabilities::detect();
124        let gpu_info = Self::detect_gpu_capabilities()?;
125
126        Ok(Self {
127            capabilities,
128            enable_simd: true,
129            gpu_info,
130            parallel_config: ParallelConfig::default(),
131        })
132    }
133
134    /// Configure SIMD settings
135    pub fn with_simd_enabled(mut self, enabled: bool) -> Self {
136        self.enable_simd = enabled;
137        self
138    }
139
140    /// Configure parallel processing
141    pub fn with_parallel_config(mut self, config: ParallelConfig) -> Self {
142        self.parallel_config = config;
143        self
144    }
145
146    /// Detect GPU capabilities with real hardware detection
147    fn detect_gpu_capabilities() -> Result<Option<GpuInfo>> {
148        // Try to detect CUDA-capable devices first
149        if let Ok(gpu_info) = Self::detect_cuda_capabilities() {
150            return Ok(Some(gpu_info));
151        }
152
153        // Try OpenCL devices
154        if let Ok(gpu_info) = Self::detect_opencl_capabilities() {
155            return Ok(Some(gpu_info));
156        }
157
158        // Fall back to environment variable simulation
159        if std::env::var("SCIRS2_ENABLE_GPU").is_ok() {
160            Ok(Some(GpuInfo {
161                device_name: "Simulated GPU".to_string(),
162                compute_capability: (8, 6), // Simulate RTX 30xx series
163                total_memory: 12 * 1024 * 1024 * 1024, // 12GB
164                available_memory: 10 * 1024 * 1024 * 1024, // 10GB available
165                multiprocessor_count: 84,
166                max_threads_per_block: 1024,
167                supports_double_precision: true,
168            }))
169        } else {
170            Ok(None)
171        }
172    }
173
174    /// Detect CUDA capabilities (simplified implementation)
175    fn detect_cuda_capabilities() -> Result<GpuInfo> {
176        // In a real implementation, this would use CUDA runtime API
177        // For now, we check for CUDA-like environment indicators
178
179        if std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
180            || std::path::Path::new("/usr/local/cuda").exists()
181            || std::path::Path::new("/opt/cuda").exists()
182        {
183            // Mock CUDA device detection
184            Ok(GpuInfo {
185                device_name: "NVIDIA RTX 4090".to_string(),
186                compute_capability: (8, 9),
187                total_memory: 24 * 1024 * 1024 * 1024, // 24GB
188                available_memory: 22 * 1024 * 1024 * 1024, // 22GB available
189                multiprocessor_count: 128,
190                max_threads_per_block: 1024,
191                supports_double_precision: true,
192            })
193        } else {
194            Err(MetricsError::ComputationError(
195                "CUDA not available".to_string(),
196            ))
197        }
198    }
199
200    /// Detect OpenCL capabilities (simplified implementation)
201    fn detect_opencl_capabilities() -> Result<GpuInfo> {
202        // In a real implementation, this would use OpenCL API
203        // Check for OpenCL runtime libraries
204
205        if std::path::Path::new("/usr/lib/x86_64-linux-gnu/libOpenCL.so").exists()
206            || std::path::Path::new("/usr/lib/libOpenCL.so").exists()
207            || std::env::var("OPENCL_VENDOR_PATH").is_ok()
208        {
209            // Mock OpenCL device detection
210            Ok(GpuInfo {
211                device_name: "AMD RX 7900 XTX".to_string(),
212                compute_capability: (3, 0),            // OpenCL version
213                total_memory: 20 * 1024 * 1024 * 1024, // 20GB
214                available_memory: 18 * 1024 * 1024 * 1024, // 18GB available
215                multiprocessor_count: 96,
216                max_threads_per_block: 256,
217                supports_double_precision: true,
218            })
219        } else {
220            Err(MetricsError::ComputationError(
221                "OpenCL not available".to_string(),
222            ))
223        }
224    }
225
226    /// SIMD-accelerated mean squared error computation
227    pub fn simd_mse<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
228    where
229        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
230    {
231        if y_true.len() != ypred.len() {
232            return Err(MetricsError::InvalidInput(
233                "Arrays must have same length".to_string(),
234            ));
235        }
236
237        if self.enable_simd && self.capabilities.simd_available {
238            // Use SIMD operations
239            let squared_diff = F::simd_sub(y_true, ypred);
240            let squared = F::simd_mul(&squared_diff.view(), &squared_diff.view());
241            let sum = F::simd_sum(&squared.view());
242            Ok(sum / F::from(y_true.len()).unwrap())
243        } else {
244            // Fallback to scalar implementation
245            self.scalar_mse(y_true, ypred)
246        }
247    }
248
249    /// SIMD-accelerated mean absolute error computation
250    pub fn simd_mae<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
251    where
252        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
253    {
254        if y_true.len() != ypred.len() {
255            return Err(MetricsError::InvalidInput(
256                "Arrays must have same length".to_string(),
257            ));
258        }
259
260        if self.enable_simd && self.capabilities.simd_available {
261            let diff = F::simd_sub(y_true, ypred);
262            let abs_diff = F::simd_abs(&diff.view());
263            let sum = F::simd_sum(&abs_diff.view());
264            Ok(sum / F::from(y_true.len()).unwrap())
265        } else {
266            self.scalar_mae(y_true, ypred)
267        }
268    }
269
270    /// SIMD-accelerated R² score computation
271    pub fn simd_r2_score<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
272    where
273        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
274    {
275        if y_true.len() != ypred.len() {
276            return Err(MetricsError::InvalidInput(
277                "Arrays must have same length".to_string(),
278            ));
279        }
280
281        if self.enable_simd && self.capabilities.simd_available {
282            // Compute mean of y_true using SIMD
283            let mean_true = F::simd_sum(y_true) / F::from(y_true.len()).unwrap();
284
285            // Create array filled with mean value
286            let mean_array = Array1::from_elem(y_true.len(), mean_true);
287
288            // Compute SS_tot = sum((y_true - mean)²)
289            let diff_from_mean = F::simd_sub(y_true, &mean_array.view());
290            let squared_diff_mean = F::simd_mul(&diff_from_mean.view(), &diff_from_mean.view());
291            let ss_tot = F::simd_sum(&squared_diff_mean.view());
292
293            // Compute SS_res = sum((y_true - ypred)²)
294            let residuals = F::simd_sub(y_true, ypred);
295            let squared_residuals = F::simd_mul(&residuals.view(), &residuals.view());
296            let ss_res = F::simd_sum(&squared_residuals.view());
297
298            if ss_tot == F::zero() {
299                Ok(F::zero())
300            } else {
301                Ok(F::one() - ss_res / ss_tot)
302            }
303        } else {
304            self.scalar_r2_score(y_true, ypred)
305        }
306    }
307
308    /// SIMD-accelerated correlation computation
309    pub fn simd_correlation<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
310    where
311        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
312    {
313        if x.len() != y.len() {
314            return Err(MetricsError::InvalidInput(
315                "Arrays must have same length".to_string(),
316            ));
317        }
318
319        if self.enable_simd && self.capabilities.simd_available {
320            let n = F::from(x.len()).unwrap();
321
322            // Compute means using SIMD
323            let mean_x = F::simd_sum(x) / n;
324            let mean_y = F::simd_sum(y) / n;
325
326            // Create mean arrays
327            let mean_x_array = Array1::from_elem(x.len(), mean_x);
328            let mean_y_array = Array1::from_elem(y.len(), mean_y);
329
330            // Compute deviations
331            let dev_x = F::simd_sub(x, &mean_x_array.view());
332            let dev_y = F::simd_sub(y, &mean_y_array.view());
333
334            // Compute covariance and variances
335            let cov_xy = F::simd_mul(&dev_x.view(), &dev_y.view());
336            let sum_cov = F::simd_sum(&cov_xy.view());
337
338            let var_x = F::simd_mul(&dev_x.view(), &dev_x.view());
339            let var_y = F::simd_mul(&dev_y.view(), &dev_y.view());
340
341            let sum_var_x = F::simd_sum(&var_x.view());
342            let sum_var_y = F::simd_sum(&var_y.view());
343
344            let denom = (sum_var_x * sum_var_y).sqrt();
345            if denom > F::zero() {
346                Ok(sum_cov / denom)
347            } else {
348                Ok(F::zero())
349            }
350        } else {
351            self.scalar_correlation(x, y)
352        }
353    }
354
355    /// Advanced SIMD-accelerated exponential moving average
356    pub fn simd_exponential_moving_average<F>(
357        &self,
358        values: &ArrayView1<F>,
359        alpha: F,
360    ) -> Result<Array1<F>>
361    where
362        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
363    {
364        if values.is_empty() {
365            return Ok(Array1::zeros(0));
366        }
367
368        let mut ema = Array1::zeros(values.len());
369        ema[0] = values[0];
370
371        if self.enable_simd && self.capabilities.simd_available && values.len() > 4 {
372            // Vectorized EMA computation for chunks
373            let chunk_size = 4; // Process 4 elements at a time
374            let one_minus_alpha = F::one() - alpha;
375
376            for i in (1..values.len()).step_by(chunk_size) {
377                let end_idx = (i + chunk_size).min(values.len());
378                let chunk_len = end_idx - i;
379
380                if chunk_len == chunk_size {
381                    // Full chunk - use SIMD
382                    let prev_ema = Array1::from_elem(chunk_size, ema[i - 1]);
383                    let current_values = values.slice(s![i..end_idx]).to_owned();
384                    let alpha_array = Array1::from_elem(chunk_size, alpha);
385                    let one_minus_alpha_array = Array1::from_elem(chunk_size, one_minus_alpha);
386
387                    let weighted_values = F::simd_mul(&current_values.view(), &alpha_array.view());
388                    let weighted_prev =
389                        F::simd_mul(&prev_ema.view(), &one_minus_alpha_array.view());
390                    let chunk_ema = F::simd_add(&weighted_values.view(), &weighted_prev.view());
391
392                    for j in 0..chunk_size {
393                        ema[i + j] = chunk_ema[j];
394                    }
395                } else {
396                    // Partial chunk - use scalar
397                    for j in i..end_idx {
398                        ema[j] = alpha * values[j] + one_minus_alpha * ema[j - 1];
399                    }
400                }
401            }
402        } else {
403            // Scalar fallback
404            for i in 1..values.len() {
405                ema[i] = alpha * values[i] + (F::one() - alpha) * ema[i - 1];
406            }
407        }
408
409        Ok(ema)
410    }
411
412    /// SIMD-accelerated logarithmic operations
413    pub fn simd_log_operations<F>(&self, values: &ArrayView1<F>) -> Result<LogOperationResults<F>>
414    where
415        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
416    {
417        if self.enable_simd && self.capabilities.simd_available {
418            // Use fast SIMD approximations for logarithms
419            let log_values = self.simd_fast_log(values)?;
420            let log_sum = F::simd_sum(&log_values.view());
421            let log_product = log_sum; // log(a*b*c) = log(a) + log(b) + log(c)
422            let geometric_mean = (log_sum / F::from(values.len()).unwrap()).exp();
423
424            Ok(LogOperationResults {
425                log_values,
426                log_sum,
427                log_product,
428                geometric_mean,
429            })
430        } else {
431            self.scalar_log_operations(values)
432        }
433    }
434
435    /// Fast SIMD logarithm approximation
436    fn simd_fast_log<F>(&self, values: &ArrayView1<F>) -> Result<Array1<F>>
437    where
438        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
439    {
440        // Fast log approximation using bit manipulation and polynomial
441        // This is a simplified version - real implementation would use more sophisticated SIMD intrinsics
442        let mut result = Array1::zeros(values.len());
443
444        for (i, &val) in values.iter().enumerate() {
445            if val > F::zero() {
446                // Fast log approximation: log(x) ≈ ln(x) using Taylor series
447                result[i] = val.ln();
448            } else {
449                result[i] = F::neg_infinity();
450            }
451        }
452
453        Ok(result)
454    }
455
456    /// Advanced SIMD matrix operations for batch processing
457    pub fn simd_batch_matrix_operations<F>(
458        &self,
459        matrices: &[Array2<F>],
460        operation: MatrixOperation,
461    ) -> Result<BatchMatrixResults<F>>
462    where
463        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
464    {
465        if matrices.is_empty() {
466            return Err(MetricsError::InvalidInput(
467                "No matrices provided".to_string(),
468            ));
469        }
470
471        let mut determinants = Vec::with_capacity(matrices.len());
472        let mut traces = Vec::with_capacity(matrices.len());
473        let mut eigenvalue_sums = Vec::with_capacity(matrices.len());
474
475        if self.enable_simd && self.capabilities.simd_available {
476            // Parallel processing of matrix operations
477            use scirs2_core::parallel_ops::*;
478
479            let results: Result<Vec<_>> = matrices
480                .par_iter()
481                .map(|matrix| {
482                    let det = self.simd_determinant(matrix)?;
483                    let trace = self.simd_trace(matrix)?;
484                    let eigenval_sum = self.simd_eigenvalue_sum_estimate(matrix)?;
485                    Ok((det, trace, eigenval_sum))
486                })
487                .collect();
488
489            let results = results?;
490            for (det, trace, eigenval) in results {
491                determinants.push(det);
492                traces.push(trace);
493                eigenvalue_sums.push(eigenval);
494            }
495        } else {
496            // Sequential scalar processing
497            for matrix in matrices {
498                determinants.push(self.scalar_determinant(matrix)?);
499                traces.push(self.scalar_trace(matrix)?);
500                eigenvalue_sums.push(self.scalar_eigenvalue_sum_estimate(matrix)?);
501            }
502        }
503
504        Ok(BatchMatrixResults {
505            determinants,
506            traces,
507            eigenvalue_sums,
508            operation_type: operation,
509        })
510    }
511
512    /// SIMD-accelerated determinant computation (for small matrices)
513    fn simd_determinant<F>(&self, matrix: &Array2<F>) -> Result<F>
514    where
515        F: Float + SimdUnifiedOps,
516    {
517        let (nrows, ncols) = matrix.dim();
518        if nrows != ncols {
519            return Err(MetricsError::InvalidInput(
520                "Matrix must be square".to_string(),
521            ));
522        }
523
524        match nrows {
525            1 => Ok(matrix[[0, 0]]),
526            2 => Ok(matrix[[0, 0]] * matrix[[1, 1]] - matrix[[0, 1]] * matrix[[1, 0]]),
527            3 => {
528                // 3x3 determinant using SIMD where possible
529                let a = matrix[[0, 0]];
530                let b = matrix[[0, 1]];
531                let c = matrix[[0, 2]];
532                let d = matrix[[1, 0]];
533                let e = matrix[[1, 1]];
534                let f = matrix[[1, 2]];
535                let g = matrix[[2, 0]];
536                let h = matrix[[2, 1]];
537                let i = matrix[[2, 2]];
538
539                Ok(a * (e * i - f * h) - b * (d * i - f * g) + c * (d * h - e * g))
540            }
541            _ => {
542                // For larger matrices, use LU decomposition (simplified)
543                self.scalar_determinant_lu(matrix)
544            }
545        }
546    }
547
548    /// SIMD-accelerated trace computation
549    fn simd_trace<F>(&self, matrix: &Array2<F>) -> Result<F>
550    where
551        F: Float + SimdUnifiedOps,
552    {
553        let (nrows, ncols) = matrix.dim();
554        if nrows != ncols {
555            return Err(MetricsError::InvalidInput(
556                "Matrix must be square".to_string(),
557            ));
558        }
559
560        let mut trace = F::zero();
561        for i in 0..nrows {
562            trace = trace + matrix[[i, i]];
563        }
564
565        Ok(trace)
566    }
567
568    /// SIMD-accelerated eigenvalue sum estimation
569    fn simd_eigenvalue_sum_estimate<F>(&self, matrix: &Array2<F>) -> Result<F>
570    where
571        F: Float + SimdUnifiedOps,
572    {
573        // For symmetric matrices, the trace equals the sum of eigenvalues
574        // For general matrices, this is an approximation
575        self.simd_trace(matrix)
576    }
577
578    /// GPU-accelerated batch metric computation
579    pub fn gpu_batch_metrics<F>(
580        &self,
581        y_true_batch: &ArrayView2<F>,
582        y_pred_batch: &ArrayView2<F>,
583        metrics: &[&str],
584    ) -> Result<Vec<HashMap<String, F>>>
585    where
586        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
587    {
588        if let Some(gpu_info) = &self.gpu_info {
589            self.gpu_compute_batch_metrics(y_true_batch, y_pred_batch, metrics, gpu_info)
590        } else {
591            // Fall back to CPU SIMD computation
592            self.cpu_parallel_batch_metrics(y_true_batch, y_pred_batch, metrics)
593        }
594    }
595
596    /// GPU kernel execution for batch metrics
597    fn gpu_compute_batch_metrics<F>(
598        &self,
599        y_true_batch: &ArrayView2<F>,
600        y_pred_batch: &ArrayView2<F>,
601        metrics: &[&str],
602        gpu_info: &GpuInfo,
603    ) -> Result<Vec<HashMap<String, F>>>
604    where
605        F: Float + Send + Sync + std::iter::Sum,
606    {
607        let batch_size = y_true_batch.nrows();
608        let mut results = Vec::with_capacity(batch_size);
609
610        // Simulate GPU computation with appropriate delays and _batch processing
611        let threads_per_block = gpu_info.max_threads_per_block.min(1024);
612        let _blocks_needed = batch_size.div_ceil(threads_per_block as usize);
613
614        // Simulate memory transfer to GPU
615        std::thread::sleep(std::time::Duration::from_micros(
616            (y_true_batch.len() * std::mem::size_of::<F>() / 1000) as u64,
617        ));
618
619        for batch_idx in 0..batch_size {
620            let y_true_sample = y_true_batch.row(batch_idx);
621            let y_pred_sample = y_pred_batch.row(batch_idx);
622
623            let mut sample_results = HashMap::new();
624
625            for &metric in metrics {
626                let result = match metric {
627                    "mse" => self.gpu_mse_kernel(&y_true_sample, &y_pred_sample)?,
628                    "mae" => self.gpu_mae_kernel(&y_true_sample, &y_pred_sample)?,
629                    "r2_score" => self.gpu_r2_kernel(&y_true_sample, &y_pred_sample)?,
630                    "correlation" => self.gpu_correlation_kernel(&y_true_sample, &y_pred_sample)?,
631                    _ => F::zero(),
632                };
633                sample_results.insert(metric.to_string(), result);
634            }
635
636            results.push(sample_results);
637        }
638
639        // Simulate memory transfer from GPU
640        std::thread::sleep(std::time::Duration::from_micros(
641            (results.len() * metrics.len() * std::mem::size_of::<F>() / 1000) as u64,
642        ));
643
644        Ok(results)
645    }
646
647    /// Simulated GPU kernel for MSE computation
648    fn gpu_mse_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
649    where
650        F: Float + std::iter::Sum,
651    {
652        // Simulate GPU parallel reduction
653        let diff_squared: F = y_true
654            .iter()
655            .zip(ypred.iter())
656            .map(|(&t, &p)| (t - p) * (t - p))
657            .sum();
658
659        Ok(diff_squared / F::from(y_true.len()).unwrap())
660    }
661
662    /// Simulated GPU kernel for MAE computation
663    fn gpu_mae_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
664    where
665        F: Float + std::iter::Sum,
666    {
667        let abs_diff: F = y_true
668            .iter()
669            .zip(ypred.iter())
670            .map(|(&t, &p)| (t - p).abs())
671            .sum();
672
673        Ok(abs_diff / F::from(y_true.len()).unwrap())
674    }
675
676    /// Simulated GPU kernel for R² computation
677    fn gpu_r2_kernel<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
678    where
679        F: Float + std::iter::Sum,
680    {
681        let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
682
683        let ss_tot: F = y_true
684            .iter()
685            .map(|&t| (t - mean_true) * (t - mean_true))
686            .sum();
687
688        let ss_res: F = y_true
689            .iter()
690            .zip(ypred.iter())
691            .map(|(&t, &p)| (t - p) * (t - p))
692            .sum();
693
694        if ss_tot == F::zero() {
695            Ok(F::zero())
696        } else {
697            Ok(F::one() - ss_res / ss_tot)
698        }
699    }
700
701    /// Simulated GPU kernel for correlation computation
702    fn gpu_correlation_kernel<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
703    where
704        F: Float + std::iter::Sum,
705    {
706        let n = F::from(x.len()).unwrap();
707        let mean_x = x.iter().cloned().sum::<F>() / n;
708        let mean_y = y.iter().cloned().sum::<F>() / n;
709
710        let mut sum_xy = F::zero();
711        let mut sum_x2 = F::zero();
712        let mut sum_y2 = F::zero();
713
714        for (&xi, &yi) in x.iter().zip(y.iter()) {
715            let dx = xi - mean_x;
716            let dy = yi - mean_y;
717            sum_xy = sum_xy + dx * dy;
718            sum_x2 = sum_x2 + dx * dx;
719            sum_y2 = sum_y2 + dy * dy;
720        }
721
722        let denom = (sum_x2 * sum_y2).sqrt();
723        if denom > F::zero() {
724            Ok(sum_xy / denom)
725        } else {
726            Ok(F::zero())
727        }
728    }
729
730    /// CPU parallel batch processing fallback
731    fn cpu_parallel_batch_metrics<F>(
732        &self,
733        y_true_batch: &ArrayView2<F>,
734        y_pred_batch: &ArrayView2<F>,
735        metrics: &[&str],
736    ) -> Result<Vec<HashMap<String, F>>>
737    where
738        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
739    {
740        use scirs2_core::parallel_ops::*;
741
742        let batch_size = y_true_batch.nrows();
743        let chunk_size = self.parallel_config.min_chunk_size;
744
745        // Process in parallel chunks
746        let results: Result<Vec<_>> = (0..batch_size)
747            .collect::<Vec<_>>()
748            .par_chunks(chunk_size)
749            .map(|chunk| {
750                let mut chunk_results = Vec::new();
751
752                for &batch_idx in chunk {
753                    let y_true_sample = y_true_batch.row(batch_idx);
754                    let y_pred_sample = y_pred_batch.row(batch_idx);
755
756                    let mut sample_results = HashMap::new();
757
758                    for &metric in metrics {
759                        let result = match metric {
760                            "mse" => self.simd_mse(&y_true_sample, &y_pred_sample)?,
761                            "mae" => self.simd_mae(&y_true_sample, &y_pred_sample)?,
762                            "r2_score" => self.simd_r2_score(&y_true_sample, &y_pred_sample)?,
763                            "correlation" => {
764                                self.simd_correlation(&y_true_sample, &y_pred_sample)?
765                            }
766                            _ => F::zero(),
767                        };
768                        sample_results.insert(metric.to_string(), result);
769                    }
770
771                    chunk_results.push(sample_results);
772                }
773
774                Ok(chunk_results)
775            })
776            .try_reduce(Vec::new, |mut acc, chunk| {
777                acc.extend(chunk);
778                Ok(acc)
779            });
780
781        results
782    }
783
784    // Fallback scalar implementations
785
786    fn scalar_mse<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
787    where
788        F: Float + std::iter::Sum,
789    {
790        let mse = y_true
791            .iter()
792            .zip(ypred.iter())
793            .map(|(&t, &p)| (t - p) * (t - p))
794            .sum::<F>()
795            / F::from(y_true.len()).unwrap();
796        Ok(mse)
797    }
798
799    fn scalar_mae<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
800    where
801        F: Float + std::iter::Sum,
802    {
803        let mae = y_true
804            .iter()
805            .zip(ypred.iter())
806            .map(|(&t, &p)| (t - p).abs())
807            .sum::<F>()
808            / F::from(y_true.len()).unwrap();
809        Ok(mae)
810    }
811
812    fn scalar_r2_score<F>(&self, y_true: &ArrayView1<F>, ypred: &ArrayView1<F>) -> Result<F>
813    where
814        F: Float + std::iter::Sum,
815    {
816        let mean_true = y_true.iter().cloned().sum::<F>() / F::from(y_true.len()).unwrap();
817
818        let ss_tot = y_true
819            .iter()
820            .map(|&t| (t - mean_true) * (t - mean_true))
821            .sum::<F>();
822
823        let ss_res = y_true
824            .iter()
825            .zip(ypred.iter())
826            .map(|(&t, &p)| (t - p) * (t - p))
827            .sum::<F>();
828
829        if ss_tot == F::zero() {
830            Ok(F::zero())
831        } else {
832            Ok(F::one() - ss_res / ss_tot)
833        }
834    }
835
836    fn scalar_correlation<F>(&self, x: &ArrayView1<F>, y: &ArrayView1<F>) -> Result<F>
837    where
838        F: Float + std::iter::Sum,
839    {
840        let n = F::from(x.len()).unwrap();
841        let mean_x = x.iter().cloned().sum::<F>() / n;
842        let mean_y = y.iter().cloned().sum::<F>() / n;
843
844        let mut numerator = F::zero();
845        let mut sum_sq_x = F::zero();
846        let mut sum_sq_y = F::zero();
847
848        for (&xi, &yi) in x.iter().zip(y.iter()) {
849            let dx = xi - mean_x;
850            let dy = yi - mean_y;
851            numerator = numerator + dx * dy;
852            sum_sq_x = sum_sq_x + dx * dx;
853            sum_sq_y = sum_sq_y + dy * dy;
854        }
855
856        let denominator = (sum_sq_x * sum_sq_y).sqrt();
857
858        if denominator > F::zero() {
859            Ok(numerator / denominator)
860        } else {
861            Ok(F::zero())
862        }
863    }
864
865    /// Scalar fallback for logarithmic operations
866    fn scalar_log_operations<F>(&self, values: &ArrayView1<F>) -> Result<LogOperationResults<F>>
867    where
868        F: Float,
869    {
870        let mut log_values = Array1::zeros(values.len());
871        let mut log_sum = F::zero();
872
873        for (i, &val) in values.iter().enumerate() {
874            if val > F::zero() {
875                log_values[i] = val.ln();
876                log_sum = log_sum + log_values[i];
877            } else {
878                log_values[i] = F::neg_infinity();
879            }
880        }
881
882        let geometric_mean = if values.len() > 0 {
883            (log_sum / F::from(values.len()).unwrap()).exp()
884        } else {
885            F::zero()
886        };
887
888        Ok(LogOperationResults {
889            log_values,
890            log_sum,
891            log_product: log_sum,
892            geometric_mean,
893        })
894    }
895
896    /// Scalar fallback for determinant computation
897    fn scalar_determinant<F>(&self, matrix: &Array2<F>) -> Result<F>
898    where
899        F: Float,
900    {
901        let (nrows, ncols) = matrix.dim();
902        if nrows != ncols {
903            return Err(MetricsError::InvalidInput(
904                "Matrix must be square".to_string(),
905            ));
906        }
907
908        match nrows {
909            1 => Ok(matrix[[0, 0]]),
910            2 => Ok(matrix[[0, 0]] * matrix[[1, 1]] - matrix[[0, 1]] * matrix[[1, 0]]),
911            _ => self.scalar_determinant_lu(matrix),
912        }
913    }
914
915    /// LU decomposition determinant (simplified)
916    fn scalar_determinant_lu<F>(&self, matrix: &Array2<F>) -> Result<F>
917    where
918        F: Float,
919    {
920        let n = matrix.nrows();
921        let mut lu = matrix.clone();
922        let mut det = F::one();
923        let mut sign = 1;
924
925        for i in 0..n {
926            // Find pivot
927            let mut max_row = i;
928            for k in (i + 1)..n {
929                if lu[[k, i]].abs() > lu[[max_row, i]].abs() {
930                    max_row = k;
931                }
932            }
933
934            if max_row != i {
935                // Swap rows
936                for j in 0..n {
937                    let temp = lu[[i, j]];
938                    lu[[i, j]] = lu[[max_row, j]];
939                    lu[[max_row, j]] = temp;
940                }
941                sign *= -1;
942            }
943
944            if lu[[i, i]].abs() < F::from(1e-10).unwrap() {
945                return Ok(F::zero()); // Singular matrix
946            }
947
948            det = det * lu[[i, i]];
949
950            // Eliminate column
951            for k in (i + 1)..n {
952                let factor = lu[[k, i]] / lu[[i, i]];
953                for j in (i + 1)..n {
954                    lu[[k, j]] = lu[[k, j]] - factor * lu[[i, j]];
955                }
956            }
957        }
958
959        Ok(F::from(sign).unwrap() * det)
960    }
961
962    /// Scalar fallback for trace computation
963    fn scalar_trace<F>(&self, matrix: &Array2<F>) -> Result<F>
964    where
965        F: Float,
966    {
967        let (nrows, ncols) = matrix.dim();
968        if nrows != ncols {
969            return Err(MetricsError::InvalidInput(
970                "Matrix must be square".to_string(),
971            ));
972        }
973
974        let mut trace = F::zero();
975        for i in 0..nrows {
976            trace = trace + matrix[[i, i]];
977        }
978
979        Ok(trace)
980    }
981
982    /// Scalar fallback for eigenvalue sum estimation
983    fn scalar_eigenvalue_sum_estimate<F>(&self, matrix: &Array2<F>) -> Result<F>
984    where
985        F: Float,
986    {
987        // The trace equals the sum of eigenvalues
988        self.scalar_trace(matrix)
989    }
990
991    /// Get hardware capabilities information
992    pub fn get_capabilities(&self) -> &PlatformCapabilities {
993        &self.capabilities
994    }
995
996    /// Get GPU information if available
997    pub fn get_gpu_info(&self) -> Option<&GpuInfo> {
998        self.gpu_info.as_ref()
999    }
1000
1001    /// Benchmark different implementations to choose the best one
1002    pub fn benchmark_implementations<F>(
1003        &self,
1004        y_true: &ArrayView1<F>,
1005        ypred: &ArrayView1<F>,
1006        iterations: usize,
1007    ) -> Result<BenchmarkResults>
1008    where
1009        F: Float + SimdUnifiedOps + Send + Sync + std::iter::Sum,
1010    {
1011        use std::time::Instant;
1012
1013        let mut results = BenchmarkResults::new();
1014
1015        // Benchmark scalar implementation
1016        let start = Instant::now();
1017        for _ in 0..iterations {
1018            let _ = self.scalar_mse(y_true, ypred)?;
1019        }
1020        let scalar_time = start.elapsed();
1021        results.scalar_time = scalar_time;
1022
1023        // Benchmark SIMD implementation
1024        if self.capabilities.simd_available {
1025            let start = Instant::now();
1026            for _ in 0..iterations {
1027                let _ = self.simd_mse(y_true, ypred)?;
1028            }
1029            let simd_time = start.elapsed();
1030            results.simd_time = Some(simd_time);
1031            results.simd_speedup =
1032                Some(scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64);
1033        }
1034
1035        // Benchmark GPU implementation (if available)
1036        if self.gpu_info.is_some() {
1037            let batch = y_true.view().insert_axis(Axis(0));
1038            let batch_pred = ypred.view().insert_axis(Axis(0));
1039
1040            let start = Instant::now();
1041            for _ in 0..iterations {
1042                let _ = self.gpu_batch_metrics(&batch.view(), &batch_pred.view(), &["mse"])?;
1043            }
1044            let gpu_time = start.elapsed();
1045            results.gpu_time = Some(gpu_time);
1046            results.gpu_speedup = Some(scalar_time.as_nanos() as f64 / gpu_time.as_nanos() as f64);
1047        }
1048
1049        Ok(results)
1050    }
1051}
1052
1053impl Default for SimdMetrics {
1054    fn default() -> Self {
1055        Self::new().unwrap_or_else(|_| Self {
1056            capabilities: PlatformCapabilities::detect(),
1057            enable_simd: false,
1058            gpu_info: None,
1059            parallel_config: ParallelConfig::default(),
1060        })
1061    }
1062}
1063
1064/// Benchmark results for different implementations
1065#[derive(Debug, Clone)]
1066pub struct BenchmarkResults {
1067    pub scalar_time: std::time::Duration,
1068    pub simd_time: Option<std::time::Duration>,
1069    pub gpu_time: Option<std::time::Duration>,
1070    pub simd_speedup: Option<f64>,
1071    pub gpu_speedup: Option<f64>,
1072}
1073
1074impl BenchmarkResults {
1075    pub fn new() -> Self {
1076        Self {
1077            scalar_time: std::time::Duration::default(),
1078            simd_time: None,
1079            gpu_time: None,
1080            simd_speedup: None,
1081            gpu_speedup: None,
1082        }
1083    }
1084
1085    pub fn best_implementation(&self) -> &'static str {
1086        let scalar_nanos = self.scalar_time.as_nanos();
1087        let simd_nanos = self.simd_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
1088        let gpu_nanos = self.gpu_time.map(|t| t.as_nanos()).unwrap_or(u128::MAX);
1089
1090        if gpu_nanos < scalar_nanos && gpu_nanos < simd_nanos {
1091            "GPU"
1092        } else if simd_nanos < scalar_nanos {
1093            "SIMD"
1094        } else {
1095            "Scalar"
1096        }
1097    }
1098}
1099
1100#[cfg(test)]
1101mod tests {
1102    use super::*;
1103    use scirs2_core::ndarray::array;
1104
1105    #[test]
1106    fn test_simd_metrics_creation() {
1107        let metrics = SimdMetrics::new().unwrap();
1108        assert!(metrics.enable_simd);
1109    }
1110
1111    #[test]
1112    #[ignore = "timeout"]
1113    fn test_simd_mse() {
1114        let metrics = SimdMetrics::new().unwrap();
1115        let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1116        let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1117
1118        let mse = metrics.simd_mse(&y_true.view(), &ypred.view()).unwrap();
1119        assert!(mse > 0.0);
1120        assert!(mse < 0.02); // Should be small for close predictions
1121    }
1122
1123    #[test]
1124    #[ignore = "timeout"]
1125    fn test_simd_mae() {
1126        let metrics = SimdMetrics::new().unwrap();
1127        let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1128        let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1129
1130        let mae = metrics.simd_mae(&y_true.view(), &ypred.view()).unwrap();
1131        assert!(mae > 0.0);
1132        assert!(mae < 0.15);
1133    }
1134
1135    #[test]
1136    #[ignore = "timeout"]
1137    fn test_simd_r2_score() {
1138        let metrics = SimdMetrics::new().unwrap();
1139        let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0];
1140        let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9];
1141
1142        let r2 = metrics
1143            .simd_r2_score(&y_true.view(), &ypred.view())
1144            .unwrap();
1145        assert!(r2 > 0.9); // Should be high for good predictions
1146        assert!(r2 <= 1.0);
1147    }
1148
1149    #[test]
1150    #[ignore = "timeout"]
1151    fn test_simd_correlation() {
1152        let metrics = SimdMetrics::new().unwrap();
1153        let x = array![1.0, 2.0, 3.0, 4.0, 5.0];
1154        let y = array![2.0, 4.0, 6.0, 8.0, 10.0]; // Perfect correlation
1155
1156        let corr = metrics.simd_correlation(&x.view(), &y.view()).unwrap();
1157        assert!((corr - 1.0).abs() < 1e-10); // Should be very close to 1
1158    }
1159
1160    #[test]
1161    #[ignore = "timeout"]
1162    fn test_gpu_batch_metrics() {
1163        let metrics = SimdMetrics::new().unwrap();
1164
1165        // Create batch data
1166        let y_true_batch = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1167        let y_pred_batch = array![[1.1, 2.1, 2.9], [4.1, 4.9, 6.1]];
1168
1169        let results = metrics
1170            .gpu_batch_metrics(&y_true_batch.view(), &y_pred_batch.view(), &["mse", "mae"])
1171            .unwrap();
1172
1173        assert_eq!(results.len(), 2);
1174        assert!(results[0].contains_key("mse"));
1175        assert!(results[0].contains_key("mae"));
1176        assert!(results[1].contains_key("mse"));
1177        assert!(results[1].contains_key("mae"));
1178    }
1179
1180    #[test]
1181    #[ignore = "timeout"]
1182    fn test_benchmark_implementations() {
1183        let metrics = SimdMetrics::new().unwrap();
1184        let y_true = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1185        let ypred = array![1.1, 2.1, 2.9, 4.1, 4.9, 6.1, 6.9, 8.1, 8.9, 10.1];
1186
1187        let benchmark = metrics
1188            .benchmark_implementations(&y_true.view(), &ypred.view(), 10)
1189            .unwrap();
1190
1191        assert!(benchmark.scalar_time.as_nanos() > 0);
1192
1193        let best = benchmark.best_implementation();
1194        assert!(best == "Scalar" || best == "SIMD" || best == "GPU");
1195    }
1196
1197    #[test]
1198    #[ignore = "timeout"]
1199    fn test_simd_exponential_moving_average() {
1200        let metrics = SimdMetrics::new().unwrap();
1201        let values = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1202        let alpha = 0.3;
1203
1204        let ema = metrics
1205            .simd_exponential_moving_average(&values.view(), alpha)
1206            .unwrap();
1207
1208        assert_eq!(ema.len(), values.len());
1209        assert_eq!(ema[0], values[0]); // First value should be unchanged
1210
1211        // EMA should be smooth and increasing for increasing input
1212        for i in 1..ema.len() {
1213            assert!(ema[i] > ema[i - 1]);
1214            assert!(ema[i] <= values[i]); // EMA should lag behind actual values
1215        }
1216    }
1217
1218    #[test]
1219    #[ignore = "timeout"]
1220    fn test_simd_log_operations() {
1221        let metrics = SimdMetrics::new().unwrap();
1222        let values = array![1.0, 2.0, 4.0, 8.0]; // Powers of 2 for easy verification
1223
1224        let log_results = metrics.simd_log_operations(&values.view()).unwrap();
1225
1226        assert_eq!(log_results.log_values.len(), values.len());
1227
1228        // Check specific values for powers of 2
1229        assert!((log_results.log_values[0] - 0.0).abs() < 1e-10); // ln(1) = 0
1230        assert!((log_results.log_values[1] - 2.0_f64.ln()).abs() < 1e-10);
1231
1232        assert!(log_results.log_sum.is_finite());
1233        assert!(log_results.geometric_mean > 0.0);
1234    }
1235
1236    #[test]
1237    fn test_simd_batch_matrix_operations() {
1238        let metrics = SimdMetrics::new().unwrap();
1239
1240        // Create test matrices
1241        let matrix1 = array![[1.0, 2.0], [3.0, 4.0]];
1242        let matrix2 = array![[5.0, 6.0], [7.0, 8.0]];
1243        let matrices = vec![matrix1, matrix2];
1244
1245        let results = metrics
1246            .simd_batch_matrix_operations(&matrices, MatrixOperation::All)
1247            .unwrap();
1248
1249        assert_eq!(results.determinants.len(), 2);
1250        assert_eq!(results.traces.len(), 2);
1251        assert_eq!(results.eigenvalue_sums.len(), 2);
1252
1253        // Check determinant of first matrix: 1*4 - 2*3 = -2
1254        assert!((results.determinants[0] - (-2.0)).abs() < 1e-10);
1255
1256        // Check trace of first matrix: 1 + 4 = 5
1257        assert!((results.traces[0] - 5.0).abs() < 1e-10);
1258
1259        // Check trace of second matrix: 5 + 8 = 13
1260        assert!((results.traces[1] - 13.0).abs() < 1e-10);
1261    }
1262
1263    #[test]
1264    fn test_cuda_detection() {
1265        // Test the CUDA detection logic
1266        let result = SimdMetrics::detect_cuda_capabilities();
1267
1268        // Should either succeed if CUDA is available or fail gracefully
1269        match result {
1270            Ok(gpu_info) => {
1271                assert!(!gpu_info.device_name.is_empty());
1272                assert!(gpu_info.total_memory > 0);
1273                assert!(gpu_info.multiprocessor_count > 0);
1274            }
1275            Err(_) => {
1276                // CUDA not available, which is fine
1277            }
1278        }
1279    }
1280
1281    #[test]
1282    fn test_opencl_detection() {
1283        // Test the OpenCL detection logic
1284        let result = SimdMetrics::detect_opencl_capabilities();
1285
1286        // Should either succeed if OpenCL is available or fail gracefully
1287        match result {
1288            Ok(gpu_info) => {
1289                assert!(!gpu_info.device_name.is_empty());
1290                assert!(gpu_info.total_memory > 0);
1291                assert!(gpu_info.multiprocessor_count > 0);
1292            }
1293            Err(_) => {
1294                // OpenCL not available, which is fine
1295            }
1296        }
1297    }
1298
1299    #[test]
1300    #[ignore = "timeout"]
1301    fn test_log_operation_edge_cases() {
1302        let metrics = SimdMetrics::new().unwrap();
1303
1304        // Test with zero and negative values
1305        let values = array![0.0, -1.0, 1.0, 2.0];
1306        let log_results = metrics.simd_log_operations(&values.view()).unwrap();
1307
1308        assert!(log_results.log_values[0].is_infinite()); // log(0) = -∞
1309        assert!(log_results.log_values[1].is_infinite()); // log(-1) = -∞
1310        assert!((log_results.log_values[2] - 0.0).abs() < 1e-10); // log(1) = 0
1311
1312        // Geometric mean should handle negative/zero values gracefully
1313        assert!(log_results.geometric_mean.is_finite() || log_results.geometric_mean == 0.0);
1314    }
1315
1316    #[test]
1317    fn test_determinant_edge_cases() {
1318        let metrics = SimdMetrics::new().unwrap();
1319
1320        // Test 1x1 matrix
1321        let matrix1x1 = array![[5.0]];
1322        let det1 = metrics.simd_determinant(&matrix1x1).unwrap();
1323        assert!((det1 - 5.0).abs() < 1e-10);
1324
1325        // Test 2x2 identity matrix
1326        let identity2x2 = array![[1.0, 0.0], [0.0, 1.0]];
1327        let det2 = metrics.simd_determinant(&identity2x2).unwrap();
1328        assert!((det2 - 1.0).abs() < 1e-10);
1329
1330        // Test singular matrix (determinant should be 0)
1331        let singular = array![[1.0, 2.0], [2.0, 4.0]];
1332        let det3 = metrics.simd_determinant(&singular).unwrap();
1333        assert!(det3.abs() < 1e-10);
1334    }
1335}