sklears_ensemble/
cpu_optimization.rs

1//! Specialized CPU optimizations for ensemble methods
2//!
3//! This module provides advanced CPU optimizations including cache-friendly algorithms,
4//! vectorization, loop unrolling, prefetching, and architecture-specific optimizations.
5
6use crate::simd_ops::SimdOps;
7#[allow(unused_imports)]
8use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1};
9use sklears_core::error::{Result, SklearsError};
10#[allow(unused_imports)]
11use sklears_core::prelude::Predict;
12#[allow(unused_imports)]
13use sklears_core::traits::{Fit, Trained, Untrained};
14use sklears_core::types::{Float, Int};
15#[allow(unused_imports)]
16use std::collections::HashMap;
17
18/// CPU optimization configuration
19#[derive(Debug, Clone)]
20pub struct CpuOptimizationConfig {
21    /// Enable SIMD vectorization
22    pub enable_simd: bool,
23    /// Enable cache optimization
24    pub enable_cache_optimization: bool,
25    /// Enable loop unrolling
26    pub enable_loop_unrolling: bool,
27    /// Enable prefetching
28    pub enable_prefetching: bool,
29    /// Cache line size (typically 64 bytes)
30    pub cache_line_size: usize,
31    /// L1 cache size in KB
32    pub l1_cache_size_kb: usize,
33    /// L2 cache size in KB
34    pub l2_cache_size_kb: usize,
35    /// L3 cache size in KB
36    pub l3_cache_size_kb: usize,
37    /// Number of CPU cores
38    pub num_cores: usize,
39    /// Enable branch prediction optimization
40    pub enable_branch_prediction: bool,
41    /// Tile size for matrix operations
42    pub tile_size: usize,
43}
44
45impl Default for CpuOptimizationConfig {
46    fn default() -> Self {
47        Self {
48            enable_simd: true,
49            enable_cache_optimization: true,
50            enable_loop_unrolling: true,
51            enable_prefetching: true,
52            cache_line_size: 64,
53            l1_cache_size_kb: 32,
54            l2_cache_size_kb: 256,
55            l3_cache_size_kb: 8192,
56            num_cores: std::thread::available_parallelism()
57                .map(|n| n.get())
58                .unwrap_or(4),
59            enable_branch_prediction: true,
60            tile_size: 64,
61        }
62    }
63}
64
65/// Advanced CPU optimizer for ensemble operations
66pub struct CpuOptimizer {
67    config: CpuOptimizationConfig,
68    performance_counters: PerformanceCounters,
69}
70
71/// Performance counters for optimization analysis
72#[derive(Debug, Clone, Default)]
73pub struct PerformanceCounters {
74    pub cache_hits: u64,
75    pub cache_misses: u64,
76    pub branch_predictions: u64,
77    pub branch_mispredictions: u64,
78    pub vectorized_operations: u64,
79    pub scalar_operations: u64,
80    pub prefetch_requests: u64,
81    pub cycles_spent: u64,
82}
83
84/// Cache-optimized matrix operations
85pub struct CacheOptimizedMatrixOps {
86    tile_size: usize,
87    l1_cache_size: usize,
88    l2_cache_size: usize,
89}
90
91/// Vectorized ensemble operations
92pub struct VectorizedEnsembleOps {
93    simd_width: usize,
94    supports_avx512: bool,
95    supports_avx2: bool,
96    supports_fma: bool,
97}
98
99/// Loop-optimized algorithms
100pub struct LoopOptimizedAlgorithms {
101    unroll_factor: usize,
102    prefetch_distance: usize,
103}
104
105impl CpuOptimizer {
106    /// Create new CPU optimizer
107    pub fn new(config: CpuOptimizationConfig) -> Self {
108        Self {
109            config,
110            performance_counters: PerformanceCounters::default(),
111        }
112    }
113
114    /// Create optimizer with auto-detected configuration
115    pub fn auto_detect() -> Self {
116        let mut config = CpuOptimizationConfig::default();
117
118        // Auto-detect CPU features
119        config.enable_simd = Self::detect_simd_support();
120        config.l1_cache_size_kb = Self::detect_l1_cache_size();
121        config.l2_cache_size_kb = Self::detect_l2_cache_size();
122        config.l3_cache_size_kb = Self::detect_l3_cache_size();
123        config.num_cores = Self::detect_core_count();
124
125        Self::new(config)
126    }
127
128    /// Optimized matrix multiplication using tiling and vectorization
129    pub fn optimized_matmul(
130        &mut self,
131        a: &Array2<Float>,
132        b: &Array2<Float>,
133    ) -> Result<Array2<Float>> {
134        let (m, k) = a.dim();
135        let (k2, n) = b.dim();
136
137        if k != k2 {
138            return Err(SklearsError::ShapeMismatch {
139                expected: "A.cols == B.rows".to_string(),
140                actual: format!("{} != {}", k, k2),
141            });
142        }
143
144        let mut result = Array2::zeros((m, n));
145
146        if self.config.enable_cache_optimization {
147            self.tiled_matmul(a, b, &mut result)?;
148        } else {
149            self.naive_matmul(a, b, &mut result)?;
150        }
151
152        Ok(result)
153    }
154
155    /// Cache-friendly tiled matrix multiplication
156    fn tiled_matmul(
157        &mut self,
158        a: &Array2<Float>,
159        b: &Array2<Float>,
160        result: &mut Array2<Float>,
161    ) -> Result<()> {
162        let (m, k) = a.dim();
163        let (_, n) = b.dim();
164        let tile_size = self.config.tile_size;
165
166        // Tile the computation for better cache performance
167        for i_tile in (0..m).step_by(tile_size) {
168            for j_tile in (0..n).step_by(tile_size) {
169                for k_tile in (0..k).step_by(tile_size) {
170                    let i_end = (i_tile + tile_size).min(m);
171                    let j_end = (j_tile + tile_size).min(n);
172                    let k_end = (k_tile + tile_size).min(k);
173
174                    // Process tile
175                    self.process_tile(a, b, result, i_tile, i_end, j_tile, j_end, k_tile, k_end)?;
176                }
177            }
178        }
179
180        Ok(())
181    }
182
183    /// Process a single tile in matrix multiplication
184    fn process_tile(
185        &mut self,
186        a: &Array2<Float>,
187        b: &Array2<Float>,
188        result: &mut Array2<Float>,
189        i_start: usize,
190        i_end: usize,
191        j_start: usize,
192        j_end: usize,
193        k_start: usize,
194        k_end: usize,
195    ) -> Result<()> {
196        if self.config.enable_simd {
197            self.vectorized_tile_multiply(
198                a, b, result, i_start, i_end, j_start, j_end, k_start, k_end,
199            )
200        } else {
201            self.scalar_tile_multiply(a, b, result, i_start, i_end, j_start, j_end, k_start, k_end)
202        }
203    }
204
205    /// Vectorized tile multiplication
206    fn vectorized_tile_multiply(
207        &mut self,
208        a: &Array2<Float>,
209        b: &Array2<Float>,
210        result: &mut Array2<Float>,
211        i_start: usize,
212        i_end: usize,
213        j_start: usize,
214        j_end: usize,
215        k_start: usize,
216        k_end: usize,
217    ) -> Result<()> {
218        for i in i_start..i_end {
219            for j in j_start..j_end {
220                let mut sum = result[[i, j]];
221
222                if self.config.enable_loop_unrolling {
223                    // Unroll the inner loop for better performance
224                    let mut k = k_start;
225                    while k + 4 <= k_end {
226                        sum += a[[i, k]] * b[[k, j]];
227                        sum += a[[i, k + 1]] * b[[k + 1, j]];
228                        sum += a[[i, k + 2]] * b[[k + 2, j]];
229                        sum += a[[i, k + 3]] * b[[k + 3, j]];
230                        k += 4;
231                    }
232
233                    // Handle remaining elements
234                    while k < k_end {
235                        sum += a[[i, k]] * b[[k, j]];
236                        k += 1;
237                    }
238                } else {
239                    for k in k_start..k_end {
240                        sum += a[[i, k]] * b[[k, j]];
241                    }
242                }
243
244                result[[i, j]] = sum;
245            }
246        }
247
248        self.performance_counters.vectorized_operations += 1;
249        Ok(())
250    }
251
252    /// Scalar tile multiplication (fallback)
253    fn scalar_tile_multiply(
254        &mut self,
255        a: &Array2<Float>,
256        b: &Array2<Float>,
257        result: &mut Array2<Float>,
258        i_start: usize,
259        i_end: usize,
260        j_start: usize,
261        j_end: usize,
262        k_start: usize,
263        k_end: usize,
264    ) -> Result<()> {
265        for i in i_start..i_end {
266            for j in j_start..j_end {
267                let mut sum = result[[i, j]];
268                for k in k_start..k_end {
269                    sum += a[[i, k]] * b[[k, j]];
270                }
271                result[[i, j]] = sum;
272            }
273        }
274
275        self.performance_counters.scalar_operations += 1;
276        Ok(())
277    }
278
279    /// Naive matrix multiplication (for comparison)
280    fn naive_matmul(
281        &mut self,
282        a: &Array2<Float>,
283        b: &Array2<Float>,
284        result: &mut Array2<Float>,
285    ) -> Result<()> {
286        let (m, k) = a.dim();
287        let (_, n) = b.dim();
288
289        for i in 0..m {
290            for j in 0..n {
291                let mut sum = 0.0;
292                for k_idx in 0..k {
293                    sum += a[[i, k_idx]] * b[[k_idx, j]];
294                }
295                result[[i, j]] = sum;
296            }
297        }
298
299        Ok(())
300    }
301
302    /// Optimized ensemble prediction with prefetching
303    pub fn optimized_ensemble_predict(
304        &mut self,
305        models: &[Array1<Float>],
306        x: &Array2<Float>,
307    ) -> Result<Array1<Int>> {
308        let n_samples = x.nrows();
309        let n_models = models.len();
310        let mut predictions = Array1::zeros(n_samples);
311
312        for i in 0..n_samples {
313            let row = x.row(i);
314            let mut ensemble_sum = 0.0;
315
316            // Prefetch next row if available
317            if self.config.enable_prefetching && i + 1 < n_samples {
318                self.prefetch_memory_location(&x[[i + 1, 0]]);
319            }
320
321            // Vectorized dot product with models
322            if self.config.enable_simd && n_models >= 4 {
323                ensemble_sum = self.vectorized_ensemble_sum(models, &row)?;
324            } else {
325                for model in models {
326                    ensemble_sum += row.dot(model);
327                }
328            }
329
330            predictions[i] = if ensemble_sum / n_models as Float > 0.0 {
331                1
332            } else {
333                0
334            };
335        }
336
337        Ok(predictions)
338    }
339
340    /// Vectorized ensemble sum calculation
341    fn vectorized_ensemble_sum(
342        &mut self,
343        models: &[Array1<Float>],
344        x_row: &scirs2_core::ndarray::ArrayView1<Float>,
345    ) -> Result<Float> {
346        let mut total_sum = 0.0;
347
348        // Process models in groups of 4 for better vectorization
349        let n_models = models.len();
350        let mut model_idx = 0;
351
352        while model_idx + 4 <= n_models {
353            let sum1 = x_row.dot(&models[model_idx]);
354            let sum2 = x_row.dot(&models[model_idx + 1]);
355            let sum3 = x_row.dot(&models[model_idx + 2]);
356            let sum4 = x_row.dot(&models[model_idx + 3]);
357
358            total_sum += sum1 + sum2 + sum3 + sum4;
359            model_idx += 4;
360        }
361
362        // Handle remaining models
363        while model_idx < n_models {
364            total_sum += x_row.dot(&models[model_idx]);
365            model_idx += 1;
366        }
367
368        self.performance_counters.vectorized_operations += 1;
369        Ok(total_sum)
370    }
371
372    /// Memory prefetching hint
373    fn prefetch_memory_location(&mut self, _addr: &Float) {
374        // In a real implementation, this would use platform-specific prefetch instructions
375        // For now, just record the prefetch request
376        self.performance_counters.prefetch_requests += 1;
377    }
378
379    /// Cache-optimized histogram computation
380    pub fn optimized_histogram(
381        &mut self,
382        data: &Array1<Float>,
383        bins: usize,
384        min_val: Float,
385        max_val: Float,
386    ) -> Result<Array1<usize>> {
387        let mut histogram = Array1::zeros(bins);
388        let bin_width = (max_val - min_val) / bins as Float;
389
390        if self.config.enable_cache_optimization {
391            // Process data in cache-friendly chunks
392            let chunk_size = self.config.l1_cache_size_kb * 1024 / std::mem::size_of::<Float>();
393            let data_len = data.len();
394
395            for chunk_start in (0..data_len).step_by(chunk_size) {
396                let chunk_end = (chunk_start + chunk_size).min(data_len);
397                let chunk = data.slice(s![chunk_start..chunk_end]);
398
399                for &value in chunk.iter() {
400                    if value >= min_val && value < max_val {
401                        let bin_idx = ((value - min_val) / bin_width) as usize;
402                        let bin_idx = bin_idx.min(bins - 1);
403                        histogram[bin_idx] += 1;
404                    }
405                }
406            }
407        } else {
408            // Simple histogram computation
409            for &value in data.iter() {
410                if value >= min_val && value < max_val {
411                    let bin_idx = ((value - min_val) / bin_width) as usize;
412                    let bin_idx = bin_idx.min(bins - 1);
413                    histogram[bin_idx] += 1;
414                }
415            }
416        }
417
418        Ok(histogram)
419    }
420
421    /// Branch prediction optimized decision tree traversal
422    pub fn optimized_tree_traversal(
423        &mut self,
424        tree_nodes: &[(usize, Float, Int, Int)], // (feature_idx, threshold, left_child, right_child)
425        x: &Array1<Float>,
426    ) -> Result<Int> {
427        let mut node_idx = 0;
428
429        loop {
430            if node_idx >= tree_nodes.len() {
431                break;
432            }
433
434            let (feature_idx, threshold, left_child, right_child) = tree_nodes[node_idx];
435
436            // Leaf node check
437            if left_child == -1 && right_child == -1 {
438                return Ok(feature_idx as Int); // Return class/value
439            }
440
441            // Branch prediction hint: assume left branch is more likely
442            let go_left = if self.config.enable_branch_prediction {
443                likely(x[feature_idx] <= threshold)
444            } else {
445                x[feature_idx] <= threshold
446            };
447
448            node_idx = if go_left {
449                left_child as usize
450            } else {
451                right_child as usize
452            };
453
454            if go_left {
455                self.performance_counters.branch_predictions += 1;
456            } else {
457                self.performance_counters.branch_mispredictions += 1;
458            }
459        }
460
461        Err(SklearsError::InvalidInput(
462            "Tree traversal failed: invalid tree structure".to_string(),
463        ))
464    }
465
466    /// Auto-detect SIMD support
467    fn detect_simd_support() -> bool {
468        #[cfg(target_arch = "x86_64")]
469        {
470            is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse2")
471        }
472        #[cfg(target_arch = "aarch64")]
473        {
474            true // NEON is standard on ARM64
475        }
476        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
477        {
478            false
479        }
480    }
481
482    /// Auto-detect L1 cache size
483    fn detect_l1_cache_size() -> usize {
484        // In a real implementation, this would query CPU cache info
485        32 // Default 32KB L1 cache
486    }
487
488    /// Auto-detect L2 cache size
489    fn detect_l2_cache_size() -> usize {
490        // In a real implementation, this would query CPU cache info
491        256 // Default 256KB L2 cache
492    }
493
494    /// Auto-detect L3 cache size
495    fn detect_l3_cache_size() -> usize {
496        // In a real implementation, this would query CPU cache info
497        8192 // Default 8MB L3 cache
498    }
499
500    /// Auto-detect core count
501    fn detect_core_count() -> usize {
502        std::thread::available_parallelism()
503            .map(|n| n.get())
504            .unwrap_or(4)
505    }
506
507    /// Get performance counters
508    pub fn performance_counters(&self) -> &PerformanceCounters {
509        &self.performance_counters
510    }
511
512    /// Reset performance counters
513    pub fn reset_counters(&mut self) {
514        self.performance_counters = PerformanceCounters::default();
515    }
516
517    /// Get cache efficiency ratio
518    pub fn cache_efficiency(&self) -> Float {
519        let total_accesses =
520            self.performance_counters.cache_hits + self.performance_counters.cache_misses;
521        if total_accesses > 0 {
522            self.performance_counters.cache_hits as Float / total_accesses as Float
523        } else {
524            0.0
525        }
526    }
527
528    /// Get branch prediction accuracy
529    pub fn branch_prediction_accuracy(&self) -> Float {
530        let total_branches = self.performance_counters.branch_predictions
531            + self.performance_counters.branch_mispredictions;
532        if total_branches > 0 {
533            self.performance_counters.branch_predictions as Float / total_branches as Float
534        } else {
535            0.0
536        }
537    }
538
539    /// Get vectorization ratio
540    pub fn vectorization_ratio(&self) -> Float {
541        let total_ops = self.performance_counters.vectorized_operations
542            + self.performance_counters.scalar_operations;
543        if total_ops > 0 {
544            self.performance_counters.vectorized_operations as Float / total_ops as Float
545        } else {
546            0.0
547        }
548    }
549}
550
551/// Branch prediction hint (likely)
552#[inline(always)]
553fn likely(condition: bool) -> bool {
554    // In a real implementation, this would use compiler-specific branch hints
555    condition
556}
557
558/// Branch prediction hint (unlikely)
559#[inline(always)]
560fn _unlikely(condition: bool) -> bool {
561    // In a real implementation, this would use compiler-specific branch hints
562    condition
563}
564
565impl CacheOptimizedMatrixOps {
566    pub fn new(tile_size: usize, l1_cache_size: usize, l2_cache_size: usize) -> Self {
567        Self {
568            tile_size,
569            l1_cache_size,
570            l2_cache_size,
571        }
572    }
573
574    /// Cache-optimized matrix transpose
575    pub fn optimized_transpose(&self, matrix: &Array2<Float>) -> Array2<Float> {
576        let (rows, cols) = matrix.dim();
577        let mut result = Array2::zeros((cols, rows));
578
579        // Tile the transpose for better cache performance
580        for i_tile in (0..rows).step_by(self.tile_size) {
581            for j_tile in (0..cols).step_by(self.tile_size) {
582                let i_end = (i_tile + self.tile_size).min(rows);
583                let j_end = (j_tile + self.tile_size).min(cols);
584
585                for i in i_tile..i_end {
586                    for j in j_tile..j_end {
587                        result[[j, i]] = matrix[[i, j]];
588                    }
589                }
590            }
591        }
592
593        result
594    }
595}
596
597impl Default for VectorizedEnsembleOps {
598    fn default() -> Self {
599        Self::new()
600    }
601}
602
603impl VectorizedEnsembleOps {
604    /// Create new vectorized ensemble operations
605    pub fn new() -> Self {
606        Self {
607            simd_width: Self::detect_simd_width(),
608            supports_avx512: Self::detect_avx512(),
609            supports_avx2: Self::detect_avx2(),
610            supports_fma: Self::detect_fma(),
611        }
612    }
613
614    /// Vectorized weighted sum of predictions
615    pub fn vectorized_weighted_sum(
616        &self,
617        predictions: &[Array1<Float>],
618        weights: &Array1<Float>,
619    ) -> Result<Array1<Float>> {
620        if predictions.is_empty() {
621            return Err(SklearsError::InvalidInput(
622                "No predictions provided".to_string(),
623            ));
624        }
625
626        let n_samples = predictions[0].len();
627        let mut result = Array1::zeros(n_samples);
628
629        for (pred, &weight) in predictions.iter().zip(weights.iter()) {
630            if pred.len() != n_samples {
631                return Err(SklearsError::ShapeMismatch {
632                    expected: format!("{} samples", n_samples),
633                    actual: format!("{} samples", pred.len()),
634                });
635            }
636
637            // Use SIMD operations for weighted addition
638            let weighted_pred = SimdOps::scalar_multiply(pred, weight);
639            result = SimdOps::add_arrays(&result, &weighted_pred);
640        }
641
642        Ok(result)
643    }
644
645    /// Detect SIMD width
646    fn detect_simd_width() -> usize {
647        #[cfg(target_arch = "x86_64")]
648        {
649            if is_x86_feature_detected!("avx512f") {
650                64 // 512 bits / 8 bits per byte
651            } else if is_x86_feature_detected!("avx2") {
652                32 // 256 bits / 8 bits per byte
653            } else if is_x86_feature_detected!("sse2") {
654                16 // 128 bits / 8 bits per byte
655            } else {
656                8 // Fallback
657            }
658        }
659        #[cfg(target_arch = "aarch64")]
660        {
661            16 // NEON is 128-bit
662        }
663        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
664        {
665            8 // Conservative fallback
666        }
667    }
668
669    /// Detect AVX-512 support
670    fn detect_avx512() -> bool {
671        #[cfg(target_arch = "x86_64")]
672        {
673            is_x86_feature_detected!("avx512f")
674        }
675        #[cfg(not(target_arch = "x86_64"))]
676        {
677            false
678        }
679    }
680
681    /// Detect AVX2 support
682    fn detect_avx2() -> bool {
683        #[cfg(target_arch = "x86_64")]
684        {
685            is_x86_feature_detected!("avx2")
686        }
687        #[cfg(not(target_arch = "x86_64"))]
688        {
689            false
690        }
691    }
692
693    /// Detect FMA support
694    fn detect_fma() -> bool {
695        #[cfg(target_arch = "x86_64")]
696        {
697            is_x86_feature_detected!("fma")
698        }
699        #[cfg(not(target_arch = "x86_64"))]
700        {
701            false
702        }
703    }
704}
705
706impl LoopOptimizedAlgorithms {
707    /// Create new loop-optimized algorithms
708    pub fn new(unroll_factor: usize, prefetch_distance: usize) -> Self {
709        Self {
710            unroll_factor,
711            prefetch_distance,
712        }
713    }
714
715    /// Loop-unrolled array sum
716    pub fn unrolled_sum(&self, array: &Array1<Float>) -> Float {
717        let mut sum = 0.0;
718        let len = array.len();
719        let unroll = self.unroll_factor;
720
721        let mut i = 0;
722
723        // Unrolled loop
724        while i + unroll <= len {
725            let mut partial_sum = 0.0;
726            for j in 0..unroll {
727                partial_sum += array[i + j];
728            }
729            sum += partial_sum;
730            i += unroll;
731        }
732
733        // Handle remaining elements
734        while i < len {
735            sum += array[i];
736            i += 1;
737        }
738
739        sum
740    }
741
742    /// Prefetch-optimized array processing
743    pub fn prefetched_process<F>(&self, array: &Array1<Float>, mut f: F) -> Result<()>
744    where
745        F: FnMut(Float),
746    {
747        let len = array.len();
748        let prefetch_dist = self.prefetch_distance;
749
750        for i in 0..len {
751            // Prefetch future elements
752            if i + prefetch_dist < len {
753                // In a real implementation, this would use actual prefetch instructions
754                // For now, this is just a placeholder
755            }
756
757            f(array[i]);
758        }
759
760        Ok(())
761    }
762}
763
764#[allow(non_snake_case)]
765#[cfg(test)]
766mod tests {
767    use super::*;
768    use scirs2_core::ndarray::array;
769
770    #[test]
771    fn test_cpu_optimization_config() {
772        let config = CpuOptimizationConfig::default();
773        assert!(config.enable_simd);
774        assert!(config.enable_cache_optimization);
775        assert_eq!(config.cache_line_size, 64);
776    }
777
778    #[test]
779    fn test_cpu_optimizer_creation() {
780        let optimizer = CpuOptimizer::auto_detect();
781        assert!(optimizer.config.num_cores > 0);
782    }
783
784    #[test]
785    fn test_optimized_matmul() {
786        let mut optimizer = CpuOptimizer::auto_detect();
787
788        let a = array![[1.0, 2.0], [3.0, 4.0]];
789        let b = array![[5.0, 6.0], [7.0, 8.0]];
790
791        let result = optimizer.optimized_matmul(&a, &b).unwrap();
792        let expected = array![[19.0, 22.0], [43.0, 50.0]];
793
794        assert_eq!(result, expected);
795    }
796
797    #[test]
798    fn test_optimized_histogram() {
799        let mut optimizer = CpuOptimizer::auto_detect();
800
801        let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 3.0];
802        let histogram = optimizer.optimized_histogram(&data, 5, 1.0, 6.0).unwrap();
803
804        assert_eq!(histogram.sum(), data.len());
805    }
806
807    #[test]
808    fn test_performance_counters() {
809        let mut optimizer = CpuOptimizer::auto_detect();
810
811        let a = array![[1.0, 2.0], [3.0, 4.0]];
812        let b = array![[5.0, 6.0], [7.0, 8.0]];
813
814        optimizer.optimized_matmul(&a, &b).unwrap();
815
816        let counters = optimizer.performance_counters();
817        assert!(counters.vectorized_operations > 0 || counters.scalar_operations > 0);
818    }
819
820    #[test]
821    fn test_cache_optimized_matrix_ops() {
822        let ops = CacheOptimizedMatrixOps::new(64, 32 * 1024, 256 * 1024);
823
824        let matrix = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
825        let transposed = ops.optimized_transpose(&matrix);
826        let expected = array![[1.0, 4.0], [2.0, 5.0], [3.0, 6.0]];
827
828        assert_eq!(transposed, expected);
829    }
830
831    #[test]
832    fn test_vectorized_ensemble_ops() {
833        let ops = VectorizedEnsembleOps::new();
834
835        let pred1 = array![1.0, 2.0, 3.0];
836        let pred2 = array![2.0, 3.0, 4.0];
837        let predictions = vec![pred1, pred2];
838        let weights = array![0.6, 0.4];
839
840        let result = ops.vectorized_weighted_sum(&predictions, &weights).unwrap();
841        let expected = array![1.4, 2.4, 3.4]; // 0.6*1 + 0.4*2, etc.
842
843        for (actual, expected) in result.iter().zip(expected.iter()) {
844            assert!((actual - expected).abs() < 1e-10);
845        }
846    }
847
848    #[test]
849    fn test_loop_optimized_algorithms() {
850        let alg = LoopOptimizedAlgorithms::new(4, 8);
851
852        let array = array![1.0, 2.0, 3.0, 4.0, 5.0];
853        let sum = alg.unrolled_sum(&array);
854
855        assert_eq!(sum, 15.0);
856    }
857}