1use crate::simd_ops::SimdOps;
7#[allow(unused_imports)]
8use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1};
9use sklears_core::error::{Result, SklearsError};
10#[allow(unused_imports)]
11use sklears_core::prelude::Predict;
12#[allow(unused_imports)]
13use sklears_core::traits::{Fit, Trained, Untrained};
14use sklears_core::types::{Float, Int};
15#[allow(unused_imports)]
16use std::collections::HashMap;
17
18#[derive(Debug, Clone)]
20pub struct CpuOptimizationConfig {
21 pub enable_simd: bool,
23 pub enable_cache_optimization: bool,
25 pub enable_loop_unrolling: bool,
27 pub enable_prefetching: bool,
29 pub cache_line_size: usize,
31 pub l1_cache_size_kb: usize,
33 pub l2_cache_size_kb: usize,
35 pub l3_cache_size_kb: usize,
37 pub num_cores: usize,
39 pub enable_branch_prediction: bool,
41 pub tile_size: usize,
43}
44
45impl Default for CpuOptimizationConfig {
46 fn default() -> Self {
47 Self {
48 enable_simd: true,
49 enable_cache_optimization: true,
50 enable_loop_unrolling: true,
51 enable_prefetching: true,
52 cache_line_size: 64,
53 l1_cache_size_kb: 32,
54 l2_cache_size_kb: 256,
55 l3_cache_size_kb: 8192,
56 num_cores: std::thread::available_parallelism()
57 .map(|n| n.get())
58 .unwrap_or(4),
59 enable_branch_prediction: true,
60 tile_size: 64,
61 }
62 }
63}
64
65pub struct CpuOptimizer {
67 config: CpuOptimizationConfig,
68 performance_counters: PerformanceCounters,
69}
70
71#[derive(Debug, Clone, Default)]
73pub struct PerformanceCounters {
74 pub cache_hits: u64,
75 pub cache_misses: u64,
76 pub branch_predictions: u64,
77 pub branch_mispredictions: u64,
78 pub vectorized_operations: u64,
79 pub scalar_operations: u64,
80 pub prefetch_requests: u64,
81 pub cycles_spent: u64,
82}
83
84pub struct CacheOptimizedMatrixOps {
86 tile_size: usize,
87 l1_cache_size: usize,
88 l2_cache_size: usize,
89}
90
91pub struct VectorizedEnsembleOps {
93 simd_width: usize,
94 supports_avx512: bool,
95 supports_avx2: bool,
96 supports_fma: bool,
97}
98
99pub struct LoopOptimizedAlgorithms {
101 unroll_factor: usize,
102 prefetch_distance: usize,
103}
104
105impl CpuOptimizer {
106 pub fn new(config: CpuOptimizationConfig) -> Self {
108 Self {
109 config,
110 performance_counters: PerformanceCounters::default(),
111 }
112 }
113
114 pub fn auto_detect() -> Self {
116 let mut config = CpuOptimizationConfig::default();
117
118 config.enable_simd = Self::detect_simd_support();
120 config.l1_cache_size_kb = Self::detect_l1_cache_size();
121 config.l2_cache_size_kb = Self::detect_l2_cache_size();
122 config.l3_cache_size_kb = Self::detect_l3_cache_size();
123 config.num_cores = Self::detect_core_count();
124
125 Self::new(config)
126 }
127
128 pub fn optimized_matmul(
130 &mut self,
131 a: &Array2<Float>,
132 b: &Array2<Float>,
133 ) -> Result<Array2<Float>> {
134 let (m, k) = a.dim();
135 let (k2, n) = b.dim();
136
137 if k != k2 {
138 return Err(SklearsError::ShapeMismatch {
139 expected: "A.cols == B.rows".to_string(),
140 actual: format!("{} != {}", k, k2),
141 });
142 }
143
144 let mut result = Array2::zeros((m, n));
145
146 if self.config.enable_cache_optimization {
147 self.tiled_matmul(a, b, &mut result)?;
148 } else {
149 self.naive_matmul(a, b, &mut result)?;
150 }
151
152 Ok(result)
153 }
154
155 fn tiled_matmul(
157 &mut self,
158 a: &Array2<Float>,
159 b: &Array2<Float>,
160 result: &mut Array2<Float>,
161 ) -> Result<()> {
162 let (m, k) = a.dim();
163 let (_, n) = b.dim();
164 let tile_size = self.config.tile_size;
165
166 for i_tile in (0..m).step_by(tile_size) {
168 for j_tile in (0..n).step_by(tile_size) {
169 for k_tile in (0..k).step_by(tile_size) {
170 let i_end = (i_tile + tile_size).min(m);
171 let j_end = (j_tile + tile_size).min(n);
172 let k_end = (k_tile + tile_size).min(k);
173
174 self.process_tile(a, b, result, i_tile, i_end, j_tile, j_end, k_tile, k_end)?;
176 }
177 }
178 }
179
180 Ok(())
181 }
182
183 fn process_tile(
185 &mut self,
186 a: &Array2<Float>,
187 b: &Array2<Float>,
188 result: &mut Array2<Float>,
189 i_start: usize,
190 i_end: usize,
191 j_start: usize,
192 j_end: usize,
193 k_start: usize,
194 k_end: usize,
195 ) -> Result<()> {
196 if self.config.enable_simd {
197 self.vectorized_tile_multiply(
198 a, b, result, i_start, i_end, j_start, j_end, k_start, k_end,
199 )
200 } else {
201 self.scalar_tile_multiply(a, b, result, i_start, i_end, j_start, j_end, k_start, k_end)
202 }
203 }
204
205 fn vectorized_tile_multiply(
207 &mut self,
208 a: &Array2<Float>,
209 b: &Array2<Float>,
210 result: &mut Array2<Float>,
211 i_start: usize,
212 i_end: usize,
213 j_start: usize,
214 j_end: usize,
215 k_start: usize,
216 k_end: usize,
217 ) -> Result<()> {
218 for i in i_start..i_end {
219 for j in j_start..j_end {
220 let mut sum = result[[i, j]];
221
222 if self.config.enable_loop_unrolling {
223 let mut k = k_start;
225 while k + 4 <= k_end {
226 sum += a[[i, k]] * b[[k, j]];
227 sum += a[[i, k + 1]] * b[[k + 1, j]];
228 sum += a[[i, k + 2]] * b[[k + 2, j]];
229 sum += a[[i, k + 3]] * b[[k + 3, j]];
230 k += 4;
231 }
232
233 while k < k_end {
235 sum += a[[i, k]] * b[[k, j]];
236 k += 1;
237 }
238 } else {
239 for k in k_start..k_end {
240 sum += a[[i, k]] * b[[k, j]];
241 }
242 }
243
244 result[[i, j]] = sum;
245 }
246 }
247
248 self.performance_counters.vectorized_operations += 1;
249 Ok(())
250 }
251
252 fn scalar_tile_multiply(
254 &mut self,
255 a: &Array2<Float>,
256 b: &Array2<Float>,
257 result: &mut Array2<Float>,
258 i_start: usize,
259 i_end: usize,
260 j_start: usize,
261 j_end: usize,
262 k_start: usize,
263 k_end: usize,
264 ) -> Result<()> {
265 for i in i_start..i_end {
266 for j in j_start..j_end {
267 let mut sum = result[[i, j]];
268 for k in k_start..k_end {
269 sum += a[[i, k]] * b[[k, j]];
270 }
271 result[[i, j]] = sum;
272 }
273 }
274
275 self.performance_counters.scalar_operations += 1;
276 Ok(())
277 }
278
279 fn naive_matmul(
281 &mut self,
282 a: &Array2<Float>,
283 b: &Array2<Float>,
284 result: &mut Array2<Float>,
285 ) -> Result<()> {
286 let (m, k) = a.dim();
287 let (_, n) = b.dim();
288
289 for i in 0..m {
290 for j in 0..n {
291 let mut sum = 0.0;
292 for k_idx in 0..k {
293 sum += a[[i, k_idx]] * b[[k_idx, j]];
294 }
295 result[[i, j]] = sum;
296 }
297 }
298
299 Ok(())
300 }
301
302 pub fn optimized_ensemble_predict(
304 &mut self,
305 models: &[Array1<Float>],
306 x: &Array2<Float>,
307 ) -> Result<Array1<Int>> {
308 let n_samples = x.nrows();
309 let n_models = models.len();
310 let mut predictions = Array1::zeros(n_samples);
311
312 for i in 0..n_samples {
313 let row = x.row(i);
314 let mut ensemble_sum = 0.0;
315
316 if self.config.enable_prefetching && i + 1 < n_samples {
318 self.prefetch_memory_location(&x[[i + 1, 0]]);
319 }
320
321 if self.config.enable_simd && n_models >= 4 {
323 ensemble_sum = self.vectorized_ensemble_sum(models, &row)?;
324 } else {
325 for model in models {
326 ensemble_sum += row.dot(model);
327 }
328 }
329
330 predictions[i] = if ensemble_sum / n_models as Float > 0.0 {
331 1
332 } else {
333 0
334 };
335 }
336
337 Ok(predictions)
338 }
339
340 fn vectorized_ensemble_sum(
342 &mut self,
343 models: &[Array1<Float>],
344 x_row: &scirs2_core::ndarray::ArrayView1<Float>,
345 ) -> Result<Float> {
346 let mut total_sum = 0.0;
347
348 let n_models = models.len();
350 let mut model_idx = 0;
351
352 while model_idx + 4 <= n_models {
353 let sum1 = x_row.dot(&models[model_idx]);
354 let sum2 = x_row.dot(&models[model_idx + 1]);
355 let sum3 = x_row.dot(&models[model_idx + 2]);
356 let sum4 = x_row.dot(&models[model_idx + 3]);
357
358 total_sum += sum1 + sum2 + sum3 + sum4;
359 model_idx += 4;
360 }
361
362 while model_idx < n_models {
364 total_sum += x_row.dot(&models[model_idx]);
365 model_idx += 1;
366 }
367
368 self.performance_counters.vectorized_operations += 1;
369 Ok(total_sum)
370 }
371
372 fn prefetch_memory_location(&mut self, _addr: &Float) {
374 self.performance_counters.prefetch_requests += 1;
377 }
378
379 pub fn optimized_histogram(
381 &mut self,
382 data: &Array1<Float>,
383 bins: usize,
384 min_val: Float,
385 max_val: Float,
386 ) -> Result<Array1<usize>> {
387 let mut histogram = Array1::zeros(bins);
388 let bin_width = (max_val - min_val) / bins as Float;
389
390 if self.config.enable_cache_optimization {
391 let chunk_size = self.config.l1_cache_size_kb * 1024 / std::mem::size_of::<Float>();
393 let data_len = data.len();
394
395 for chunk_start in (0..data_len).step_by(chunk_size) {
396 let chunk_end = (chunk_start + chunk_size).min(data_len);
397 let chunk = data.slice(s![chunk_start..chunk_end]);
398
399 for &value in chunk.iter() {
400 if value >= min_val && value < max_val {
401 let bin_idx = ((value - min_val) / bin_width) as usize;
402 let bin_idx = bin_idx.min(bins - 1);
403 histogram[bin_idx] += 1;
404 }
405 }
406 }
407 } else {
408 for &value in data.iter() {
410 if value >= min_val && value < max_val {
411 let bin_idx = ((value - min_val) / bin_width) as usize;
412 let bin_idx = bin_idx.min(bins - 1);
413 histogram[bin_idx] += 1;
414 }
415 }
416 }
417
418 Ok(histogram)
419 }
420
421 pub fn optimized_tree_traversal(
423 &mut self,
424 tree_nodes: &[(usize, Float, Int, Int)], x: &Array1<Float>,
426 ) -> Result<Int> {
427 let mut node_idx = 0;
428
429 loop {
430 if node_idx >= tree_nodes.len() {
431 break;
432 }
433
434 let (feature_idx, threshold, left_child, right_child) = tree_nodes[node_idx];
435
436 if left_child == -1 && right_child == -1 {
438 return Ok(feature_idx as Int); }
440
441 let go_left = if self.config.enable_branch_prediction {
443 likely(x[feature_idx] <= threshold)
444 } else {
445 x[feature_idx] <= threshold
446 };
447
448 node_idx = if go_left {
449 left_child as usize
450 } else {
451 right_child as usize
452 };
453
454 if go_left {
455 self.performance_counters.branch_predictions += 1;
456 } else {
457 self.performance_counters.branch_mispredictions += 1;
458 }
459 }
460
461 Err(SklearsError::InvalidInput(
462 "Tree traversal failed: invalid tree structure".to_string(),
463 ))
464 }
465
466 fn detect_simd_support() -> bool {
468 #[cfg(target_arch = "x86_64")]
469 {
470 is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse2")
471 }
472 #[cfg(target_arch = "aarch64")]
473 {
474 true }
476 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
477 {
478 false
479 }
480 }
481
482 fn detect_l1_cache_size() -> usize {
484 32 }
487
488 fn detect_l2_cache_size() -> usize {
490 256 }
493
494 fn detect_l3_cache_size() -> usize {
496 8192 }
499
500 fn detect_core_count() -> usize {
502 std::thread::available_parallelism()
503 .map(|n| n.get())
504 .unwrap_or(4)
505 }
506
507 pub fn performance_counters(&self) -> &PerformanceCounters {
509 &self.performance_counters
510 }
511
512 pub fn reset_counters(&mut self) {
514 self.performance_counters = PerformanceCounters::default();
515 }
516
517 pub fn cache_efficiency(&self) -> Float {
519 let total_accesses =
520 self.performance_counters.cache_hits + self.performance_counters.cache_misses;
521 if total_accesses > 0 {
522 self.performance_counters.cache_hits as Float / total_accesses as Float
523 } else {
524 0.0
525 }
526 }
527
528 pub fn branch_prediction_accuracy(&self) -> Float {
530 let total_branches = self.performance_counters.branch_predictions
531 + self.performance_counters.branch_mispredictions;
532 if total_branches > 0 {
533 self.performance_counters.branch_predictions as Float / total_branches as Float
534 } else {
535 0.0
536 }
537 }
538
539 pub fn vectorization_ratio(&self) -> Float {
541 let total_ops = self.performance_counters.vectorized_operations
542 + self.performance_counters.scalar_operations;
543 if total_ops > 0 {
544 self.performance_counters.vectorized_operations as Float / total_ops as Float
545 } else {
546 0.0
547 }
548 }
549}
550
551#[inline(always)]
553fn likely(condition: bool) -> bool {
554 condition
556}
557
558#[inline(always)]
560fn _unlikely(condition: bool) -> bool {
561 condition
563}
564
565impl CacheOptimizedMatrixOps {
566 pub fn new(tile_size: usize, l1_cache_size: usize, l2_cache_size: usize) -> Self {
567 Self {
568 tile_size,
569 l1_cache_size,
570 l2_cache_size,
571 }
572 }
573
574 pub fn optimized_transpose(&self, matrix: &Array2<Float>) -> Array2<Float> {
576 let (rows, cols) = matrix.dim();
577 let mut result = Array2::zeros((cols, rows));
578
579 for i_tile in (0..rows).step_by(self.tile_size) {
581 for j_tile in (0..cols).step_by(self.tile_size) {
582 let i_end = (i_tile + self.tile_size).min(rows);
583 let j_end = (j_tile + self.tile_size).min(cols);
584
585 for i in i_tile..i_end {
586 for j in j_tile..j_end {
587 result[[j, i]] = matrix[[i, j]];
588 }
589 }
590 }
591 }
592
593 result
594 }
595}
596
597impl Default for VectorizedEnsembleOps {
598 fn default() -> Self {
599 Self::new()
600 }
601}
602
603impl VectorizedEnsembleOps {
604 pub fn new() -> Self {
606 Self {
607 simd_width: Self::detect_simd_width(),
608 supports_avx512: Self::detect_avx512(),
609 supports_avx2: Self::detect_avx2(),
610 supports_fma: Self::detect_fma(),
611 }
612 }
613
614 pub fn vectorized_weighted_sum(
616 &self,
617 predictions: &[Array1<Float>],
618 weights: &Array1<Float>,
619 ) -> Result<Array1<Float>> {
620 if predictions.is_empty() {
621 return Err(SklearsError::InvalidInput(
622 "No predictions provided".to_string(),
623 ));
624 }
625
626 let n_samples = predictions[0].len();
627 let mut result = Array1::zeros(n_samples);
628
629 for (pred, &weight) in predictions.iter().zip(weights.iter()) {
630 if pred.len() != n_samples {
631 return Err(SklearsError::ShapeMismatch {
632 expected: format!("{} samples", n_samples),
633 actual: format!("{} samples", pred.len()),
634 });
635 }
636
637 let weighted_pred = SimdOps::scalar_multiply(pred, weight);
639 result = SimdOps::add_arrays(&result, &weighted_pred);
640 }
641
642 Ok(result)
643 }
644
645 fn detect_simd_width() -> usize {
647 #[cfg(target_arch = "x86_64")]
648 {
649 if is_x86_feature_detected!("avx512f") {
650 64 } else if is_x86_feature_detected!("avx2") {
652 32 } else if is_x86_feature_detected!("sse2") {
654 16 } else {
656 8 }
658 }
659 #[cfg(target_arch = "aarch64")]
660 {
661 16 }
663 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
664 {
665 8 }
667 }
668
669 fn detect_avx512() -> bool {
671 #[cfg(target_arch = "x86_64")]
672 {
673 is_x86_feature_detected!("avx512f")
674 }
675 #[cfg(not(target_arch = "x86_64"))]
676 {
677 false
678 }
679 }
680
681 fn detect_avx2() -> bool {
683 #[cfg(target_arch = "x86_64")]
684 {
685 is_x86_feature_detected!("avx2")
686 }
687 #[cfg(not(target_arch = "x86_64"))]
688 {
689 false
690 }
691 }
692
693 fn detect_fma() -> bool {
695 #[cfg(target_arch = "x86_64")]
696 {
697 is_x86_feature_detected!("fma")
698 }
699 #[cfg(not(target_arch = "x86_64"))]
700 {
701 false
702 }
703 }
704}
705
706impl LoopOptimizedAlgorithms {
707 pub fn new(unroll_factor: usize, prefetch_distance: usize) -> Self {
709 Self {
710 unroll_factor,
711 prefetch_distance,
712 }
713 }
714
715 pub fn unrolled_sum(&self, array: &Array1<Float>) -> Float {
717 let mut sum = 0.0;
718 let len = array.len();
719 let unroll = self.unroll_factor;
720
721 let mut i = 0;
722
723 while i + unroll <= len {
725 let mut partial_sum = 0.0;
726 for j in 0..unroll {
727 partial_sum += array[i + j];
728 }
729 sum += partial_sum;
730 i += unroll;
731 }
732
733 while i < len {
735 sum += array[i];
736 i += 1;
737 }
738
739 sum
740 }
741
742 pub fn prefetched_process<F>(&self, array: &Array1<Float>, mut f: F) -> Result<()>
744 where
745 F: FnMut(Float),
746 {
747 let len = array.len();
748 let prefetch_dist = self.prefetch_distance;
749
750 for i in 0..len {
751 if i + prefetch_dist < len {
753 }
756
757 f(array[i]);
758 }
759
760 Ok(())
761 }
762}
763
764#[allow(non_snake_case)]
765#[cfg(test)]
766mod tests {
767 use super::*;
768 use scirs2_core::ndarray::array;
769
770 #[test]
771 fn test_cpu_optimization_config() {
772 let config = CpuOptimizationConfig::default();
773 assert!(config.enable_simd);
774 assert!(config.enable_cache_optimization);
775 assert_eq!(config.cache_line_size, 64);
776 }
777
778 #[test]
779 fn test_cpu_optimizer_creation() {
780 let optimizer = CpuOptimizer::auto_detect();
781 assert!(optimizer.config.num_cores > 0);
782 }
783
784 #[test]
785 fn test_optimized_matmul() {
786 let mut optimizer = CpuOptimizer::auto_detect();
787
788 let a = array![[1.0, 2.0], [3.0, 4.0]];
789 let b = array![[5.0, 6.0], [7.0, 8.0]];
790
791 let result = optimizer.optimized_matmul(&a, &b).unwrap();
792 let expected = array![[19.0, 22.0], [43.0, 50.0]];
793
794 assert_eq!(result, expected);
795 }
796
797 #[test]
798 fn test_optimized_histogram() {
799 let mut optimizer = CpuOptimizer::auto_detect();
800
801 let data = array![1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 3.0];
802 let histogram = optimizer.optimized_histogram(&data, 5, 1.0, 6.0).unwrap();
803
804 assert_eq!(histogram.sum(), data.len());
805 }
806
807 #[test]
808 fn test_performance_counters() {
809 let mut optimizer = CpuOptimizer::auto_detect();
810
811 let a = array![[1.0, 2.0], [3.0, 4.0]];
812 let b = array![[5.0, 6.0], [7.0, 8.0]];
813
814 optimizer.optimized_matmul(&a, &b).unwrap();
815
816 let counters = optimizer.performance_counters();
817 assert!(counters.vectorized_operations > 0 || counters.scalar_operations > 0);
818 }
819
820 #[test]
821 fn test_cache_optimized_matrix_ops() {
822 let ops = CacheOptimizedMatrixOps::new(64, 32 * 1024, 256 * 1024);
823
824 let matrix = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
825 let transposed = ops.optimized_transpose(&matrix);
826 let expected = array![[1.0, 4.0], [2.0, 5.0], [3.0, 6.0]];
827
828 assert_eq!(transposed, expected);
829 }
830
831 #[test]
832 fn test_vectorized_ensemble_ops() {
833 let ops = VectorizedEnsembleOps::new();
834
835 let pred1 = array![1.0, 2.0, 3.0];
836 let pred2 = array![2.0, 3.0, 4.0];
837 let predictions = vec![pred1, pred2];
838 let weights = array![0.6, 0.4];
839
840 let result = ops.vectorized_weighted_sum(&predictions, &weights).unwrap();
841 let expected = array![1.4, 2.4, 3.4]; for (actual, expected) in result.iter().zip(expected.iter()) {
844 assert!((actual - expected).abs() < 1e-10);
845 }
846 }
847
848 #[test]
849 fn test_loop_optimized_algorithms() {
850 let alg = LoopOptimizedAlgorithms::new(4, 8);
851
852 let array = array![1.0, 2.0, 3.0, 4.0, 5.0];
853 let sum = alg.unrolled_sum(&array);
854
855 assert_eq!(sum, 15.0);
856 }
857}