sklears_compose/
simd_optimizations.rs

1//! SIMD optimizations for parallel pipeline execution
2//!
3//! This module provides SIMD-optimized operations for data processing
4//! in machine learning pipelines, focusing on vectorized operations
5//! that can significantly improve performance.
6
7use num_cpus;
8use scirs2_core::ndarray::{
9    s, Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Zip,
10};
11use sklears_core::{
12    error::Result as SklResult,
13    prelude::SklearsError,
14    traits::Estimator,
15    types::{Float, FloatBounds},
16};
17#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
18use std::arch::x86_64::*;
19
20/// SIMD configuration for optimized operations
21#[derive(Debug, Clone)]
22pub struct SimdConfig {
23    /// Enable AVX2 instructions
24    pub use_avx2: bool,
25    /// Enable AVX512 instructions
26    pub use_avx512: bool,
27    /// Enable FMA instructions
28    pub use_fma: bool,
29    /// Vectorization width (elements per vector)
30    pub vector_width: usize,
31    /// Memory alignment requirements
32    pub alignment: usize,
33    /// Minimum array size for SIMD operations
34    pub simd_threshold: usize,
35}
36
37impl Default for SimdConfig {
38    fn default() -> Self {
39        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
40        {
41            Self {
42                use_avx2: is_x86_feature_detected!("avx2"),
43                use_avx512: is_x86_feature_detected!("avx512f"),
44                use_fma: is_x86_feature_detected!("fma"),
45                vector_width: if is_x86_feature_detected!("avx512f") {
46                    16
47                } else if is_x86_feature_detected!("avx2") {
48                    8
49                } else {
50                    4
51                },
52                alignment: 64,
53                simd_threshold: 64,
54            }
55        }
56        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
57        {
58            Self {
59                use_avx2: false,
60                use_avx512: false,
61                use_fma: false,
62                vector_width: 4,
63                alignment: 64,
64                simd_threshold: 64,
65            }
66        }
67    }
68}
69
70/// SIMD-optimized operations for pipeline processing
71pub struct SimdOps {
72    config: SimdConfig,
73}
74
75impl SimdOps {
76    /// Create new SIMD operations instance
77    #[must_use]
78    pub fn new(config: SimdConfig) -> Self {
79        Self { config }
80    }
81
82    /// Create with default configuration
83    #[must_use]
84    pub fn default() -> Self {
85        Self::new(SimdConfig::default())
86    }
87
88    /// Vectorized addition of two arrays
89    pub fn add_arrays(
90        &self,
91        a: &ArrayView1<Float>,
92        b: &ArrayView1<Float>,
93    ) -> SklResult<Array1<Float>> {
94        if a.len() != b.len() {
95            return Err(SklearsError::InvalidInput(
96                "Arrays must have the same length".to_string(),
97            ));
98        }
99
100        let len = a.len();
101        let mut result = Array1::zeros(len);
102
103        // Use standard addition (SIMD optimizations would be implemented here)
104        Zip::from(&mut result)
105            .and(a)
106            .and(b)
107            .for_each(|r, &a_val, &b_val| *r = a_val + b_val);
108
109        Ok(result)
110    }
111
112    /// AVX2-optimized array addition
113    #[cfg(target_arch = "x86_64")]
114    #[target_feature(enable = "avx2")]
115    unsafe fn add_arrays_avx2(
116        &self,
117        a: &ArrayView1<Float>,
118        b: &ArrayView1<Float>,
119        result: &mut ArrayViewMut1<Float>,
120    ) -> SklResult<()> {
121        let len = a.len();
122        let vector_len = 4; // AVX2 processes 4 f64 values at once
123        let chunks = len / vector_len;
124        let remainder = len % vector_len;
125
126        let a_ptr = a.as_ptr();
127        let b_ptr = b.as_ptr();
128        let result_ptr = result.as_mut_ptr();
129
130        // Process chunks of 4 elements
131        for i in 0..chunks {
132            let offset = i * vector_len;
133
134            let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
135            let b_vec = _mm256_loadu_pd(b_ptr.add(offset));
136            let sum_vec = _mm256_add_pd(a_vec, b_vec);
137
138            _mm256_storeu_pd(result_ptr.add(offset), sum_vec);
139        }
140
141        // Handle remaining elements
142        for i in (chunks * vector_len)..len {
143            *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
144        }
145
146        Ok(())
147    }
148
149    /// Vectorized matrix multiplication
150    pub fn matrix_multiply(
151        &self,
152        a: &ArrayView2<Float>,
153        b: &ArrayView2<Float>,
154    ) -> SklResult<Array2<Float>> {
155        if a.ncols() != b.nrows() {
156            return Err(SklearsError::InvalidInput(
157                "Matrix dimensions incompatible for multiplication".to_string(),
158            ));
159        }
160
161        let (m, k) = (a.nrows(), a.ncols());
162        let n = b.ncols();
163        let mut result = Array2::zeros((m, n));
164
165        // Use standard matrix multiplication (SIMD optimizations would be implemented here)
166        for i in 0..m {
167            for j in 0..n {
168                let mut sum = 0.0;
169                for idx in 0..k {
170                    sum += a[[i, idx]] * b[[idx, j]];
171                }
172                result[[i, j]] = sum;
173            }
174        }
175
176        Ok(result)
177    }
178
179    /// AVX2-optimized matrix multiplication
180    #[cfg(target_arch = "x86_64")]
181    #[target_feature(enable = "avx2", enable = "fma")]
182    unsafe fn matrix_multiply_avx2(
183        &self,
184        a: &ArrayView2<Float>,
185        b: &ArrayView2<Float>,
186        result: &mut ArrayViewMut2<Float>,
187    ) -> SklResult<()> {
188        let (m, k) = (a.nrows(), a.ncols());
189        let n = b.ncols();
190        let vector_len = 4; // AVX2 processes 4 f64 values at once
191
192        for i in 0..m {
193            for j in (0..n).step_by(vector_len) {
194                let end_j = std::cmp::min(j + vector_len, n);
195                let mut sum_vec = _mm256_setzero_pd();
196
197                for idx in 0..k {
198                    let a_val = _mm256_broadcast_sd(&a[[i, idx]]);
199
200                    if end_j - j == vector_len {
201                        let b_vec = _mm256_loadu_pd(b.as_ptr().add(idx * n + j));
202                        sum_vec = _mm256_fmadd_pd(a_val, b_vec, sum_vec);
203                    } else {
204                        // Handle remainder
205                        for col in j..end_j {
206                            let prev_sum = result[[i, col]];
207                            result[[i, col]] = prev_sum + a[[i, idx]] * b[[idx, col]];
208                        }
209                        continue;
210                    }
211                }
212
213                if end_j - j == vector_len {
214                    _mm256_storeu_pd(result.as_mut_ptr().add(i * n + j), sum_vec);
215                }
216            }
217        }
218
219        Ok(())
220    }
221
222    /// Vectorized dot product
223    pub fn dot_product(&self, a: &ArrayView1<Float>, b: &ArrayView1<Float>) -> SklResult<Float> {
224        if a.len() != b.len() {
225            return Err(SklearsError::InvalidInput(
226                "Arrays must have the same length".to_string(),
227            ));
228        }
229
230        let len = a.len();
231
232        // Use standard dot product (SIMD optimizations would be implemented here)
233        Ok(a.iter().zip(b.iter()).map(|(x, y)| x * y).sum())
234    }
235
236    /// AVX2-optimized dot product
237    #[cfg(target_arch = "x86_64")]
238    #[target_feature(enable = "avx2", enable = "fma")]
239    unsafe fn dot_product_avx2(
240        &self,
241        a: &ArrayView1<Float>,
242        b: &ArrayView1<Float>,
243    ) -> SklResult<Float> {
244        let len = a.len();
245        let vector_len = 4; // AVX2 processes 4 f64 values at once
246        let chunks = len / vector_len;
247        let remainder = len % vector_len;
248
249        let a_ptr = a.as_ptr();
250        let b_ptr = b.as_ptr();
251
252        let mut sum_vec = _mm256_setzero_pd();
253
254        // Process chunks
255        for i in 0..chunks {
256            let offset = i * vector_len;
257            let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
258            let b_vec = _mm256_loadu_pd(b_ptr.add(offset));
259            sum_vec = _mm256_fmadd_pd(a_vec, b_vec, sum_vec);
260        }
261
262        // Horizontal sum of vector elements
263        let mut result: [f64; 4] = [0.0; 4];
264        _mm256_storeu_pd(result.as_mut_ptr(), sum_vec);
265        let mut total_sum = result.iter().sum::<f64>();
266
267        // Handle remainder
268        for i in (chunks * vector_len)..len {
269            total_sum += *a_ptr.add(i) * *b_ptr.add(i);
270        }
271
272        Ok(total_sum)
273    }
274
275    /// Vectorized element-wise operations
276    pub fn elementwise_op<F>(&self, a: &ArrayView1<Float>, op: F) -> SklResult<Array1<Float>>
277    where
278        F: Fn(Float) -> Float + Copy,
279    {
280        let len = a.len();
281        let mut result = Array1::zeros(len);
282
283        if len >= self.config.simd_threshold && self.can_vectorize_op() {
284            // For operations that can be vectorized, use SIMD
285            self.elementwise_op_simd(a, op, &mut result.view_mut())?;
286        } else {
287            // Fallback to scalar operations
288            Zip::from(&mut result)
289                .and(a)
290                .for_each(|r, &val| *r = op(val));
291        }
292
293        Ok(result)
294    }
295
296    /// Check if operation can be vectorized efficiently
297    fn can_vectorize_op(&self) -> bool {
298        // This is a simplified check - in practice, you'd analyze the operation
299        self.config.use_avx2
300    }
301
302    /// SIMD element-wise operations (simplified implementation)
303    fn elementwise_op_simd<F>(
304        &self,
305        a: &ArrayView1<Float>,
306        op: F,
307        result: &mut ArrayViewMut1<Float>,
308    ) -> SklResult<()>
309    where
310        F: Fn(Float) -> Float + Copy,
311    {
312        // For complex operations, fallback to scalar for now
313        // Real implementations would use lookup tables or polynomial approximations
314        Zip::from(result).and(a).for_each(|r, &val| *r = op(val));
315        Ok(())
316    }
317
318    /// Vectorized normalization (L2 norm)
319    pub fn normalize_l2(&self, a: &ArrayView1<Float>) -> SklResult<Array1<Float>> {
320        let norm = self.dot_product(a, a)?.sqrt();
321        if norm == 0.0 {
322            return Ok(a.to_owned());
323        }
324
325        let inv_norm = 1.0 / norm;
326        self.elementwise_op(a, |x| x * inv_norm)
327    }
328
329    /// Vectorized scaling
330    pub fn scale(&self, a: &ArrayView1<Float>, scale_factor: Float) -> SklResult<Array1<Float>> {
331        // Use standard scaling (SIMD optimizations would be implemented here)
332        Ok(a.mapv(|x| x * scale_factor))
333    }
334
335    /// AVX2-optimized scaling
336    #[cfg(target_arch = "x86_64")]
337    #[target_feature(enable = "avx2")]
338    unsafe fn scale_avx2(
339        &self,
340        a: &ArrayView1<Float>,
341        scale_factor: Float,
342    ) -> SklResult<Array1<Float>> {
343        let len = a.len();
344        let vector_len = 4; // AVX2 processes 4 f64 values at once
345        let chunks = len / vector_len;
346        let remainder = len % vector_len;
347
348        let mut result = Array1::zeros(len);
349        let a_ptr = a.as_ptr();
350        let result_ptr: *mut Float = result.as_mut_ptr();
351        let scale_vec = _mm256_broadcast_sd(&scale_factor);
352
353        // Process chunks
354        for i in 0..chunks {
355            let offset = i * vector_len;
356            let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
357            let scaled_vec = _mm256_mul_pd(a_vec, scale_vec);
358            _mm256_storeu_pd(result_ptr.add(offset), scaled_vec);
359        }
360
361        // Handle remainder
362        for i in (chunks * vector_len)..len {
363            *result_ptr.add(i) = *a_ptr.add(i) * scale_factor;
364        }
365
366        Ok(result)
367    }
368
369    /// Memory-aligned array allocation for optimal SIMD performance
370    #[must_use]
371    pub fn aligned_array_1d(&self, size: usize) -> Array1<Float> {
372        // Use ndarray's default allocation which should be reasonably aligned
373        // In production, you'd want to use aligned allocation
374        Array1::zeros(size)
375    }
376
377    /// Memory-aligned matrix allocation
378    #[must_use]
379    pub fn aligned_array_2d(&self, rows: usize, cols: usize) -> Array2<Float> {
380        Array2::zeros((rows, cols))
381    }
382
383    /// Check if arrays are properly aligned for SIMD operations
384    #[must_use]
385    pub fn is_aligned(&self, ptr: *const Float) -> bool {
386        (ptr as usize) % self.config.alignment == 0
387    }
388
389    /// Get optimal chunk size for parallel SIMD operations
390    #[must_use]
391    pub fn optimal_chunk_size(&self, total_size: usize) -> usize {
392        let base_chunk = std::cmp::max(self.config.simd_threshold, total_size / num_cpus::get());
393
394        // Round to vector boundary
395        (base_chunk / self.config.vector_width) * self.config.vector_width
396    }
397
398    /// Vectorized feature standardization (z-score normalization)
399    pub fn standardize_features(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
400        let (n_samples, n_features) = data.dim();
401        let mut standardized = Array2::zeros((n_samples, n_features));
402
403        for j in 0..n_features {
404            let column = data.column(j);
405            let mean = column.sum() / (n_samples as Float);
406            let variance = column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>()
407                / ((n_samples - 1) as Float);
408            let std_dev = variance.sqrt();
409
410            if std_dev > 0.0 {
411                for i in 0..n_samples {
412                    standardized[[i, j]] = (data[[i, j]] - mean) / std_dev;
413                }
414            } else {
415                for i in 0..n_samples {
416                    standardized[[i, j]] = 0.0;
417                }
418            }
419        }
420
421        Ok(standardized)
422    }
423
424    /// Vectorized min-max scaling
425    pub fn min_max_scale(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
426        let (n_samples, n_features) = data.dim();
427        let mut scaled = Array2::zeros((n_samples, n_features));
428
429        for j in 0..n_features {
430            let column = data.column(j);
431            let min_val = column.fold(Float::INFINITY, |acc, &x| acc.min(x));
432            let max_val = column.fold(Float::NEG_INFINITY, |acc, &x| acc.max(x));
433            let range = max_val - min_val;
434
435            if range > 0.0 {
436                for i in 0..n_samples {
437                    scaled[[i, j]] = (data[[i, j]] - min_val) / range;
438                }
439            } else {
440                for i in 0..n_samples {
441                    scaled[[i, j]] = 0.0;
442                }
443            }
444        }
445
446        Ok(scaled)
447    }
448
449    /// Generate polynomial features up to specified degree
450    pub fn polynomial_features(
451        &self,
452        data: &ArrayView2<Float>,
453        degree: usize,
454    ) -> SklResult<Array2<Float>> {
455        if degree == 0 {
456            return Err(SklearsError::InvalidInput(
457                "Degree must be at least 1".to_string(),
458            ));
459        }
460
461        let (n_samples, n_features) = data.dim();
462
463        // Calculate number of output features for polynomial expansion
464        let mut n_output_features = 0;
465        for d in 1..=degree {
466            n_output_features += Self::binomial_coefficient(n_features + d - 1, d);
467        }
468
469        let mut result = Array2::zeros((n_samples, n_output_features));
470
471        for i in 0..n_samples {
472            let mut col_idx = 0;
473
474            // Generate polynomial terms
475            for d in 1..=degree {
476                Self::generate_polynomial_terms_recursive(
477                    data.row(i).as_slice().unwrap(),
478                    d,
479                    0,
480                    1.0,
481                    result.row_mut(i).as_slice_mut().unwrap(),
482                    &mut col_idx,
483                );
484            }
485        }
486
487        Ok(result)
488    }
489
490    /// Generate polynomial terms recursively
491    fn generate_polynomial_terms_recursive(
492        features: &[Float],
493        remaining_degree: usize,
494        start_idx: usize,
495        current_term: Float,
496        output: &mut [Float],
497        col_idx: &mut usize,
498    ) {
499        if remaining_degree == 0 {
500            output[*col_idx] = current_term;
501            *col_idx += 1;
502            return;
503        }
504
505        for i in start_idx..features.len() {
506            Self::generate_polynomial_terms_recursive(
507                features,
508                remaining_degree - 1,
509                i,
510                current_term * features[i],
511                output,
512                col_idx,
513            );
514        }
515    }
516
517    /// Calculate binomial coefficient C(n, k)
518    fn binomial_coefficient(n: usize, k: usize) -> usize {
519        if k > n {
520            return 0;
521        }
522        if k == 0 || k == n {
523            return 1;
524        }
525
526        let k = k.min(n - k); // Take advantage of symmetry
527        let mut result = 1;
528        for i in 0..k {
529            result = result * (n - i) / (i + 1);
530        }
531        result
532    }
533
534    /// Vectorized sum with SIMD optimization
535    pub fn vectorized_sum(&self, data: &ArrayView1<Float>) -> SklResult<Float> {
536        if data.len() >= self.config.simd_threshold && self.config.use_avx2 {
537            #[cfg(target_arch = "x86_64")]
538            unsafe {
539                return self.vectorized_sum_avx2(data);
540            }
541        }
542
543        // Fallback to standard sum
544        Ok(data.sum())
545    }
546
547    /// AVX2-optimized vectorized sum
548    #[cfg(target_arch = "x86_64")]
549    #[target_feature(enable = "avx2")]
550    unsafe fn vectorized_sum_avx2(&self, data: &ArrayView1<Float>) -> SklResult<Float> {
551        let len = data.len();
552        let vector_len = 4; // AVX2 processes 4 f64 values at once
553        let chunks = len / vector_len;
554        let remainder = len % vector_len;
555
556        let data_ptr = data.as_ptr();
557        let mut sum_vec = _mm256_setzero_pd();
558
559        // Process chunks of 4 elements
560        for i in 0..chunks {
561            let offset = i * vector_len;
562            let data_vec = _mm256_loadu_pd(data_ptr.add(offset));
563            sum_vec = _mm256_add_pd(sum_vec, data_vec);
564        }
565
566        // Horizontal sum of vector elements
567        let mut result_array: [f64; 4] = [0.0; 4];
568        _mm256_storeu_pd(result_array.as_mut_ptr(), sum_vec);
569        let mut total_sum = result_array.iter().sum::<f64>();
570
571        // Handle remaining elements
572        for i in (chunks * vector_len)..len {
573            total_sum += *data_ptr.add(i);
574        }
575
576        Ok(total_sum)
577    }
578
579    /// Ultra-fast unsafe memory copy optimized for aligned data
580    ///
581    /// # Safety
582    ///
583    /// This function is unsafe because:
584    /// - It assumes src and dst pointers are valid for `count` elements
585    /// - It assumes proper memory alignment for SIMD operations
586    /// - It performs unchecked pointer arithmetic
587    #[cfg(target_arch = "x86_64")]
588    #[target_feature(enable = "avx2")]
589    pub unsafe fn fast_aligned_copy(
590        &self,
591        src: *const Float,
592        dst: *mut Float,
593        count: usize,
594    ) -> SklResult<()> {
595        // Validate alignment assumptions
596        debug_assert!(src as usize % self.config.alignment == 0);
597        debug_assert!(dst as usize % self.config.alignment == 0);
598        debug_assert!(count > 0);
599
600        let vector_len = 4; // AVX2 handles 4 f64 at once
601        let simd_count = count / vector_len;
602        let remainder = count % vector_len;
603
604        // SIMD copy for bulk data
605        for i in 0..simd_count {
606            let offset = i * vector_len;
607            let data_vec = _mm256_load_pd(src.add(offset));
608            _mm256_store_pd(dst.add(offset), data_vec);
609        }
610
611        // Handle remainder with scalar copy
612        let remainder_start = simd_count * vector_len;
613        for i in 0..remainder {
614            *dst.add(remainder_start + i) = *src.add(remainder_start + i);
615        }
616
617        Ok(())
618    }
619
620    /// Unsafe unrolled matrix multiplication for small fixed-size matrices
621    ///
622    /// # Safety
623    ///
624    /// This function is unsafe because:
625    /// - It assumes matrix pointers are valid and properly aligned
626    /// - It performs manual loop unrolling with unchecked indexing
627    /// - It assumes matrices are in row-major order
628    #[cfg(target_arch = "x86_64")]
629    #[target_feature(enable = "avx2", enable = "fma")]
630    pub unsafe fn fast_small_matrix_mul(
631        &self,
632        a: *const Float,
633        b: *const Float,
634        c: *mut Float,
635        m: usize,
636        n: usize,
637        k: usize,
638    ) -> SklResult<()> {
639        debug_assert!(m > 0 && n > 0 && k > 0);
640        debug_assert!(m <= 16 && n <= 16 && k <= 16); // Optimized for small matrices
641
642        // Manual loop unrolling for small matrices (4x4 case optimized)
643        if m == 4 && n == 4 && k == 4 {
644            // Highly optimized 4x4 x 4x4 multiplication
645            let a00 = *a.add(0);
646            let a01 = *a.add(1);
647            let a02 = *a.add(2);
648            let a03 = *a.add(3);
649            let a10 = *a.add(4);
650            let a11 = *a.add(5);
651            let a12 = *a.add(6);
652            let a13 = *a.add(7);
653            let a20 = *a.add(8);
654            let a21 = *a.add(9);
655            let a22 = *a.add(10);
656            let a23 = *a.add(11);
657            let a30 = *a.add(12);
658            let a31 = *a.add(13);
659            let a32 = *a.add(14);
660            let a33 = *a.add(15);
661
662            let b00 = *b.add(0);
663            let b01 = *b.add(1);
664            let b02 = *b.add(2);
665            let b03 = *b.add(3);
666            let b10 = *b.add(4);
667            let b11 = *b.add(5);
668            let b12 = *b.add(6);
669            let b13 = *b.add(7);
670            let b20 = *b.add(8);
671            let b21 = *b.add(9);
672            let b22 = *b.add(10);
673            let b23 = *b.add(11);
674            let b30 = *b.add(12);
675            let b31 = *b.add(13);
676            let b32 = *b.add(14);
677            let b33 = *b.add(15);
678
679            // Unrolled computation
680            *c.add(0) = a00 * b00 + a01 * b10 + a02 * b20 + a03 * b30;
681            *c.add(1) = a00 * b01 + a01 * b11 + a02 * b21 + a03 * b31;
682            *c.add(2) = a00 * b02 + a01 * b12 + a02 * b22 + a03 * b32;
683            *c.add(3) = a00 * b03 + a01 * b13 + a02 * b23 + a03 * b33;
684
685            *c.add(4) = a10 * b00 + a11 * b10 + a12 * b20 + a13 * b30;
686            *c.add(5) = a10 * b01 + a11 * b11 + a12 * b21 + a13 * b31;
687            *c.add(6) = a10 * b02 + a11 * b12 + a12 * b22 + a13 * b32;
688            *c.add(7) = a10 * b03 + a11 * b13 + a12 * b23 + a13 * b33;
689
690            *c.add(8) = a20 * b00 + a21 * b10 + a22 * b20 + a23 * b30;
691            *c.add(9) = a20 * b01 + a21 * b11 + a22 * b21 + a23 * b31;
692            *c.add(10) = a20 * b02 + a21 * b12 + a22 * b22 + a23 * b32;
693            *c.add(11) = a20 * b03 + a21 * b13 + a22 * b23 + a23 * b33;
694
695            *c.add(12) = a30 * b00 + a31 * b10 + a32 * b20 + a33 * b30;
696            *c.add(13) = a30 * b01 + a31 * b11 + a32 * b21 + a33 * b31;
697            *c.add(14) = a30 * b02 + a31 * b12 + a32 * b22 + a33 * b32;
698            *c.add(15) = a30 * b03 + a31 * b13 + a32 * b23 + a33 * b33;
699        } else {
700            // Fallback for other small sizes with manual unrolling
701            for i in 0..m {
702                for j in 0..n {
703                    let mut sum = 0.0;
704                    for idx in 0..k {
705                        sum += *a.add(i * k + idx) * *b.add(idx * n + j);
706                    }
707                    *c.add(i * n + j) = sum;
708                }
709            }
710        }
711
712        Ok(())
713    }
714
715    /// Unsafe cache-oblivious matrix transpose for better cache performance
716    ///
717    /// # Safety
718    ///
719    /// This function is unsafe because:
720    /// - It performs unchecked pointer arithmetic
721    /// - It assumes valid memory layouts for source and destination
722    /// - It uses recursive divide-and-conquer with raw pointers
723    pub unsafe fn cache_oblivious_transpose(
724        &self,
725        src: *const Float,
726        dst: *mut Float,
727        src_rows: usize,
728        src_cols: usize,
729        dst_cols: usize,
730        row_start: usize,
731        col_start: usize,
732        block_rows: usize,
733        block_cols: usize,
734    ) -> SklResult<()> {
735        const CACHE_BLOCK_SIZE: usize = 64; // Cache line optimized
736
737        if block_rows <= CACHE_BLOCK_SIZE && block_cols <= CACHE_BLOCK_SIZE {
738            // Base case: small block, do simple transpose
739            for i in 0..block_rows {
740                for j in 0..block_cols {
741                    let src_idx = (row_start + i) * src_cols + (col_start + j);
742                    let dst_idx = (col_start + j) * dst_cols + (row_start + i);
743                    *dst.add(dst_idx) = *src.add(src_idx);
744                }
745            }
746        } else if block_rows >= block_cols {
747            // Split rows
748            let mid_rows = block_rows / 2;
749            self.cache_oblivious_transpose(
750                src, dst, src_rows, src_cols, dst_cols, row_start, col_start, mid_rows, block_cols,
751            )?;
752            self.cache_oblivious_transpose(
753                src,
754                dst,
755                src_rows,
756                src_cols,
757                dst_cols,
758                row_start + mid_rows,
759                col_start,
760                block_rows - mid_rows,
761                block_cols,
762            )?;
763        } else {
764            // Split columns
765            let mid_cols = block_cols / 2;
766            self.cache_oblivious_transpose(
767                src, dst, src_rows, src_cols, dst_cols, row_start, col_start, block_rows, mid_cols,
768            )?;
769            self.cache_oblivious_transpose(
770                src,
771                dst,
772                src_rows,
773                src_cols,
774                dst_cols,
775                row_start,
776                col_start + mid_cols,
777                block_rows,
778                block_cols - mid_cols,
779            )?;
780        }
781
782        Ok(())
783    }
784
785    /// Unsafe vectorized sum with manual unrolling and FMA
786    ///
787    /// # Safety
788    ///
789    /// This function is unsafe because:
790    /// - It assumes proper memory alignment and validity
791    /// - It performs unchecked SIMD operations
792    /// - It uses manual loop unrolling for maximum performance
793    #[cfg(target_arch = "x86_64")]
794    #[target_feature(enable = "avx2", enable = "fma")]
795    pub unsafe fn fast_vectorized_sum(&self, data: *const Float, len: usize) -> SklResult<Float> {
796        debug_assert!(len > 0);
797        debug_assert!(data as usize % self.config.alignment == 0);
798
799        let vector_len = 4; // AVX2 processes 4 f64 values at once
800        let unroll_factor = 4; // Process 4 vectors per iteration
801        let unrolled_len = vector_len * unroll_factor;
802        let unrolled_count = len / unrolled_len;
803        let remainder = len % unrolled_len;
804
805        // Initialize accumulators
806        let mut sum1 = _mm256_setzero_pd();
807        let mut sum2 = _mm256_setzero_pd();
808        let mut sum3 = _mm256_setzero_pd();
809        let mut sum4 = _mm256_setzero_pd();
810
811        // Unrolled vectorized loop
812        for i in 0..unrolled_count {
813            let offset = i * unrolled_len;
814
815            let vec1 = _mm256_load_pd(data.add(offset));
816            let vec2 = _mm256_load_pd(data.add(offset + vector_len));
817            let vec3 = _mm256_load_pd(data.add(offset + vector_len * 2));
818            let vec4 = _mm256_load_pd(data.add(offset + vector_len * 3));
819
820            sum1 = _mm256_add_pd(sum1, vec1);
821            sum2 = _mm256_add_pd(sum2, vec2);
822            sum3 = _mm256_add_pd(sum3, vec3);
823            sum4 = _mm256_add_pd(sum4, vec4);
824        }
825
826        // Combine partial sums
827        let combined = _mm256_add_pd(_mm256_add_pd(sum1, sum2), _mm256_add_pd(sum3, sum4));
828
829        // Horizontal sum
830        let mut result: [f64; 4] = [0.0; 4];
831        _mm256_store_pd(result.as_mut_ptr(), combined);
832        let mut total: Float = result.iter().sum();
833
834        // Handle remainder
835        let remainder_start = unrolled_count * unrolled_len;
836        for i in 0..remainder {
837            total += *data.add(remainder_start + i);
838        }
839
840        Ok(total)
841    }
842}
843
844/// SIMD-optimized feature transformations
845pub struct SimdFeatureOps {
846    simd_ops: SimdOps,
847}
848
849impl SimdFeatureOps {
850    /// Create new SIMD feature operations
851    #[must_use]
852    pub fn new(config: SimdConfig) -> Self {
853        Self {
854            simd_ops: SimdOps::new(config),
855        }
856    }
857
858    /// Vectorized standardization (z-score normalization)
859    pub fn standardize(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
860        let (n_rows, n_cols) = data.dim();
861        let mut result = Array2::zeros((n_rows, n_cols));
862
863        // Compute mean and std for each feature
864        for col in 0..n_cols {
865            let column = data.column(col);
866            let mean = column.sum() / n_rows as Float;
867
868            // Compute variance
869            let variance =
870                column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>() / n_rows as Float;
871            let std_dev = variance.sqrt();
872
873            if std_dev > 0.0 {
874                let inv_std = 1.0 / std_dev;
875                let standardized = self
876                    .simd_ops
877                    .elementwise_op(&column, |x| (x - mean) * inv_std)?;
878                result.column_mut(col).assign(&standardized);
879            } else {
880                result.column_mut(col).fill(0.0);
881            }
882        }
883
884        Ok(result)
885    }
886
887    /// Vectorized min-max scaling
888    pub fn min_max_scale(
889        &self,
890        data: &ArrayView2<Float>,
891        feature_range: (Float, Float),
892    ) -> SklResult<Array2<Float>> {
893        let (min_val, max_val) = feature_range;
894        let scale = max_val - min_val;
895
896        let (n_rows, n_cols) = data.dim();
897        let mut result = Array2::zeros((n_rows, n_cols));
898
899        for col in 0..n_cols {
900            let column = data.column(col);
901            let col_min = column.iter().copied().fold(Float::INFINITY, Float::min);
902            let col_max = column.iter().copied().fold(Float::NEG_INFINITY, Float::max);
903
904            if col_max > col_min {
905                let col_range = col_max - col_min;
906                let scaled = self
907                    .simd_ops
908                    .elementwise_op(&column, |x| min_val + scale * (x - col_min) / col_range)?;
909                result.column_mut(col).assign(&scaled);
910            } else {
911                result.column_mut(col).fill(min_val);
912            }
913        }
914
915        Ok(result)
916    }
917
918    /// Vectorized polynomial features
919    pub fn polynomial_features(
920        &self,
921        data: &ArrayView2<Float>,
922        degree: usize,
923    ) -> SklResult<Array2<Float>> {
924        if degree < 1 {
925            return Err(SklearsError::InvalidInput(
926                "Degree must be at least 1".to_string(),
927            ));
928        }
929
930        let (n_rows, n_cols) = data.dim();
931
932        // Calculate number of polynomial features
933        let n_output_features = self.calculate_poly_features(n_cols, degree);
934        let mut result = Array2::zeros((n_rows, n_output_features));
935
936        // Copy original features
937        result.slice_mut(s![.., ..n_cols]).assign(data);
938        let mut feature_idx = n_cols;
939
940        // Generate polynomial combinations
941        for deg in 2..=degree {
942            feature_idx +=
943                self.generate_combinations(data, &mut result.view_mut(), deg, feature_idx)?;
944        }
945
946        Ok(result)
947    }
948
949    /// Calculate number of polynomial features
950    fn calculate_poly_features(&self, n_features: usize, degree: usize) -> usize {
951        // This is a simplified calculation - real implementation would use combinatorics
952        let mut total = n_features;
953        for d in 2..=degree {
954            total += (n_features as f64).powi(d as i32) as usize / d; // Approximation
955        }
956        total
957    }
958
959    /// Generate polynomial combinations
960    fn generate_combinations(
961        &self,
962        data: &ArrayView2<Float>,
963        result: &mut ArrayViewMut2<Float>,
964        degree: usize,
965        start_idx: usize,
966    ) -> SklResult<usize> {
967        let (n_rows, n_cols) = data.dim();
968        let mut feature_idx = start_idx;
969
970        // Simplified: just add squared features for degree 2
971        if degree == 2 {
972            for col in 0..n_cols {
973                let column = data.column(col);
974                let squared = self.simd_ops.elementwise_op(&column, |x| x * x)?;
975                result.column_mut(feature_idx).assign(&squared);
976                feature_idx += 1;
977            }
978        }
979
980        Ok(feature_idx - start_idx)
981    }
982}
983
984/// Cache-friendly data layouts for SIMD operations
985pub struct SimdDataLayout {
986    config: SimdConfig,
987}
988
989impl SimdDataLayout {
990    /// Create new SIMD data layout optimizer
991    #[must_use]
992    pub fn new(config: SimdConfig) -> Self {
993        Self { config }
994    }
995
996    /// Transpose matrix for better cache locality
997    #[must_use]
998    pub fn transpose_for_simd(&self, data: &ArrayView2<Float>) -> Array2<Float> {
999        data.t().to_owned()
1000    }
1001
1002    /// Reshape data for optimal SIMD processing
1003    #[must_use]
1004    pub fn optimize_layout(&self, data: &ArrayView2<Float>) -> Array2<Float> {
1005        let (rows, cols) = data.dim();
1006
1007        // Pad columns to vector boundary if beneficial
1008        let padded_cols = if cols % self.config.vector_width != 0 {
1009            ((cols / self.config.vector_width) + 1) * self.config.vector_width
1010        } else {
1011            cols
1012        };
1013
1014        if padded_cols > cols {
1015            let mut padded = Array2::zeros((rows, padded_cols));
1016            padded.slice_mut(s![.., ..cols]).assign(data);
1017            padded
1018        } else {
1019            data.to_owned()
1020        }
1021    }
1022
1023    /// Create memory-efficient chunks for parallel processing
1024    #[must_use]
1025    pub fn create_chunks(
1026        &self,
1027        data: &ArrayView2<Float>,
1028        chunk_size: Option<usize>,
1029    ) -> Vec<Array2<Float>> {
1030        let (rows, _cols) = data.dim();
1031        let chunk_sz = chunk_size.unwrap_or(self.config.simd_threshold);
1032
1033        let mut chunks = Vec::new();
1034        for start in (0..rows).step_by(chunk_sz) {
1035            let end = std::cmp::min(start + chunk_sz, rows);
1036            let chunk = data.slice(s![start..end, ..]).to_owned();
1037            chunks.push(chunk);
1038        }
1039
1040        chunks
1041    }
1042}
1043
1044#[allow(non_snake_case)]
1045#[cfg(test)]
1046mod tests {
1047    use super::*;
1048    use scirs2_core::ndarray::array;
1049
1050    #[test]
1051    fn test_simd_config() {
1052        let config = SimdConfig::default();
1053        assert!(config.vector_width >= 4);
1054        assert!(config.alignment >= 16);
1055        assert!(config.simd_threshold > 0);
1056    }
1057
1058    #[test]
1059    fn test_simd_add_arrays() {
1060        let simd_ops = SimdOps::default();
1061        let a = array![1.0, 2.0, 3.0, 4.0];
1062        let b = array![5.0, 6.0, 7.0, 8.0];
1063
1064        let result = simd_ops.add_arrays(&a.view(), &b.view()).unwrap();
1065        let expected = array![6.0, 8.0, 10.0, 12.0];
1066
1067        assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1068    }
1069
1070    #[test]
1071    fn test_dot_product() {
1072        let simd_ops = SimdOps::default();
1073        let a = array![1.0, 2.0, 3.0, 4.0];
1074        let b = array![2.0, 3.0, 4.0, 5.0];
1075
1076        let result = simd_ops.dot_product(&a.view(), &b.view()).unwrap();
1077        let expected = 1.0 * 2.0 + 2.0 * 3.0 + 3.0 * 4.0 + 4.0 * 5.0; // = 40.0
1078
1079        assert!((result - expected).abs() < 1e-6);
1080    }
1081
1082    #[test]
1083    fn test_matrix_multiply() {
1084        let simd_ops = SimdOps::default();
1085        let a = array![[1.0, 2.0], [3.0, 4.0]];
1086        let b = array![[5.0, 6.0], [7.0, 8.0]];
1087
1088        let result = simd_ops.matrix_multiply(&a.view(), &b.view()).unwrap();
1089        let expected = array![[19.0, 22.0], [43.0, 50.0]];
1090
1091        assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1092    }
1093
1094    #[test]
1095    fn test_scale() {
1096        let simd_ops = SimdOps::default();
1097        let a = array![1.0, 2.0, 3.0, 4.0];
1098        let scale_factor = 2.0;
1099
1100        let result = simd_ops.scale(&a.view(), scale_factor).unwrap();
1101        let expected = array![2.0, 4.0, 6.0, 8.0];
1102
1103        assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1104    }
1105
1106    #[test]
1107    fn test_normalize_l2() {
1108        let simd_ops = SimdOps::default();
1109        let a = array![3.0, 4.0, 0.0];
1110
1111        let result = simd_ops.normalize_l2(&a.view()).unwrap();
1112        let norm = (3.0f32 * 3.0 + 4.0 * 4.0 + 0.0 * 0.0).sqrt(); // = 5.0
1113        let expected = array![3.0 / 5.0, 4.0 / 5.0, 0.0 / 5.0];
1114
1115        assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1116    }
1117
1118    #[test]
1119    fn test_feature_standardize() {
1120        let config = SimdConfig::default();
1121        let feature_ops = SimdFeatureOps::new(config);
1122
1123        let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1124        let result = feature_ops.standardize(&data.view()).unwrap();
1125
1126        // Check that each column has approximately zero mean and unit variance
1127        for col in 0..result.ncols() {
1128            let column = result.column(col);
1129            let mean = column.sum() / column.len() as Float;
1130            let variance =
1131                column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>() / column.len() as Float;
1132
1133            assert!(mean.abs() < 1e-10, "Mean should be approximately zero");
1134            assert!(
1135                (variance - 1.0).abs() < 1e-6,
1136                "Variance should be approximately one"
1137            );
1138        }
1139    }
1140
1141    #[test]
1142    fn test_min_max_scale() {
1143        let config = SimdConfig::default();
1144        let feature_ops = SimdFeatureOps::new(config);
1145
1146        let data = array![[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]];
1147        let result = feature_ops.min_max_scale(&data.view(), (0.0, 1.0)).unwrap();
1148
1149        // Check that values are in [0, 1] range
1150        for &val in result.iter() {
1151            assert!(
1152                (0.0..=1.0).contains(&val),
1153                "Values should be in [0, 1] range"
1154            );
1155        }
1156
1157        // Check that min/max values are mapped correctly
1158        assert!((result[[0, 0]] - 0.0).abs() < 1e-6); // min of first column
1159        assert!((result[[2, 0]] - 1.0).abs() < 1e-6); // max of first column
1160    }
1161
1162    #[test]
1163    fn test_optimal_chunk_size() {
1164        let simd_ops = SimdOps::default();
1165        let chunk_size = simd_ops.optimal_chunk_size(1000);
1166
1167        assert!(chunk_size > 0);
1168        assert!(chunk_size % simd_ops.config.vector_width == 0);
1169    }
1170
1171    #[test]
1172    fn test_data_layout_optimization() {
1173        let config = SimdConfig::default();
1174        let layout = SimdDataLayout::new(config);
1175
1176        let data = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1177        let optimized = layout.optimize_layout(&data.view());
1178
1179        // Should maintain data integrity
1180        assert_eq!(optimized.slice(s![.., ..3]), data);
1181    }
1182}