1use num_cpus;
8use scirs2_core::ndarray::{
9 s, Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2, Zip,
10};
11use sklears_core::{
12 error::Result as SklResult,
13 prelude::SklearsError,
14 traits::Estimator,
15 types::{Float, FloatBounds},
16};
17#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
18use std::arch::x86_64::*;
19
20#[derive(Debug, Clone)]
22pub struct SimdConfig {
23 pub use_avx2: bool,
25 pub use_avx512: bool,
27 pub use_fma: bool,
29 pub vector_width: usize,
31 pub alignment: usize,
33 pub simd_threshold: usize,
35}
36
37impl Default for SimdConfig {
38 fn default() -> Self {
39 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
40 {
41 Self {
42 use_avx2: is_x86_feature_detected!("avx2"),
43 use_avx512: is_x86_feature_detected!("avx512f"),
44 use_fma: is_x86_feature_detected!("fma"),
45 vector_width: if is_x86_feature_detected!("avx512f") {
46 16
47 } else if is_x86_feature_detected!("avx2") {
48 8
49 } else {
50 4
51 },
52 alignment: 64,
53 simd_threshold: 64,
54 }
55 }
56 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
57 {
58 Self {
59 use_avx2: false,
60 use_avx512: false,
61 use_fma: false,
62 vector_width: 4,
63 alignment: 64,
64 simd_threshold: 64,
65 }
66 }
67 }
68}
69
70pub struct SimdOps {
72 config: SimdConfig,
73}
74
75impl SimdOps {
76 #[must_use]
78 pub fn new(config: SimdConfig) -> Self {
79 Self { config }
80 }
81
82 #[must_use]
84 pub fn default() -> Self {
85 Self::new(SimdConfig::default())
86 }
87
88 pub fn add_arrays(
90 &self,
91 a: &ArrayView1<Float>,
92 b: &ArrayView1<Float>,
93 ) -> SklResult<Array1<Float>> {
94 if a.len() != b.len() {
95 return Err(SklearsError::InvalidInput(
96 "Arrays must have the same length".to_string(),
97 ));
98 }
99
100 let len = a.len();
101 let mut result = Array1::zeros(len);
102
103 Zip::from(&mut result)
105 .and(a)
106 .and(b)
107 .for_each(|r, &a_val, &b_val| *r = a_val + b_val);
108
109 Ok(result)
110 }
111
112 #[cfg(target_arch = "x86_64")]
114 #[target_feature(enable = "avx2")]
115 unsafe fn add_arrays_avx2(
116 &self,
117 a: &ArrayView1<Float>,
118 b: &ArrayView1<Float>,
119 result: &mut ArrayViewMut1<Float>,
120 ) -> SklResult<()> {
121 let len = a.len();
122 let vector_len = 4; let chunks = len / vector_len;
124 let remainder = len % vector_len;
125
126 let a_ptr = a.as_ptr();
127 let b_ptr = b.as_ptr();
128 let result_ptr = result.as_mut_ptr();
129
130 for i in 0..chunks {
132 let offset = i * vector_len;
133
134 let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
135 let b_vec = _mm256_loadu_pd(b_ptr.add(offset));
136 let sum_vec = _mm256_add_pd(a_vec, b_vec);
137
138 _mm256_storeu_pd(result_ptr.add(offset), sum_vec);
139 }
140
141 for i in (chunks * vector_len)..len {
143 *result_ptr.add(i) = *a_ptr.add(i) + *b_ptr.add(i);
144 }
145
146 Ok(())
147 }
148
149 pub fn matrix_multiply(
151 &self,
152 a: &ArrayView2<Float>,
153 b: &ArrayView2<Float>,
154 ) -> SklResult<Array2<Float>> {
155 if a.ncols() != b.nrows() {
156 return Err(SklearsError::InvalidInput(
157 "Matrix dimensions incompatible for multiplication".to_string(),
158 ));
159 }
160
161 let (m, k) = (a.nrows(), a.ncols());
162 let n = b.ncols();
163 let mut result = Array2::zeros((m, n));
164
165 for i in 0..m {
167 for j in 0..n {
168 let mut sum = 0.0;
169 for idx in 0..k {
170 sum += a[[i, idx]] * b[[idx, j]];
171 }
172 result[[i, j]] = sum;
173 }
174 }
175
176 Ok(result)
177 }
178
179 #[cfg(target_arch = "x86_64")]
181 #[target_feature(enable = "avx2", enable = "fma")]
182 unsafe fn matrix_multiply_avx2(
183 &self,
184 a: &ArrayView2<Float>,
185 b: &ArrayView2<Float>,
186 result: &mut ArrayViewMut2<Float>,
187 ) -> SklResult<()> {
188 let (m, k) = (a.nrows(), a.ncols());
189 let n = b.ncols();
190 let vector_len = 4; for i in 0..m {
193 for j in (0..n).step_by(vector_len) {
194 let end_j = std::cmp::min(j + vector_len, n);
195 let mut sum_vec = _mm256_setzero_pd();
196
197 for idx in 0..k {
198 let a_val = _mm256_broadcast_sd(&a[[i, idx]]);
199
200 if end_j - j == vector_len {
201 let b_vec = _mm256_loadu_pd(b.as_ptr().add(idx * n + j));
202 sum_vec = _mm256_fmadd_pd(a_val, b_vec, sum_vec);
203 } else {
204 for col in j..end_j {
206 let prev_sum = result[[i, col]];
207 result[[i, col]] = prev_sum + a[[i, idx]] * b[[idx, col]];
208 }
209 continue;
210 }
211 }
212
213 if end_j - j == vector_len {
214 _mm256_storeu_pd(result.as_mut_ptr().add(i * n + j), sum_vec);
215 }
216 }
217 }
218
219 Ok(())
220 }
221
222 pub fn dot_product(&self, a: &ArrayView1<Float>, b: &ArrayView1<Float>) -> SklResult<Float> {
224 if a.len() != b.len() {
225 return Err(SklearsError::InvalidInput(
226 "Arrays must have the same length".to_string(),
227 ));
228 }
229
230 let len = a.len();
231
232 Ok(a.iter().zip(b.iter()).map(|(x, y)| x * y).sum())
234 }
235
236 #[cfg(target_arch = "x86_64")]
238 #[target_feature(enable = "avx2", enable = "fma")]
239 unsafe fn dot_product_avx2(
240 &self,
241 a: &ArrayView1<Float>,
242 b: &ArrayView1<Float>,
243 ) -> SklResult<Float> {
244 let len = a.len();
245 let vector_len = 4; let chunks = len / vector_len;
247 let remainder = len % vector_len;
248
249 let a_ptr = a.as_ptr();
250 let b_ptr = b.as_ptr();
251
252 let mut sum_vec = _mm256_setzero_pd();
253
254 for i in 0..chunks {
256 let offset = i * vector_len;
257 let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
258 let b_vec = _mm256_loadu_pd(b_ptr.add(offset));
259 sum_vec = _mm256_fmadd_pd(a_vec, b_vec, sum_vec);
260 }
261
262 let mut result: [f64; 4] = [0.0; 4];
264 _mm256_storeu_pd(result.as_mut_ptr(), sum_vec);
265 let mut total_sum = result.iter().sum::<f64>();
266
267 for i in (chunks * vector_len)..len {
269 total_sum += *a_ptr.add(i) * *b_ptr.add(i);
270 }
271
272 Ok(total_sum)
273 }
274
275 pub fn elementwise_op<F>(&self, a: &ArrayView1<Float>, op: F) -> SklResult<Array1<Float>>
277 where
278 F: Fn(Float) -> Float + Copy,
279 {
280 let len = a.len();
281 let mut result = Array1::zeros(len);
282
283 if len >= self.config.simd_threshold && self.can_vectorize_op() {
284 self.elementwise_op_simd(a, op, &mut result.view_mut())?;
286 } else {
287 Zip::from(&mut result)
289 .and(a)
290 .for_each(|r, &val| *r = op(val));
291 }
292
293 Ok(result)
294 }
295
296 fn can_vectorize_op(&self) -> bool {
298 self.config.use_avx2
300 }
301
302 fn elementwise_op_simd<F>(
304 &self,
305 a: &ArrayView1<Float>,
306 op: F,
307 result: &mut ArrayViewMut1<Float>,
308 ) -> SklResult<()>
309 where
310 F: Fn(Float) -> Float + Copy,
311 {
312 Zip::from(result).and(a).for_each(|r, &val| *r = op(val));
315 Ok(())
316 }
317
318 pub fn normalize_l2(&self, a: &ArrayView1<Float>) -> SklResult<Array1<Float>> {
320 let norm = self.dot_product(a, a)?.sqrt();
321 if norm == 0.0 {
322 return Ok(a.to_owned());
323 }
324
325 let inv_norm = 1.0 / norm;
326 self.elementwise_op(a, |x| x * inv_norm)
327 }
328
329 pub fn scale(&self, a: &ArrayView1<Float>, scale_factor: Float) -> SklResult<Array1<Float>> {
331 Ok(a.mapv(|x| x * scale_factor))
333 }
334
335 #[cfg(target_arch = "x86_64")]
337 #[target_feature(enable = "avx2")]
338 unsafe fn scale_avx2(
339 &self,
340 a: &ArrayView1<Float>,
341 scale_factor: Float,
342 ) -> SklResult<Array1<Float>> {
343 let len = a.len();
344 let vector_len = 4; let chunks = len / vector_len;
346 let remainder = len % vector_len;
347
348 let mut result = Array1::zeros(len);
349 let a_ptr = a.as_ptr();
350 let result_ptr: *mut Float = result.as_mut_ptr();
351 let scale_vec = _mm256_broadcast_sd(&scale_factor);
352
353 for i in 0..chunks {
355 let offset = i * vector_len;
356 let a_vec = _mm256_loadu_pd(a_ptr.add(offset));
357 let scaled_vec = _mm256_mul_pd(a_vec, scale_vec);
358 _mm256_storeu_pd(result_ptr.add(offset), scaled_vec);
359 }
360
361 for i in (chunks * vector_len)..len {
363 *result_ptr.add(i) = *a_ptr.add(i) * scale_factor;
364 }
365
366 Ok(result)
367 }
368
369 #[must_use]
371 pub fn aligned_array_1d(&self, size: usize) -> Array1<Float> {
372 Array1::zeros(size)
375 }
376
377 #[must_use]
379 pub fn aligned_array_2d(&self, rows: usize, cols: usize) -> Array2<Float> {
380 Array2::zeros((rows, cols))
381 }
382
383 #[must_use]
385 pub fn is_aligned(&self, ptr: *const Float) -> bool {
386 (ptr as usize) % self.config.alignment == 0
387 }
388
389 #[must_use]
391 pub fn optimal_chunk_size(&self, total_size: usize) -> usize {
392 let base_chunk = std::cmp::max(self.config.simd_threshold, total_size / num_cpus::get());
393
394 (base_chunk / self.config.vector_width) * self.config.vector_width
396 }
397
398 pub fn standardize_features(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
400 let (n_samples, n_features) = data.dim();
401 let mut standardized = Array2::zeros((n_samples, n_features));
402
403 for j in 0..n_features {
404 let column = data.column(j);
405 let mean = column.sum() / (n_samples as Float);
406 let variance = column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>()
407 / ((n_samples - 1) as Float);
408 let std_dev = variance.sqrt();
409
410 if std_dev > 0.0 {
411 for i in 0..n_samples {
412 standardized[[i, j]] = (data[[i, j]] - mean) / std_dev;
413 }
414 } else {
415 for i in 0..n_samples {
416 standardized[[i, j]] = 0.0;
417 }
418 }
419 }
420
421 Ok(standardized)
422 }
423
424 pub fn min_max_scale(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
426 let (n_samples, n_features) = data.dim();
427 let mut scaled = Array2::zeros((n_samples, n_features));
428
429 for j in 0..n_features {
430 let column = data.column(j);
431 let min_val = column.fold(Float::INFINITY, |acc, &x| acc.min(x));
432 let max_val = column.fold(Float::NEG_INFINITY, |acc, &x| acc.max(x));
433 let range = max_val - min_val;
434
435 if range > 0.0 {
436 for i in 0..n_samples {
437 scaled[[i, j]] = (data[[i, j]] - min_val) / range;
438 }
439 } else {
440 for i in 0..n_samples {
441 scaled[[i, j]] = 0.0;
442 }
443 }
444 }
445
446 Ok(scaled)
447 }
448
449 pub fn polynomial_features(
451 &self,
452 data: &ArrayView2<Float>,
453 degree: usize,
454 ) -> SklResult<Array2<Float>> {
455 if degree == 0 {
456 return Err(SklearsError::InvalidInput(
457 "Degree must be at least 1".to_string(),
458 ));
459 }
460
461 let (n_samples, n_features) = data.dim();
462
463 let mut n_output_features = 0;
465 for d in 1..=degree {
466 n_output_features += Self::binomial_coefficient(n_features + d - 1, d);
467 }
468
469 let mut result = Array2::zeros((n_samples, n_output_features));
470
471 for i in 0..n_samples {
472 let mut col_idx = 0;
473
474 for d in 1..=degree {
476 Self::generate_polynomial_terms_recursive(
477 data.row(i).as_slice().unwrap(),
478 d,
479 0,
480 1.0,
481 result.row_mut(i).as_slice_mut().unwrap(),
482 &mut col_idx,
483 );
484 }
485 }
486
487 Ok(result)
488 }
489
490 fn generate_polynomial_terms_recursive(
492 features: &[Float],
493 remaining_degree: usize,
494 start_idx: usize,
495 current_term: Float,
496 output: &mut [Float],
497 col_idx: &mut usize,
498 ) {
499 if remaining_degree == 0 {
500 output[*col_idx] = current_term;
501 *col_idx += 1;
502 return;
503 }
504
505 for i in start_idx..features.len() {
506 Self::generate_polynomial_terms_recursive(
507 features,
508 remaining_degree - 1,
509 i,
510 current_term * features[i],
511 output,
512 col_idx,
513 );
514 }
515 }
516
517 fn binomial_coefficient(n: usize, k: usize) -> usize {
519 if k > n {
520 return 0;
521 }
522 if k == 0 || k == n {
523 return 1;
524 }
525
526 let k = k.min(n - k); let mut result = 1;
528 for i in 0..k {
529 result = result * (n - i) / (i + 1);
530 }
531 result
532 }
533
534 pub fn vectorized_sum(&self, data: &ArrayView1<Float>) -> SklResult<Float> {
536 if data.len() >= self.config.simd_threshold && self.config.use_avx2 {
537 #[cfg(target_arch = "x86_64")]
538 unsafe {
539 return self.vectorized_sum_avx2(data);
540 }
541 }
542
543 Ok(data.sum())
545 }
546
547 #[cfg(target_arch = "x86_64")]
549 #[target_feature(enable = "avx2")]
550 unsafe fn vectorized_sum_avx2(&self, data: &ArrayView1<Float>) -> SklResult<Float> {
551 let len = data.len();
552 let vector_len = 4; let chunks = len / vector_len;
554 let remainder = len % vector_len;
555
556 let data_ptr = data.as_ptr();
557 let mut sum_vec = _mm256_setzero_pd();
558
559 for i in 0..chunks {
561 let offset = i * vector_len;
562 let data_vec = _mm256_loadu_pd(data_ptr.add(offset));
563 sum_vec = _mm256_add_pd(sum_vec, data_vec);
564 }
565
566 let mut result_array: [f64; 4] = [0.0; 4];
568 _mm256_storeu_pd(result_array.as_mut_ptr(), sum_vec);
569 let mut total_sum = result_array.iter().sum::<f64>();
570
571 for i in (chunks * vector_len)..len {
573 total_sum += *data_ptr.add(i);
574 }
575
576 Ok(total_sum)
577 }
578
579 #[cfg(target_arch = "x86_64")]
588 #[target_feature(enable = "avx2")]
589 pub unsafe fn fast_aligned_copy(
590 &self,
591 src: *const Float,
592 dst: *mut Float,
593 count: usize,
594 ) -> SklResult<()> {
595 debug_assert!(src as usize % self.config.alignment == 0);
597 debug_assert!(dst as usize % self.config.alignment == 0);
598 debug_assert!(count > 0);
599
600 let vector_len = 4; let simd_count = count / vector_len;
602 let remainder = count % vector_len;
603
604 for i in 0..simd_count {
606 let offset = i * vector_len;
607 let data_vec = _mm256_load_pd(src.add(offset));
608 _mm256_store_pd(dst.add(offset), data_vec);
609 }
610
611 let remainder_start = simd_count * vector_len;
613 for i in 0..remainder {
614 *dst.add(remainder_start + i) = *src.add(remainder_start + i);
615 }
616
617 Ok(())
618 }
619
620 #[cfg(target_arch = "x86_64")]
629 #[target_feature(enable = "avx2", enable = "fma")]
630 pub unsafe fn fast_small_matrix_mul(
631 &self,
632 a: *const Float,
633 b: *const Float,
634 c: *mut Float,
635 m: usize,
636 n: usize,
637 k: usize,
638 ) -> SklResult<()> {
639 debug_assert!(m > 0 && n > 0 && k > 0);
640 debug_assert!(m <= 16 && n <= 16 && k <= 16); if m == 4 && n == 4 && k == 4 {
644 let a00 = *a.add(0);
646 let a01 = *a.add(1);
647 let a02 = *a.add(2);
648 let a03 = *a.add(3);
649 let a10 = *a.add(4);
650 let a11 = *a.add(5);
651 let a12 = *a.add(6);
652 let a13 = *a.add(7);
653 let a20 = *a.add(8);
654 let a21 = *a.add(9);
655 let a22 = *a.add(10);
656 let a23 = *a.add(11);
657 let a30 = *a.add(12);
658 let a31 = *a.add(13);
659 let a32 = *a.add(14);
660 let a33 = *a.add(15);
661
662 let b00 = *b.add(0);
663 let b01 = *b.add(1);
664 let b02 = *b.add(2);
665 let b03 = *b.add(3);
666 let b10 = *b.add(4);
667 let b11 = *b.add(5);
668 let b12 = *b.add(6);
669 let b13 = *b.add(7);
670 let b20 = *b.add(8);
671 let b21 = *b.add(9);
672 let b22 = *b.add(10);
673 let b23 = *b.add(11);
674 let b30 = *b.add(12);
675 let b31 = *b.add(13);
676 let b32 = *b.add(14);
677 let b33 = *b.add(15);
678
679 *c.add(0) = a00 * b00 + a01 * b10 + a02 * b20 + a03 * b30;
681 *c.add(1) = a00 * b01 + a01 * b11 + a02 * b21 + a03 * b31;
682 *c.add(2) = a00 * b02 + a01 * b12 + a02 * b22 + a03 * b32;
683 *c.add(3) = a00 * b03 + a01 * b13 + a02 * b23 + a03 * b33;
684
685 *c.add(4) = a10 * b00 + a11 * b10 + a12 * b20 + a13 * b30;
686 *c.add(5) = a10 * b01 + a11 * b11 + a12 * b21 + a13 * b31;
687 *c.add(6) = a10 * b02 + a11 * b12 + a12 * b22 + a13 * b32;
688 *c.add(7) = a10 * b03 + a11 * b13 + a12 * b23 + a13 * b33;
689
690 *c.add(8) = a20 * b00 + a21 * b10 + a22 * b20 + a23 * b30;
691 *c.add(9) = a20 * b01 + a21 * b11 + a22 * b21 + a23 * b31;
692 *c.add(10) = a20 * b02 + a21 * b12 + a22 * b22 + a23 * b32;
693 *c.add(11) = a20 * b03 + a21 * b13 + a22 * b23 + a23 * b33;
694
695 *c.add(12) = a30 * b00 + a31 * b10 + a32 * b20 + a33 * b30;
696 *c.add(13) = a30 * b01 + a31 * b11 + a32 * b21 + a33 * b31;
697 *c.add(14) = a30 * b02 + a31 * b12 + a32 * b22 + a33 * b32;
698 *c.add(15) = a30 * b03 + a31 * b13 + a32 * b23 + a33 * b33;
699 } else {
700 for i in 0..m {
702 for j in 0..n {
703 let mut sum = 0.0;
704 for idx in 0..k {
705 sum += *a.add(i * k + idx) * *b.add(idx * n + j);
706 }
707 *c.add(i * n + j) = sum;
708 }
709 }
710 }
711
712 Ok(())
713 }
714
715 pub unsafe fn cache_oblivious_transpose(
724 &self,
725 src: *const Float,
726 dst: *mut Float,
727 src_rows: usize,
728 src_cols: usize,
729 dst_cols: usize,
730 row_start: usize,
731 col_start: usize,
732 block_rows: usize,
733 block_cols: usize,
734 ) -> SklResult<()> {
735 const CACHE_BLOCK_SIZE: usize = 64; if block_rows <= CACHE_BLOCK_SIZE && block_cols <= CACHE_BLOCK_SIZE {
738 for i in 0..block_rows {
740 for j in 0..block_cols {
741 let src_idx = (row_start + i) * src_cols + (col_start + j);
742 let dst_idx = (col_start + j) * dst_cols + (row_start + i);
743 *dst.add(dst_idx) = *src.add(src_idx);
744 }
745 }
746 } else if block_rows >= block_cols {
747 let mid_rows = block_rows / 2;
749 self.cache_oblivious_transpose(
750 src, dst, src_rows, src_cols, dst_cols, row_start, col_start, mid_rows, block_cols,
751 )?;
752 self.cache_oblivious_transpose(
753 src,
754 dst,
755 src_rows,
756 src_cols,
757 dst_cols,
758 row_start + mid_rows,
759 col_start,
760 block_rows - mid_rows,
761 block_cols,
762 )?;
763 } else {
764 let mid_cols = block_cols / 2;
766 self.cache_oblivious_transpose(
767 src, dst, src_rows, src_cols, dst_cols, row_start, col_start, block_rows, mid_cols,
768 )?;
769 self.cache_oblivious_transpose(
770 src,
771 dst,
772 src_rows,
773 src_cols,
774 dst_cols,
775 row_start,
776 col_start + mid_cols,
777 block_rows,
778 block_cols - mid_cols,
779 )?;
780 }
781
782 Ok(())
783 }
784
785 #[cfg(target_arch = "x86_64")]
794 #[target_feature(enable = "avx2", enable = "fma")]
795 pub unsafe fn fast_vectorized_sum(&self, data: *const Float, len: usize) -> SklResult<Float> {
796 debug_assert!(len > 0);
797 debug_assert!(data as usize % self.config.alignment == 0);
798
799 let vector_len = 4; let unroll_factor = 4; let unrolled_len = vector_len * unroll_factor;
802 let unrolled_count = len / unrolled_len;
803 let remainder = len % unrolled_len;
804
805 let mut sum1 = _mm256_setzero_pd();
807 let mut sum2 = _mm256_setzero_pd();
808 let mut sum3 = _mm256_setzero_pd();
809 let mut sum4 = _mm256_setzero_pd();
810
811 for i in 0..unrolled_count {
813 let offset = i * unrolled_len;
814
815 let vec1 = _mm256_load_pd(data.add(offset));
816 let vec2 = _mm256_load_pd(data.add(offset + vector_len));
817 let vec3 = _mm256_load_pd(data.add(offset + vector_len * 2));
818 let vec4 = _mm256_load_pd(data.add(offset + vector_len * 3));
819
820 sum1 = _mm256_add_pd(sum1, vec1);
821 sum2 = _mm256_add_pd(sum2, vec2);
822 sum3 = _mm256_add_pd(sum3, vec3);
823 sum4 = _mm256_add_pd(sum4, vec4);
824 }
825
826 let combined = _mm256_add_pd(_mm256_add_pd(sum1, sum2), _mm256_add_pd(sum3, sum4));
828
829 let mut result: [f64; 4] = [0.0; 4];
831 _mm256_store_pd(result.as_mut_ptr(), combined);
832 let mut total: Float = result.iter().sum();
833
834 let remainder_start = unrolled_count * unrolled_len;
836 for i in 0..remainder {
837 total += *data.add(remainder_start + i);
838 }
839
840 Ok(total)
841 }
842}
843
844pub struct SimdFeatureOps {
846 simd_ops: SimdOps,
847}
848
849impl SimdFeatureOps {
850 #[must_use]
852 pub fn new(config: SimdConfig) -> Self {
853 Self {
854 simd_ops: SimdOps::new(config),
855 }
856 }
857
858 pub fn standardize(&self, data: &ArrayView2<Float>) -> SklResult<Array2<Float>> {
860 let (n_rows, n_cols) = data.dim();
861 let mut result = Array2::zeros((n_rows, n_cols));
862
863 for col in 0..n_cols {
865 let column = data.column(col);
866 let mean = column.sum() / n_rows as Float;
867
868 let variance =
870 column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>() / n_rows as Float;
871 let std_dev = variance.sqrt();
872
873 if std_dev > 0.0 {
874 let inv_std = 1.0 / std_dev;
875 let standardized = self
876 .simd_ops
877 .elementwise_op(&column, |x| (x - mean) * inv_std)?;
878 result.column_mut(col).assign(&standardized);
879 } else {
880 result.column_mut(col).fill(0.0);
881 }
882 }
883
884 Ok(result)
885 }
886
887 pub fn min_max_scale(
889 &self,
890 data: &ArrayView2<Float>,
891 feature_range: (Float, Float),
892 ) -> SklResult<Array2<Float>> {
893 let (min_val, max_val) = feature_range;
894 let scale = max_val - min_val;
895
896 let (n_rows, n_cols) = data.dim();
897 let mut result = Array2::zeros((n_rows, n_cols));
898
899 for col in 0..n_cols {
900 let column = data.column(col);
901 let col_min = column.iter().copied().fold(Float::INFINITY, Float::min);
902 let col_max = column.iter().copied().fold(Float::NEG_INFINITY, Float::max);
903
904 if col_max > col_min {
905 let col_range = col_max - col_min;
906 let scaled = self
907 .simd_ops
908 .elementwise_op(&column, |x| min_val + scale * (x - col_min) / col_range)?;
909 result.column_mut(col).assign(&scaled);
910 } else {
911 result.column_mut(col).fill(min_val);
912 }
913 }
914
915 Ok(result)
916 }
917
918 pub fn polynomial_features(
920 &self,
921 data: &ArrayView2<Float>,
922 degree: usize,
923 ) -> SklResult<Array2<Float>> {
924 if degree < 1 {
925 return Err(SklearsError::InvalidInput(
926 "Degree must be at least 1".to_string(),
927 ));
928 }
929
930 let (n_rows, n_cols) = data.dim();
931
932 let n_output_features = self.calculate_poly_features(n_cols, degree);
934 let mut result = Array2::zeros((n_rows, n_output_features));
935
936 result.slice_mut(s![.., ..n_cols]).assign(data);
938 let mut feature_idx = n_cols;
939
940 for deg in 2..=degree {
942 feature_idx +=
943 self.generate_combinations(data, &mut result.view_mut(), deg, feature_idx)?;
944 }
945
946 Ok(result)
947 }
948
949 fn calculate_poly_features(&self, n_features: usize, degree: usize) -> usize {
951 let mut total = n_features;
953 for d in 2..=degree {
954 total += (n_features as f64).powi(d as i32) as usize / d; }
956 total
957 }
958
959 fn generate_combinations(
961 &self,
962 data: &ArrayView2<Float>,
963 result: &mut ArrayViewMut2<Float>,
964 degree: usize,
965 start_idx: usize,
966 ) -> SklResult<usize> {
967 let (n_rows, n_cols) = data.dim();
968 let mut feature_idx = start_idx;
969
970 if degree == 2 {
972 for col in 0..n_cols {
973 let column = data.column(col);
974 let squared = self.simd_ops.elementwise_op(&column, |x| x * x)?;
975 result.column_mut(feature_idx).assign(&squared);
976 feature_idx += 1;
977 }
978 }
979
980 Ok(feature_idx - start_idx)
981 }
982}
983
984pub struct SimdDataLayout {
986 config: SimdConfig,
987}
988
989impl SimdDataLayout {
990 #[must_use]
992 pub fn new(config: SimdConfig) -> Self {
993 Self { config }
994 }
995
996 #[must_use]
998 pub fn transpose_for_simd(&self, data: &ArrayView2<Float>) -> Array2<Float> {
999 data.t().to_owned()
1000 }
1001
1002 #[must_use]
1004 pub fn optimize_layout(&self, data: &ArrayView2<Float>) -> Array2<Float> {
1005 let (rows, cols) = data.dim();
1006
1007 let padded_cols = if cols % self.config.vector_width != 0 {
1009 ((cols / self.config.vector_width) + 1) * self.config.vector_width
1010 } else {
1011 cols
1012 };
1013
1014 if padded_cols > cols {
1015 let mut padded = Array2::zeros((rows, padded_cols));
1016 padded.slice_mut(s![.., ..cols]).assign(data);
1017 padded
1018 } else {
1019 data.to_owned()
1020 }
1021 }
1022
1023 #[must_use]
1025 pub fn create_chunks(
1026 &self,
1027 data: &ArrayView2<Float>,
1028 chunk_size: Option<usize>,
1029 ) -> Vec<Array2<Float>> {
1030 let (rows, _cols) = data.dim();
1031 let chunk_sz = chunk_size.unwrap_or(self.config.simd_threshold);
1032
1033 let mut chunks = Vec::new();
1034 for start in (0..rows).step_by(chunk_sz) {
1035 let end = std::cmp::min(start + chunk_sz, rows);
1036 let chunk = data.slice(s![start..end, ..]).to_owned();
1037 chunks.push(chunk);
1038 }
1039
1040 chunks
1041 }
1042}
1043
1044#[allow(non_snake_case)]
1045#[cfg(test)]
1046mod tests {
1047 use super::*;
1048 use scirs2_core::ndarray::array;
1049
1050 #[test]
1051 fn test_simd_config() {
1052 let config = SimdConfig::default();
1053 assert!(config.vector_width >= 4);
1054 assert!(config.alignment >= 16);
1055 assert!(config.simd_threshold > 0);
1056 }
1057
1058 #[test]
1059 fn test_simd_add_arrays() {
1060 let simd_ops = SimdOps::default();
1061 let a = array![1.0, 2.0, 3.0, 4.0];
1062 let b = array![5.0, 6.0, 7.0, 8.0];
1063
1064 let result = simd_ops.add_arrays(&a.view(), &b.view()).unwrap();
1065 let expected = array![6.0, 8.0, 10.0, 12.0];
1066
1067 assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1068 }
1069
1070 #[test]
1071 fn test_dot_product() {
1072 let simd_ops = SimdOps::default();
1073 let a = array![1.0, 2.0, 3.0, 4.0];
1074 let b = array![2.0, 3.0, 4.0, 5.0];
1075
1076 let result = simd_ops.dot_product(&a.view(), &b.view()).unwrap();
1077 let expected = 1.0 * 2.0 + 2.0 * 3.0 + 3.0 * 4.0 + 4.0 * 5.0; assert!((result - expected).abs() < 1e-6);
1080 }
1081
1082 #[test]
1083 fn test_matrix_multiply() {
1084 let simd_ops = SimdOps::default();
1085 let a = array![[1.0, 2.0], [3.0, 4.0]];
1086 let b = array![[5.0, 6.0], [7.0, 8.0]];
1087
1088 let result = simd_ops.matrix_multiply(&a.view(), &b.view()).unwrap();
1089 let expected = array![[19.0, 22.0], [43.0, 50.0]];
1090
1091 assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1092 }
1093
1094 #[test]
1095 fn test_scale() {
1096 let simd_ops = SimdOps::default();
1097 let a = array![1.0, 2.0, 3.0, 4.0];
1098 let scale_factor = 2.0;
1099
1100 let result = simd_ops.scale(&a.view(), scale_factor).unwrap();
1101 let expected = array![2.0, 4.0, 6.0, 8.0];
1102
1103 assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1104 }
1105
1106 #[test]
1107 fn test_normalize_l2() {
1108 let simd_ops = SimdOps::default();
1109 let a = array![3.0, 4.0, 0.0];
1110
1111 let result = simd_ops.normalize_l2(&a.view()).unwrap();
1112 let norm = (3.0f32 * 3.0 + 4.0 * 4.0 + 0.0 * 0.0).sqrt(); let expected = array![3.0 / 5.0, 4.0 / 5.0, 0.0 / 5.0];
1114
1115 assert!((result - expected).mapv(|x| x.abs()).sum() < 1e-6);
1116 }
1117
1118 #[test]
1119 fn test_feature_standardize() {
1120 let config = SimdConfig::default();
1121 let feature_ops = SimdFeatureOps::new(config);
1122
1123 let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1124 let result = feature_ops.standardize(&data.view()).unwrap();
1125
1126 for col in 0..result.ncols() {
1128 let column = result.column(col);
1129 let mean = column.sum() / column.len() as Float;
1130 let variance =
1131 column.iter().map(|&x| (x - mean).powi(2)).sum::<Float>() / column.len() as Float;
1132
1133 assert!(mean.abs() < 1e-10, "Mean should be approximately zero");
1134 assert!(
1135 (variance - 1.0).abs() < 1e-6,
1136 "Variance should be approximately one"
1137 );
1138 }
1139 }
1140
1141 #[test]
1142 fn test_min_max_scale() {
1143 let config = SimdConfig::default();
1144 let feature_ops = SimdFeatureOps::new(config);
1145
1146 let data = array![[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]];
1147 let result = feature_ops.min_max_scale(&data.view(), (0.0, 1.0)).unwrap();
1148
1149 for &val in result.iter() {
1151 assert!(
1152 (0.0..=1.0).contains(&val),
1153 "Values should be in [0, 1] range"
1154 );
1155 }
1156
1157 assert!((result[[0, 0]] - 0.0).abs() < 1e-6); assert!((result[[2, 0]] - 1.0).abs() < 1e-6); }
1161
1162 #[test]
1163 fn test_optimal_chunk_size() {
1164 let simd_ops = SimdOps::default();
1165 let chunk_size = simd_ops.optimal_chunk_size(1000);
1166
1167 assert!(chunk_size > 0);
1168 assert!(chunk_size % simd_ops.config.vector_width == 0);
1169 }
1170
1171 #[test]
1172 fn test_data_layout_optimization() {
1173 let config = SimdConfig::default();
1174 let layout = SimdDataLayout::new(config);
1175
1176 let data = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1177 let optimized = layout.optimize_layout(&data.view());
1178
1179 assert_eq!(optimized.slice(s![.., ..3]), data);
1181 }
1182}