1use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, Axis};
8use rayon::prelude::*;
12use sklears_core::{error::Result, prelude::SklearsError, types::Float};
13
14#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
15use std::arch::x86_64::*;
16
17#[derive(Debug, Clone)]
19pub struct SimdConfig {
20 pub use_avx512: bool,
22 pub use_avx2: bool,
24 pub use_sse: bool,
26 pub min_vector_length: usize,
28 pub parallel_simd: bool,
30 pub block_size: usize,
32}
33
34impl Default for SimdConfig {
35 fn default() -> Self {
36 Self {
37 use_avx512: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
38 && cfg!(feature = "avx512"),
39 use_avx2: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
40 && cfg!(feature = "avx2"),
41 use_sse: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
42 && cfg!(feature = "sse"),
43 min_vector_length: 8, parallel_simd: true,
45 block_size: 64, }
47 }
48}
49
50pub struct SimdMatrixOps {
52 config: SimdConfig,
53}
54
55impl SimdMatrixOps {
56 pub fn new() -> Self {
58 Self {
59 config: SimdConfig::default(),
60 }
61 }
62
63 pub fn with_config(config: SimdConfig) -> Self {
65 Self { config }
66 }
67
68 pub fn simd_matvec(
70 &self,
71 matrix: &Array2<Float>,
72 vector: &Array1<Float>,
73 ) -> Result<Array1<Float>> {
74 if matrix.ncols() != vector.len() {
75 return Err(SklearsError::InvalidInput(
76 "Matrix columns must match vector length".to_string(),
77 ));
78 }
79
80 let nrows = matrix.nrows();
81 let mut result = Array1::zeros(nrows);
82
83 if self.config.parallel_simd && nrows >= 64 {
85 self.parallel_simd_matvec(matrix, vector, &mut result)?;
86 } else {
87 self.sequential_simd_matvec(matrix, vector, &mut result)?;
88 }
89
90 Ok(result)
91 }
92
93 fn sequential_simd_matvec(
95 &self,
96 matrix: &Array2<Float>,
97 vector: &Array1<Float>,
98 result: &mut Array1<Float>,
99 ) -> Result<()> {
100 for (i, result_elem) in result.iter_mut().enumerate() {
101 let row = matrix.row(i);
102 *result_elem = self.simd_dot_product(&row, vector.view())?;
103 }
104 Ok(())
105 }
106
107 fn parallel_simd_matvec(
109 &self,
110 matrix: &Array2<Float>,
111 vector: &Array1<Float>,
112 result: &mut Array1<Float>,
113 ) -> Result<()> {
114 let results: Result<Vec<Float>> = (0..matrix.nrows())
115 .into_par_iter()
116 .map(|i| -> Result<Float> {
117 let row = matrix.row(i);
118 self.simd_dot_product(&row, vector.view())
119 })
120 .collect();
121
122 let computed_results = results?;
123 for (i, value) in computed_results.into_iter().enumerate() {
124 result[i] = value;
125 }
126 Ok(())
127 }
128
129 pub fn simd_dot_product(&self, a: &ArrayView1<Float>, b: ArrayView1<Float>) -> Result<Float> {
131 if a.len() != b.len() {
132 return Err(SklearsError::InvalidInput(
133 "Vectors must have same length".to_string(),
134 ));
135 }
136
137 if a.len() >= self.config.min_vector_length {
139 let a_vec: Vec<Float> = a.to_vec();
140 let b_vec: Vec<Float> = b.to_vec();
141
142 self.manual_simd_dot_product(&a_vec, &b_vec)
144 } else {
145 Ok(a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum())
147 }
148 }
149
150 fn manual_simd_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
152 let len = a.len();
153 let mut sum = 0.0;
154
155 if self.config.use_avx512 && len >= 8 {
156 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
157 {
158 sum += unsafe { self.avx512_dot_product(a, b)? };
159 }
160 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
161 {
162 sum += self.avx512_dot_product(a, b)?;
163 }
164 } else if self.config.use_avx2 && len >= 4 {
165 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
166 {
167 sum += unsafe { self.avx2_dot_product(a, b)? };
168 }
169 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
170 {
171 sum += self.avx2_dot_product(a, b)?;
172 }
173 } else if self.config.use_sse && len >= 2 {
174 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
175 {
176 sum += unsafe { self.sse_dot_product(a, b)? };
177 }
178 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
179 {
180 sum += self.sse_dot_product(a, b)?;
181 }
182 } else {
183 sum = a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum();
185 }
186
187 Ok(sum)
188 }
189
190 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
192 #[target_feature(enable = "avx512f")]
193 unsafe fn avx512_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
194 let len = a.len();
195 let simd_len = len & !7; let mut sum = _mm512_setzero_pd();
197
198 for i in (0..simd_len).step_by(8) {
199 let a_vec = _mm512_loadu_pd(a.as_ptr().add(i));
200 let b_vec = _mm512_loadu_pd(b.as_ptr().add(i));
201 let prod = _mm512_mul_pd(a_vec, b_vec);
202 sum = _mm512_add_pd(sum, prod);
203 }
204
205 let sum_array: [f64; 8] = std::mem::transmute(sum);
207 let mut result = sum_array.iter().sum::<f64>();
208
209 for i in simd_len..len {
211 result += a[i] * b[i];
212 }
213
214 Ok(result)
215 }
216
217 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
219 #[target_feature(enable = "avx2")]
220 unsafe fn avx2_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
221 let len = a.len();
222 let simd_len = len & !3; let mut sum = _mm256_setzero_pd();
224
225 for i in (0..simd_len).step_by(4) {
226 let a_vec = _mm256_loadu_pd(a.as_ptr().add(i));
227 let b_vec = _mm256_loadu_pd(b.as_ptr().add(i));
228 let prod = _mm256_mul_pd(a_vec, b_vec);
229 sum = _mm256_add_pd(sum, prod);
230 }
231
232 let sum_array: [f64; 4] = std::mem::transmute(sum);
234 let mut result = sum_array.iter().sum::<f64>();
235
236 for i in simd_len..len {
238 result += a[i] * b[i];
239 }
240
241 Ok(result)
242 }
243
244 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
246 #[target_feature(enable = "sse2")]
247 unsafe fn sse_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
248 let len = a.len();
249 let simd_len = len & !1; let mut sum = _mm_setzero_pd();
251
252 for i in (0..simd_len).step_by(2) {
253 let a_vec = _mm_loadu_pd(a.as_ptr().add(i));
254 let b_vec = _mm_loadu_pd(b.as_ptr().add(i));
255 let prod = _mm_mul_pd(a_vec, b_vec);
256 sum = _mm_add_pd(sum, prod);
257 }
258
259 let sum_array: [f64; 2] = std::mem::transmute(sum);
261 let mut result = sum_array.iter().sum::<f64>();
262
263 for i in simd_len..len {
265 result += a[i] * b[i];
266 }
267
268 Ok(result)
269 }
270
271 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
273 fn avx512_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
274 Ok(a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum())
276 }
277
278 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
279 fn avx2_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
280 Ok(a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum())
282 }
283
284 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
285 fn sse_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
286 Ok(a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum())
288 }
289
290 pub fn simd_matmul(&self, a: &Array2<Float>, b: &Array2<Float>) -> Result<Array2<Float>> {
292 if a.ncols() != b.nrows() {
293 return Err(SklearsError::InvalidInput(
294 "Matrix dimensions incompatible for multiplication".to_string(),
295 ));
296 }
297
298 let (m, k) = (a.nrows(), a.ncols());
299 let n = b.ncols();
300 let mut result = Array2::zeros((m, n));
301
302 if m >= self.config.block_size && n >= self.config.block_size && k >= self.config.block_size
304 {
305 self.tiled_simd_matmul(a, b, &mut result)?;
306 } else {
307 self.simple_simd_matmul(a, b, &mut result)?;
308 }
309
310 Ok(result)
311 }
312
313 fn simple_simd_matmul(
315 &self,
316 a: &Array2<Float>,
317 b: &Array2<Float>,
318 result: &mut Array2<Float>,
319 ) -> Result<()> {
320 let n = b.ncols();
321
322 if self.config.parallel_simd {
323 let rows = (0..a.nrows()).collect::<Vec<_>>();
325 let results: Result<Vec<Vec<Float>>> = rows
326 .par_iter()
327 .map(|&i| -> Result<Vec<Float>> {
328 let a_row = a.row(i);
329 let mut row_result = vec![0.0; n];
330 for j in 0..n {
331 let b_col = b.column(j);
332 row_result[j] = self.simd_dot_product(&a_row, b_col)?;
333 }
334 Ok(row_result)
335 })
336 .collect();
337
338 let computed_results = results?;
340 for (i, row_data) in computed_results.into_iter().enumerate() {
341 for (j, value) in row_data.into_iter().enumerate() {
342 result[[i, j]] = value;
343 }
344 }
345 } else {
346 for i in 0..a.nrows() {
348 let a_row = a.row(i);
349 for j in 0..n {
350 let b_col = b.column(j);
351 result[[i, j]] = self.simd_dot_product(&a_row, b_col)?;
352 }
353 }
354 }
355
356 Ok(())
357 }
358
359 fn tiled_simd_matmul(
361 &self,
362 a: &Array2<Float>,
363 b: &Array2<Float>,
364 result: &mut Array2<Float>,
365 ) -> Result<()> {
366 let (m, k, n) = (a.nrows(), a.ncols(), b.ncols());
367 let block_size = self.config.block_size;
368
369 for i_block in (0..m).step_by(block_size) {
371 for j_block in (0..n).step_by(block_size) {
372 for k_block in (0..k).step_by(block_size) {
373 let i_end = (i_block + block_size).min(m);
374 let j_end = (j_block + block_size).min(n);
375 let k_end = (k_block + block_size).min(k);
376
377 self.compute_block(
379 a, b, result, i_block, i_end, j_block, j_end, k_block, k_end,
380 )?;
381 }
382 }
383 }
384
385 Ok(())
386 }
387
388 fn compute_block(
390 &self,
391 a: &Array2<Float>,
392 b: &Array2<Float>,
393 result: &mut Array2<Float>,
394 i_start: usize,
395 i_end: usize,
396 j_start: usize,
397 j_end: usize,
398 k_start: usize,
399 k_end: usize,
400 ) -> Result<()> {
401 for i in i_start..i_end {
402 for j in j_start..j_end {
403 let mut sum = 0.0;
404
405 let k_len = k_end - k_start;
407 if k_len >= self.config.min_vector_length {
408 let a_slice = a.slice(s![i, k_start..k_end]);
409 let b_slice = b.slice(s![k_start..k_end, j]);
410 sum = self.simd_dot_product(&a_slice, b_slice)?;
411 } else {
412 for k in k_start..k_end {
414 sum += a[[i, k]] * b[[k, j]];
415 }
416 }
417
418 result[[i, j]] += sum;
419 }
420 }
421
422 Ok(())
423 }
424
425 pub fn simd_element_wise_add(
427 &self,
428 a: &Array1<Float>,
429 b: &Array1<Float>,
430 ) -> Result<Array1<Float>> {
431 if a.len() != b.len() {
432 return Err(SklearsError::InvalidInput(
433 "Arrays must have same length".to_string(),
434 ));
435 }
436
437 Ok(a + b)
439 }
440
441 pub fn simd_element_wise_subtract(
443 &self,
444 a: &Array1<Float>,
445 b: &Array1<Float>,
446 ) -> Result<Array1<Float>> {
447 if a.len() != b.len() {
448 return Err(SklearsError::InvalidInput(
449 "Arrays must have same length".to_string(),
450 ));
451 }
452
453 Ok(a - b)
455 }
456
457 pub fn simd_element_wise_multiply(
459 &self,
460 a: &Array1<Float>,
461 b: &Array1<Float>,
462 ) -> Result<Array1<Float>> {
463 if a.len() != b.len() {
464 return Err(SklearsError::InvalidInput(
465 "Arrays must have same length".to_string(),
466 ));
467 }
468
469 Ok(a * b)
471 }
472
473 pub fn simd_covariance(&self, data: &Array2<Float>) -> Result<Array2<Float>> {
475 let n_samples = data.nrows() as Float;
476 let _n_features = data.ncols();
477
478 let mean = self.simd_column_mean(data)?;
480
481 let mut centered_data = Array2::zeros(data.raw_dim());
483 for (mut row, data_row) in centered_data
484 .axis_iter_mut(Axis(0))
485 .zip(data.axis_iter(Axis(0)))
486 {
487 row.assign(&self.simd_element_wise_subtract(&data_row.to_owned(), &mean)?);
488 }
489
490 let covariance = self.simd_matmul(¢ered_data.t().to_owned(), ¢ered_data)?;
492 Ok(covariance / (n_samples - 1.0))
493 }
494
495 fn simd_column_mean(&self, data: &Array2<Float>) -> Result<Array1<Float>> {
497 let n_samples = data.nrows() as Float;
498 let n_features = data.ncols();
499 let mut mean = Array1::zeros(n_features);
500
501 if self.config.parallel_simd {
502 let results: Result<Vec<Float>> = (0..n_features)
503 .into_par_iter()
504 .map(|j| -> Result<Float> {
505 let col = data.column(j);
506 Ok(col.sum() / n_samples)
507 })
508 .collect();
509
510 let computed_means = results?;
511 for (j, value) in computed_means.into_iter().enumerate() {
512 mean[j] = value;
513 }
514 } else {
515 for (j, mean_elem) in mean.iter_mut().enumerate() {
516 let col = data.column(j);
517 *mean_elem = col.sum() / n_samples;
518 }
519 }
520
521 Ok(mean)
522 }
523
524 pub fn simd_mahalanobis_distance(
526 &self,
527 x: &Array1<Float>,
528 mean: &Array1<Float>,
529 inv_cov: &Array2<Float>,
530 ) -> Result<Float> {
531 let diff = self.simd_element_wise_subtract(x, mean)?;
533
534 let temp = self.simd_matvec(inv_cov, &diff)?;
536
537 let distance_squared = self.simd_dot_product(&diff.view(), temp.view())?;
539
540 Ok(distance_squared.sqrt())
541 }
542
543 pub fn check_simd_support(&self) -> SimdSupport {
545 SimdSupport {
546 avx512: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
547 && self.runtime_feature_detect("avx512f"),
548 avx2: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
549 && self.runtime_feature_detect("avx2"),
550 sse: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
551 && self.runtime_feature_detect("sse2"),
552 fma: cfg!(any(target_arch = "x86", target_arch = "x86_64"))
553 && self.runtime_feature_detect("fma"),
554 }
555 }
556
557 fn runtime_feature_detect(&self, feature: &str) -> bool {
559 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
560 {
561 match feature {
562 "avx512f" => is_x86_feature_detected!("avx512f"),
563 "avx2" => is_x86_feature_detected!("avx2"),
564 "sse2" => is_x86_feature_detected!("sse2"),
565 "fma" => is_x86_feature_detected!("fma"),
566 _ => false,
567 }
568 }
569 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
570 {
571 let _ = feature;
572 false
573 }
574 }
575}
576
577#[derive(Debug, Clone)]
579pub struct SimdSupport {
580 pub avx512: bool,
582 pub avx2: bool,
584 pub sse: bool,
586 pub fma: bool,
588}
589
590impl Default for SimdMatrixOps {
591 fn default() -> Self {
592 Self::new()
593 }
594}
595
596pub trait SimdArrayOps<T> {
598 fn simd_dot(&self, other: &Array1<T>) -> Result<T>;
600
601 fn simd_dot_matrix(&self, vector: &Array1<T>) -> Result<Array1<T>>;
603
604 fn simd_add(&self, other: &Array1<T>) -> Result<Array1<T>>;
606}
607
608impl SimdArrayOps<Float> for Array1<Float> {
609 fn simd_dot(&self, other: &Array1<Float>) -> Result<Float> {
610 let simd_ops = SimdMatrixOps::new();
611 simd_ops.simd_dot_product(&self.view(), other.view())
612 }
613
614 fn simd_dot_matrix(&self, _vector: &Array1<Float>) -> Result<Array1<Float>> {
615 Err(SklearsError::InvalidInput(
617 "Not applicable for 1D array".to_string(),
618 ))
619 }
620
621 fn simd_add(&self, other: &Array1<Float>) -> Result<Array1<Float>> {
622 let simd_ops = SimdMatrixOps::new();
623 simd_ops.simd_element_wise_add(self, other)
624 }
625}
626
627impl SimdArrayOps<Float> for Array2<Float> {
628 fn simd_dot(&self, _other: &Array1<Float>) -> Result<Float> {
629 Err(SklearsError::InvalidInput(
630 "Use simd_dot_matrix for matrix-vector operations".to_string(),
631 ))
632 }
633
634 fn simd_dot_matrix(&self, vector: &Array1<Float>) -> Result<Array1<Float>> {
635 let simd_ops = SimdMatrixOps::new();
636 simd_ops.simd_matvec(self, vector)
637 }
638
639 fn simd_add(&self, _other: &Array1<Float>) -> Result<Array1<Float>> {
640 Err(SklearsError::InvalidInput(
641 "Not applicable for 2D array with 1D array".to_string(),
642 ))
643 }
644}
645
646pub struct AdvancedSimdOps {
648 base_ops: SimdMatrixOps,
649 config: SimdConfig,
650}
651
652impl AdvancedSimdOps {
653 pub fn new() -> Self {
654 Self {
655 base_ops: SimdMatrixOps::new(),
656 config: SimdConfig::default(),
657 }
658 }
659
660 pub fn with_config(config: SimdConfig) -> Self {
661 Self {
662 base_ops: SimdMatrixOps::with_config(config.clone()),
663 config,
664 }
665 }
666
667 pub fn simd_eigenvalues_2x2(&self, matrices: &Array2<Float>) -> Result<Array1<Float>> {
669 if matrices.nrows() % 2 != 0 || matrices.ncols() % 2 != 0 {
670 return Err(SklearsError::InvalidInput(
671 "Input must be stacks of 2x2 matrices".to_string(),
672 ));
673 }
674
675 let n_matrices = matrices.nrows() / 2;
676 let mut eigenvalues = Array1::zeros(n_matrices * 2);
677
678 for i in 0..n_matrices {
680 let matrix_start = i * 2;
681 let a = matrices[[matrix_start, 0]];
682 let b = matrices[[matrix_start, 1]];
683 let c = matrices[[matrix_start + 1, 0]];
684 let d = matrices[[matrix_start + 1, 1]];
685
686 let trace = a + d;
688 let det = a * d - b * c;
689 let discriminant = (trace * trace - 4.0 * det).sqrt();
690
691 eigenvalues[i * 2] = (trace + discriminant) / 2.0;
692 eigenvalues[i * 2 + 1] = (trace - discriminant) / 2.0;
693 }
694
695 Ok(eigenvalues)
696 }
697
698 pub fn simd_batch_inverse_2x2(&self, matrices: &Array2<Float>) -> Result<Array2<Float>> {
700 if matrices.nrows() % 2 != 0 || matrices.ncols() % 2 != 0 {
701 return Err(SklearsError::InvalidInput(
702 "Input must be stacks of 2x2 matrices".to_string(),
703 ));
704 }
705
706 let n_matrices = matrices.nrows() / 2;
707 let mut inverses = Array2::zeros(matrices.raw_dim());
708
709 for i in 0..n_matrices {
710 let matrix_start = i * 2;
711 let a = matrices[[matrix_start, 0]];
712 let b = matrices[[matrix_start, 1]];
713 let c = matrices[[matrix_start + 1, 0]];
714 let d = matrices[[matrix_start + 1, 1]];
715
716 let det = a * d - b * c;
717 if det.abs() < Float::EPSILON {
718 return Err(SklearsError::NumericalError(format!(
719 "Matrix {} is singular",
720 i
721 )));
722 }
723
724 let inv_det = 1.0 / det;
725 inverses[[matrix_start, 0]] = d * inv_det;
726 inverses[[matrix_start, 1]] = -b * inv_det;
727 inverses[[matrix_start + 1, 0]] = -c * inv_det;
728 inverses[[matrix_start + 1, 1]] = a * inv_det;
729 }
730
731 Ok(inverses)
732 }
733
734 pub fn simd_softmax(&self, x: &Array1<Float>) -> Result<Array1<Float>> {
736 if x.is_empty() {
737 return Err(SklearsError::InvalidInput(
738 "Input array is empty".to_string(),
739 ));
740 }
741
742 let max_val = x.iter().fold(Float::NEG_INFINITY, |acc, &val| acc.max(val));
744
745 let shifted: Array1<Float> = x.mapv(|val| val - max_val);
747 let exp_values = self.simd_exp(&shifted)?;
748
749 let sum_exp = exp_values.sum();
751
752 Ok(exp_values / sum_exp)
754 }
755
756 fn simd_exp(&self, x: &Array1<Float>) -> Result<Array1<Float>> {
758 let len = x.len();
761 let mut result = Array1::ones(len);
762
763 if len >= self.config.min_vector_length {
764 let x_vec = x.to_vec();
766 let exp_vec = self.vectorized_exp(&x_vec)?;
767 result.assign(&Array1::from_vec(exp_vec));
768 } else {
769 result.iter_mut().zip(x.iter()).for_each(|(res, &val)| {
771 *res = val.exp();
772 });
773 }
774
775 Ok(result)
776 }
777
778 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
780 fn vectorized_exp(&self, x: &[Float]) -> Result<Vec<Float>> {
781 if self.config.use_avx2 && x.len() >= 4 {
782 unsafe { self.avx2_exp(x) }
783 } else if self.config.use_sse && x.len() >= 2 {
784 unsafe { self.sse_exp(x) }
785 } else {
786 Ok(x.iter().map(|&val| val.exp()).collect())
787 }
788 }
789
790 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
791 fn vectorized_exp(&self, x: &[Float]) -> Result<Vec<Float>> {
792 Ok(x.iter().map(|&val| val.exp()).collect())
794 }
795
796 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
798 #[target_feature(enable = "avx2")]
799 unsafe fn avx2_exp(&self, x: &[Float]) -> Result<Vec<Float>> {
800 let len = x.len();
801 let simd_len = len & !3; let mut result = vec![0.0; len];
803
804 for i in (0..simd_len).step_by(4) {
805 let _x_vec = _mm256_loadu_pd(x.as_ptr().add(i));
806
807 let exp_vec = _mm256_set_pd(x[i + 3].exp(), x[i + 2].exp(), x[i + 1].exp(), x[i].exp());
810
811 _mm256_storeu_pd(result.as_mut_ptr().add(i), exp_vec);
812 }
813
814 for i in simd_len..len {
816 result[i] = x[i].exp();
817 }
818
819 Ok(result)
820 }
821
822 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
824 #[target_feature(enable = "sse2")]
825 unsafe fn sse_exp(&self, x: &[Float]) -> Result<Vec<Float>> {
826 let len = x.len();
827 let simd_len = len & !1; let mut result = vec![0.0; len];
829
830 for i in (0..simd_len).step_by(2) {
831 let exp_vec = _mm_set_pd(x[i + 1].exp(), x[i].exp());
832 _mm_storeu_pd(result.as_mut_ptr().add(i), exp_vec);
833 }
834
835 for i in simd_len..len {
837 result[i] = x[i].exp();
838 }
839
840 Ok(result)
841 }
842
843 pub fn simd_log_determinant_batch(
845 &self,
846 matrices: &Array2<Float>,
847 size: usize,
848 ) -> Result<Array1<Float>> {
849 let n_matrices = matrices.nrows() / size;
850 let mut log_dets = Array1::zeros(n_matrices);
851
852 match size {
853 2 => {
854 for i in 0..n_matrices {
856 let offset = i * 2;
857 let a = matrices[[offset, 0]];
858 let b = matrices[[offset, 1]];
859 let c = matrices[[offset + 1, 0]];
860 let d = matrices[[offset + 1, 1]];
861
862 let det = a * d - b * c;
863 if det <= 0.0 {
864 return Err(SklearsError::NumericalError(format!(
865 "Matrix {} has non-positive determinant",
866 i
867 )));
868 }
869 log_dets[i] = det.ln();
870 }
871 }
872 3 => {
873 for i in 0..n_matrices {
875 let offset = i * 3;
876 let a = matrices[[offset, 0]];
877 let b = matrices[[offset, 1]];
878 let c = matrices[[offset, 2]];
879 let d = matrices[[offset + 1, 0]];
880 let e = matrices[[offset + 1, 1]];
881 let f = matrices[[offset + 1, 2]];
882 let g = matrices[[offset + 2, 0]];
883 let h = matrices[[offset + 2, 1]];
884 let i_elem = matrices[[offset + 2, 2]];
885
886 let det =
887 a * (e * i_elem - f * h) - b * (d * i_elem - f * g) + c * (d * h - e * g);
888 if det <= 0.0 {
889 return Err(SklearsError::NumericalError(format!(
890 "Matrix {} has non-positive determinant",
891 i
892 )));
893 }
894 log_dets[i] = det.ln();
895 }
896 }
897 _ => {
898 return Err(SklearsError::InvalidInput(
900 "Only 2x2 and 3x3 matrices supported in batch mode".to_string(),
901 ));
902 }
903 }
904
905 Ok(log_dets)
906 }
907
908 pub fn simd_batch_quadratic_form(
910 &self,
911 vectors: &Array2<Float>,
912 matrix: &Array2<Float>,
913 ) -> Result<Array1<Float>> {
914 let n_vectors = vectors.nrows();
915 let mut results = Array1::zeros(n_vectors);
916
917 if self.config.parallel_simd && n_vectors >= 64 {
918 let parallel_results: Result<Vec<Float>> = (0..n_vectors)
919 .into_par_iter()
920 .map(|i| -> Result<Float> {
921 let x = vectors.row(i);
922 let temp = self.base_ops.simd_matvec(matrix, &x.to_owned())?;
923 self.base_ops.simd_dot_product(&x, temp.view())
924 })
925 .collect();
926
927 let computed_results = parallel_results?;
928 for (i, value) in computed_results.into_iter().enumerate() {
929 results[i] = value;
930 }
931 } else {
932 for i in 0..n_vectors {
933 let x = vectors.row(i);
934 let temp = self.base_ops.simd_matvec(matrix, &x.to_owned())?;
935 results[i] = self.base_ops.simd_dot_product(&x, temp.view())?;
936 }
937 }
938
939 Ok(results)
940 }
941
942 pub fn simd_pairwise_distances(
944 &self,
945 x: &Array2<Float>,
946 y: &Array2<Float>,
947 ) -> Result<Array2<Float>> {
948 let (n_x, dim_x) = x.dim();
949 let (n_y, dim_y) = y.dim();
950
951 if dim_x != dim_y {
952 return Err(SklearsError::InvalidInput(
953 "X and Y must have same number of features".to_string(),
954 ));
955 }
956
957 let mut distances = Array2::zeros((n_x, n_y));
958
959 if self.config.parallel_simd {
960 let results: Result<Vec<Vec<Float>>> = (0..n_x)
961 .into_par_iter()
962 .map(|i| -> Result<Vec<Float>> {
963 let x_row = x.row(i);
964 let mut row_distances = Vec::with_capacity(n_y);
965
966 for j in 0..n_y {
967 let y_row = y.row(j);
968 let diff = self
969 .base_ops
970 .simd_element_wise_subtract(&x_row.to_owned(), &y_row.to_owned())?;
971 let dist_sq = self.base_ops.simd_dot_product(&diff.view(), diff.view())?;
972 row_distances.push(dist_sq.sqrt());
973 }
974 Ok(row_distances)
975 })
976 .collect();
977
978 let computed_results = results?;
979 for (i, row_data) in computed_results.into_iter().enumerate() {
980 for (j, dist) in row_data.into_iter().enumerate() {
981 distances[[i, j]] = dist;
982 }
983 }
984 } else {
985 for i in 0..n_x {
986 for j in 0..n_y {
987 let x_row = x.row(i);
988 let y_row = y.row(j);
989 let diff = self
990 .base_ops
991 .simd_element_wise_subtract(&x_row.to_owned(), &y_row.to_owned())?;
992 let dist_sq = self.base_ops.simd_dot_product(&diff.view(), diff.view())?;
993 distances[[i, j]] = dist_sq.sqrt();
994 }
995 }
996 }
997
998 Ok(distances)
999 }
1000
1001 pub fn simd_cross_covariance(
1003 &self,
1004 x: &Array2<Float>,
1005 y: &Array2<Float>,
1006 ) -> Result<Array2<Float>> {
1007 if x.nrows() != y.nrows() {
1008 return Err(SklearsError::InvalidInput(
1009 "X and Y must have same number of samples".to_string(),
1010 ));
1011 }
1012
1013 let n_samples = x.nrows() as Float;
1014
1015 let mean_x = self.base_ops.simd_column_mean(x)?;
1017 let mean_y = self.base_ops.simd_column_mean(y)?;
1018
1019 let mut centered_x = Array2::zeros(x.raw_dim());
1021 let mut centered_y = Array2::zeros(y.raw_dim());
1022
1023 for (i, (mut cx_row, mut cy_row)) in centered_x
1024 .axis_iter_mut(Axis(0))
1025 .zip(centered_y.axis_iter_mut(Axis(0)))
1026 .enumerate()
1027 {
1028 let x_row = x.row(i);
1029 let y_row = y.row(i);
1030 cx_row.assign(
1031 &self
1032 .base_ops
1033 .simd_element_wise_subtract(&x_row.to_owned(), &mean_x)?,
1034 );
1035 cy_row.assign(
1036 &self
1037 .base_ops
1038 .simd_element_wise_subtract(&y_row.to_owned(), &mean_y)?,
1039 );
1040 }
1041
1042 let cross_cov = self
1044 .base_ops
1045 .simd_matmul(¢ered_x.t().to_owned(), ¢ered_y)?;
1046 Ok(cross_cov / (n_samples - 1.0))
1047 }
1048
1049 pub fn simd_standardize(&self, data: &Array2<Float>) -> Result<Array2<Float>> {
1051 let mean = self.base_ops.simd_column_mean(data)?;
1052
1053 let mut std_dev = Array1::zeros(data.ncols());
1055 for (j, std_elem) in std_dev.iter_mut().enumerate() {
1056 let col = data.column(j);
1057 let mean_j = mean[j];
1058 let variance = col.iter().map(|&x| (x - mean_j).powi(2)).sum::<Float>()
1059 / (data.nrows() - 1) as Float;
1060 *std_elem = variance.sqrt();
1061 }
1062
1063 let mut standardized = Array2::zeros(data.raw_dim());
1065 for (i, mut std_row) in standardized.axis_iter_mut(Axis(0)).enumerate() {
1066 let data_row = data.row(i);
1067 let centered = self
1068 .base_ops
1069 .simd_element_wise_subtract(&data_row.to_owned(), &mean)?;
1070 let scaled = self.element_wise_divide(¢ered, &std_dev)?;
1071 std_row.assign(&scaled);
1072 }
1073
1074 Ok(standardized)
1075 }
1076
1077 fn element_wise_divide(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Array1<Float>> {
1079 if a.len() != b.len() {
1080 return Err(SklearsError::InvalidInput(
1081 "Arrays must have same length".to_string(),
1082 ));
1083 }
1084
1085 Ok(a.iter()
1086 .zip(b.iter())
1087 .map(|(&a_val, &b_val)| {
1088 if b_val.abs() < Float::EPSILON {
1089 0.0 } else {
1091 a_val / b_val
1092 }
1093 })
1094 .collect::<Array1<Float>>())
1095 }
1096
1097 pub fn simd_log_sum_exp(&self, x: &Array1<Float>) -> Result<Float> {
1099 if x.is_empty() {
1100 return Err(SklearsError::InvalidInput(
1101 "Input array is empty".to_string(),
1102 ));
1103 }
1104
1105 let max_val = x.iter().fold(Float::NEG_INFINITY, |acc, &val| acc.max(val));
1107
1108 let shifted = x.mapv(|val| val - max_val);
1110 let exp_values = self.simd_exp(&shifted)?;
1111 let sum_exp = exp_values.sum();
1112
1113 Ok(sum_exp.ln() + max_val)
1114 }
1115
1116 pub fn benchmark_simd_vs_scalar(&self, size: usize) -> SimdBenchmarkResults {
1118 let a = Array1::from_vec((0..size).map(|i| i as Float).collect());
1119 let b = Array1::from_vec((0..size).map(|i| (i + 1) as Float).collect());
1120
1121 let start = std::time::Instant::now();
1123 let _simd_result = self.base_ops.simd_dot_product(&a.view(), b.view());
1124 let simd_time = start.elapsed();
1125
1126 let start = std::time::Instant::now();
1128 let _scalar_result: Float = a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum();
1129 let scalar_time = start.elapsed();
1130
1131 SimdBenchmarkResults {
1132 size,
1133 simd_time_ns: simd_time.as_nanos() as u64,
1134 scalar_time_ns: scalar_time.as_nanos() as u64,
1135 speedup: scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64,
1136 }
1137 }
1138}
1139
1140impl Default for AdvancedSimdOps {
1141 fn default() -> Self {
1142 Self::new()
1143 }
1144}
1145
1146#[derive(Debug, Clone)]
1148pub struct SimdBenchmarkResults {
1149 pub size: usize,
1151 pub simd_time_ns: u64,
1153 pub scalar_time_ns: u64,
1155 pub speedup: f64,
1157}
1158
1159#[cfg(target_arch = "aarch64")]
1161pub struct NeonSimdOps {
1162 config: SimdConfig,
1163}
1164
1165#[cfg(target_arch = "aarch64")]
1166impl Default for NeonSimdOps {
1167 fn default() -> Self {
1168 Self::new()
1169 }
1170}
1171
1172#[cfg(target_arch = "aarch64")]
1173impl NeonSimdOps {
1174 pub fn new() -> Self {
1175 Self {
1176 config: SimdConfig::default(),
1177 }
1178 }
1179
1180 pub fn neon_dot_product(&self, a: &[Float], b: &[Float]) -> Result<Float> {
1182 Ok(a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum())
1185 }
1186
1187 pub fn check_neon_support(&self) -> bool {
1189 true
1191 }
1192}
1193
1194#[allow(non_snake_case)]
1195#[cfg(test)]
1196mod tests {
1197 use super::*;
1198 use approx::assert_abs_diff_eq;
1199 use scirs2_core::ndarray::array;
1200
1201 #[test]
1202 fn test_simd_dot_product() {
1203 let simd_ops = SimdMatrixOps::new();
1204 let a = array![1.0, 2.0, 3.0, 4.0];
1205 let b = array![5.0, 6.0, 7.0, 8.0];
1206
1207 let result = simd_ops.simd_dot_product(&a.view(), b.view()).unwrap();
1208 let expected = 1.0 * 5.0 + 2.0 * 6.0 + 3.0 * 7.0 + 4.0 * 8.0; assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1211 }
1212
1213 #[test]
1214 fn test_simd_matvec() {
1215 let simd_ops = SimdMatrixOps::new();
1216 let matrix = array![[1.0, 2.0], [3.0, 4.0]];
1217 let vector = array![5.0, 6.0];
1218
1219 let result = simd_ops.simd_matvec(&matrix, &vector).unwrap();
1220 let expected = array![1.0 * 5.0 + 2.0 * 6.0, 3.0 * 5.0 + 4.0 * 6.0]; assert_abs_diff_eq!(result[0], expected[0], epsilon = 1e-10);
1223 assert_abs_diff_eq!(result[1], expected[1], epsilon = 1e-10);
1224 }
1225
1226 #[test]
1227 fn test_simd_matmul() {
1228 let simd_ops = SimdMatrixOps::new();
1229 let a = array![[1.0, 2.0], [3.0, 4.0]];
1230 let b = array![[5.0, 6.0], [7.0, 8.0]];
1231
1232 let result = simd_ops.simd_matmul(&a, &b).unwrap();
1233 assert_abs_diff_eq!(result[[0, 0]], 19.0, epsilon = 1e-10);
1236 assert_abs_diff_eq!(result[[0, 1]], 22.0, epsilon = 1e-10);
1237 assert_abs_diff_eq!(result[[1, 0]], 43.0, epsilon = 1e-10);
1238 assert_abs_diff_eq!(result[[1, 1]], 50.0, epsilon = 1e-10);
1239 }
1240
1241 #[test]
1242 fn test_simd_element_wise_ops() {
1243 let simd_ops = SimdMatrixOps::new();
1244 let a = array![1.0, 2.0, 3.0, 4.0];
1245 let b = array![5.0, 6.0, 7.0, 8.0];
1246
1247 let add_result = simd_ops.simd_element_wise_add(&a, &b).unwrap();
1248 let expected_add = array![6.0, 8.0, 10.0, 12.0];
1249
1250 for (r, e) in add_result.iter().zip(expected_add.iter()) {
1251 assert_abs_diff_eq!(*r, *e, epsilon = 1e-10);
1252 }
1253
1254 let sub_result = simd_ops.simd_element_wise_subtract(&a, &b).unwrap();
1255 let expected_sub = array![-4.0, -4.0, -4.0, -4.0];
1256
1257 for (r, e) in sub_result.iter().zip(expected_sub.iter()) {
1258 assert_abs_diff_eq!(*r, *e, epsilon = 1e-10);
1259 }
1260 }
1261
1262 #[test]
1263 fn test_simd_covariance() {
1264 let simd_ops = SimdMatrixOps::new();
1265 let data = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
1266
1267 let cov = simd_ops.simd_covariance(&data).unwrap();
1268
1269 assert!(cov.nrows() == 2 && cov.ncols() == 2);
1271 assert!(cov[[0, 0]] > 0.0); assert!(cov[[1, 1]] > 0.0); assert_abs_diff_eq!(cov[[0, 1]], cov[[1, 0]], epsilon = 1e-10); }
1275
1276 #[test]
1277 fn test_simd_support_detection() {
1278 let simd_ops = SimdMatrixOps::new();
1279 let support = simd_ops.check_simd_support();
1280
1281 println!("SIMD Support: {:?}", support);
1283 }
1284
1285 #[test]
1286 fn test_simd_array_ops_trait() {
1287 let a = array![1.0, 2.0, 3.0, 4.0];
1288 let b = array![5.0, 6.0, 7.0, 8.0];
1289
1290 let dot_result = a.simd_dot(&b).unwrap();
1291 let expected = 70.0;
1292 assert_abs_diff_eq!(dot_result, expected, epsilon = 1e-10);
1293
1294 let add_result = a.simd_add(&b).unwrap();
1295 let expected_add = array![6.0, 8.0, 10.0, 12.0];
1296
1297 for (r, e) in add_result.iter().zip(expected_add.iter()) {
1298 assert_abs_diff_eq!(*r, *e, epsilon = 1e-10);
1299 }
1300 }
1301}