1use scirs2_core::ndarray::{Array1, Array2, Axis};
9
10#[cfg(feature = "serde")]
11use serde::{Deserialize, Serialize};
12
13#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
16pub struct SimdConfig {
17 pub enabled: bool,
19 pub min_size_threshold: usize,
21 pub force_width: Option<usize>,
23 pub use_parallel: bool,
25}
26
27impl Default for SimdConfig {
28 fn default() -> Self {
29 Self {
30 enabled: true,
31 min_size_threshold: 32,
32 force_width: None,
33 use_parallel: true,
34 }
35 }
36}
37
38pub fn add_scalar_f64_simd(data: &mut [f64], scalar: f64, config: &SimdConfig) {
40 if !config.enabled || data.len() < config.min_size_threshold {
41 add_scalar_f64_scalar(data, scalar);
42 return;
43 }
44
45 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46 {
47 if is_x86_feature_detected!("avx2") {
48 unsafe { add_scalar_f64_avx2(data, scalar) };
49 return;
50 } else if is_x86_feature_detected!("sse2") {
51 unsafe { add_scalar_f64_sse2(data, scalar) };
52 return;
53 }
54 }
55
56 #[cfg(target_arch = "aarch64")]
57 unsafe {
58 add_scalar_f64_neon(data, scalar)
59 };
60
61 #[cfg(not(target_arch = "aarch64"))]
62 add_scalar_f64_scalar(data, scalar);
63}
64
65pub fn sub_scalar_f64_simd(data: &mut [f64], scalar: f64, config: &SimdConfig) {
67 add_scalar_f64_simd(data, -scalar, config);
68}
69
70pub fn mul_scalar_f64_simd(data: &mut [f64], scalar: f64, config: &SimdConfig) {
72 if !config.enabled || data.len() < config.min_size_threshold {
73 mul_scalar_f64_scalar(data, scalar);
74 return;
75 }
76
77 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78 {
79 if is_x86_feature_detected!("avx2") {
80 unsafe { mul_scalar_f64_avx2(data, scalar) };
81 return;
82 } else if is_x86_feature_detected!("sse2") {
83 unsafe { mul_scalar_f64_sse2(data, scalar) };
84 return;
85 }
86 }
87
88 #[cfg(target_arch = "aarch64")]
89 unsafe {
90 mul_scalar_f64_neon(data, scalar)
91 };
92
93 #[cfg(not(target_arch = "aarch64"))]
94 mul_scalar_f64_scalar(data, scalar);
95}
96
97pub fn div_scalar_f64_simd(data: &mut [f64], scalar: f64, config: &SimdConfig) {
99 if scalar != 0.0 {
100 mul_scalar_f64_simd(data, 1.0 / scalar, config);
101 }
102}
103
104pub fn add_vectors_f64_simd(a: &[f64], b: &[f64], result: &mut [f64], config: &SimdConfig) {
106 assert_eq!(a.len(), b.len());
107 assert_eq!(a.len(), result.len());
108
109 if !config.enabled || a.len() < config.min_size_threshold {
110 add_vectors_f64_scalar(a, b, result);
111 return;
112 }
113
114 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
115 {
116 if is_x86_feature_detected!("avx2") {
117 unsafe { add_vectors_f64_avx2(a, b, result) };
118 return;
119 } else if is_x86_feature_detected!("sse2") {
120 unsafe { add_vectors_f64_sse2(a, b, result) };
121 return;
122 }
123 }
124
125 #[cfg(target_arch = "aarch64")]
126 unsafe {
127 add_vectors_f64_neon(a, b, result)
128 };
129
130 #[cfg(not(target_arch = "aarch64"))]
131 add_vectors_f64_scalar(a, b, result);
132}
133
134pub fn sub_vectors_f64_simd(a: &[f64], b: &[f64], result: &mut [f64], config: &SimdConfig) {
136 assert_eq!(a.len(), b.len());
137 assert_eq!(a.len(), result.len());
138
139 if !config.enabled || a.len() < config.min_size_threshold {
140 sub_vectors_f64_scalar(a, b, result);
141 return;
142 }
143
144 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
145 {
146 if is_x86_feature_detected!("avx2") {
147 unsafe { sub_vectors_f64_avx2(a, b, result) };
148 return;
149 } else if is_x86_feature_detected!("sse2") {
150 unsafe { sub_vectors_f64_sse2(a, b, result) };
151 return;
152 }
153 }
154
155 #[cfg(target_arch = "aarch64")]
156 unsafe {
157 sub_vectors_f64_neon(a, b, result)
158 };
159
160 #[cfg(not(target_arch = "aarch64"))]
161 sub_vectors_f64_scalar(a, b, result);
162}
163
164pub fn mean_f64_simd(data: &[f64], config: &SimdConfig) -> f64 {
166 if data.is_empty() {
167 return 0.0;
168 }
169
170 if !config.enabled || data.len() < config.min_size_threshold {
171 return mean_f64_scalar(data);
172 }
173
174 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
175 {
176 if is_x86_feature_detected!("avx2") {
177 return unsafe { mean_f64_avx2(data) };
178 } else if is_x86_feature_detected!("sse2") {
179 return unsafe { mean_f64_sse2(data) };
180 }
181 }
182
183 #[cfg(target_arch = "aarch64")]
184 return unsafe { mean_f64_neon(data) };
185
186 #[cfg(not(target_arch = "aarch64"))]
187 mean_f64_scalar(data)
188}
189
190pub fn variance_f64_simd(data: &[f64], mean: f64, config: &SimdConfig) -> f64 {
192 if data.len() <= 1 {
193 return 0.0;
194 }
195
196 if !config.enabled || data.len() < config.min_size_threshold {
197 return variance_f64_scalar(data, mean);
198 }
199
200 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
201 {
202 if is_x86_feature_detected!("avx2") {
203 return unsafe { variance_f64_avx2(data, mean) };
204 } else if is_x86_feature_detected!("sse2") {
205 return unsafe { variance_f64_sse2(data, mean) };
206 }
207 }
208
209 #[cfg(target_arch = "aarch64")]
210 return unsafe { variance_f64_neon(data, mean) };
211
212 #[cfg(not(target_arch = "aarch64"))]
213 variance_f64_scalar(data, mean)
214}
215
216pub fn min_max_f64_simd(data: &[f64], config: &SimdConfig) -> (f64, f64) {
218 if data.is_empty() {
219 return (f64::NAN, f64::NAN);
220 }
221
222 if !config.enabled || data.len() < config.min_size_threshold {
223 return min_max_f64_scalar(data);
224 }
225
226 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
227 {
228 if is_x86_feature_detected!("avx2") {
229 return unsafe { min_max_f64_avx2(data) };
230 } else if is_x86_feature_detected!("sse2") {
231 return unsafe { min_max_f64_sse2(data) };
232 }
233 }
234
235 #[cfg(target_arch = "aarch64")]
236 return unsafe { min_max_f64_neon(data) };
237
238 #[cfg(not(target_arch = "aarch64"))]
239 min_max_f64_scalar(data)
240}
241
242fn add_scalar_f64_scalar(data: &mut [f64], scalar: f64) {
245 for x in data.iter_mut() {
246 *x += scalar;
247 }
248}
249
250fn mul_scalar_f64_scalar(data: &mut [f64], scalar: f64) {
251 for x in data.iter_mut() {
252 *x *= scalar;
253 }
254}
255
256fn add_vectors_f64_scalar(a: &[f64], b: &[f64], result: &mut [f64]) {
257 for ((x, y), r) in a.iter().zip(b.iter()).zip(result.iter_mut()) {
258 *r = x + y;
259 }
260}
261
262fn sub_vectors_f64_scalar(a: &[f64], b: &[f64], result: &mut [f64]) {
263 for ((x, y), r) in a.iter().zip(b.iter()).zip(result.iter_mut()) {
264 *r = x - y;
265 }
266}
267
268fn mean_f64_scalar(data: &[f64]) -> f64 {
269 data.iter().sum::<f64>() / data.len() as f64
270}
271
272fn variance_f64_scalar(data: &[f64], mean: f64) -> f64 {
273 data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (data.len() - 1) as f64
274}
275
276fn min_max_f64_scalar(data: &[f64]) -> (f64, f64) {
277 let mut min = data[0];
278 let mut max = data[0];
279
280 for &x in &data[1..] {
281 if x < min {
282 min = x;
283 }
284 if x > max {
285 max = x;
286 }
287 }
288
289 (min, max)
290}
291
292#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
295#[target_feature(enable = "sse2")]
296unsafe fn add_scalar_f64_sse2(data: &mut [f64], scalar: f64) {
297 use std::arch::x86_64::*;
298
299 let scalar_vec = _mm_set1_pd(scalar);
300 let mut i = 0;
301
302 while i + 2 <= data.len() {
303 let data_vec = _mm_loadu_pd(data.as_ptr().add(i));
304 let result = _mm_add_pd(data_vec, scalar_vec);
305 _mm_storeu_pd(data.as_mut_ptr().add(i), result);
306 i += 2;
307 }
308
309 while i < data.len() {
311 data[i] += scalar;
312 i += 1;
313 }
314}
315
316#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
317#[target_feature(enable = "sse2")]
318unsafe fn mul_scalar_f64_sse2(data: &mut [f64], scalar: f64) {
319 use std::arch::x86_64::*;
320
321 let scalar_vec = _mm_set1_pd(scalar);
322 let mut i = 0;
323
324 while i + 2 <= data.len() {
325 let data_vec = _mm_loadu_pd(data.as_ptr().add(i));
326 let result = _mm_mul_pd(data_vec, scalar_vec);
327 _mm_storeu_pd(data.as_mut_ptr().add(i), result);
328 i += 2;
329 }
330
331 while i < data.len() {
332 data[i] *= scalar;
333 i += 1;
334 }
335}
336
337#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
338#[target_feature(enable = "sse2")]
339unsafe fn add_vectors_f64_sse2(a: &[f64], b: &[f64], result: &mut [f64]) {
340 use std::arch::x86_64::*;
341
342 let mut i = 0;
343
344 while i + 2 <= a.len() {
345 let a_vec = _mm_loadu_pd(a.as_ptr().add(i));
346 let b_vec = _mm_loadu_pd(b.as_ptr().add(i));
347 let result_vec = _mm_add_pd(a_vec, b_vec);
348 _mm_storeu_pd(result.as_mut_ptr().add(i), result_vec);
349 i += 2;
350 }
351
352 while i < a.len() {
353 result[i] = a[i] + b[i];
354 i += 1;
355 }
356}
357
358#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
359#[target_feature(enable = "sse2")]
360unsafe fn sub_vectors_f64_sse2(a: &[f64], b: &[f64], result: &mut [f64]) {
361 use std::arch::x86_64::*;
362
363 let mut i = 0;
364
365 while i + 2 <= a.len() {
366 let a_vec = _mm_loadu_pd(a.as_ptr().add(i));
367 let b_vec = _mm_loadu_pd(b.as_ptr().add(i));
368 let result_vec = _mm_sub_pd(a_vec, b_vec);
369 _mm_storeu_pd(result.as_mut_ptr().add(i), result_vec);
370 i += 2;
371 }
372
373 while i < a.len() {
374 result[i] = a[i] - b[i];
375 i += 1;
376 }
377}
378
379#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
380#[target_feature(enable = "sse2")]
381unsafe fn mean_f64_sse2(data: &[f64]) -> f64 {
382 use std::arch::x86_64::*;
383
384 let mut sum = _mm_setzero_pd();
385 let mut i = 0;
386
387 while i + 2 <= data.len() {
388 let data_vec = _mm_loadu_pd(data.as_ptr().add(i));
389 sum = _mm_add_pd(sum, data_vec);
390 i += 2;
391 }
392
393 let mut result = [0.0f64; 2];
394 _mm_storeu_pd(result.as_mut_ptr(), sum);
395 let mut scalar_sum = result[0] + result[1];
396
397 while i < data.len() {
398 scalar_sum += data[i];
399 i += 1;
400 }
401
402 scalar_sum / data.len() as f64
403}
404
405#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
406#[target_feature(enable = "sse2")]
407unsafe fn variance_f64_sse2(data: &[f64], mean: f64) -> f64 {
408 use std::arch::x86_64::*;
409
410 let mean_vec = _mm_set1_pd(mean);
411 let mut sum = _mm_setzero_pd();
412 let mut i = 0;
413
414 while i + 2 <= data.len() {
415 let data_vec = _mm_loadu_pd(data.as_ptr().add(i));
416 let diff = _mm_sub_pd(data_vec, mean_vec);
417 let squared = _mm_mul_pd(diff, diff);
418 sum = _mm_add_pd(sum, squared);
419 i += 2;
420 }
421
422 let mut result = [0.0f64; 2];
423 _mm_storeu_pd(result.as_mut_ptr(), sum);
424 let mut scalar_sum = result[0] + result[1];
425
426 while i < data.len() {
427 let diff = data[i] - mean;
428 scalar_sum += diff * diff;
429 i += 1;
430 }
431
432 scalar_sum / (data.len() - 1) as f64
433}
434
435#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
436#[target_feature(enable = "sse2")]
437unsafe fn min_max_f64_sse2(data: &[f64]) -> (f64, f64) {
438 use std::arch::x86_64::*;
439
440 let mut min_vec = _mm_set1_pd(data[0]);
441 let mut max_vec = _mm_set1_pd(data[0]);
442 let mut i = 0;
443
444 while i + 2 <= data.len() {
445 let data_vec = _mm_loadu_pd(data.as_ptr().add(i));
446 min_vec = _mm_min_pd(min_vec, data_vec);
447 max_vec = _mm_max_pd(max_vec, data_vec);
448 i += 2;
449 }
450
451 let mut min_result = [0.0f64; 2];
452 let mut max_result = [0.0f64; 2];
453 _mm_storeu_pd(min_result.as_mut_ptr(), min_vec);
454 _mm_storeu_pd(max_result.as_mut_ptr(), max_vec);
455
456 let mut min_val = min_result[0].min(min_result[1]);
457 let mut max_val = max_result[0].max(max_result[1]);
458
459 while i < data.len() {
460 if data[i] < min_val {
461 min_val = data[i];
462 }
463 if data[i] > max_val {
464 max_val = data[i];
465 }
466 i += 1;
467 }
468
469 (min_val, max_val)
470}
471
472#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
475#[target_feature(enable = "avx2")]
476unsafe fn add_scalar_f64_avx2(data: &mut [f64], scalar: f64) {
477 use std::arch::x86_64::*;
478
479 let scalar_vec = _mm256_set1_pd(scalar);
480 let mut i = 0;
481
482 while i + 4 <= data.len() {
483 let data_vec = _mm256_loadu_pd(data.as_ptr().add(i));
484 let result = _mm256_add_pd(data_vec, scalar_vec);
485 _mm256_storeu_pd(data.as_mut_ptr().add(i), result);
486 i += 4;
487 }
488
489 while i < data.len() {
490 data[i] += scalar;
491 i += 1;
492 }
493}
494
495#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
496#[target_feature(enable = "avx2")]
497unsafe fn mul_scalar_f64_avx2(data: &mut [f64], scalar: f64) {
498 use std::arch::x86_64::*;
499
500 let scalar_vec = _mm256_set1_pd(scalar);
501 let mut i = 0;
502
503 while i + 4 <= data.len() {
504 let data_vec = _mm256_loadu_pd(data.as_ptr().add(i));
505 let result = _mm256_mul_pd(data_vec, scalar_vec);
506 _mm256_storeu_pd(data.as_mut_ptr().add(i), result);
507 i += 4;
508 }
509
510 while i < data.len() {
511 data[i] *= scalar;
512 i += 1;
513 }
514}
515
516#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
517#[target_feature(enable = "avx2")]
518unsafe fn add_vectors_f64_avx2(a: &[f64], b: &[f64], result: &mut [f64]) {
519 use std::arch::x86_64::*;
520
521 let mut i = 0;
522
523 while i + 4 <= a.len() {
524 let a_vec = _mm256_loadu_pd(a.as_ptr().add(i));
525 let b_vec = _mm256_loadu_pd(b.as_ptr().add(i));
526 let result_vec = _mm256_add_pd(a_vec, b_vec);
527 _mm256_storeu_pd(result.as_mut_ptr().add(i), result_vec);
528 i += 4;
529 }
530
531 while i < a.len() {
532 result[i] = a[i] + b[i];
533 i += 1;
534 }
535}
536
537#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
538#[target_feature(enable = "avx2")]
539unsafe fn sub_vectors_f64_avx2(a: &[f64], b: &[f64], result: &mut [f64]) {
540 use std::arch::x86_64::*;
541
542 let mut i = 0;
543
544 while i + 4 <= a.len() {
545 let a_vec = _mm256_loadu_pd(a.as_ptr().add(i));
546 let b_vec = _mm256_loadu_pd(b.as_ptr().add(i));
547 let result_vec = _mm256_sub_pd(a_vec, b_vec);
548 _mm256_storeu_pd(result.as_mut_ptr().add(i), result_vec);
549 i += 4;
550 }
551
552 while i < a.len() {
553 result[i] = a[i] - b[i];
554 i += 1;
555 }
556}
557
558#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
559#[target_feature(enable = "avx2")]
560unsafe fn mean_f64_avx2(data: &[f64]) -> f64 {
561 use std::arch::x86_64::*;
562
563 let mut sum = _mm256_setzero_pd();
564 let mut i = 0;
565
566 while i + 4 <= data.len() {
567 let data_vec = _mm256_loadu_pd(data.as_ptr().add(i));
568 sum = _mm256_add_pd(sum, data_vec);
569 i += 4;
570 }
571
572 let mut result = [0.0f64; 4];
573 _mm256_storeu_pd(result.as_mut_ptr(), sum);
574 let mut scalar_sum = result.iter().sum::<f64>();
575
576 while i < data.len() {
577 scalar_sum += data[i];
578 i += 1;
579 }
580
581 scalar_sum / data.len() as f64
582}
583
584#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
585#[target_feature(enable = "avx2")]
586unsafe fn variance_f64_avx2(data: &[f64], mean: f64) -> f64 {
587 use std::arch::x86_64::*;
588
589 let mean_vec = _mm256_set1_pd(mean);
590 let mut sum = _mm256_setzero_pd();
591 let mut i = 0;
592
593 while i + 4 <= data.len() {
594 let data_vec = _mm256_loadu_pd(data.as_ptr().add(i));
595 let diff = _mm256_sub_pd(data_vec, mean_vec);
596 let squared = _mm256_mul_pd(diff, diff);
597 sum = _mm256_add_pd(sum, squared);
598 i += 4;
599 }
600
601 let mut result = [0.0f64; 4];
602 _mm256_storeu_pd(result.as_mut_ptr(), sum);
603 let mut scalar_sum = result.iter().sum::<f64>();
604
605 while i < data.len() {
606 let diff = data[i] - mean;
607 scalar_sum += diff * diff;
608 i += 1;
609 }
610
611 scalar_sum / (data.len() - 1) as f64
612}
613
614#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
615#[target_feature(enable = "avx2")]
616unsafe fn min_max_f64_avx2(data: &[f64]) -> (f64, f64) {
617 use std::arch::x86_64::*;
618
619 let mut min_vec = _mm256_set1_pd(data[0]);
620 let mut max_vec = _mm256_set1_pd(data[0]);
621 let mut i = 0;
622
623 while i + 4 <= data.len() {
624 let data_vec = _mm256_loadu_pd(data.as_ptr().add(i));
625 min_vec = _mm256_min_pd(min_vec, data_vec);
626 max_vec = _mm256_max_pd(max_vec, data_vec);
627 i += 4;
628 }
629
630 let mut min_result = [0.0f64; 4];
631 let mut max_result = [0.0f64; 4];
632 _mm256_storeu_pd(min_result.as_mut_ptr(), min_vec);
633 _mm256_storeu_pd(max_result.as_mut_ptr(), max_vec);
634
635 let mut min_val = min_result.iter().fold(f64::INFINITY, |a, &b| a.min(b));
636 let mut max_val = max_result.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
637
638 while i < data.len() {
639 if data[i] < min_val {
640 min_val = data[i];
641 }
642 if data[i] > max_val {
643 max_val = data[i];
644 }
645 i += 1;
646 }
647
648 (min_val, max_val)
649}
650
651#[cfg(target_arch = "aarch64")]
654unsafe fn add_scalar_f64_neon(data: &mut [f64], scalar: f64) {
655 use std::arch::aarch64::*;
656
657 let scalar_vec = vdupq_n_f64(scalar);
658 let mut i = 0;
659
660 while i + 2 <= data.len() {
661 let data_vec = vld1q_f64(data.as_ptr().add(i));
662 let result = vaddq_f64(data_vec, scalar_vec);
663 vst1q_f64(data.as_mut_ptr().add(i), result);
664 i += 2;
665 }
666
667 while i < data.len() {
668 data[i] += scalar;
669 i += 1;
670 }
671}
672
673#[cfg(target_arch = "aarch64")]
674unsafe fn mul_scalar_f64_neon(data: &mut [f64], scalar: f64) {
675 use std::arch::aarch64::*;
676
677 let scalar_vec = vdupq_n_f64(scalar);
678 let mut i = 0;
679
680 while i + 2 <= data.len() {
681 let data_vec = vld1q_f64(data.as_ptr().add(i));
682 let result = vmulq_f64(data_vec, scalar_vec);
683 vst1q_f64(data.as_mut_ptr().add(i), result);
684 i += 2;
685 }
686
687 while i < data.len() {
688 data[i] *= scalar;
689 i += 1;
690 }
691}
692
693#[cfg(target_arch = "aarch64")]
694unsafe fn add_vectors_f64_neon(a: &[f64], b: &[f64], result: &mut [f64]) {
695 use std::arch::aarch64::*;
696
697 let mut i = 0;
698
699 while i + 2 <= a.len() {
700 let a_vec = vld1q_f64(a.as_ptr().add(i));
701 let b_vec = vld1q_f64(b.as_ptr().add(i));
702 let result_vec = vaddq_f64(a_vec, b_vec);
703 vst1q_f64(result.as_mut_ptr().add(i), result_vec);
704 i += 2;
705 }
706
707 while i < a.len() {
708 result[i] = a[i] + b[i];
709 i += 1;
710 }
711}
712
713#[cfg(target_arch = "aarch64")]
714unsafe fn sub_vectors_f64_neon(a: &[f64], b: &[f64], result: &mut [f64]) {
715 use std::arch::aarch64::*;
716
717 let mut i = 0;
718
719 while i + 2 <= a.len() {
720 let a_vec = vld1q_f64(a.as_ptr().add(i));
721 let b_vec = vld1q_f64(b.as_ptr().add(i));
722 let result_vec = vsubq_f64(a_vec, b_vec);
723 vst1q_f64(result.as_mut_ptr().add(i), result_vec);
724 i += 2;
725 }
726
727 while i < a.len() {
728 result[i] = a[i] - b[i];
729 i += 1;
730 }
731}
732
733#[cfg(target_arch = "aarch64")]
734unsafe fn mean_f64_neon(data: &[f64]) -> f64 {
735 use std::arch::aarch64::*;
736
737 let mut sum = vdupq_n_f64(0.0);
738 let mut i = 0;
739
740 while i + 2 <= data.len() {
741 let data_vec = vld1q_f64(data.as_ptr().add(i));
742 sum = vaddq_f64(sum, data_vec);
743 i += 2;
744 }
745
746 let mut scalar_sum = vaddvq_f64(sum);
747
748 while i < data.len() {
749 scalar_sum += data[i];
750 i += 1;
751 }
752
753 scalar_sum / data.len() as f64
754}
755
756#[cfg(target_arch = "aarch64")]
757unsafe fn variance_f64_neon(data: &[f64], mean: f64) -> f64 {
758 use std::arch::aarch64::*;
759
760 let mean_vec = vdupq_n_f64(mean);
761 let mut sum = vdupq_n_f64(0.0);
762 let mut i = 0;
763
764 while i + 2 <= data.len() {
765 let data_vec = vld1q_f64(data.as_ptr().add(i));
766 let diff = vsubq_f64(data_vec, mean_vec);
767 let squared = vmulq_f64(diff, diff);
768 sum = vaddq_f64(sum, squared);
769 i += 2;
770 }
771
772 let mut scalar_sum = vaddvq_f64(sum);
773
774 while i < data.len() {
775 let diff = data[i] - mean;
776 scalar_sum += diff * diff;
777 i += 1;
778 }
779
780 scalar_sum / (data.len() - 1) as f64
781}
782
783#[cfg(target_arch = "aarch64")]
784unsafe fn min_max_f64_neon(data: &[f64]) -> (f64, f64) {
785 use std::arch::aarch64::*;
786
787 let mut min_vec = vdupq_n_f64(data[0]);
788 let mut max_vec = vdupq_n_f64(data[0]);
789 let mut i = 0;
790
791 while i + 2 <= data.len() {
792 let data_vec = vld1q_f64(data.as_ptr().add(i));
793 min_vec = vminq_f64(min_vec, data_vec);
794 max_vec = vmaxq_f64(max_vec, data_vec);
795 i += 2;
796 }
797
798 let min_val = vminvq_f64(min_vec);
799 let max_val = vmaxvq_f64(max_vec);
800
801 let mut final_min = min_val;
802 let mut final_max = max_val;
803
804 while i < data.len() {
805 if data[i] < final_min {
806 final_min = data[i];
807 }
808 if data[i] > final_max {
809 final_max = data[i];
810 }
811 i += 1;
812 }
813
814 (final_min, final_max)
815}
816
817pub mod ndarray_ops {
819 use super::*;
820
821 pub fn add_scalar_array(array: &mut Array2<f64>, scalar: f64, config: &SimdConfig) {
823 if config.use_parallel && array.len() > 1000 {
824 #[cfg(feature = "parallel")]
825 {
826 use rayon::prelude::*;
827 array
828 .axis_iter_mut(Axis(0))
829 .into_par_iter()
830 .for_each(|mut row| {
831 add_scalar_f64_simd(
832 row.as_slice_mut().expect("matrix indexing should be valid"),
833 scalar,
834 config,
835 );
836 });
837 return;
838 }
839 }
840
841 for mut row in array.axis_iter_mut(Axis(0)) {
842 if let Some(slice) = row.as_slice_mut() {
843 add_scalar_f64_simd(slice, scalar, config);
844 } else {
845 for elem in row.iter_mut() {
847 *elem += scalar;
848 }
849 }
850 }
851 }
852
853 pub fn mul_scalar_array(array: &mut Array2<f64>, scalar: f64, config: &SimdConfig) {
855 if config.use_parallel && array.len() > 1000 {
856 #[cfg(feature = "parallel")]
857 {
858 use rayon::prelude::*;
859 array
860 .axis_iter_mut(Axis(0))
861 .into_par_iter()
862 .for_each(|mut row| {
863 mul_scalar_f64_simd(
864 row.as_slice_mut().expect("matrix indexing should be valid"),
865 scalar,
866 config,
867 );
868 });
869 return;
870 }
871 }
872
873 for mut row in array.axis_iter_mut(Axis(0)) {
874 if let Some(slice) = row.as_slice_mut() {
875 mul_scalar_f64_simd(slice, scalar, config);
876 } else {
877 for elem in row.iter_mut() {
878 *elem *= scalar;
879 }
880 }
881 }
882 }
883
884 pub fn column_means(array: &Array2<f64>, config: &SimdConfig) -> Array1<f64> {
886 let mut means = Array1::zeros(array.ncols());
887
888 for (j, mean_col) in means.iter_mut().enumerate() {
889 let column = array.column(j);
890 if let Some(slice) = column.as_slice() {
891 *mean_col = mean_f64_simd(slice, config);
892 } else {
893 *mean_col = column.iter().sum::<f64>() / array.nrows() as f64;
894 }
895 }
896
897 means
898 }
899
900 pub fn column_variances(
902 array: &Array2<f64>,
903 means: &Array1<f64>,
904 config: &SimdConfig,
905 ) -> Array1<f64> {
906 let mut variances = Array1::zeros(array.ncols());
907
908 for (j, var_col) in variances.iter_mut().enumerate() {
909 let column = array.column(j);
910 if let Some(slice) = column.as_slice() {
911 *var_col = variance_f64_simd(slice, means[j], config);
912 } else {
913 let sum_sq_diff: f64 = column.iter().map(|x| (x - means[j]).powi(2)).sum();
914 *var_col = sum_sq_diff / (array.nrows() - 1) as f64;
915 }
916 }
917
918 variances
919 }
920
921 pub fn column_min_max(array: &Array2<f64>, config: &SimdConfig) -> (Array1<f64>, Array1<f64>) {
923 let mut mins = Array1::zeros(array.ncols());
924 let mut maxs = Array1::zeros(array.ncols());
925
926 for j in 0..array.ncols() {
927 let column = array.column(j);
928 if let Some(slice) = column.as_slice() {
929 let (min_val, max_val) = min_max_f64_simd(slice, config);
930 mins[j] = min_val;
931 maxs[j] = max_val;
932 } else {
933 let mut min_val = column[0];
934 let mut max_val = column[0];
935 for &val in column.iter().skip(1) {
936 if val < min_val {
937 min_val = val;
938 }
939 if val > max_val {
940 max_val = val;
941 }
942 }
943 mins[j] = min_val;
944 maxs[j] = max_val;
945 }
946 }
947
948 (mins, maxs)
949 }
950}
951
952#[allow(non_snake_case)]
953#[cfg(test)]
954mod tests {
955 use super::*;
956 use approx::assert_relative_eq;
957
958 #[test]
959 fn test_simd_config() {
960 let config = SimdConfig::default();
961 assert!(config.enabled);
962 assert_eq!(config.min_size_threshold, 32);
963 assert!(config.use_parallel);
964 }
965
966 #[test]
967 fn test_add_scalar_simd() {
968 let config = SimdConfig::default();
969 let mut data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
970 let original = data.clone();
971
972 add_scalar_f64_simd(&mut data, 10.0, &config);
973
974 for (i, &val) in data.iter().enumerate() {
975 assert_relative_eq!(val, original[i] + 10.0, epsilon = 1e-14);
976 }
977 }
978
979 #[test]
980 fn test_mul_scalar_simd() {
981 let config = SimdConfig::default();
982 let mut data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
983 let original = data.clone();
984
985 mul_scalar_f64_simd(&mut data, 2.5, &config);
986
987 for (i, &val) in data.iter().enumerate() {
988 assert_relative_eq!(val, original[i] * 2.5, epsilon = 1e-14);
989 }
990 }
991
992 #[test]
993 fn test_vector_operations_simd() {
994 let config = SimdConfig::default();
995 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
996 let b = vec![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
997 let mut result = vec![0.0; 8];
998
999 add_vectors_f64_simd(&a, &b, &mut result, &config);
1000
1001 for (i, &val) in result.iter().enumerate() {
1002 assert_relative_eq!(val, a[i] + b[i], epsilon = 1e-14);
1003 }
1004 }
1005
1006 #[test]
1007 fn test_mean_simd() {
1008 let config = SimdConfig::default();
1009 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1010
1011 let mean = mean_f64_simd(&data, &config);
1012 let expected = 5.5;
1013
1014 assert_relative_eq!(mean, expected, epsilon = 1e-14);
1015 }
1016
1017 #[test]
1018 fn test_variance_simd() {
1019 let config = SimdConfig::default();
1020 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1021 let mean = 3.0;
1022
1023 let variance = variance_f64_simd(&data, mean, &config);
1024 let expected = 2.5; assert_relative_eq!(variance, expected, epsilon = 1e-14);
1027 }
1028
1029 #[test]
1030 fn test_min_max_simd() {
1031 let config = SimdConfig::default();
1032 let data = vec![3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0];
1033
1034 let (min_val, max_val) = min_max_f64_simd(&data, &config);
1035
1036 assert_relative_eq!(min_val, 1.0, epsilon = 1e-14);
1037 assert_relative_eq!(max_val, 9.0, epsilon = 1e-14);
1038 }
1039
1040 #[test]
1041 fn test_ndarray_operations() {
1042 let config = SimdConfig::default();
1043 let mut array = Array2::from_shape_vec(
1044 (4, 3),
1045 vec![
1046 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
1047 ],
1048 )
1049 .expect("operation should succeed");
1050
1051 let original = array.clone();
1053 ndarray_ops::add_scalar_array(&mut array, 5.0, &config);
1054
1055 for (_i, (&new_val, &old_val)) in array.iter().zip(original.iter()).enumerate() {
1056 assert_relative_eq!(new_val, old_val + 5.0, epsilon = 1e-14);
1057 }
1058
1059 let means = ndarray_ops::column_means(&original, &config);
1061 assert_relative_eq!(means[0], 5.5, epsilon = 1e-14); assert_relative_eq!(means[1], 6.5, epsilon = 1e-14); assert_relative_eq!(means[2], 7.5, epsilon = 1e-14); }
1065
1066 #[test]
1067 fn test_disabled_simd() {
1068 let config = SimdConfig {
1069 enabled: false,
1070 ..Default::default()
1071 };
1072
1073 let mut data = vec![1.0, 2.0, 3.0, 4.0];
1074 add_scalar_f64_simd(&mut data, 10.0, &config);
1075
1076 assert_relative_eq!(data[0], 11.0, epsilon = 1e-14);
1078 assert_relative_eq!(data[1], 12.0, epsilon = 1e-14);
1079 assert_relative_eq!(data[2], 13.0, epsilon = 1e-14);
1080 assert_relative_eq!(data[3], 14.0, epsilon = 1e-14);
1081 }
1082
1083 #[test]
1084 fn test_small_array_threshold() {
1085 let config = SimdConfig {
1086 min_size_threshold: 100, ..Default::default()
1088 };
1089
1090 let mut data = vec![1.0, 2.0, 3.0, 4.0];
1091 add_scalar_f64_simd(&mut data, 10.0, &config);
1092
1093 assert_relative_eq!(data[0], 11.0, epsilon = 1e-14);
1095 assert_relative_eq!(data[1], 12.0, epsilon = 1e-14);
1096 assert_relative_eq!(data[2], 13.0, epsilon = 1e-14);
1097 assert_relative_eq!(data[3], 14.0, epsilon = 1e-14);
1098 }
1099}