1use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
8use num_traits::Zero;
9
10#[cfg(feature = "simd")]
11use crate::simd_ops_polynomial;
12
13pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
15 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
17
18 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
20
21 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
23
24 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
26
27 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
29
30 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
32
33 fn simd_gemm(
35 alpha: Self,
36 a: &ArrayView2<Self>,
37 b: &ArrayView2<Self>,
38 beta: Self,
39 c: &mut Array2<Self>,
40 );
41
42 fn simd_norm(a: &ArrayView1<Self>) -> Self;
44
45 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
47
48 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
50
51 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
53
54 fn simd_sum(a: &ArrayView1<Self>) -> Self;
56
57 fn simd_mean(a: &ArrayView1<Self>) -> Self;
59
60 fn simd_max_element(a: &ArrayView1<Self>) -> Self;
62
63 fn simd_min_element(a: &ArrayView1<Self>) -> Self;
65
66 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
68
69 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
71
72 fn simd_fma_advanced_optimized(
74 a: &ArrayView1<Self>,
75 b: &ArrayView1<Self>,
76 c: &ArrayView1<Self>,
77 ) -> Array1<Self>;
78
79 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
81
82 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
84
85 fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self>;
89
90 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
92
93 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
95
96 fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self>;
98
99 fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self>;
101
102 fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self>;
104
105 fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self>;
107
108 fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self>;
110
111 fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self>;
113
114 fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self>;
116
117 fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self>;
119
120 fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self>;
122
123 fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self>;
125
126 fn simd_round(a: &ArrayView1<Self>) -> Array1<Self>;
128
129 fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self>;
131
132 fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self>;
134
135 fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self>;
137
138 fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self>;
140
141 fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self>;
143
144 fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self>;
146
147 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self>;
149
150 fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self>;
152
153 fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self>;
155
156 fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self>;
158
159 fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self>;
161
162 fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self>;
164
165 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self>;
167
168 fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self>;
170
171 fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self>;
173
174 fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self>;
176
177 fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self>;
179
180 fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self>;
182
183 fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self>;
185
186 fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self>;
188
189 fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self>;
191
192 fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self>;
195
196 fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self>;
199
200 fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self>;
204
205 fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self>;
208
209 fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self>;
214
215 fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self>;
219
220 fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
225
226 fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self>;
232
233 fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self>;
239
240 fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self>;
246
247 fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self>;
253
254 fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
260
261 fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self>;
269
270 fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
277
278 fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self>;
285
286 fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self>;
292
293 fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
300
301 fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self>;
306
307 fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self>;
313
314 fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self>;
320
321 fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
329
330 fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
335
336 fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self>;
348
349 fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
362
363 fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
374
375 fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
386
387 fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
398
399 fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
410
411 fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self>;
422
423 fn simd_square(a: &ArrayView1<Self>) -> Array1<Self>;
433
434 fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self>;
444
445 fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>);
456
457 fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self>;
468
469 fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self>;
480
481 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
483
484 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
486
487 fn simd_available() -> bool;
489
490 fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
492 Self::simd_sum(a)
493 }
494
495 fn simd_sub_f32_ultra(
497 a: &ArrayView1<Self>,
498 b: &ArrayView1<Self>,
499 result: &mut ArrayViewMut1<Self>,
500 );
501
502 fn simd_mul_f32_ultra(
504 a: &ArrayView1<Self>,
505 b: &ArrayView1<Self>,
506 result: &mut ArrayViewMut1<Self>,
507 );
508
509 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
511
512 fn simd_div_f32_ultra(
514 a: &ArrayView1<Self>,
515 b: &ArrayView1<Self>,
516 result: &mut ArrayViewMut1<Self>,
517 );
518
519 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
521
522 fn simd_add_f32_ultra(
524 a: &ArrayView1<Self>,
525 b: &ArrayView1<Self>,
526 result: &mut ArrayViewMut1<Self>,
527 );
528
529 fn simd_fma_f32_ultra(
531 a: &ArrayView1<Self>,
532 b: &ArrayView1<Self>,
533 c: &ArrayView1<Self>,
534 result: &mut ArrayViewMut1<Self>,
535 );
536
537 fn simd_pow_f32_ultra(
539 a: &ArrayView1<Self>,
540 b: &ArrayView1<Self>,
541 result: &mut ArrayViewMut1<Self>,
542 );
543
544 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
546
547 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
549
550 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
552
553 fn simd_variance(a: &ArrayView1<Self>) -> Self;
555
556 fn simd_std(a: &ArrayView1<Self>) -> Self;
558
559 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self;
561
562 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self;
564
565 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
567
568 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
570
571 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
573
574 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
576
577 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
579
580 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
582
583 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
585
586 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize>;
588
589 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize>;
591
592 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self>;
594
595 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self;
597
598 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
600
601 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self>;
603
604 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self>;
606
607 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self>;
609
610 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self>;
612
613 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self>;
615
616 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
618
619 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self>;
621
622 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self>;
624}
625
626#[allow(dead_code)]
629fn lanczos_gamma_f32(z: f32) -> f32 {
630 const G: f32 = 7.0;
631 const SQRT_2PI: f32 = 2.5066282746310002; const LANCZOS_COEFFS: [f32; 9] = [
633 0.99999999999980993,
634 676.5203681218851,
635 -1259.1392167224028,
636 771.32342877765313,
637 -176.61502916214059,
638 12.507343278686905,
639 -0.13857109526572012,
640 9.9843695780195716e-6,
641 1.5056327351493116e-7,
642 ];
643
644 if z < 0.5 {
646 let pi = std::f32::consts::PI;
647 let sinpix = (pi * z).sin();
648 if sinpix.abs() < 1e-10 {
649 return f32::INFINITY;
650 }
651 return pi / (sinpix * lanczos_gamma_f32(1.0 - z));
652 }
653
654 let z_shifted = z - 1.0;
655 let mut acc = LANCZOS_COEFFS[0];
656 for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
657 acc += coeff / (z_shifted + i as f32);
658 }
659
660 let t = z_shifted + G + 0.5;
661 SQRT_2PI * acc * t.powf(z_shifted + 0.5) * (-t).exp()
662}
663
664#[allow(dead_code)]
666fn lanczos_gamma_f64(z: f64) -> f64 {
667 const G: f64 = 7.0;
668 const SQRT_2PI: f64 = 2.5066282746310002; const LANCZOS_COEFFS: [f64; 9] = [
670 0.999999999999809932,
671 676.5203681218851,
672 -1259.1392167224028,
673 771.32342877765313,
674 -176.61502916214059,
675 12.507343278686905,
676 -0.13857109526572012,
677 9.9843695780195716e-6,
678 1.5056327351493116e-7,
679 ];
680
681 if z < 0.5 {
683 let pi = std::f64::consts::PI;
684 let sinpix = (pi * z).sin();
685 if sinpix.abs() < 1e-14 {
686 return f64::INFINITY;
687 }
688 return pi / (sinpix * lanczos_gamma_f64(1.0 - z));
689 }
690
691 let z_shifted = z - 1.0;
692 let mut acc = LANCZOS_COEFFS[0];
693 for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
694 acc += coeff / (z_shifted + i as f64);
695 }
696
697 let t = z_shifted + G + 0.5;
698 SQRT_2PI * acc * t.powf(z_shifted + 0.5) * (-t).exp()
699}
700
701#[allow(dead_code)]
704fn digamma_f32(mut x: f32) -> f32 {
705 let pi = std::f32::consts::PI;
706
707 if x.is_nan() {
709 return f32::NAN;
710 }
711 if x.is_infinite() {
712 return if x > 0.0 { f32::INFINITY } else { f32::NAN };
713 }
714
715 if x <= 0.0 {
719 if x == x.floor() {
721 return f32::NAN;
722 }
723 return digamma_f32(1.0 - x) - pi / (pi * x).tan();
725 }
726
727 let mut result = 0.0;
728
729 while x < 6.0 {
732 result -= 1.0 / x;
733 x += 1.0;
734 }
735
736 result += x.ln() - 0.5 / x;
739 let x2 = x * x;
740 let x4 = x2 * x2;
741 let x6 = x4 * x2;
742 result -= 1.0 / (12.0 * x2);
743 result += 1.0 / (120.0 * x4);
744 result -= 1.0 / (252.0 * x6);
745
746 result
747}
748
749#[allow(dead_code)]
752fn digamma_f64(mut x: f64) -> f64 {
753 let pi = std::f64::consts::PI;
754
755 if x.is_nan() {
757 return f64::NAN;
758 }
759 if x.is_infinite() {
760 return if x > 0.0 { f64::INFINITY } else { f64::NAN };
761 }
762
763 if x <= 0.0 {
767 if x == x.floor() {
769 return f64::NAN;
770 }
771 return digamma_f64(1.0 - x) - pi / (pi * x).tan();
773 }
774
775 let mut result = 0.0;
776
777 while x < 8.0 {
780 result -= 1.0 / x;
781 x += 1.0;
782 }
783
784 result += x.ln() - 0.5 / x;
788 let x2 = x * x;
789 let x4 = x2 * x2;
790 let x6 = x4 * x2;
791 let x8 = x4 * x4;
792 let x10 = x8 * x2;
793 let x12 = x8 * x4;
794 result -= 1.0 / (12.0 * x2); result += 1.0 / (120.0 * x4); result -= 1.0 / (252.0 * x6); result += 1.0 / (240.0 * x8); result -= 5.0 / (660.0 * x10); result += 691.0 / (32760.0 * x12); result
803}
804
805fn trigamma_f32(mut x: f32) -> f32 {
812 let pi = std::f32::consts::PI;
813
814 if x.is_nan() {
816 return f32::NAN;
817 }
818 if x.is_infinite() {
819 return if x > 0.0 { 0.0 } else { f32::NAN };
820 }
821
822 if x <= 0.0 {
825 if x == x.floor() {
827 return f32::NAN;
828 }
829 let sin_pix = (pi * x).sin();
831 return (pi * pi) / (sin_pix * sin_pix) - trigamma_f32(1.0 - x);
832 }
833
834 let mut result = 0.0;
835
836 while x < 6.0 {
839 result += 1.0 / (x * x);
840 x += 1.0;
841 }
842
843 let x2 = x * x;
846 let x3 = x2 * x;
847 let x5 = x3 * x2;
848 let x7 = x5 * x2;
849
850 result += 1.0 / x;
851 result += 1.0 / (2.0 * x2);
852 result += 1.0 / (6.0 * x3);
853 result -= 1.0 / (30.0 * x5);
854 result += 1.0 / (42.0 * x7);
855
856 result
857}
858
859fn trigamma_f64(mut x: f64) -> f64 {
866 let pi = std::f64::consts::PI;
867
868 if x.is_nan() {
870 return f64::NAN;
871 }
872 if x.is_infinite() {
873 return if x > 0.0 { 0.0 } else { f64::NAN };
874 }
875
876 if x <= 0.0 {
879 if x == x.floor() {
881 return f64::NAN;
882 }
883 let sin_pix = (pi * x).sin();
885 return (pi * pi) / (sin_pix * sin_pix) - trigamma_f64(1.0 - x);
886 }
887
888 let mut result = 0.0;
889
890 while x < 8.0 {
893 result += 1.0 / (x * x);
894 x += 1.0;
895 }
896
897 let x2 = x * x;
901 let x3 = x2 * x;
902 let x5 = x3 * x2;
903 let x7 = x5 * x2;
904 let x9 = x7 * x2;
905 let x11 = x9 * x2;
906
907 result += 1.0 / x;
908 result += 1.0 / (2.0 * x2);
909 result += 1.0 / (6.0 * x3); result -= 1.0 / (30.0 * x5); result += 1.0 / (42.0 * x7); result -= 1.0 / (30.0 * x9); result += 5.0 / (66.0 * x11); result
916}
917
918fn ln_gamma_f32(z: f32) -> f32 {
924 const G: f32 = 7.0;
925 const LN_SQRT_2PI: f32 = 0.9189385332046727; const LANCZOS_COEFFS: [f32; 9] = [
927 0.99999999999980993,
928 676.5203681218851,
929 -1259.1392167224028,
930 771.32342877765313,
931 -176.61502916214059,
932 12.507343278686905,
933 -0.13857109526572012,
934 9.9843695780195716e-6,
935 1.5056327351493116e-7,
936 ];
937
938 if z.is_nan() {
940 return f32::NAN;
941 }
942 if z.is_infinite() {
943 return if z > 0.0 { f32::INFINITY } else { f32::NAN };
944 }
945 if z <= 0.0 && z == z.floor() {
946 return f32::INFINITY; }
948
949 if z < 0.5 {
952 let pi = std::f32::consts::PI;
953 let sinpiz = (pi * z).sin().abs();
954 if sinpiz < 1e-10 {
955 return f32::INFINITY;
956 }
957 return pi.ln() - sinpiz.ln() - ln_gamma_f32(1.0 - z);
958 }
959
960 let z_shifted = z - 1.0;
962 let mut sum = LANCZOS_COEFFS[0];
963 for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
964 sum += coeff / (z_shifted + i as f32);
965 }
966
967 let t = z_shifted + G + 0.5;
968 LN_SQRT_2PI + (z_shifted + 0.5) * t.ln() - t + sum.ln()
970}
971
972fn ln_gamma_f64(z: f64) -> f64 {
977 const G: f64 = 7.0;
978 const LN_SQRT_2PI: f64 = 0.9189385332046727; const LANCZOS_COEFFS: [f64; 9] = [
980 0.999999999999809932,
981 676.5203681218851,
982 -1259.1392167224028,
983 771.32342877765313,
984 -176.61502916214059,
985 12.507343278686905,
986 -0.13857109526572012,
987 9.9843695780195716e-6,
988 1.5056327351493116e-7,
989 ];
990
991 if z.is_nan() {
993 return f64::NAN;
994 }
995 if z.is_infinite() {
996 return if z > 0.0 { f64::INFINITY } else { f64::NAN };
997 }
998 if z <= 0.0 && z == z.floor() {
999 return f64::INFINITY; }
1001
1002 if z < 0.5 {
1005 let pi = std::f64::consts::PI;
1006 let sinpiz = (pi * z).sin().abs();
1007 if sinpiz < 1e-14 {
1008 return f64::INFINITY;
1009 }
1010 return pi.ln() - sinpiz.ln() - ln_gamma_f64(1.0 - z);
1011 }
1012
1013 let z_shifted = z - 1.0;
1015 let mut sum = LANCZOS_COEFFS[0];
1016 for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
1017 sum += coeff / (z_shifted + i as f64);
1018 }
1019
1020 let t = z_shifted + G + 0.5;
1021 LN_SQRT_2PI + (z_shifted + 0.5) * t.ln() - t + sum.ln()
1023}
1024
1025fn erf_f32(x: f32) -> f32 {
1030 if x.is_nan() {
1032 return f32::NAN;
1033 }
1034 if x.is_infinite() {
1035 return if x > 0.0 { 1.0 } else { -1.0 };
1036 }
1037
1038 let sign = if x >= 0.0 { 1.0 } else { -1.0 };
1040 let x = x.abs();
1041
1042 if x < 1e-10 {
1044 return sign * x * std::f32::consts::FRAC_2_SQRT_PI;
1045 }
1046
1047 const P: f32 = 0.3275911;
1051 const A1: f32 = 0.254829592;
1052 const A2: f32 = -0.284496736;
1053 const A3: f32 = 1.421413741;
1054 const A4: f32 = -1.453152027;
1055 const A5: f32 = 1.061405429;
1056
1057 let t = 1.0 / (1.0 + P * x);
1058 let t2 = t * t;
1059 let t3 = t2 * t;
1060 let t4 = t3 * t;
1061 let t5 = t4 * t;
1062
1063 let poly = A1 * t + A2 * t2 + A3 * t3 + A4 * t4 + A5 * t5;
1064 let result = 1.0 - poly * (-x * x).exp();
1065
1066 sign * result
1067}
1068
1069fn erf_f64(x: f64) -> f64 {
1074 if x.is_nan() {
1076 return f64::NAN;
1077 }
1078 if x.is_infinite() {
1079 return if x > 0.0 { 1.0 } else { -1.0 };
1080 }
1081
1082 let sign = if x >= 0.0 { 1.0 } else { -1.0 };
1084 let x = x.abs();
1085
1086 if x < 1e-20 {
1088 return sign * x * std::f64::consts::FRAC_2_SQRT_PI;
1089 }
1090
1091 if x > 6.0 {
1093 return sign;
1094 }
1095
1096 let result = if x < 0.25 {
1099 let x2 = x * x;
1102 let two_over_sqrt_pi = std::f64::consts::FRAC_2_SQRT_PI;
1103 let term = x
1104 * (1.0
1105 - x2 / 3.0
1106 * (1.0
1107 - x2 / 5.0
1108 * 0.5
1109 * (1.0
1110 - x2 / 7.0
1111 * (1.0 / 3.0)
1112 * (1.0
1113 - x2 / 9.0
1114 * 0.25
1115 * (1.0
1116 - x2 / 11.0
1117 * 0.2
1118 * (1.0
1119 - x2 / 13.0
1120 * (1.0 / 6.0)
1121 * (1.0 - x2 / 15.0 * (1.0 / 7.0))))))));
1122 two_over_sqrt_pi * term
1123 } else {
1124 const P: f64 = 0.3275911;
1127 const A1: f64 = 0.254829592;
1128 const A2: f64 = -0.284496736;
1129 const A3: f64 = 1.421413741;
1130 const A4: f64 = -1.453152027;
1131 const A5: f64 = 1.061405429;
1132
1133 let t = 1.0 / (1.0 + P * x);
1134 let poly = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5))));
1135 1.0 - poly * (-x * x).exp()
1136 };
1137
1138 sign * result
1139}
1140
1141fn erfc_f32(x: f32) -> f32 {
1145 if x.is_nan() {
1147 return f32::NAN;
1148 }
1149 if x.is_infinite() {
1150 return if x > 0.0 { 0.0 } else { 2.0 };
1151 }
1152
1153 if x < 0.0 {
1155 return 2.0 - erfc_f32(-x);
1156 }
1157
1158 if x > 10.0 {
1160 return 0.0;
1161 }
1162
1163 if x < 0.5 {
1165 return 1.0 - erf_f32(x);
1166 }
1167
1168 const P: f32 = 0.3275911;
1171 const A1: f32 = 0.254829592;
1172 const A2: f32 = -0.284496736;
1173 const A3: f32 = 1.421413741;
1174 const A4: f32 = -1.453152027;
1175 const A5: f32 = 1.061405429;
1176
1177 let t = 1.0 / (1.0 + P * x);
1178 let t2 = t * t;
1179 let t3 = t2 * t;
1180 let t4 = t3 * t;
1181 let t5 = t4 * t;
1182
1183 let poly = A1 * t + A2 * t2 + A3 * t3 + A4 * t4 + A5 * t5;
1184 poly * (-x * x).exp()
1185}
1186
1187fn erfc_f64(x: f64) -> f64 {
1191 if x.is_nan() {
1193 return f64::NAN;
1194 }
1195 if x.is_infinite() {
1196 return if x > 0.0 { 0.0 } else { 2.0 };
1197 }
1198
1199 if x < 0.0 {
1201 return 2.0 - erfc_f64(-x);
1202 }
1203
1204 if x > 27.0 {
1206 return 0.0;
1207 }
1208
1209 if x < 0.5 {
1211 return 1.0 - erf_f64(x);
1212 }
1213
1214 if x > 4.0 {
1218 let x2 = x * x;
1220 let inv_x2 = 1.0 / x2;
1221 let sqrt_pi = std::f64::consts::PI.sqrt();
1222
1223 let asymp = 1.0 - 0.5 * inv_x2 + 0.75 * inv_x2 * inv_x2 - 1.875 * inv_x2 * inv_x2 * inv_x2
1225 + 6.5625 * inv_x2 * inv_x2 * inv_x2 * inv_x2;
1226 return (-x2).exp() / (x * sqrt_pi) * asymp;
1227 }
1228
1229 const P: f64 = 0.3275911;
1231 const A1: f64 = 0.254829592;
1232 const A2: f64 = -0.284496736;
1233 const A3: f64 = 1.421413741;
1234 const A4: f64 = -1.453152027;
1235 const A5: f64 = 1.061405429;
1236
1237 let t = 1.0 / (1.0 + P * x);
1238 let poly = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5))));
1239 poly * (-x * x).exp()
1240}
1241
1242fn erfinv_f32(y: f32) -> f32 {
1247 if y.is_nan() {
1249 return f32::NAN;
1250 }
1251 if y <= -1.0 {
1252 return if y == -1.0 {
1253 f32::NEG_INFINITY
1254 } else {
1255 f32::NAN
1256 };
1257 }
1258 if y >= 1.0 {
1259 return if y == 1.0 { f32::INFINITY } else { f32::NAN };
1260 }
1261 if y == 0.0 {
1262 return 0.0;
1263 }
1264
1265 let sign = if y >= 0.0 { 1.0 } else { -1.0 };
1267 let y = y.abs();
1268
1269 let a = 0.147_f32;
1273 let two_over_pi_a = 2.0 / (std::f32::consts::PI * a);
1274 let ln_one_minus_y2 = (1.0 - y * y).ln();
1275
1276 let t1 = two_over_pi_a + 0.5 * ln_one_minus_y2;
1277 let t2 = (1.0 / a) * ln_one_minus_y2;
1278 let inner = (t1 * t1 - t2).sqrt() - t1;
1279
1280 let mut x = inner.sqrt();
1281
1282 let two_over_sqrt_pi = std::f32::consts::FRAC_2_SQRT_PI;
1285 for _ in 0..2 {
1286 let erf_x = erf_f32(x);
1287 let erf_deriv = two_over_sqrt_pi * (-x * x).exp();
1288 x -= (erf_x - y) / erf_deriv;
1289 }
1290
1291 sign * x
1292}
1293
1294fn erfinv_f64(y: f64) -> f64 {
1299 if y.is_nan() {
1301 return f64::NAN;
1302 }
1303 if y <= -1.0 {
1304 return if y == -1.0 {
1305 f64::NEG_INFINITY
1306 } else {
1307 f64::NAN
1308 };
1309 }
1310 if y >= 1.0 {
1311 return if y == 1.0 { f64::INFINITY } else { f64::NAN };
1312 }
1313 if y == 0.0 {
1314 return 0.0;
1315 }
1316
1317 let sign = if y >= 0.0 { 1.0 } else { -1.0 };
1319 let y = y.abs();
1320
1321 let a = 0.147_f64;
1324 let two_over_pi_a = 2.0 / (std::f64::consts::PI * a);
1325 let ln_one_minus_y2 = (1.0 - y * y).ln();
1326
1327 let t1 = two_over_pi_a + 0.5 * ln_one_minus_y2;
1328 let t2 = (1.0 / a) * ln_one_minus_y2;
1329 let inner = (t1 * t1 - t2).sqrt() - t1;
1330
1331 let mut x = inner.sqrt();
1332
1333 let two_over_sqrt_pi = std::f64::consts::FRAC_2_SQRT_PI;
1338 for _ in 0..5 {
1339 let erf_x = erf_f64(x);
1340 let f = erf_x - y;
1341 let exp_neg_x2 = (-x * x).exp();
1342 let f_prime = two_over_sqrt_pi * exp_neg_x2;
1343
1344 let newton_step = f / f_prime;
1346
1347 let f_double_prime = -2.0 * x * f_prime;
1349 let halley_correction = 1.0 / (1.0 - 0.5 * f * f_double_prime / (f_prime * f_prime));
1350
1351 x -= newton_step * halley_correction;
1352 }
1353
1354 sign * x
1355}
1356
1357fn erfcinv_f32(y: f32) -> f32 {
1362 if y.is_nan() {
1364 return f32::NAN;
1365 }
1366 if y <= 0.0 {
1367 return if y == 0.0 { f32::INFINITY } else { f32::NAN };
1368 }
1369 if y >= 2.0 {
1370 return if y == 2.0 {
1371 f32::NEG_INFINITY
1372 } else {
1373 f32::NAN
1374 };
1375 }
1376
1377 erfinv_f32(1.0 - y)
1379}
1380
1381fn erfcinv_f64(y: f64) -> f64 {
1387 if y.is_nan() {
1389 return f64::NAN;
1390 }
1391 if y <= 0.0 {
1392 return if y == 0.0 { f64::INFINITY } else { f64::NAN };
1393 }
1394 if y >= 2.0 {
1395 return if y == 2.0 {
1396 f64::NEG_INFINITY
1397 } else {
1398 f64::NAN
1399 };
1400 }
1401
1402 erfinv_f64(1.0 - y)
1405}
1406
1407fn sigmoid_f32(x: f32) -> f32 {
1412 if x.is_nan() {
1413 return f32::NAN;
1414 }
1415 if x >= 0.0 {
1419 let exp_neg_x = (-x).exp();
1420 1.0 / (1.0 + exp_neg_x)
1421 } else {
1422 let exp_x = x.exp();
1423 exp_x / (1.0 + exp_x)
1424 }
1425}
1426
1427fn sigmoid_f64(x: f64) -> f64 {
1432 if x.is_nan() {
1433 return f64::NAN;
1434 }
1435 if x >= 0.0 {
1439 let exp_neg_x = (-x).exp();
1440 1.0 / (1.0 + exp_neg_x)
1441 } else {
1442 let exp_x = x.exp();
1443 exp_x / (1.0 + exp_x)
1444 }
1445}
1446
1447fn gelu_f32(x: f32) -> f32 {
1452 if x.is_nan() {
1453 return f32::NAN;
1454 }
1455 let sqrt_2_inv = std::f32::consts::FRAC_1_SQRT_2; let erf_arg = x * sqrt_2_inv;
1458 x * 0.5 * (1.0 + erf_f32(erf_arg))
1459}
1460
1461fn gelu_f64(x: f64) -> f64 {
1466 if x.is_nan() {
1467 return f64::NAN;
1468 }
1469 let sqrt_2_inv = std::f64::consts::FRAC_1_SQRT_2; let erf_arg = x * sqrt_2_inv;
1472 x * 0.5 * (1.0 + erf_f64(erf_arg))
1473}
1474
1475fn swish_f32(x: f32) -> f32 {
1482 if x.is_nan() {
1483 return f32::NAN;
1484 }
1485 x * sigmoid_f32(x)
1488}
1489
1490fn swish_f64(x: f64) -> f64 {
1497 if x.is_nan() {
1498 return f64::NAN;
1499 }
1500 x * sigmoid_f64(x)
1503}
1504
1505fn softplus_f32(x: f32) -> f32 {
1512 if x.is_nan() {
1513 return f32::NAN;
1514 }
1515 if x > 20.0 {
1520 x
1522 } else if x < -20.0 {
1523 x.exp()
1525 } else {
1526 (1.0_f32 + x.exp()).ln()
1528 }
1529}
1530
1531fn softplus_f64(x: f64) -> f64 {
1538 if x.is_nan() {
1539 return f64::NAN;
1540 }
1541 if x > 34.0 {
1546 x
1548 } else if x < -34.0 {
1549 x.exp()
1551 } else {
1552 (1.0_f64 + x.exp()).ln()
1554 }
1555}
1556
1557fn mish_f32(x: f32) -> f32 {
1564 if x.is_nan() {
1565 return f32::NAN;
1566 }
1567 x * softplus_f32(x).tanh()
1570}
1571
1572fn mish_f64(x: f64) -> f64 {
1579 if x.is_nan() {
1580 return f64::NAN;
1581 }
1582 x * softplus_f64(x).tanh()
1585}
1586
1587fn elu_f32(x: f32, alpha: f32) -> f32 {
1595 if x.is_nan() {
1596 return f32::NAN;
1597 }
1598 if x >= 0.0 {
1599 x
1600 } else {
1601 alpha * (x.exp() - 1.0)
1602 }
1603}
1604
1605fn elu_f64(x: f64, alpha: f64) -> f64 {
1613 if x.is_nan() {
1614 return f64::NAN;
1615 }
1616 if x >= 0.0 {
1617 x
1618 } else {
1619 alpha * (x.exp() - 1.0)
1620 }
1621}
1622
1623const SELU_ALPHA: f64 = 1.6732632423543772;
1629const SELU_LAMBDA: f64 = 1.0507009873554805;
1630const SELU_ALPHA_F32: f32 = 1.6732632;
1631const SELU_LAMBDA_F32: f32 = 1.0507010;
1632
1633fn selu_f32(x: f32) -> f32 {
1641 if x.is_nan() {
1642 return f32::NAN;
1643 }
1644 if x > 0.0 {
1645 SELU_LAMBDA_F32 * x
1646 } else {
1647 SELU_LAMBDA_F32 * SELU_ALPHA_F32 * (x.exp() - 1.0)
1648 }
1649}
1650
1651fn selu_f64(x: f64) -> f64 {
1659 if x.is_nan() {
1660 return f64::NAN;
1661 }
1662 if x > 0.0 {
1663 SELU_LAMBDA * x
1664 } else {
1665 SELU_LAMBDA * SELU_ALPHA * (x.exp() - 1.0)
1666 }
1667}
1668
1669fn hardsigmoid_f32(x: f32) -> f32 {
1676 if x.is_nan() {
1677 return f32::NAN;
1678 }
1679 if x <= -3.0 {
1681 0.0
1682 } else if x >= 3.0 {
1683 1.0
1684 } else {
1685 (x + 3.0) / 6.0
1686 }
1687}
1688
1689fn hardsigmoid_f64(x: f64) -> f64 {
1696 if x.is_nan() {
1697 return f64::NAN;
1698 }
1699 if x <= -3.0 {
1701 0.0
1702 } else if x >= 3.0 {
1703 1.0
1704 } else {
1705 (x + 3.0) / 6.0
1706 }
1707}
1708
1709fn hardswish_f32(x: f32) -> f32 {
1716 if x.is_nan() {
1717 return f32::NAN;
1718 }
1719 if x <= -3.0 {
1721 0.0
1722 } else if x >= 3.0 {
1723 x
1724 } else {
1725 x * (x + 3.0) / 6.0
1726 }
1727}
1728
1729fn hardswish_f64(x: f64) -> f64 {
1736 if x.is_nan() {
1737 return f64::NAN;
1738 }
1739 if x <= -3.0 {
1741 0.0
1742 } else if x >= 3.0 {
1743 x
1744 } else {
1745 x * (x + 3.0) / 6.0
1746 }
1747}
1748
1749fn sinc_f32(x: f32) -> f32 {
1756 if x.is_nan() {
1757 return f32::NAN;
1758 }
1759 if x.abs() < 1e-7 {
1760 let pi_x = std::f32::consts::PI * x;
1763 1.0 - pi_x * pi_x / 6.0
1764 } else {
1765 let pi_x = std::f32::consts::PI * x;
1766 pi_x.sin() / pi_x
1767 }
1768}
1769
1770fn sinc_f64(x: f64) -> f64 {
1777 if x.is_nan() {
1778 return f64::NAN;
1779 }
1780 if x.abs() < 1e-15 {
1781 let pi_x = std::f64::consts::PI * x;
1784 let pi_x_sq = pi_x * pi_x;
1785 1.0 - pi_x_sq / 6.0 + pi_x_sq * pi_x_sq / 120.0
1786 } else {
1787 let pi_x = std::f64::consts::PI * x;
1788 pi_x.sin() / pi_x
1789 }
1790}
1791
1792impl SimdUnifiedOps for f32 {
1794 #[cfg(feature = "simd")]
1795 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1796 crate::simd::simd_add_f32(a, b)
1797 }
1798
1799 #[cfg(not(feature = "simd"))]
1800 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1801 (a + b).to_owned()
1802 }
1803
1804 #[cfg(feature = "simd")]
1805 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1806 crate::simd::simd_sub_f32(a, b)
1807 }
1808
1809 #[cfg(not(feature = "simd"))]
1810 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1811 (a - b).to_owned()
1812 }
1813
1814 #[cfg(feature = "simd")]
1815 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1816 crate::simd::simd_mul_f32(a, b)
1817 }
1818
1819 #[cfg(not(feature = "simd"))]
1820 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1821 (a * b).to_owned()
1822 }
1823
1824 #[cfg(feature = "simd")]
1825 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1826 crate::simd::simd_div_f32(a, b)
1827 }
1828
1829 #[cfg(not(feature = "simd"))]
1830 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1831 (a / b).to_owned()
1832 }
1833
1834 #[cfg(feature = "simd")]
1835 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
1836 crate::simd::simd_dot_f32(a, b)
1837 }
1838
1839 #[cfg(not(feature = "simd"))]
1840 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
1841 a.dot(b)
1842 }
1843
1844 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
1845 let m = a.nrows();
1846 let n = a.ncols();
1847
1848 assert_eq!(n, x.len());
1849 assert_eq!(m, y.len());
1850
1851 if beta == 0.0 {
1853 y.fill(0.0);
1854 } else if beta != 1.0 {
1855 y.mapv_inplace(|v| v * beta);
1856 }
1857
1858 for i in 0..m {
1860 let row = a.row(i);
1861 y[i] += Self::simd_dot(&row, x);
1862 }
1863 }
1864
1865 fn simd_gemm(
1866 alpha: Self,
1867 a: &ArrayView2<Self>,
1868 b: &ArrayView2<Self>,
1869 beta: Self,
1870 c: &mut Array2<Self>,
1871 ) {
1872 let m = a.nrows();
1873 let k = a.ncols();
1874 let n = b.ncols();
1875
1876 assert_eq!(k, b.nrows());
1877 assert_eq!((m, n), c.dim());
1878
1879 if beta == 0.0 {
1881 c.fill(0.0);
1882 } else if beta != 1.0 {
1883 c.mapv_inplace(|v| v * beta);
1884 }
1885
1886 const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
1889
1890 if n * k > GEMM_TRANSPOSE_THRESHOLD {
1891 let b_t = Self::simd_transpose_blocked(b);
1893
1894 for i in 0..m {
1895 let a_row = a.row(i);
1896 for j in 0..n {
1897 let b_row = b_t.row(j);
1899 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
1900 }
1901 }
1902 } else {
1903 for i in 0..m {
1905 let a_row = a.row(i);
1906 for j in 0..n {
1907 let b_col = b.column(j);
1908 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
1909 }
1910 }
1911 }
1912 }
1913
1914 #[cfg(feature = "simd")]
1915 fn simd_norm(a: &ArrayView1<Self>) -> Self {
1916 crate::simd::norms::simd_norm_l2_f32(a)
1917 }
1918
1919 #[cfg(not(feature = "simd"))]
1920 fn simd_norm(a: &ArrayView1<Self>) -> Self {
1921 a.iter().map(|&x| x * x).sum::<f32>().sqrt()
1922 }
1923
1924 #[cfg(feature = "simd")]
1925 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1926 crate::simd::simd_maximum_f32(a, b)
1927 }
1928
1929 #[cfg(not(feature = "simd"))]
1930 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1931 let mut result = Array1::zeros(a.len());
1932 for _i in 0..a.len() {
1933 result[0] = a[0].max(b[0]);
1934 }
1935 result
1936 }
1937
1938 #[cfg(feature = "simd")]
1939 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1940 crate::simd::simd_minimum_f32(a, b)
1941 }
1942
1943 #[cfg(not(feature = "simd"))]
1944 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1945 let mut result = Array1::zeros(a.len());
1946 for _i in 0..a.len() {
1947 result[0] = a[0].min(b[0]);
1948 }
1949 result
1950 }
1951
1952 #[cfg(feature = "simd")]
1953 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
1954 crate::simd::simd_scalar_mul_f32(a, scalar)
1955 }
1956
1957 #[cfg(not(feature = "simd"))]
1958 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
1959 a.mapv(|x| x * scalar)
1960 }
1961
1962 #[cfg(feature = "simd")]
1963 fn simd_sum(a: &ArrayView1<Self>) -> Self {
1964 crate::simd::simd_sum_f32(a)
1965 }
1966
1967 #[cfg(not(feature = "simd"))]
1968 fn simd_sum(a: &ArrayView1<Self>) -> Self {
1969 a.sum()
1970 }
1971
1972 fn simd_mean(a: &ArrayView1<Self>) -> Self {
1973 if a.is_empty() {
1974 0.0
1975 } else {
1976 Self::simd_sum(a) / (a.len() as f32)
1977 }
1978 }
1979
1980 #[cfg(feature = "simd")]
1981 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
1982 crate::simd::simd_max_f32(a)
1983 }
1984
1985 #[cfg(not(feature = "simd"))]
1986 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
1987 a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
1988 }
1989
1990 #[cfg(feature = "simd")]
1991 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
1992 crate::simd::simd_min_f32(a)
1993 }
1994
1995 #[cfg(not(feature = "simd"))]
1996 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
1997 a.fold(f32::INFINITY, |acc, &x| acc.min(x))
1998 }
1999
2000 #[cfg(feature = "simd")]
2001 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
2002 crate::simd::simd_fused_multiply_add_f32(a, b, c)
2003 }
2004
2005 #[cfg(not(feature = "simd"))]
2006 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
2007 let mut result = Array1::zeros(a.len());
2008 for _i in 0..a.len() {
2009 result[0] = a[0] * b[0] + c[0];
2010 }
2011 result
2012 }
2013
2014 #[cfg(feature = "simd")]
2015 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2016 crate::simd::simd_add_cache_optimized_f32(a, b)
2017 }
2018
2019 #[cfg(not(feature = "simd"))]
2020 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2021 a + b
2022 }
2023
2024 #[cfg(feature = "simd")]
2025 fn simd_fma_advanced_optimized(
2026 a: &ArrayView1<Self>,
2027 b: &ArrayView1<Self>,
2028 c: &ArrayView1<Self>,
2029 ) -> Array1<Self> {
2030 crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
2031 }
2032
2033 #[cfg(not(feature = "simd"))]
2034 fn simd_fma_advanced_optimized(
2035 a: &ArrayView1<Self>,
2036 b: &ArrayView1<Self>,
2037 c: &ArrayView1<Self>,
2038 ) -> Array1<Self> {
2039 let mut result = Array1::zeros(a.len());
2040 for _i in 0..a.len() {
2041 result[0] = a[0] * b[0] + c[0];
2042 }
2043 result
2044 }
2045
2046 #[cfg(feature = "simd")]
2047 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2048 crate::simd::simd_adaptive_add_f32(a, b)
2049 }
2050
2051 #[cfg(not(feature = "simd"))]
2052 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2053 a + b
2054 }
2055
2056 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
2057 a.t().to_owned()
2058 }
2059
2060 fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
2061 #[cfg(feature = "simd")]
2062 {
2063 crate::simd::simd_transpose_blocked_f32(a)
2064 }
2065 #[cfg(not(feature = "simd"))]
2066 {
2067 a.t().to_owned()
2068 }
2069 }
2070
2071 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
2072 a.iter().map(|&x| x * x).sum()
2073 }
2074
2075 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2076 Self::simd_mul(a, b)
2077 }
2078
2079 #[cfg(feature = "simd")]
2080 fn simd_available() -> bool {
2081 true
2082 }
2083
2084 #[cfg(not(feature = "simd"))]
2085 fn simd_available() -> bool {
2086 false
2087 }
2088
2089 fn simd_sub_f32_ultra(
2090 a: &ArrayView1<Self>,
2091 b: &ArrayView1<Self>,
2092 result: &mut ArrayViewMut1<Self>,
2093 ) {
2094 let sub_result = Self::simd_sub(a, b);
2095 result.assign(&sub_result);
2096 }
2097
2098 fn simd_mul_f32_ultra(
2099 a: &ArrayView1<Self>,
2100 b: &ArrayView1<Self>,
2101 result: &mut ArrayViewMut1<Self>,
2102 ) {
2103 let mul_result = Self::simd_mul(a, b);
2104 result.assign(&mul_result);
2105 }
2106
2107 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
2108 a.iter().map(|&x| x * x * x).sum()
2110 }
2111
2112 fn simd_div_f32_ultra(
2113 a: &ArrayView1<Self>,
2114 b: &ArrayView1<Self>,
2115 result: &mut ArrayViewMut1<Self>,
2116 ) {
2117 let div_result = Self::simd_div(a, b);
2118 result.assign(&div_result);
2119 }
2120
2121 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2122 let sin_result = a.mapv(|x| x.sin());
2123 result.assign(&sin_result);
2124 }
2125
2126 fn simd_add_f32_ultra(
2127 a: &ArrayView1<Self>,
2128 b: &ArrayView1<Self>,
2129 result: &mut ArrayViewMut1<Self>,
2130 ) {
2131 let add_result = Self::simd_add(a, b);
2132 result.assign(&add_result);
2133 }
2134
2135 fn simd_fma_f32_ultra(
2136 a: &ArrayView1<Self>,
2137 b: &ArrayView1<Self>,
2138 c: &ArrayView1<Self>,
2139 result: &mut ArrayViewMut1<Self>,
2140 ) {
2141 let fma_result = Self::simd_fma(a, b, c);
2142 result.assign(&fma_result);
2143 }
2144
2145 fn simd_pow_f32_ultra(
2146 a: &ArrayView1<Self>,
2147 b: &ArrayView1<Self>,
2148 result: &mut ArrayViewMut1<Self>,
2149 ) {
2150 let pow_result = a
2151 .iter()
2152 .zip(b.iter())
2153 .map(|(&x, &y)| x.powf(y))
2154 .collect::<Vec<_>>();
2155 result.assign(&Array1::from_vec(pow_result));
2156 }
2157
2158 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2159 let exp_result = a.mapv(|x| x.exp());
2160 result.assign(&exp_result);
2161 }
2162
2163 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2164 let cos_result = a.mapv(|x| x.cos());
2165 result.assign(&cos_result);
2166 }
2167
2168 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2169 Self::simd_dot(a, b)
2170 }
2171
2172 #[cfg(feature = "simd")]
2173 fn simd_variance(a: &ArrayView1<Self>) -> Self {
2174 crate::simd::simd_variance_f32(a)
2175 }
2176
2177 #[cfg(not(feature = "simd"))]
2178 fn simd_variance(a: &ArrayView1<Self>) -> Self {
2179 let mean = Self::simd_mean(a);
2180 let n = a.len() as f32;
2181 if n < 2.0 {
2182 return f32::NAN;
2183 }
2184 a.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / (n - 1.0)
2186 }
2187
2188 #[cfg(feature = "simd")]
2189 fn simd_std(a: &ArrayView1<Self>) -> Self {
2190 crate::simd::simd_std_f32(a)
2191 }
2192
2193 #[cfg(not(feature = "simd"))]
2194 fn simd_std(a: &ArrayView1<Self>) -> Self {
2195 Self::simd_variance(a).sqrt()
2196 }
2197
2198 #[cfg(feature = "simd")]
2199 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
2200 crate::simd::simd_norm_l1_f32(a)
2201 }
2202
2203 #[cfg(not(feature = "simd"))]
2204 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
2205 a.iter().map(|&x| x.abs()).sum()
2206 }
2207
2208 #[cfg(feature = "simd")]
2209 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
2210 crate::simd::simd_norm_linf_f32(a)
2211 }
2212
2213 #[cfg(not(feature = "simd"))]
2214 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
2215 a.iter().fold(0.0f32, |acc, &x| acc.max(x.abs()))
2216 }
2217
2218 #[cfg(feature = "simd")]
2219 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2220 crate::simd::simd_cosine_similarity_f32(a, b)
2221 }
2222
2223 #[cfg(not(feature = "simd"))]
2224 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2225 let dot = Self::simd_dot(a, b);
2226 let norm_a = Self::simd_norm(a);
2227 let norm_b = Self::simd_norm(b);
2228 dot / (norm_a * norm_b)
2229 }
2230
2231 #[cfg(feature = "simd")]
2232 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2233 crate::simd::simd_distance_euclidean_f32(a, b)
2234 }
2235
2236 #[cfg(not(feature = "simd"))]
2237 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2238 a.iter()
2239 .zip(b.iter())
2240 .map(|(&x, &y)| (x - y).powi(2))
2241 .sum::<f32>()
2242 .sqrt()
2243 }
2244
2245 #[cfg(feature = "simd")]
2246 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2247 crate::simd::simd_distance_manhattan_f32(a, b)
2248 }
2249
2250 #[cfg(not(feature = "simd"))]
2251 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2252 a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
2253 }
2254
2255 #[cfg(feature = "simd")]
2256 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2257 crate::simd::simd_distance_chebyshev_f32(a, b)
2258 }
2259
2260 #[cfg(not(feature = "simd"))]
2261 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2262 a.iter()
2263 .zip(b.iter())
2264 .fold(0.0f32, |acc, (&x, &y)| acc.max((x - y).abs()))
2265 }
2266
2267 #[cfg(feature = "simd")]
2268 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2269 crate::simd::simd_distance_cosine_f32(a, b)
2270 }
2271
2272 #[cfg(not(feature = "simd"))]
2273 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2274 1.0 - Self::simd_cosine_similarity(a, b)
2275 }
2276
2277 #[cfg(feature = "simd")]
2278 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2279 crate::simd::simd_weighted_sum_f32(values, weights)
2280 }
2281
2282 #[cfg(not(feature = "simd"))]
2283 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2284 values
2285 .iter()
2286 .zip(weights.iter())
2287 .map(|(&v, &w)| v * w)
2288 .sum()
2289 }
2290
2291 #[cfg(feature = "simd")]
2292 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2293 crate::simd::simd_weighted_mean_f32(values, weights)
2294 }
2295
2296 #[cfg(not(feature = "simd"))]
2297 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2298 let weighted_sum = Self::simd_weighted_sum(values, weights);
2299 let weight_sum: f32 = weights.iter().sum();
2300 weighted_sum / weight_sum
2301 }
2302
2303 #[cfg(feature = "simd")]
2304 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
2305 crate::simd::simd_argmin_f32(a)
2306 }
2307
2308 #[cfg(not(feature = "simd"))]
2309 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
2310 if a.is_empty() {
2311 return None;
2312 }
2313 let mut min_idx = 0;
2314 let mut min_val = a[0];
2315 for (i, &v) in a.iter().enumerate().skip(1) {
2316 if v < min_val {
2317 min_val = v;
2318 min_idx = i;
2319 }
2320 }
2321 Some(min_idx)
2322 }
2323
2324 #[cfg(feature = "simd")]
2325 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
2326 crate::simd::simd_argmax_f32(a)
2327 }
2328
2329 #[cfg(not(feature = "simd"))]
2330 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
2331 if a.is_empty() {
2332 return None;
2333 }
2334 let mut max_idx = 0;
2335 let mut max_val = a[0];
2336 for (i, &v) in a.iter().enumerate().skip(1) {
2337 if v > max_val {
2338 max_val = v;
2339 max_idx = i;
2340 }
2341 }
2342 Some(max_idx)
2343 }
2344
2345 #[cfg(feature = "simd")]
2346 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
2347 crate::simd::simd_clip_f32(a, min_val, max_val)
2348 }
2349
2350 #[cfg(not(feature = "simd"))]
2351 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
2352 a.mapv(|v| v.max(min_val).min(max_val))
2353 }
2354
2355 #[cfg(feature = "simd")]
2356 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
2357 crate::simd::simd_log_sum_exp_f32(a)
2358 }
2359
2360 #[cfg(not(feature = "simd"))]
2361 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
2362 if a.is_empty() {
2363 return f32::NEG_INFINITY;
2364 }
2365 let max_val = a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x));
2366 let sum_exp: f32 = a.iter().map(|&x| (x - max_val).exp()).sum();
2367 max_val + sum_exp.ln()
2368 }
2369
2370 #[cfg(feature = "simd")]
2371 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2372 crate::simd::simd_softmax_f32(a)
2373 }
2374
2375 #[cfg(not(feature = "simd"))]
2376 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2377 if a.is_empty() {
2378 return Array1::zeros(0);
2379 }
2380 let lse = Self::simd_log_sum_exp(a);
2381 a.mapv(|x| (x - lse).exp())
2382 }
2383
2384 #[cfg(feature = "simd")]
2385 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
2386 crate::simd::simd_cumsum_f32(a)
2387 }
2388
2389 #[cfg(not(feature = "simd"))]
2390 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
2391 if a.is_empty() {
2392 return Array1::zeros(0);
2393 }
2394 let mut cumsum = 0.0f32;
2395 a.mapv(|x| {
2396 cumsum += x;
2397 cumsum
2398 })
2399 }
2400
2401 #[cfg(feature = "simd")]
2402 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
2403 crate::simd::simd_cumprod_f32(a)
2404 }
2405
2406 #[cfg(not(feature = "simd"))]
2407 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
2408 if a.is_empty() {
2409 return Array1::zeros(0);
2410 }
2411 let mut cumprod = 1.0f32;
2412 a.mapv(|x| {
2413 cumprod *= x;
2414 cumprod
2415 })
2416 }
2417
2418 #[cfg(feature = "simd")]
2419 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
2420 crate::simd::simd_diff_f32(a)
2421 }
2422
2423 #[cfg(not(feature = "simd"))]
2424 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
2425 if a.len() <= 1 {
2426 return Array1::zeros(0);
2427 }
2428 Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
2429 }
2430
2431 #[cfg(feature = "simd")]
2432 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
2433 crate::simd::simd_sign_f32(a)
2434 }
2435
2436 #[cfg(not(feature = "simd"))]
2437 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
2438 a.mapv(|x| {
2439 if x > 0.0 {
2440 1.0
2441 } else if x < 0.0 {
2442 -1.0
2443 } else {
2444 0.0
2445 }
2446 })
2447 }
2448
2449 #[cfg(feature = "simd")]
2450 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
2451 crate::simd::simd_relu_f32(a)
2452 }
2453
2454 #[cfg(not(feature = "simd"))]
2455 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
2456 a.mapv(|x| x.max(0.0))
2457 }
2458
2459 #[cfg(feature = "simd")]
2460 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2461 crate::simd::simd_leaky_relu_f32(a, alpha)
2462 }
2463
2464 #[cfg(not(feature = "simd"))]
2465 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2466 a.mapv(|x| if x > 0.0 { x } else { alpha * x })
2467 }
2468
2469 #[cfg(feature = "simd")]
2470 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
2471 crate::simd::simd_normalize_f32(a)
2472 }
2473
2474 #[cfg(not(feature = "simd"))]
2475 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
2476 let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2477 if norm == 0.0 {
2478 return a.to_owned();
2479 }
2480 a.mapv(|x| x / norm)
2481 }
2482
2483 #[cfg(feature = "simd")]
2484 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
2485 crate::simd::simd_standardize_f32(a)
2486 }
2487
2488 #[cfg(not(feature = "simd"))]
2489 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
2490 if a.len() <= 1 {
2491 return Array1::zeros(a.len());
2492 }
2493 let mean: f32 = a.iter().sum::<f32>() / a.len() as f32;
2494 let variance: f32 =
2495 a.iter().map(|x| (x - mean) * (x - mean)).sum::<f32>() / (a.len() - 1) as f32;
2496 let std = variance.sqrt();
2497 if std == 0.0 {
2498 return Array1::zeros(a.len());
2499 }
2500 a.mapv(|x| (x - mean) / std)
2501 }
2502
2503 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
2504 a.mapv(|x| x.abs())
2505 }
2506
2507 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
2508 a.mapv(|x| x.sqrt())
2509 }
2510
2511 fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
2512 a.mapv(|x| x.exp())
2515 }
2516
2517 fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
2518 a.mapv(|x| x.ln())
2521 }
2522
2523 fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
2524 a.mapv(|x| x.sin())
2527 }
2528
2529 fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
2530 a.mapv(|x| x.cos())
2533 }
2534
2535 fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
2536 a.mapv(|x| x.tan())
2538 }
2539
2540 fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
2541 let exp_a = Self::simd_exp(a);
2544 let neg_a = Self::simd_scalar_mul(a, -1.0);
2545 let exp_neg_a = Self::simd_exp(&neg_a.view());
2546 let diff = Self::simd_sub(&exp_a.view(), &exp_neg_a.view());
2547 Self::simd_scalar_mul(&diff.view(), 0.5)
2548 }
2549
2550 fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
2551 let exp_a = Self::simd_exp(a);
2554 let neg_a = Self::simd_scalar_mul(a, -1.0);
2555 let exp_neg_a = Self::simd_exp(&neg_a.view());
2556 let sum = Self::simd_add(&exp_a.view(), &exp_neg_a.view());
2557 Self::simd_scalar_mul(&sum.view(), 0.5)
2558 }
2559
2560 fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
2561 #[cfg(feature = "simd")]
2562 {
2563 simd_ops_polynomial::simd_tanh_f32_poly(a)
2565 }
2566 #[cfg(not(feature = "simd"))]
2567 {
2568 a.mapv(|x| x.tanh())
2569 }
2570 }
2571
2572 fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
2573 #[cfg(feature = "simd")]
2574 {
2575 crate::simd::simd_floor_f32(a)
2576 }
2577 #[cfg(not(feature = "simd"))]
2578 {
2579 a.mapv(|x| x.floor())
2580 }
2581 }
2582
2583 fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
2584 #[cfg(feature = "simd")]
2585 {
2586 crate::simd::simd_ceil_f32(a)
2587 }
2588 #[cfg(not(feature = "simd"))]
2589 {
2590 a.mapv(|x| x.ceil())
2591 }
2592 }
2593
2594 fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
2595 #[cfg(feature = "simd")]
2596 {
2597 crate::simd::simd_round_f32(a)
2598 }
2599 #[cfg(not(feature = "simd"))]
2600 {
2601 a.mapv(|x| x.round())
2602 }
2603 }
2604
2605 fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
2606 a.mapv(|x| x.atan())
2607 }
2608
2609 fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
2610 a.mapv(|x| x.asin())
2611 }
2612
2613 fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
2614 a.mapv(|x| x.acos())
2615 }
2616
2617 fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
2618 y.iter()
2619 .zip(x.iter())
2620 .map(|(&y_val, &x_val)| y_val.atan2(x_val))
2621 .collect::<Vec<_>>()
2622 .into()
2623 }
2624
2625 fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
2626 const LOG10_E: f32 = std::f32::consts::LOG10_E; let ln_a = Self::simd_ln(a);
2629 Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
2630 }
2631
2632 fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
2633 const LOG2_E: f32 = std::f32::consts::LOG2_E; let ln_a = Self::simd_ln(a);
2636 Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
2637 }
2638
2639 #[cfg(feature = "simd")]
2640 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
2641 crate::simd::simd_clip_f32(a, min, max)
2643 }
2644
2645 #[cfg(not(feature = "simd"))]
2646 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
2647 a.mapv(|x| x.clamp(min, max))
2648 }
2649
2650 fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
2651 #[cfg(feature = "simd")]
2652 {
2653 let truncated = crate::simd::simd_trunc_f32(a);
2654 Self::simd_sub(a, &truncated.view())
2655 }
2656 #[cfg(not(feature = "simd"))]
2657 {
2658 a.mapv(|x| x.fract())
2659 }
2660 }
2661
2662 fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
2663 #[cfg(feature = "simd")]
2664 {
2665 crate::simd::simd_trunc_f32(a)
2666 }
2667 #[cfg(not(feature = "simd"))]
2668 {
2669 a.mapv(|x| x.trunc())
2670 }
2671 }
2672
2673 fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
2674 let ones = Array1::from_elem(a.len(), 1.0f32);
2676 Self::simd_div(&ones.view(), a)
2677 }
2678
2679 fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
2680 let ln_base = Self::simd_ln(base);
2683 let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
2684 Self::simd_exp(&scaled.view())
2685 }
2686
2687 fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
2688 let ln_base = Self::simd_ln(base);
2691 let scaled = Self::simd_mul(&ln_base.view(), exp);
2692 Self::simd_exp(&scaled.view())
2693 }
2694
2695 #[cfg(feature = "simd")]
2696 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
2697 crate::simd::unary_powi::simd_powi_f32(base, n)
2698 }
2699
2700 #[cfg(not(feature = "simd"))]
2701 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
2702 base.mapv(|x| x.powi(n))
2703 }
2704
2705 fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
2706 x.mapv(lanczos_gamma_f32)
2707 }
2708
2709 fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
2710 const LN2: f32 = std::f32::consts::LN_2;
2712 let scaled = Self::simd_scalar_mul(a, LN2);
2713 Self::simd_exp(&scaled.view())
2714 }
2715
2716 fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
2717 a.mapv(|x| x.cbrt())
2720 }
2721
2722 fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
2723 a.mapv(|x| x.ln_1p())
2725 }
2726
2727 fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
2728 a.mapv(|x| x.exp_m1())
2730 }
2731
2732 fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
2733 const DEG_TO_RAD: f32 = std::f32::consts::PI / 180.0;
2735 Self::simd_scalar_mul(a, DEG_TO_RAD)
2736 }
2737
2738 fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
2739 const RAD_TO_DEG: f32 = 180.0 / std::f32::consts::PI;
2741 Self::simd_scalar_mul(a, RAD_TO_DEG)
2742 }
2743
2744 fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
2745 a.mapv(digamma_f32)
2748 }
2749
2750 fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
2751 a.mapv(trigamma_f32)
2754 }
2755
2756 fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
2757 a.mapv(ln_gamma_f32)
2759 }
2760
2761 fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
2762 a.mapv(erf_f32)
2765 }
2766
2767 fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
2768 a.mapv(erfc_f32)
2771 }
2772
2773 fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
2774 a.mapv(erfinv_f32)
2777 }
2778
2779 fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
2780 a.mapv(erfcinv_f32)
2783 }
2784
2785 fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
2786 a.mapv(sigmoid_f32)
2790 }
2791
2792 fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
2793 a.mapv(gelu_f32)
2796 }
2797
2798 fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
2799 a.mapv(swish_f32)
2802 }
2803
2804 fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
2805 a.mapv(softplus_f32)
2808 }
2809
2810 fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
2811 a.mapv(mish_f32)
2814 }
2815
2816 fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2817 a.mapv(|x| elu_f32(x, alpha))
2820 }
2821
2822 fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
2823 a.mapv(selu_f32)
2826 }
2827
2828 fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
2829 a.mapv(hardsigmoid_f32)
2832 }
2833
2834 fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
2835 a.mapv(hardswish_f32)
2838 }
2839
2840 fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
2841 a.mapv(sinc_f32)
2844 }
2845
2846 fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2847 if a.is_empty() {
2851 return Array1::zeros(0);
2852 }
2853 let lse = Self::simd_log_sum_exp(a);
2854 a.mapv(|x| x - lse)
2855 }
2856
2857 fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
2858 a.mapv(|x| x.asinh())
2861 }
2862
2863 fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
2864 a.mapv(|x| x.acosh())
2867 }
2868
2869 fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
2870 a.mapv(|x| x.atanh())
2873 }
2874
2875 fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2876 let ln_gamma_a = Self::simd_ln_gamma(a);
2879 let ln_gamma_b = Self::simd_ln_gamma(b);
2880 let a_plus_b = Self::simd_add(a, b);
2881 let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
2882 Self::simd_sub(
2883 &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
2884 &ln_gamma_ab.view(),
2885 )
2886 }
2887
2888 fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2889 let ln_beta = Self::simd_ln_beta(a, b);
2892 Self::simd_exp(&ln_beta.view())
2893 }
2894
2895 fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
2896 if a.is_empty() || b.is_empty() {
2900 return Array1::zeros(0);
2901 }
2902 let diff = Self::simd_sub(b, a);
2903 let scaled = Self::simd_scalar_mul(&diff.view(), t);
2904 Self::simd_add(a, &scaled.view())
2905 }
2906
2907 fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
2908 if x.is_empty() {
2912 return Array1::zeros(0);
2913 }
2914 let range = edge1 - edge0;
2915 if range.abs() < Self::EPSILON {
2916 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
2918 }
2919 x.mapv(|xi| {
2920 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
2921 t * t * (3.0 - 2.0 * t)
2922 })
2923 }
2924
2925 fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
2926 if x.is_empty() || y.is_empty() {
2929 return Array1::zeros(0);
2930 }
2931 let len = x.len().min(y.len());
2932 Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
2933 }
2934
2935 fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
2936 if x.is_empty() || y.is_empty() {
2938 return Array1::zeros(0);
2939 }
2940 let len = x.len().min(y.len());
2941 Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
2942 }
2943
2944 fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
2945 if x.is_empty() {
2948 return Array1::zeros(0);
2949 }
2950 let range = edge1 - edge0;
2951 if range.abs() < Self::EPSILON {
2952 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
2953 }
2954 x.mapv(|xi| {
2955 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
2956 let t3 = t * t * t;
2958 t3 * (t * (t * 6.0 - 15.0) + 10.0)
2959 })
2960 }
2961
2962 fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2963 if a.is_empty() || b.is_empty() {
2966 return Array1::zeros(0);
2967 }
2968 let len = a.len().min(b.len());
2969 Array1::from_iter((0..len).map(|i| {
2970 let ai = a[i];
2971 let bi = b[i];
2972 let max_val = ai.max(bi);
2973 let diff = (ai - bi).abs();
2974 if diff > 50.0 {
2976 max_val
2977 } else {
2978 max_val + (1.0 + (-diff).exp()).ln()
2979 }
2980 }))
2981 }
2982
2983 fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
2984 if a.is_empty() {
2987 return Array1::zeros(0);
2988 }
2989 a.mapv(|p| {
2990 if p <= 0.0 {
2992 Self::NEG_INFINITY
2993 } else if p >= 1.0 {
2994 Self::INFINITY
2995 } else {
2996 (p / (1.0 - p)).ln()
2997 }
2998 })
2999 }
3000
3001 fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
3002 if a.is_empty() {
3004 return Array1::zeros(0);
3005 }
3006 a.mapv(|x| x * x)
3007 }
3008
3009 fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
3010 if a.is_empty() {
3012 return Array1::zeros(0);
3013 }
3014 a.mapv(|x| {
3015 if x <= 0.0 {
3016 if x == 0.0 {
3017 Self::INFINITY
3018 } else {
3019 Self::NAN
3020 }
3021 } else {
3022 1.0 / x.sqrt()
3023 }
3024 })
3025 }
3026
3027 fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
3028 if a.is_empty() {
3030 return (Array1::zeros(0), Array1::zeros(0));
3031 }
3032 let sin_result = a.mapv(|x| x.sin());
3033 let cos_result = a.mapv(|x| x.cos());
3034 (sin_result, cos_result)
3035 }
3036
3037 fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
3038 if a.is_empty() {
3041 return Array1::zeros(0);
3042 }
3043 a.mapv(|x| x.exp_m1())
3044 }
3045
3046 fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
3047 if a.is_empty() {
3050 return Array1::zeros(0);
3051 }
3052 a.mapv(|x| x.ln_1p())
3053 }
3054}
3055
3056impl SimdUnifiedOps for f64 {
3058 #[cfg(feature = "simd")]
3059 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3060 crate::simd::simd_add_f64(a, b)
3061 }
3062
3063 #[cfg(not(feature = "simd"))]
3064 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3065 (a + b).to_owned()
3066 }
3067
3068 #[cfg(feature = "simd")]
3069 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3070 crate::simd::simd_sub_f64(a, b)
3071 }
3072
3073 #[cfg(not(feature = "simd"))]
3074 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3075 (a - b).to_owned()
3076 }
3077
3078 #[cfg(feature = "simd")]
3079 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3080 crate::simd::simd_mul_f64(a, b)
3081 }
3082
3083 #[cfg(not(feature = "simd"))]
3084 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3085 (a * b).to_owned()
3086 }
3087
3088 #[cfg(feature = "simd")]
3089 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3090 crate::simd::simd_div_f64(a, b)
3091 }
3092
3093 #[cfg(not(feature = "simd"))]
3094 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3095 (a / b).to_owned()
3096 }
3097
3098 #[cfg(feature = "simd")]
3099 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3100 crate::simd::simd_dot_f64(a, b)
3101 }
3102
3103 #[cfg(not(feature = "simd"))]
3104 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3105 a.dot(b)
3106 }
3107
3108 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
3109 let m = a.nrows();
3110 let n = a.ncols();
3111
3112 assert_eq!(n, x.len());
3113 assert_eq!(m, y.len());
3114
3115 if beta == 0.0 {
3117 y.fill(0.0);
3118 } else if beta != 1.0 {
3119 y.mapv_inplace(|v| v * beta);
3120 }
3121
3122 for i in 0..m {
3124 let row = a.row(i);
3125 y[i] += Self::simd_dot(&row, x);
3126 }
3127 }
3128
3129 fn simd_gemm(
3130 alpha: Self,
3131 a: &ArrayView2<Self>,
3132 b: &ArrayView2<Self>,
3133 beta: Self,
3134 c: &mut Array2<Self>,
3135 ) {
3136 let m = a.nrows();
3137 let k = a.ncols();
3138 let n = b.ncols();
3139
3140 assert_eq!(k, b.nrows());
3141 assert_eq!((m, n), c.dim());
3142
3143 if beta == 0.0 {
3145 c.fill(0.0);
3146 } else if beta != 1.0 {
3147 c.mapv_inplace(|v| v * beta);
3148 }
3149
3150 const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
3153
3154 if n * k > GEMM_TRANSPOSE_THRESHOLD {
3155 let b_t = Self::simd_transpose_blocked(b);
3157
3158 for i in 0..m {
3159 let a_row = a.row(i);
3160 for j in 0..n {
3161 let b_row = b_t.row(j);
3163 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
3164 }
3165 }
3166 } else {
3167 for i in 0..m {
3169 let a_row = a.row(i);
3170 for j in 0..n {
3171 let b_col = b.column(j);
3172 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
3173 }
3174 }
3175 }
3176 }
3177
3178 #[cfg(feature = "simd")]
3179 fn simd_norm(a: &ArrayView1<Self>) -> Self {
3180 crate::simd::norms::simd_norm_l2_f64(a)
3181 }
3182
3183 #[cfg(not(feature = "simd"))]
3184 fn simd_norm(a: &ArrayView1<Self>) -> Self {
3185 a.iter().map(|&x| x * x).sum::<f64>().sqrt()
3186 }
3187
3188 #[cfg(feature = "simd")]
3189 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3190 crate::simd::simd_maximum_f64(a, b)
3191 }
3192
3193 #[cfg(not(feature = "simd"))]
3194 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3195 let mut result = Array1::zeros(a.len());
3196 for _i in 0..a.len() {
3197 result[0] = a[0].max(b[0]);
3198 }
3199 result
3200 }
3201
3202 #[cfg(feature = "simd")]
3203 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3204 crate::simd::simd_minimum_f64(a, b)
3205 }
3206
3207 #[cfg(not(feature = "simd"))]
3208 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3209 let mut result = Array1::zeros(a.len());
3210 for _i in 0..a.len() {
3211 result[0] = a[0].min(b[0]);
3212 }
3213 result
3214 }
3215
3216 #[cfg(feature = "simd")]
3217 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
3218 crate::simd::simd_scalar_mul_f64(a, scalar)
3219 }
3220
3221 #[cfg(not(feature = "simd"))]
3222 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
3223 a.mapv(|x| x * scalar)
3224 }
3225
3226 #[cfg(feature = "simd")]
3227 fn simd_sum(a: &ArrayView1<Self>) -> Self {
3228 crate::simd::simd_sum_f64(a)
3229 }
3230
3231 #[cfg(not(feature = "simd"))]
3232 fn simd_sum(a: &ArrayView1<Self>) -> Self {
3233 a.sum()
3234 }
3235
3236 fn simd_mean(a: &ArrayView1<Self>) -> Self {
3237 if a.is_empty() {
3238 0.0
3239 } else {
3240 Self::simd_sum(a) / (a.len() as f64)
3241 }
3242 }
3243
3244 #[cfg(feature = "simd")]
3245 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
3246 crate::simd::simd_max_f64(a)
3247 }
3248
3249 #[cfg(not(feature = "simd"))]
3250 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
3251 a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
3252 }
3253
3254 #[cfg(feature = "simd")]
3255 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
3256 crate::simd::simd_min_f64(a)
3257 }
3258
3259 #[cfg(not(feature = "simd"))]
3260 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
3261 a.fold(f64::INFINITY, |acc, &x| acc.min(x))
3262 }
3263
3264 #[cfg(feature = "simd")]
3265 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
3266 crate::simd::simd_fused_multiply_add_f64(a, b, c)
3267 }
3268
3269 #[cfg(not(feature = "simd"))]
3270 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
3271 let mut result = Array1::zeros(a.len());
3272 for _i in 0..a.len() {
3273 result[0] = a[0] * b[0] + c[0];
3274 }
3275 result
3276 }
3277
3278 #[cfg(feature = "simd")]
3279 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3280 crate::simd::simd_add_cache_optimized_f64(a, b)
3281 }
3282
3283 #[cfg(not(feature = "simd"))]
3284 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3285 a + b
3286 }
3287
3288 #[cfg(feature = "simd")]
3289 fn simd_fma_advanced_optimized(
3290 a: &ArrayView1<Self>,
3291 b: &ArrayView1<Self>,
3292 c: &ArrayView1<Self>,
3293 ) -> Array1<Self> {
3294 crate::simd::simd_fma_advanced_optimized_f64(a, b, c)
3295 }
3296
3297 #[cfg(not(feature = "simd"))]
3298 fn simd_fma_advanced_optimized(
3299 a: &ArrayView1<Self>,
3300 b: &ArrayView1<Self>,
3301 c: &ArrayView1<Self>,
3302 ) -> Array1<Self> {
3303 let mut result = Array1::zeros(a.len());
3304 for _i in 0..a.len() {
3305 result[0] = a[0] * b[0] + c[0];
3306 }
3307 result
3308 }
3309
3310 #[cfg(feature = "simd")]
3311 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3312 crate::simd::simd_adaptive_add_f64(a, b)
3313 }
3314
3315 #[cfg(not(feature = "simd"))]
3316 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3317 a + b
3318 }
3319
3320 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
3321 a.t().to_owned()
3322 }
3323
3324 fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
3325 #[cfg(feature = "simd")]
3326 {
3327 crate::simd::simd_transpose_blocked_f64(a)
3328 }
3329 #[cfg(not(feature = "simd"))]
3330 {
3331 a.t().to_owned()
3332 }
3333 }
3334
3335 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
3336 a.iter().map(|&x| x * x).sum()
3337 }
3338
3339 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3340 Self::simd_mul(a, b)
3341 }
3342
3343 #[cfg(feature = "simd")]
3344 fn simd_available() -> bool {
3345 true
3346 }
3347
3348 #[cfg(not(feature = "simd"))]
3349 fn simd_available() -> bool {
3350 false
3351 }
3352
3353 fn simd_sub_f32_ultra(
3354 a: &ArrayView1<Self>,
3355 b: &ArrayView1<Self>,
3356 result: &mut ArrayViewMut1<Self>,
3357 ) {
3358 let sub_result = Self::simd_sub(a, b);
3359 result.assign(&sub_result);
3360 }
3361
3362 fn simd_mul_f32_ultra(
3363 a: &ArrayView1<Self>,
3364 b: &ArrayView1<Self>,
3365 result: &mut ArrayViewMut1<Self>,
3366 ) {
3367 let mul_result = Self::simd_mul(a, b);
3368 result.assign(&mul_result);
3369 }
3370
3371 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
3372 a.iter().map(|&x| x * x * x).sum()
3374 }
3375
3376 fn simd_div_f32_ultra(
3377 a: &ArrayView1<Self>,
3378 b: &ArrayView1<Self>,
3379 result: &mut ArrayViewMut1<Self>,
3380 ) {
3381 let div_result = Self::simd_div(a, b);
3382 result.assign(&div_result);
3383 }
3384
3385 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3386 let sin_result = a.mapv(|x| x.sin());
3387 result.assign(&sin_result);
3388 }
3389
3390 fn simd_add_f32_ultra(
3391 a: &ArrayView1<Self>,
3392 b: &ArrayView1<Self>,
3393 result: &mut ArrayViewMut1<Self>,
3394 ) {
3395 let add_result = Self::simd_add(a, b);
3396 result.assign(&add_result);
3397 }
3398
3399 fn simd_fma_f32_ultra(
3400 a: &ArrayView1<Self>,
3401 b: &ArrayView1<Self>,
3402 c: &ArrayView1<Self>,
3403 result: &mut ArrayViewMut1<Self>,
3404 ) {
3405 let fma_result = Self::simd_fma(a, b, c);
3406 result.assign(&fma_result);
3407 }
3408
3409 fn simd_pow_f32_ultra(
3410 a: &ArrayView1<Self>,
3411 b: &ArrayView1<Self>,
3412 result: &mut ArrayViewMut1<Self>,
3413 ) {
3414 let pow_result = a
3415 .iter()
3416 .zip(b.iter())
3417 .map(|(&x, &y)| x.powf(y))
3418 .collect::<Vec<_>>();
3419 result.assign(&Array1::from_vec(pow_result));
3420 }
3421
3422 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3423 let exp_result = a.mapv(|x| x.exp());
3424 result.assign(&exp_result);
3425 }
3426
3427 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3428 let cos_result = a.mapv(|x| x.cos());
3429 result.assign(&cos_result);
3430 }
3431
3432 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3433 Self::simd_dot(a, b)
3434 }
3435
3436 #[cfg(feature = "simd")]
3437 fn simd_variance(a: &ArrayView1<Self>) -> Self {
3438 crate::simd::simd_variance_f64(a)
3439 }
3440
3441 #[cfg(not(feature = "simd"))]
3442 fn simd_variance(a: &ArrayView1<Self>) -> Self {
3443 let mean = Self::simd_mean(a);
3444 let n = a.len() as f64;
3445 if n < 2.0 {
3446 return f64::NAN;
3447 }
3448 a.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
3450 }
3451
3452 #[cfg(feature = "simd")]
3453 fn simd_std(a: &ArrayView1<Self>) -> Self {
3454 crate::simd::simd_std_f64(a)
3455 }
3456
3457 #[cfg(not(feature = "simd"))]
3458 fn simd_std(a: &ArrayView1<Self>) -> Self {
3459 Self::simd_variance(a).sqrt()
3460 }
3461
3462 #[cfg(feature = "simd")]
3463 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
3464 crate::simd::simd_norm_l1_f64(a)
3465 }
3466
3467 #[cfg(not(feature = "simd"))]
3468 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
3469 a.iter().map(|&x| x.abs()).sum()
3470 }
3471
3472 #[cfg(feature = "simd")]
3473 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
3474 crate::simd::simd_norm_linf_f64(a)
3475 }
3476
3477 #[cfg(not(feature = "simd"))]
3478 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
3479 a.iter().fold(0.0f64, |acc, &x| acc.max(x.abs()))
3480 }
3481
3482 #[cfg(feature = "simd")]
3483 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3484 crate::simd::simd_cosine_similarity_f64(a, b)
3485 }
3486
3487 #[cfg(not(feature = "simd"))]
3488 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3489 let dot = Self::simd_dot(a, b);
3490 let norm_a = Self::simd_norm(a);
3491 let norm_b = Self::simd_norm(b);
3492 dot / (norm_a * norm_b)
3493 }
3494
3495 #[cfg(feature = "simd")]
3496 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3497 crate::simd::simd_distance_euclidean_f64(a, b)
3498 }
3499
3500 #[cfg(not(feature = "simd"))]
3501 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3502 a.iter()
3503 .zip(b.iter())
3504 .map(|(&x, &y)| (x - y).powi(2))
3505 .sum::<f64>()
3506 .sqrt()
3507 }
3508
3509 #[cfg(feature = "simd")]
3510 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3511 crate::simd::simd_distance_manhattan_f64(a, b)
3512 }
3513
3514 #[cfg(not(feature = "simd"))]
3515 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3516 a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
3517 }
3518
3519 #[cfg(feature = "simd")]
3520 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3521 crate::simd::simd_distance_chebyshev_f64(a, b)
3522 }
3523
3524 #[cfg(not(feature = "simd"))]
3525 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3526 a.iter()
3527 .zip(b.iter())
3528 .fold(0.0f64, |acc, (&x, &y)| acc.max((x - y).abs()))
3529 }
3530
3531 #[cfg(feature = "simd")]
3532 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3533 crate::simd::simd_distance_cosine_f64(a, b)
3534 }
3535
3536 #[cfg(not(feature = "simd"))]
3537 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3538 1.0 - Self::simd_cosine_similarity(a, b)
3539 }
3540
3541 #[cfg(feature = "simd")]
3542 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3543 crate::simd::simd_weighted_sum_f64(values, weights)
3544 }
3545
3546 #[cfg(not(feature = "simd"))]
3547 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3548 values
3549 .iter()
3550 .zip(weights.iter())
3551 .map(|(&v, &w)| v * w)
3552 .sum()
3553 }
3554
3555 #[cfg(feature = "simd")]
3556 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3557 crate::simd::simd_weighted_mean_f64(values, weights)
3558 }
3559
3560 #[cfg(not(feature = "simd"))]
3561 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3562 let weighted_sum = Self::simd_weighted_sum(values, weights);
3563 let weight_sum: f64 = weights.iter().sum();
3564 weighted_sum / weight_sum
3565 }
3566
3567 #[cfg(feature = "simd")]
3568 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
3569 crate::simd::simd_argmin_f64(a)
3570 }
3571
3572 #[cfg(not(feature = "simd"))]
3573 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
3574 if a.is_empty() {
3575 return None;
3576 }
3577 let mut min_idx = 0;
3578 let mut min_val = a[0];
3579 for (i, &v) in a.iter().enumerate().skip(1) {
3580 if v < min_val {
3581 min_val = v;
3582 min_idx = i;
3583 }
3584 }
3585 Some(min_idx)
3586 }
3587
3588 #[cfg(feature = "simd")]
3589 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
3590 crate::simd::simd_argmax_f64(a)
3591 }
3592
3593 #[cfg(not(feature = "simd"))]
3594 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
3595 if a.is_empty() {
3596 return None;
3597 }
3598 let mut max_idx = 0;
3599 let mut max_val = a[0];
3600 for (i, &v) in a.iter().enumerate().skip(1) {
3601 if v > max_val {
3602 max_val = v;
3603 max_idx = i;
3604 }
3605 }
3606 Some(max_idx)
3607 }
3608
3609 #[cfg(feature = "simd")]
3610 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
3611 crate::simd::simd_clip_f64(a, min_val, max_val)
3612 }
3613
3614 #[cfg(not(feature = "simd"))]
3615 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
3616 a.mapv(|v| v.max(min_val).min(max_val))
3617 }
3618
3619 #[cfg(feature = "simd")]
3620 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
3621 crate::simd::simd_log_sum_exp_f64(a)
3622 }
3623
3624 #[cfg(not(feature = "simd"))]
3625 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
3626 if a.is_empty() {
3627 return f64::NEG_INFINITY;
3628 }
3629 let max_val = a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x));
3630 let sum_exp: f64 = a.iter().map(|&x| (x - max_val).exp()).sum();
3631 max_val + sum_exp.ln()
3632 }
3633
3634 #[cfg(feature = "simd")]
3635 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
3636 crate::simd::simd_softmax_f64(a)
3637 }
3638
3639 #[cfg(not(feature = "simd"))]
3640 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
3641 if a.is_empty() {
3642 return Array1::zeros(0);
3643 }
3644 let lse = Self::simd_log_sum_exp(a);
3645 a.mapv(|x| (x - lse).exp())
3646 }
3647
3648 #[cfg(feature = "simd")]
3649 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
3650 crate::simd::simd_cumsum_f64(a)
3651 }
3652
3653 #[cfg(not(feature = "simd"))]
3654 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
3655 if a.is_empty() {
3656 return Array1::zeros(0);
3657 }
3658 let mut cumsum = 0.0f64;
3659 a.mapv(|x| {
3660 cumsum += x;
3661 cumsum
3662 })
3663 }
3664
3665 #[cfg(feature = "simd")]
3666 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
3667 crate::simd::simd_cumprod_f64(a)
3668 }
3669
3670 #[cfg(not(feature = "simd"))]
3671 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
3672 if a.is_empty() {
3673 return Array1::zeros(0);
3674 }
3675 let mut cumprod = 1.0f64;
3676 a.mapv(|x| {
3677 cumprod *= x;
3678 cumprod
3679 })
3680 }
3681
3682 #[cfg(feature = "simd")]
3683 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
3684 crate::simd::simd_diff_f64(a)
3685 }
3686
3687 #[cfg(not(feature = "simd"))]
3688 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
3689 if a.len() <= 1 {
3690 return Array1::zeros(0);
3691 }
3692 Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
3693 }
3694
3695 #[cfg(feature = "simd")]
3696 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
3697 crate::simd::simd_sign_f64(a)
3698 }
3699
3700 #[cfg(not(feature = "simd"))]
3701 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
3702 a.mapv(|x| {
3703 if x > 0.0 {
3704 1.0
3705 } else if x < 0.0 {
3706 -1.0
3707 } else {
3708 0.0
3709 }
3710 })
3711 }
3712
3713 #[cfg(feature = "simd")]
3714 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
3715 crate::simd::simd_relu_f64(a)
3716 }
3717
3718 #[cfg(not(feature = "simd"))]
3719 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
3720 a.mapv(|x| x.max(0.0))
3721 }
3722
3723 #[cfg(feature = "simd")]
3724 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
3725 crate::simd::simd_leaky_relu_f64(a, alpha)
3726 }
3727
3728 #[cfg(not(feature = "simd"))]
3729 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
3730 a.mapv(|x| if x > 0.0 { x } else { alpha * x })
3731 }
3732
3733 #[cfg(feature = "simd")]
3734 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
3735 crate::simd::simd_normalize_f64(a)
3736 }
3737
3738 #[cfg(not(feature = "simd"))]
3739 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
3740 let norm: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
3741 if norm == 0.0 {
3742 return a.to_owned();
3743 }
3744 a.mapv(|x| x / norm)
3745 }
3746
3747 #[cfg(feature = "simd")]
3748 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
3749 crate::simd::simd_standardize_f64(a)
3750 }
3751
3752 #[cfg(not(feature = "simd"))]
3753 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
3754 if a.len() <= 1 {
3755 return Array1::zeros(a.len());
3756 }
3757 let mean: f64 = a.iter().sum::<f64>() / a.len() as f64;
3758 let variance: f64 =
3759 a.iter().map(|x| (x - mean) * (x - mean)).sum::<f64>() / (a.len() - 1) as f64;
3760 let std = variance.sqrt();
3761 if std == 0.0 {
3762 return Array1::zeros(a.len());
3763 }
3764 a.mapv(|x| (x - mean) / std)
3765 }
3766
3767 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
3768 a.mapv(|x| x.abs())
3769 }
3770
3771 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
3772 a.mapv(|x| x.sqrt())
3773 }
3774
3775 fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
3776 a.mapv(|x| x.exp())
3779 }
3780
3781 fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
3782 a.mapv(|x| x.ln())
3785 }
3786
3787 fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
3788 a.mapv(|x| x.sin())
3791 }
3792
3793 fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
3794 a.mapv(|x| x.cos())
3797 }
3798
3799 fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
3800 a.mapv(|x| x.tan())
3802 }
3803
3804 fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
3805 #[cfg(feature = "simd")]
3806 {
3807 simd_ops_polynomial::simd_sinh_f64_poly(a)
3808 }
3809 #[cfg(not(feature = "simd"))]
3810 {
3811 a.mapv(|x| x.sinh())
3812 }
3813 }
3814
3815 fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
3816 #[cfg(feature = "simd")]
3817 {
3818 simd_ops_polynomial::simd_cosh_f64_poly(a)
3819 }
3820 #[cfg(not(feature = "simd"))]
3821 {
3822 a.mapv(|x| x.cosh())
3823 }
3824 }
3825
3826 fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
3827 #[cfg(feature = "simd")]
3828 {
3829 simd_ops_polynomial::simd_tanh_f64_poly(a)
3831 }
3832 #[cfg(not(feature = "simd"))]
3833 {
3834 a.mapv(|x| x.tanh())
3835 }
3836 }
3837
3838 fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
3839 #[cfg(feature = "simd")]
3840 {
3841 crate::simd::simd_floor_f64(a)
3842 }
3843 #[cfg(not(feature = "simd"))]
3844 {
3845 a.mapv(|x| x.floor())
3846 }
3847 }
3848
3849 fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
3850 #[cfg(feature = "simd")]
3851 {
3852 crate::simd::simd_ceil_f64(a)
3853 }
3854 #[cfg(not(feature = "simd"))]
3855 {
3856 a.mapv(|x| x.ceil())
3857 }
3858 }
3859
3860 fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
3861 #[cfg(feature = "simd")]
3862 {
3863 crate::simd::simd_round_f64(a)
3864 }
3865 #[cfg(not(feature = "simd"))]
3866 {
3867 a.mapv(|x| x.round())
3868 }
3869 }
3870
3871 fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
3872 a.mapv(|x| x.atan())
3873 }
3874
3875 fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
3876 a.mapv(|x| x.asin())
3877 }
3878
3879 fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
3880 a.mapv(|x| x.acos())
3881 }
3882
3883 fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
3884 y.iter()
3885 .zip(x.iter())
3886 .map(|(&y_val, &x_val)| y_val.atan2(x_val))
3887 .collect::<Vec<_>>()
3888 .into()
3889 }
3890
3891 fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
3892 const LOG10_E: f64 = std::f64::consts::LOG10_E; let ln_a = Self::simd_ln(a);
3895 Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
3896 }
3897
3898 fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
3899 const LOG2_E: f64 = std::f64::consts::LOG2_E; let ln_a = Self::simd_ln(a);
3902 Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
3903 }
3904
3905 #[cfg(feature = "simd")]
3906 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
3907 crate::simd::simd_clip_f64(a, min, max)
3909 }
3910
3911 #[cfg(not(feature = "simd"))]
3912 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
3913 a.mapv(|x| x.clamp(min, max))
3914 }
3915
3916 fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
3917 #[cfg(feature = "simd")]
3918 {
3919 let truncated = crate::simd::simd_trunc_f64(a);
3920 Self::simd_sub(a, &truncated.view())
3921 }
3922 #[cfg(not(feature = "simd"))]
3923 {
3924 a.mapv(|x| x.fract())
3925 }
3926 }
3927
3928 fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
3929 #[cfg(feature = "simd")]
3930 {
3931 crate::simd::simd_trunc_f64(a)
3932 }
3933 #[cfg(not(feature = "simd"))]
3934 {
3935 a.mapv(|x| x.trunc())
3936 }
3937 }
3938
3939 fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
3940 let ones = Array1::from_elem(a.len(), 1.0f64);
3942 Self::simd_div(&ones.view(), a)
3943 }
3944
3945 fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
3946 let ln_base = Self::simd_ln(base);
3949 let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
3950 Self::simd_exp(&scaled.view())
3951 }
3952
3953 fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
3954 let ln_base = Self::simd_ln(base);
3957 let scaled = Self::simd_mul(&ln_base.view(), exp);
3958 Self::simd_exp(&scaled.view())
3959 }
3960
3961 #[cfg(feature = "simd")]
3962 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
3963 crate::simd::unary_powi::simd_powi_f64(base, n)
3964 }
3965
3966 #[cfg(not(feature = "simd"))]
3967 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
3968 base.mapv(|x| x.powi(n))
3969 }
3970
3971 fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
3972 x.mapv(lanczos_gamma_f64)
3973 }
3974
3975 fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
3976 const LN2: f64 = std::f64::consts::LN_2;
3978 let scaled = Self::simd_scalar_mul(a, LN2);
3979 Self::simd_exp(&scaled.view())
3980 }
3981
3982 fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
3983 a.mapv(|x| x.cbrt())
3986 }
3987
3988 fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
3989 a.mapv(|x| x.ln_1p())
3991 }
3992
3993 fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
3994 a.mapv(|x| x.exp_m1())
3996 }
3997
3998 fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
3999 const DEG_TO_RAD: f64 = std::f64::consts::PI / 180.0;
4001 Self::simd_scalar_mul(a, DEG_TO_RAD)
4002 }
4003
4004 fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
4005 const RAD_TO_DEG: f64 = 180.0 / std::f64::consts::PI;
4007 Self::simd_scalar_mul(a, RAD_TO_DEG)
4008 }
4009
4010 fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
4011 a.mapv(digamma_f64)
4014 }
4015
4016 fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
4017 a.mapv(trigamma_f64)
4020 }
4021
4022 fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
4023 a.mapv(ln_gamma_f64)
4025 }
4026
4027 fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
4028 a.mapv(erf_f64)
4031 }
4032
4033 fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
4034 a.mapv(erfc_f64)
4037 }
4038
4039 fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
4040 a.mapv(erfinv_f64)
4043 }
4044
4045 fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
4046 a.mapv(erfcinv_f64)
4049 }
4050
4051 fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
4052 a.mapv(sigmoid_f64)
4056 }
4057
4058 fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
4059 a.mapv(gelu_f64)
4062 }
4063
4064 fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
4065 a.mapv(swish_f64)
4068 }
4069
4070 fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
4071 a.mapv(softplus_f64)
4074 }
4075
4076 fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
4077 a.mapv(mish_f64)
4080 }
4081
4082 fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
4083 a.mapv(|x| elu_f64(x, alpha))
4086 }
4087
4088 fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
4089 a.mapv(selu_f64)
4092 }
4093
4094 fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
4095 a.mapv(hardsigmoid_f64)
4098 }
4099
4100 fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
4101 a.mapv(hardswish_f64)
4104 }
4105
4106 fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
4107 a.mapv(sinc_f64)
4110 }
4111
4112 fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
4113 if a.is_empty() {
4117 return Array1::zeros(0);
4118 }
4119 let lse = Self::simd_log_sum_exp(a);
4120 a.mapv(|x| x - lse)
4121 }
4122
4123 fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
4124 a.mapv(|x| x.asinh())
4127 }
4128
4129 fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
4130 a.mapv(|x| x.acosh())
4133 }
4134
4135 fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
4136 a.mapv(|x| x.atanh())
4139 }
4140
4141 fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4142 let ln_gamma_a = Self::simd_ln_gamma(a);
4145 let ln_gamma_b = Self::simd_ln_gamma(b);
4146 let a_plus_b = Self::simd_add(a, b);
4147 let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
4148 Self::simd_sub(
4149 &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
4150 &ln_gamma_ab.view(),
4151 )
4152 }
4153
4154 fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4155 let ln_beta = Self::simd_ln_beta(a, b);
4158 Self::simd_exp(&ln_beta.view())
4159 }
4160
4161 fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
4162 if a.is_empty() || b.is_empty() {
4165 return Array1::zeros(0);
4166 }
4167 let diff = Self::simd_sub(b, a);
4168 let scaled = Self::simd_scalar_mul(&diff.view(), t);
4169 Self::simd_add(a, &scaled.view())
4170 }
4171
4172 fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
4173 if x.is_empty() {
4177 return Array1::zeros(0);
4178 }
4179 let range = edge1 - edge0;
4180 if range.abs() < Self::EPSILON {
4181 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
4183 }
4184 x.mapv(|xi| {
4185 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
4186 t * t * (3.0 - 2.0 * t)
4187 })
4188 }
4189
4190 fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
4191 if x.is_empty() || y.is_empty() {
4194 return Array1::zeros(0);
4195 }
4196 let len = x.len().min(y.len());
4197 Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
4198 }
4199
4200 fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
4201 if x.is_empty() || y.is_empty() {
4203 return Array1::zeros(0);
4204 }
4205 let len = x.len().min(y.len());
4206 Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
4207 }
4208
4209 fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
4210 if x.is_empty() {
4213 return Array1::zeros(0);
4214 }
4215 let range = edge1 - edge0;
4216 if range.abs() < Self::EPSILON {
4217 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
4218 }
4219 x.mapv(|xi| {
4220 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
4221 let t3 = t * t * t;
4223 t3 * (t * (t * 6.0 - 15.0) + 10.0)
4224 })
4225 }
4226
4227 fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4228 if a.is_empty() || b.is_empty() {
4231 return Array1::zeros(0);
4232 }
4233 let len = a.len().min(b.len());
4234 Array1::from_iter((0..len).map(|i| {
4235 let ai = a[i];
4236 let bi = b[i];
4237 let max_val = ai.max(bi);
4238 let diff = (ai - bi).abs();
4239 if diff > 50.0 {
4241 max_val
4242 } else {
4243 max_val + (1.0 + (-diff).exp()).ln()
4244 }
4245 }))
4246 }
4247
4248 fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
4249 if a.is_empty() {
4252 return Array1::zeros(0);
4253 }
4254 a.mapv(|p| {
4255 if p <= 0.0 {
4257 Self::NEG_INFINITY
4258 } else if p >= 1.0 {
4259 Self::INFINITY
4260 } else {
4261 (p / (1.0 - p)).ln()
4262 }
4263 })
4264 }
4265
4266 fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
4267 if a.is_empty() {
4269 return Array1::zeros(0);
4270 }
4271 a.mapv(|x| x * x)
4272 }
4273
4274 fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
4275 if a.is_empty() {
4277 return Array1::zeros(0);
4278 }
4279 a.mapv(|x| {
4280 if x <= 0.0 {
4281 if x == 0.0 {
4282 Self::INFINITY
4283 } else {
4284 Self::NAN
4285 }
4286 } else {
4287 1.0 / x.sqrt()
4288 }
4289 })
4290 }
4291
4292 fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
4293 if a.is_empty() {
4295 return (Array1::zeros(0), Array1::zeros(0));
4296 }
4297 let sin_result = a.mapv(|x| x.sin());
4298 let cos_result = a.mapv(|x| x.cos());
4299 (sin_result, cos_result)
4300 }
4301
4302 fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
4303 if a.is_empty() {
4306 return Array1::zeros(0);
4307 }
4308 a.mapv(|x| x.exp_m1())
4309 }
4310
4311 fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
4312 if a.is_empty() {
4315 return Array1::zeros(0);
4316 }
4317 a.mapv(|x| x.ln_1p())
4318 }
4319}
4320
4321#[derive(Debug, Clone, Copy)]
4323pub struct PlatformCapabilities {
4324 pub simd_available: bool,
4325 pub gpu_available: bool,
4326 pub cuda_available: bool,
4327 pub opencl_available: bool,
4328 pub metal_available: bool,
4329 pub avx2_available: bool,
4330 pub avx512_available: bool,
4331 pub neon_available: bool,
4332}
4333
4334impl PlatformCapabilities {
4335 pub fn detect() -> Self {
4337 Self {
4338 simd_available: cfg!(feature = "simd"),
4339 gpu_available: cfg!(feature = "gpu"),
4340 cuda_available: cfg!(all(feature = "gpu", feature = "cuda")),
4341 opencl_available: cfg!(all(feature = "gpu", feature = "opencl")),
4342 metal_available: cfg!(all(feature = "gpu", feature = "metal", target_os = "macos")),
4343 avx2_available: cfg!(target_feature = "avx2"),
4344 avx512_available: cfg!(target_feature = "avx512f"),
4345 neon_available: cfg!(target_arch = "aarch64"),
4346 }
4347 }
4348
4349 pub fn summary(&self) -> String {
4351 let mut features = Vec::new();
4352
4353 if self.simd_available {
4354 features.push("SIMD");
4355 }
4356 if self.gpu_available {
4357 features.push("GPU");
4358 }
4359 if self.cuda_available {
4360 features.push("CUDA");
4361 }
4362 if self.opencl_available {
4363 features.push("OpenCL");
4364 }
4365 if self.metal_available {
4366 features.push("Metal");
4367 }
4368 if self.avx2_available {
4369 features.push("AVX2");
4370 }
4371 if self.avx512_available {
4372 features.push("AVX512");
4373 }
4374 if self.neon_available {
4375 features.push("NEON");
4376 }
4377
4378 if features.is_empty() {
4379 "No acceleration features available".to_string()
4380 } else {
4381 format!(
4382 "Available acceleration: {features}",
4383 features = features.join(", ")
4384 )
4385 }
4386 }
4387
4388 pub fn has_avx2(&self) -> bool {
4390 self.avx2_available
4391 }
4392
4393 pub fn has_avx512(&self) -> bool {
4395 self.avx512_available
4396 }
4397
4398 pub fn has_sse(&self) -> bool {
4400 self.simd_available || self.neon_available || self.avx2_available
4401 }
4402
4403 pub fn num_cores(&self) -> usize {
4405 std::thread::available_parallelism()
4406 .map(|n| n.get())
4407 .unwrap_or(1)
4408 }
4409
4410 pub fn cache_line_size(&self) -> usize {
4412 64
4416 }
4417}
4418
4419pub struct AutoOptimizer {
4421 capabilities: PlatformCapabilities,
4422}
4423
4424impl AutoOptimizer {
4425 pub fn new() -> Self {
4426 Self {
4427 capabilities: PlatformCapabilities::detect(),
4428 }
4429 }
4430
4431 pub fn should_use_gpu(&self, size: usize) -> bool {
4433 self.capabilities.gpu_available && size > 10000
4435 }
4436
4437 pub fn should_use_metal(&self, size: usize) -> bool {
4439 self.capabilities.metal_available && size > 1024
4442 }
4443
4444 pub fn should_use_simd(&self, size: usize) -> bool {
4446 self.capabilities.simd_available && size > 64
4448 }
4449
4450 pub fn select_gemm_impl(&self, m: usize, n: usize, k: usize) -> &'static str {
4452 let total_ops = m * n * k;
4453
4454 if self.capabilities.metal_available {
4456 if total_ops > 8192 {
4458 return "Metal";
4460 }
4461 }
4462
4463 if self.should_use_gpu(total_ops) {
4464 if self.capabilities.cuda_available {
4465 "CUDA"
4466 } else if self.capabilities.metal_available {
4467 "Metal"
4468 } else if self.capabilities.opencl_available {
4469 "OpenCL"
4470 } else {
4471 "GPU"
4472 }
4473 } else if self.should_use_simd(total_ops) {
4474 "SIMD"
4475 } else {
4476 "Scalar"
4477 }
4478 }
4479
4480 pub fn select_vector_impl(&self, size: usize) -> &'static str {
4482 if self.capabilities.metal_available && size > 1024 {
4484 return "Metal";
4485 }
4486
4487 if self.should_use_gpu(size) {
4488 if self.capabilities.cuda_available {
4489 "CUDA"
4490 } else if self.capabilities.metal_available {
4491 "Metal"
4492 } else if self.capabilities.opencl_available {
4493 "OpenCL"
4494 } else {
4495 "GPU"
4496 }
4497 } else if self.should_use_simd(size) {
4498 if self.capabilities.avx512_available {
4499 "AVX512"
4500 } else if self.capabilities.avx2_available {
4501 "AVX2"
4502 } else if self.capabilities.neon_available {
4503 "NEON"
4504 } else {
4505 "SIMD"
4506 }
4507 } else {
4508 "Scalar"
4509 }
4510 }
4511
4512 pub fn select_reduction_impl(&self, size: usize) -> &'static str {
4514 if self.capabilities.metal_available && size > 4096 {
4517 return "Metal";
4518 }
4519
4520 if self.should_use_gpu(size * 2) {
4521 if self.capabilities.cuda_available {
4523 "CUDA"
4524 } else if self.capabilities.metal_available {
4525 "Metal"
4526 } else {
4527 "GPU"
4528 }
4529 } else if self.should_use_simd(size) {
4530 "SIMD"
4531 } else {
4532 "Scalar"
4533 }
4534 }
4535
4536 pub fn select_fft_impl(&self, size: usize) -> &'static str {
4538 if self.capabilities.metal_available && size > 512 {
4541 return "Metal-MPS";
4542 }
4543
4544 if self.capabilities.cuda_available && size > 1024 {
4545 "cuFFT"
4546 } else if self.should_use_simd(size) {
4547 "SIMD"
4548 } else {
4549 "Scalar"
4550 }
4551 }
4552
4553 pub fn has_unified_memory(&self) -> bool {
4555 cfg!(all(target_os = "macos", target_arch = "aarch64"))
4556 }
4557
4558 pub fn recommend(&self, operation: &str, size: usize) -> String {
4560 let recommendation = match operation {
4561 "gemm" | "matmul" => self.select_gemm_impl(size, size, size),
4562 "vector" | "axpy" | "dot" => self.select_vector_impl(size),
4563 "reduction" | "sum" | "mean" => self.select_reduction_impl(size),
4564 "fft" => self.select_fft_impl(size),
4565 _ => "Scalar",
4566 };
4567
4568 if self.has_unified_memory() && recommendation == "Metal" {
4569 format!("{recommendation} (Unified Memory)")
4570 } else {
4571 recommendation.to_string()
4572 }
4573 }
4574}
4575
4576impl Default for AutoOptimizer {
4577 fn default() -> Self {
4578 Self::new()
4579 }
4580}
4581
4582pub fn simd_dot_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
4584 f32::simd_dot_f32_ultra(a, b)
4585}
4586
4587pub fn simd_fma_f32_ultra(
4589 a: &ArrayView1<f32>,
4590 b: &ArrayView1<f32>,
4591 c: &ArrayView1<f32>,
4592 result: &mut ArrayViewMut1<f32>,
4593) {
4594 f32::simd_fma_f32_ultra(a, b, c, result)
4595}
4596
4597pub fn simd_add_f32_adaptive(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
4599 f32::simd_add_adaptive(a, b)
4600}
4601
4602pub fn simd_mul_f32_hyperoptimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
4603 f32::simd_mul(a, b)
4604}
4605
4606pub fn simd_mul_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4611 let a_array = Array1::from_vec(a.clone());
4612 let b_array = Array1::from_vec(b.clone());
4613 let mut result_array = Array1::from_vec(result.clone());
4614
4615 f32::simd_mul_f32_ultra(
4616 &a_array.view(),
4617 &b_array.view(),
4618 &mut result_array.view_mut(),
4619 );
4620 *result = result_array.to_vec();
4621}
4622
4623pub fn simd_add_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4625 let a_array = Array1::from_vec(a.clone());
4626 let b_array = Array1::from_vec(b.clone());
4627 let mut result_array = Array1::from_vec(result.clone());
4628
4629 f32::simd_add_f32_ultra(
4630 &a_array.view(),
4631 &b_array.view(),
4632 &mut result_array.view_mut(),
4633 );
4634 *result = result_array.to_vec();
4635}
4636
4637pub fn simd_div_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4639 let a_array = Array1::from_vec(a.clone());
4640 let b_array = Array1::from_vec(b.clone());
4641 let mut result_array = Array1::from_vec(result.clone());
4642
4643 f32::simd_div_f32_ultra(
4644 &a_array.view(),
4645 &b_array.view(),
4646 &mut result_array.view_mut(),
4647 );
4648 *result = result_array.to_vec();
4649}
4650
4651pub fn simd_sin_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4653 let a_array = Array1::from_vec(a.to_owned());
4654 let mut result_array = Array1::from_vec(result.clone());
4655
4656 f32::simd_sin_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4657 *result = result_array.to_vec();
4658}
4659
4660pub fn simd_sub_f32_ultra_vec(a: &[f32], b: &[f32], result: &mut Vec<f32>) {
4662 let a_array = Array1::from_vec(a.to_owned());
4663 let b_array = Array1::from_vec(b.to_owned());
4664 let mut result_array = Array1::from_vec(result.clone());
4665
4666 f32::simd_sub_f32_ultra(
4667 &a_array.view(),
4668 &b_array.view(),
4669 &mut result_array.view_mut(),
4670 );
4671 *result = result_array.to_vec();
4672}
4673
4674pub fn simd_fma_f32_ultra_vec(a: &[f32], b: &[f32], c: &[f32], result: &mut Vec<f32>) {
4676 let a_array = Array1::from_vec(a.to_owned());
4677 let b_array = Array1::from_vec(b.to_owned());
4678 let c_array = Array1::from_vec(c.to_owned());
4679 let mut result_array = Array1::from_vec(result.clone());
4680
4681 f32::simd_fma_f32_ultra(
4682 &a_array.view(),
4683 &b_array.view(),
4684 &c_array.view(),
4685 &mut result_array.view_mut(),
4686 );
4687 *result = result_array.to_vec();
4688}
4689
4690pub fn simd_pow_f32_ultra_vec(a: &[f32], b: &[f32], result: &mut Vec<f32>) {
4692 let a_array = Array1::from_vec(a.to_owned());
4693 let b_array = Array1::from_vec(b.to_owned());
4694 let mut result_array = Array1::from_vec(result.clone());
4695
4696 f32::simd_pow_f32_ultra(
4697 &a_array.view(),
4698 &b_array.view(),
4699 &mut result_array.view_mut(),
4700 );
4701 *result = result_array.to_vec();
4702}
4703
4704pub fn simd_exp_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4706 let a_array = Array1::from_vec(a.to_owned());
4707 let mut result_array = Array1::from_vec(result.clone());
4708
4709 f32::simd_exp_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4710 *result = result_array.to_vec();
4711}
4712
4713pub fn simd_cos_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4715 let a_array = Array1::from_vec(a.to_owned());
4716 let mut result_array = Array1::from_vec(result.clone());
4717
4718 f32::simd_cos_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4719 *result = result_array.to_vec();
4720}
4721
4722#[cfg(test)]
4723#[path = "simd_ops_tests.rs"]
4724mod tests;