Skip to main content

scirs2_core/simd_ops/
functions_4.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
6
7use super::functions::SimdUnifiedOps;
8use super::functions_2::{
9    digamma_f32, digamma_f64, erf_f32, erf_f64, erfc_f32, erfc_f64, erfinv_f32, lanczos_gamma_f32,
10    lanczos_gamma_f64, ln_gamma_f32, ln_gamma_f64, trigamma_f32, trigamma_f64,
11};
12use super::functions_3::{
13    elu_f32, elu_f64, erfcinv_f32, erfcinv_f64, erfinv_f64, gelu_f32, gelu_f64, hardsigmoid_f32,
14    hardsigmoid_f64, hardswish_f32, hardswish_f64, mish_f32, mish_f64, selu_f32, selu_f64,
15    sigmoid_f32, sigmoid_f64, sinc_f32, sinc_f64, softplus_f32, softplus_f64, swish_f32, swish_f64,
16};
17#[cfg(feature = "simd")]
18use crate::simd_ops_polynomial;
19
20impl SimdUnifiedOps for f32 {
21    #[cfg(feature = "simd")]
22    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
23        crate::simd::simd_add_f32(a, b)
24    }
25    #[cfg(not(feature = "simd"))]
26    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
27        (a + b).to_owned()
28    }
29    #[cfg(feature = "simd")]
30    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
31        crate::simd::simd_sub_f32(a, b)
32    }
33    #[cfg(not(feature = "simd"))]
34    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
35        (a - b).to_owned()
36    }
37    #[cfg(feature = "simd")]
38    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
39        crate::simd::simd_mul_f32(a, b)
40    }
41    #[cfg(not(feature = "simd"))]
42    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
43        (a * b).to_owned()
44    }
45    #[cfg(feature = "simd")]
46    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
47        crate::simd::simd_div_f32(a, b)
48    }
49    #[cfg(not(feature = "simd"))]
50    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
51        (a / b).to_owned()
52    }
53    #[cfg(feature = "simd")]
54    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
55        crate::simd::simd_dot_f32(a, b)
56    }
57    #[cfg(not(feature = "simd"))]
58    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
59        a.dot(b)
60    }
61    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
62        let m = a.nrows();
63        let n = a.ncols();
64        assert_eq!(n, x.len());
65        assert_eq!(m, y.len());
66        if beta == 0.0 {
67            y.fill(0.0);
68        } else if beta != 1.0 {
69            y.mapv_inplace(|v| v * beta);
70        }
71        for i in 0..m {
72            let row = a.row(i);
73            y[i] += Self::simd_dot(&row, x);
74        }
75    }
76    fn simd_gemm(
77        alpha: Self,
78        a: &ArrayView2<Self>,
79        b: &ArrayView2<Self>,
80        beta: Self,
81        c: &mut Array2<Self>,
82    ) {
83        let m = a.nrows();
84        let k = a.ncols();
85        let n = b.ncols();
86        assert_eq!(k, b.nrows());
87        assert_eq!((m, n), c.dim());
88        if beta == 0.0 {
89            c.fill(0.0);
90        } else if beta != 1.0 {
91            c.mapv_inplace(|v| v * beta);
92        }
93        const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
94        if n * k > GEMM_TRANSPOSE_THRESHOLD {
95            let b_t = Self::simd_transpose_blocked(b);
96            for i in 0..m {
97                let a_row = a.row(i);
98                for j in 0..n {
99                    let b_row = b_t.row(j);
100                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
101                }
102            }
103        } else {
104            for i in 0..m {
105                let a_row = a.row(i);
106                for j in 0..n {
107                    let b_col = b.column(j);
108                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
109                }
110            }
111        }
112    }
113    #[cfg(feature = "simd")]
114    fn simd_norm(a: &ArrayView1<Self>) -> Self {
115        crate::simd::norms::simd_norm_l2_f32(a)
116    }
117    #[cfg(not(feature = "simd"))]
118    fn simd_norm(a: &ArrayView1<Self>) -> Self {
119        a.iter().map(|&x| x * x).sum::<f32>().sqrt()
120    }
121    #[cfg(feature = "simd")]
122    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
123        crate::simd::simd_maximum_f32(a, b)
124    }
125    #[cfg(not(feature = "simd"))]
126    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
127        let mut result = Array1::zeros(a.len());
128        for _i in 0..a.len() {
129            result[0] = a[0].max(b[0]);
130        }
131        result
132    }
133    #[cfg(feature = "simd")]
134    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
135        crate::simd::simd_minimum_f32(a, b)
136    }
137    #[cfg(not(feature = "simd"))]
138    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
139        let mut result = Array1::zeros(a.len());
140        for _i in 0..a.len() {
141            result[0] = a[0].min(b[0]);
142        }
143        result
144    }
145    #[cfg(feature = "simd")]
146    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
147        crate::simd::simd_scalar_mul_f32(a, scalar)
148    }
149    #[cfg(not(feature = "simd"))]
150    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
151        a.mapv(|x| x * scalar)
152    }
153    #[cfg(feature = "simd")]
154    fn simd_sum(a: &ArrayView1<Self>) -> Self {
155        crate::simd::simd_sum_f32(a)
156    }
157    #[cfg(not(feature = "simd"))]
158    fn simd_sum(a: &ArrayView1<Self>) -> Self {
159        a.sum()
160    }
161    fn simd_mean(a: &ArrayView1<Self>) -> Self {
162        if a.is_empty() {
163            0.0
164        } else {
165            Self::simd_sum(a) / (a.len() as f32)
166        }
167    }
168    #[cfg(feature = "simd")]
169    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
170        crate::simd::simd_max_f32(a)
171    }
172    #[cfg(not(feature = "simd"))]
173    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
174        a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
175    }
176    #[cfg(feature = "simd")]
177    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
178        crate::simd::simd_min_f32(a)
179    }
180    #[cfg(not(feature = "simd"))]
181    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
182        a.fold(f32::INFINITY, |acc, &x| acc.min(x))
183    }
184    #[cfg(feature = "simd")]
185    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
186        crate::simd::simd_fused_multiply_add_f32(a, b, c)
187    }
188    #[cfg(not(feature = "simd"))]
189    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
190        let mut result = Array1::zeros(a.len());
191        for _i in 0..a.len() {
192            result[0] = a[0] * b[0] + c[0];
193        }
194        result
195    }
196    #[cfg(feature = "simd")]
197    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
198        crate::simd::simd_add_cache_optimized_f32(a, b)
199    }
200    #[cfg(not(feature = "simd"))]
201    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
202        a + b
203    }
204    #[cfg(feature = "simd")]
205    fn simd_fma_advanced_optimized(
206        a: &ArrayView1<Self>,
207        b: &ArrayView1<Self>,
208        c: &ArrayView1<Self>,
209    ) -> Array1<Self> {
210        crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
211    }
212    #[cfg(not(feature = "simd"))]
213    fn simd_fma_advanced_optimized(
214        a: &ArrayView1<Self>,
215        b: &ArrayView1<Self>,
216        c: &ArrayView1<Self>,
217    ) -> Array1<Self> {
218        let mut result = Array1::zeros(a.len());
219        for _i in 0..a.len() {
220            result[0] = a[0] * b[0] + c[0];
221        }
222        result
223    }
224    #[cfg(feature = "simd")]
225    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
226        crate::simd::simd_adaptive_add_f32(a, b)
227    }
228    #[cfg(not(feature = "simd"))]
229    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
230        a + b
231    }
232    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
233        a.t().to_owned()
234    }
235    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
236        #[cfg(feature = "simd")]
237        {
238            crate::simd::simd_transpose_blocked_f32(a)
239        }
240        #[cfg(not(feature = "simd"))]
241        {
242            a.t().to_owned()
243        }
244    }
245    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
246        a.iter().map(|&x| x * x).sum()
247    }
248    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
249        Self::simd_mul(a, b)
250    }
251    #[cfg(feature = "simd")]
252    fn simd_available() -> bool {
253        true
254    }
255    #[cfg(not(feature = "simd"))]
256    fn simd_available() -> bool {
257        false
258    }
259    fn simd_sub_f32_ultra(
260        a: &ArrayView1<Self>,
261        b: &ArrayView1<Self>,
262        result: &mut ArrayViewMut1<Self>,
263    ) {
264        let sub_result = Self::simd_sub(a, b);
265        result.assign(&sub_result);
266    }
267    fn simd_mul_f32_ultra(
268        a: &ArrayView1<Self>,
269        b: &ArrayView1<Self>,
270        result: &mut ArrayViewMut1<Self>,
271    ) {
272        let mul_result = Self::simd_mul(a, b);
273        result.assign(&mul_result);
274    }
275    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
276        a.iter().map(|&x| x * x * x).sum()
277    }
278    fn simd_div_f32_ultra(
279        a: &ArrayView1<Self>,
280        b: &ArrayView1<Self>,
281        result: &mut ArrayViewMut1<Self>,
282    ) {
283        let div_result = Self::simd_div(a, b);
284        result.assign(&div_result);
285    }
286    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
287        let sin_result = a.mapv(|x| x.sin());
288        result.assign(&sin_result);
289    }
290    fn simd_add_f32_ultra(
291        a: &ArrayView1<Self>,
292        b: &ArrayView1<Self>,
293        result: &mut ArrayViewMut1<Self>,
294    ) {
295        let add_result = Self::simd_add(a, b);
296        result.assign(&add_result);
297    }
298    fn simd_fma_f32_ultra(
299        a: &ArrayView1<Self>,
300        b: &ArrayView1<Self>,
301        c: &ArrayView1<Self>,
302        result: &mut ArrayViewMut1<Self>,
303    ) {
304        let fma_result = Self::simd_fma(a, b, c);
305        result.assign(&fma_result);
306    }
307    fn simd_pow_f32_ultra(
308        a: &ArrayView1<Self>,
309        b: &ArrayView1<Self>,
310        result: &mut ArrayViewMut1<Self>,
311    ) {
312        let pow_result = a
313            .iter()
314            .zip(b.iter())
315            .map(|(&x, &y)| x.powf(y))
316            .collect::<Vec<_>>();
317        result.assign(&Array1::from_vec(pow_result));
318    }
319    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
320        let exp_result = a.mapv(|x| x.exp());
321        result.assign(&exp_result);
322    }
323    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
324        let cos_result = a.mapv(|x| x.cos());
325        result.assign(&cos_result);
326    }
327    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
328        Self::simd_dot(a, b)
329    }
330    #[cfg(feature = "simd")]
331    fn simd_variance(a: &ArrayView1<Self>) -> Self {
332        crate::simd::simd_variance_f32(a)
333    }
334    #[cfg(not(feature = "simd"))]
335    fn simd_variance(a: &ArrayView1<Self>) -> Self {
336        let mean = Self::simd_mean(a);
337        let n = a.len() as f32;
338        if n < 2.0 {
339            return f32::NAN;
340        }
341        a.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / (n - 1.0)
342    }
343    #[cfg(feature = "simd")]
344    fn simd_std(a: &ArrayView1<Self>) -> Self {
345        crate::simd::simd_std_f32(a)
346    }
347    #[cfg(not(feature = "simd"))]
348    fn simd_std(a: &ArrayView1<Self>) -> Self {
349        Self::simd_variance(a).sqrt()
350    }
351    #[cfg(feature = "simd")]
352    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
353        crate::simd::simd_norm_l1_f32(a)
354    }
355    #[cfg(not(feature = "simd"))]
356    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
357        a.iter().map(|&x| x.abs()).sum()
358    }
359    #[cfg(feature = "simd")]
360    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
361        crate::simd::simd_norm_linf_f32(a)
362    }
363    #[cfg(not(feature = "simd"))]
364    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
365        a.iter().fold(0.0f32, |acc, &x| acc.max(x.abs()))
366    }
367    #[cfg(feature = "simd")]
368    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
369        crate::simd::simd_cosine_similarity_f32(a, b)
370    }
371    #[cfg(not(feature = "simd"))]
372    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
373        let dot = Self::simd_dot(a, b);
374        let norm_a = Self::simd_norm(a);
375        let norm_b = Self::simd_norm(b);
376        dot / (norm_a * norm_b)
377    }
378    #[cfg(feature = "simd")]
379    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
380        crate::simd::simd_distance_euclidean_f32(a, b)
381    }
382    #[cfg(not(feature = "simd"))]
383    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
384        a.iter()
385            .zip(b.iter())
386            .map(|(&x, &y)| (x - y).powi(2))
387            .sum::<f32>()
388            .sqrt()
389    }
390    #[cfg(feature = "simd")]
391    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
392        crate::simd::simd_distance_manhattan_f32(a, b)
393    }
394    #[cfg(not(feature = "simd"))]
395    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
396        a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
397    }
398    #[cfg(feature = "simd")]
399    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
400        crate::simd::simd_distance_chebyshev_f32(a, b)
401    }
402    #[cfg(not(feature = "simd"))]
403    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
404        a.iter()
405            .zip(b.iter())
406            .fold(0.0f32, |acc, (&x, &y)| acc.max((x - y).abs()))
407    }
408    #[cfg(feature = "simd")]
409    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
410        crate::simd::simd_distance_cosine_f32(a, b)
411    }
412    #[cfg(not(feature = "simd"))]
413    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
414        1.0 - Self::simd_cosine_similarity(a, b)
415    }
416    #[cfg(feature = "simd")]
417    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
418        crate::simd::simd_weighted_sum_f32(values, weights)
419    }
420    #[cfg(not(feature = "simd"))]
421    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
422        values
423            .iter()
424            .zip(weights.iter())
425            .map(|(&v, &w)| v * w)
426            .sum()
427    }
428    #[cfg(feature = "simd")]
429    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
430        crate::simd::simd_weighted_mean_f32(values, weights)
431    }
432    #[cfg(not(feature = "simd"))]
433    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
434        let weighted_sum = Self::simd_weighted_sum(values, weights);
435        let weight_sum: f32 = weights.iter().sum();
436        weighted_sum / weight_sum
437    }
438    #[cfg(feature = "simd")]
439    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
440        crate::simd::simd_argmin_f32(a)
441    }
442    #[cfg(not(feature = "simd"))]
443    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
444        if a.is_empty() {
445            return None;
446        }
447        let mut min_idx = 0;
448        let mut min_val = a[0];
449        for (i, &v) in a.iter().enumerate().skip(1) {
450            if v < min_val {
451                min_val = v;
452                min_idx = i;
453            }
454        }
455        Some(min_idx)
456    }
457    #[cfg(feature = "simd")]
458    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
459        crate::simd::simd_argmax_f32(a)
460    }
461    #[cfg(not(feature = "simd"))]
462    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
463        if a.is_empty() {
464            return None;
465        }
466        let mut max_idx = 0;
467        let mut max_val = a[0];
468        for (i, &v) in a.iter().enumerate().skip(1) {
469            if v > max_val {
470                max_val = v;
471                max_idx = i;
472            }
473        }
474        Some(max_idx)
475    }
476    #[cfg(feature = "simd")]
477    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
478        crate::simd::simd_clip_f32(a, min_val, max_val)
479    }
480    #[cfg(not(feature = "simd"))]
481    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
482        a.mapv(|v| v.max(min_val).min(max_val))
483    }
484    #[cfg(feature = "simd")]
485    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
486        crate::simd::simd_log_sum_exp_f32(a)
487    }
488    #[cfg(not(feature = "simd"))]
489    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
490        if a.is_empty() {
491            return f32::NEG_INFINITY;
492        }
493        let max_val = a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x));
494        let sum_exp: f32 = a.iter().map(|&x| (x - max_val).exp()).sum();
495        max_val + sum_exp.ln()
496    }
497    #[cfg(feature = "simd")]
498    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
499        crate::simd::simd_softmax_f32(a)
500    }
501    #[cfg(not(feature = "simd"))]
502    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
503        if a.is_empty() {
504            return Array1::zeros(0);
505        }
506        let lse = Self::simd_log_sum_exp(a);
507        a.mapv(|x| (x - lse).exp())
508    }
509    #[cfg(feature = "simd")]
510    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
511        crate::simd::simd_cumsum_f32(a)
512    }
513    #[cfg(not(feature = "simd"))]
514    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
515        if a.is_empty() {
516            return Array1::zeros(0);
517        }
518        let mut cumsum = 0.0f32;
519        a.mapv(|x| {
520            cumsum += x;
521            cumsum
522        })
523    }
524    #[cfg(feature = "simd")]
525    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
526        crate::simd::simd_cumprod_f32(a)
527    }
528    #[cfg(not(feature = "simd"))]
529    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
530        if a.is_empty() {
531            return Array1::zeros(0);
532        }
533        let mut cumprod = 1.0f32;
534        a.mapv(|x| {
535            cumprod *= x;
536            cumprod
537        })
538    }
539    #[cfg(feature = "simd")]
540    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
541        crate::simd::simd_diff_f32(a)
542    }
543    #[cfg(not(feature = "simd"))]
544    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
545        if a.len() <= 1 {
546            return Array1::zeros(0);
547        }
548        Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
549    }
550    #[cfg(feature = "simd")]
551    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
552        crate::simd::simd_sign_f32(a)
553    }
554    #[cfg(not(feature = "simd"))]
555    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
556        a.mapv(|x| {
557            if x > 0.0 {
558                1.0
559            } else if x < 0.0 {
560                -1.0
561            } else {
562                0.0
563            }
564        })
565    }
566    #[cfg(feature = "simd")]
567    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
568        crate::simd::simd_relu_f32(a)
569    }
570    #[cfg(not(feature = "simd"))]
571    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
572        a.mapv(|x| x.max(0.0))
573    }
574    #[cfg(feature = "simd")]
575    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
576        crate::simd::simd_leaky_relu_f32(a, alpha)
577    }
578    #[cfg(not(feature = "simd"))]
579    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
580        a.mapv(|x| if x > 0.0 { x } else { alpha * x })
581    }
582    #[cfg(feature = "simd")]
583    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
584        crate::simd::simd_normalize_f32(a)
585    }
586    #[cfg(not(feature = "simd"))]
587    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
588        let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
589        if norm == 0.0 {
590            return a.to_owned();
591        }
592        a.mapv(|x| x / norm)
593    }
594    #[cfg(feature = "simd")]
595    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
596        crate::simd::simd_standardize_f32(a)
597    }
598    #[cfg(not(feature = "simd"))]
599    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
600        if a.len() <= 1 {
601            return Array1::zeros(a.len());
602        }
603        let mean: f32 = a.iter().sum::<f32>() / a.len() as f32;
604        let variance: f32 =
605            a.iter().map(|x| (x - mean) * (x - mean)).sum::<f32>() / (a.len() - 1) as f32;
606        let std = variance.sqrt();
607        if std == 0.0 {
608            return Array1::zeros(a.len());
609        }
610        a.mapv(|x| (x - mean) / std)
611    }
612    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
613        a.mapv(|x| x.abs())
614    }
615    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
616        a.mapv(|x| x.sqrt())
617    }
618    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
619        a.mapv(|x| x.exp())
620    }
621    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
622        a.mapv(|x| x.ln())
623    }
624    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
625        a.mapv(|x| x.sin())
626    }
627    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
628        a.mapv(|x| x.cos())
629    }
630    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
631        a.mapv(|x| x.tan())
632    }
633    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
634        let exp_a = Self::simd_exp(a);
635        let neg_a = Self::simd_scalar_mul(a, -1.0);
636        let exp_neg_a = Self::simd_exp(&neg_a.view());
637        let diff = Self::simd_sub(&exp_a.view(), &exp_neg_a.view());
638        Self::simd_scalar_mul(&diff.view(), 0.5)
639    }
640    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
641        let exp_a = Self::simd_exp(a);
642        let neg_a = Self::simd_scalar_mul(a, -1.0);
643        let exp_neg_a = Self::simd_exp(&neg_a.view());
644        let sum = Self::simd_add(&exp_a.view(), &exp_neg_a.view());
645        Self::simd_scalar_mul(&sum.view(), 0.5)
646    }
647    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
648        #[cfg(feature = "simd")]
649        {
650            simd_ops_polynomial::simd_tanh_f32_poly(a)
651        }
652        #[cfg(not(feature = "simd"))]
653        {
654            a.mapv(|x| x.tanh())
655        }
656    }
657    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
658        #[cfg(feature = "simd")]
659        {
660            crate::simd::simd_floor_f32(a)
661        }
662        #[cfg(not(feature = "simd"))]
663        {
664            a.mapv(|x| x.floor())
665        }
666    }
667    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
668        #[cfg(feature = "simd")]
669        {
670            crate::simd::simd_ceil_f32(a)
671        }
672        #[cfg(not(feature = "simd"))]
673        {
674            a.mapv(|x| x.ceil())
675        }
676    }
677    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
678        #[cfg(feature = "simd")]
679        {
680            crate::simd::simd_round_f32(a)
681        }
682        #[cfg(not(feature = "simd"))]
683        {
684            a.mapv(|x| x.round())
685        }
686    }
687    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
688        a.mapv(|x| x.atan())
689    }
690    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
691        a.mapv(|x| x.asin())
692    }
693    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
694        a.mapv(|x| x.acos())
695    }
696    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
697        y.iter()
698            .zip(x.iter())
699            .map(|(&y_val, &x_val)| y_val.atan2(x_val))
700            .collect::<Vec<_>>()
701            .into()
702    }
703    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
704        const LOG10_E: f32 = std::f32::consts::LOG10_E;
705        let ln_a = Self::simd_ln(a);
706        Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
707    }
708    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
709        const LOG2_E: f32 = std::f32::consts::LOG2_E;
710        let ln_a = Self::simd_ln(a);
711        Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
712    }
713    #[cfg(feature = "simd")]
714    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
715        crate::simd::simd_clip_f32(a, min, max)
716    }
717    #[cfg(not(feature = "simd"))]
718    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
719        a.mapv(|x| x.clamp(min, max))
720    }
721    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
722        #[cfg(feature = "simd")]
723        {
724            let truncated = crate::simd::simd_trunc_f32(a);
725            Self::simd_sub(a, &truncated.view())
726        }
727        #[cfg(not(feature = "simd"))]
728        {
729            a.mapv(|x| x.fract())
730        }
731    }
732    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
733        #[cfg(feature = "simd")]
734        {
735            crate::simd::simd_trunc_f32(a)
736        }
737        #[cfg(not(feature = "simd"))]
738        {
739            a.mapv(|x| x.trunc())
740        }
741    }
742    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
743        let ones = Array1::from_elem(a.len(), 1.0f32);
744        Self::simd_div(&ones.view(), a)
745    }
746    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
747        let ln_base = Self::simd_ln(base);
748        let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
749        Self::simd_exp(&scaled.view())
750    }
751    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
752        let ln_base = Self::simd_ln(base);
753        let scaled = Self::simd_mul(&ln_base.view(), exp);
754        Self::simd_exp(&scaled.view())
755    }
756    #[cfg(feature = "simd")]
757    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
758        crate::simd::unary_powi::simd_powi_f32(base, n)
759    }
760    #[cfg(not(feature = "simd"))]
761    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
762        base.mapv(|x| x.powi(n))
763    }
764    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
765        x.mapv(lanczos_gamma_f32)
766    }
767    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
768        const LN2: f32 = std::f32::consts::LN_2;
769        let scaled = Self::simd_scalar_mul(a, LN2);
770        Self::simd_exp(&scaled.view())
771    }
772    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
773        a.mapv(|x| x.cbrt())
774    }
775    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
776        a.mapv(|x| x.ln_1p())
777    }
778    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
779        a.mapv(|x| x.exp_m1())
780    }
781    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
782        const DEG_TO_RAD: f32 = std::f32::consts::PI / 180.0;
783        Self::simd_scalar_mul(a, DEG_TO_RAD)
784    }
785    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
786        const RAD_TO_DEG: f32 = 180.0 / std::f32::consts::PI;
787        Self::simd_scalar_mul(a, RAD_TO_DEG)
788    }
789    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
790        a.mapv(digamma_f32)
791    }
792    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
793        a.mapv(trigamma_f32)
794    }
795    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
796        a.mapv(ln_gamma_f32)
797    }
798    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
799        a.mapv(erf_f32)
800    }
801    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
802        a.mapv(erfc_f32)
803    }
804    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
805        a.mapv(erfinv_f32)
806    }
807    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
808        a.mapv(erfcinv_f32)
809    }
810    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
811        a.mapv(sigmoid_f32)
812    }
813    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
814        a.mapv(gelu_f32)
815    }
816    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
817        a.mapv(swish_f32)
818    }
819    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
820        a.mapv(softplus_f32)
821    }
822    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
823        a.mapv(mish_f32)
824    }
825    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
826        a.mapv(|x| elu_f32(x, alpha))
827    }
828    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
829        a.mapv(selu_f32)
830    }
831    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
832        a.mapv(hardsigmoid_f32)
833    }
834    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
835        a.mapv(hardswish_f32)
836    }
837    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
838        a.mapv(sinc_f32)
839    }
840    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
841        if a.is_empty() {
842            return Array1::zeros(0);
843        }
844        let lse = Self::simd_log_sum_exp(a);
845        a.mapv(|x| x - lse)
846    }
847    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
848        a.mapv(|x| x.asinh())
849    }
850    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
851        a.mapv(|x| x.acosh())
852    }
853    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
854        a.mapv(|x| x.atanh())
855    }
856    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
857        let ln_gamma_a = Self::simd_ln_gamma(a);
858        let ln_gamma_b = Self::simd_ln_gamma(b);
859        let a_plus_b = Self::simd_add(a, b);
860        let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
861        Self::simd_sub(
862            &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
863            &ln_gamma_ab.view(),
864        )
865    }
866    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
867        let ln_beta = Self::simd_ln_beta(a, b);
868        Self::simd_exp(&ln_beta.view())
869    }
870    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
871        if a.is_empty() || b.is_empty() {
872            return Array1::zeros(0);
873        }
874        let diff = Self::simd_sub(b, a);
875        let scaled = Self::simd_scalar_mul(&diff.view(), t);
876        Self::simd_add(a, &scaled.view())
877    }
878    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
879        if x.is_empty() {
880            return Array1::zeros(0);
881        }
882        let range = edge1 - edge0;
883        if range.abs() < Self::EPSILON {
884            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
885        }
886        x.mapv(|xi| {
887            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
888            t * t * (3.0 - 2.0 * t)
889        })
890    }
891    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
892        if x.is_empty() || y.is_empty() {
893            return Array1::zeros(0);
894        }
895        let len = x.len().min(y.len());
896        Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
897    }
898    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
899        if x.is_empty() || y.is_empty() {
900            return Array1::zeros(0);
901        }
902        let len = x.len().min(y.len());
903        Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
904    }
905    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
906        if x.is_empty() {
907            return Array1::zeros(0);
908        }
909        let range = edge1 - edge0;
910        if range.abs() < Self::EPSILON {
911            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
912        }
913        x.mapv(|xi| {
914            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
915            let t3 = t * t * t;
916            t3 * (t * (t * 6.0 - 15.0) + 10.0)
917        })
918    }
919    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
920        if a.is_empty() || b.is_empty() {
921            return Array1::zeros(0);
922        }
923        let len = a.len().min(b.len());
924        Array1::from_iter((0..len).map(|i| {
925            let ai = a[i];
926            let bi = b[i];
927            let max_val = ai.max(bi);
928            let diff = (ai - bi).abs();
929            if diff > 50.0 {
930                max_val
931            } else {
932                max_val + (1.0 + (-diff).exp()).ln()
933            }
934        }))
935    }
936    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
937        if a.is_empty() {
938            return Array1::zeros(0);
939        }
940        a.mapv(|p| {
941            if p <= 0.0 {
942                Self::NEG_INFINITY
943            } else if p >= 1.0 {
944                Self::INFINITY
945            } else {
946                (p / (1.0 - p)).ln()
947            }
948        })
949    }
950    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
951        if a.is_empty() {
952            return Array1::zeros(0);
953        }
954        a.mapv(|x| x * x)
955    }
956    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
957        if a.is_empty() {
958            return Array1::zeros(0);
959        }
960        a.mapv(|x| {
961            if x <= 0.0 {
962                if x == 0.0 {
963                    Self::INFINITY
964                } else {
965                    Self::NAN
966                }
967            } else {
968                1.0 / x.sqrt()
969            }
970        })
971    }
972    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
973        if a.is_empty() {
974            return (Array1::zeros(0), Array1::zeros(0));
975        }
976        let sin_result = a.mapv(|x| x.sin());
977        let cos_result = a.mapv(|x| x.cos());
978        (sin_result, cos_result)
979    }
980    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
981        if a.is_empty() {
982            return Array1::zeros(0);
983        }
984        a.mapv(|x| x.exp_m1())
985    }
986    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
987        if a.is_empty() {
988            return Array1::zeros(0);
989        }
990        a.mapv(|x| x.ln_1p())
991    }
992
993    // ============================================================================
994    // ZERO-ALLOCATION SIMD OPERATIONS (Phase 1: ToRSh SIMD Performance Fix)
995    // ============================================================================
996
997    #[cfg(feature = "simd")]
998    fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
999        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1000        assert_eq!(
1001            a.len(),
1002            output.len(),
1003            "Output buffer must match input length"
1004        );
1005
1006        let len = a.len();
1007
1008        #[cfg(target_arch = "x86_64")]
1009        {
1010            use std::arch::x86_64::*;
1011
1012            if is_x86_feature_detected!("avx2") {
1013                unsafe {
1014                    let mut i = 0;
1015                    // Process 8 f32s at a time with AVX2
1016                    while i + 8 <= len {
1017                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1018                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1019                        let result_vec = _mm256_add_ps(a_vec, b_vec);
1020                        _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1021                        i += 8;
1022                    }
1023                    // Handle remaining elements
1024                    while i < len {
1025                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1026                        i += 1;
1027                    }
1028                }
1029                return;
1030            }
1031        }
1032
1033        #[cfg(target_arch = "aarch64")]
1034        {
1035            use std::arch::aarch64::*;
1036
1037            if std::arch::is_aarch64_feature_detected!("neon") {
1038                unsafe {
1039                    let mut i = 0;
1040                    // Process 4 f32s at a time with NEON
1041                    while i + 4 <= len {
1042                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1043                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1044                        let result_vec = vaddq_f32(a_vec, b_vec);
1045                        vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1046                        i += 4;
1047                    }
1048                    // Handle remaining elements
1049                    while i < len {
1050                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1051                        i += 1;
1052                    }
1053                }
1054                return;
1055            }
1056        }
1057
1058        // Scalar fallback
1059        for i in 0..len {
1060            output[i] = a[i] + b[i];
1061        }
1062    }
1063
1064    #[cfg(not(feature = "simd"))]
1065    fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1066        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1067        assert_eq!(
1068            a.len(),
1069            output.len(),
1070            "Output buffer must match input length"
1071        );
1072        for i in 0..a.len() {
1073            output[i] = a[i] + b[i];
1074        }
1075    }
1076
1077    #[cfg(feature = "simd")]
1078    fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1079        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1080        assert_eq!(
1081            a.len(),
1082            output.len(),
1083            "Output buffer must match input length"
1084        );
1085
1086        let len = a.len();
1087
1088        #[cfg(target_arch = "x86_64")]
1089        {
1090            use std::arch::x86_64::*;
1091
1092            if is_x86_feature_detected!("avx2") {
1093                unsafe {
1094                    let mut i = 0;
1095                    while i + 8 <= len {
1096                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1097                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1098                        let result_vec = _mm256_sub_ps(a_vec, b_vec);
1099                        _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1100                        i += 8;
1101                    }
1102                    while i < len {
1103                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) - *b.get_unchecked(i);
1104                        i += 1;
1105                    }
1106                }
1107                return;
1108            }
1109        }
1110
1111        #[cfg(target_arch = "aarch64")]
1112        {
1113            use std::arch::aarch64::*;
1114
1115            if std::arch::is_aarch64_feature_detected!("neon") {
1116                unsafe {
1117                    let mut i = 0;
1118                    while i + 4 <= len {
1119                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1120                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1121                        let result_vec = vsubq_f32(a_vec, b_vec);
1122                        vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1123                        i += 4;
1124                    }
1125                    while i < len {
1126                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) - *b.get_unchecked(i);
1127                        i += 1;
1128                    }
1129                }
1130                return;
1131            }
1132        }
1133
1134        for i in 0..len {
1135            output[i] = a[i] - b[i];
1136        }
1137    }
1138
1139    #[cfg(not(feature = "simd"))]
1140    fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1141        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1142        assert_eq!(
1143            a.len(),
1144            output.len(),
1145            "Output buffer must match input length"
1146        );
1147        for i in 0..a.len() {
1148            output[i] = a[i] - b[i];
1149        }
1150    }
1151
1152    #[cfg(feature = "simd")]
1153    fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1154        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1155        assert_eq!(
1156            a.len(),
1157            output.len(),
1158            "Output buffer must match input length"
1159        );
1160
1161        let len = a.len();
1162
1163        #[cfg(target_arch = "x86_64")]
1164        {
1165            use std::arch::x86_64::*;
1166
1167            if is_x86_feature_detected!("avx2") {
1168                unsafe {
1169                    let mut i = 0;
1170                    while i + 8 <= len {
1171                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1172                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1173                        let result_vec = _mm256_mul_ps(a_vec, b_vec);
1174                        _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1175                        i += 8;
1176                    }
1177                    while i < len {
1178                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1179                        i += 1;
1180                    }
1181                }
1182                return;
1183            }
1184        }
1185
1186        #[cfg(target_arch = "aarch64")]
1187        {
1188            use std::arch::aarch64::*;
1189
1190            if std::arch::is_aarch64_feature_detected!("neon") {
1191                unsafe {
1192                    let mut i = 0;
1193                    while i + 4 <= len {
1194                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1195                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1196                        let result_vec = vmulq_f32(a_vec, b_vec);
1197                        vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1198                        i += 4;
1199                    }
1200                    while i < len {
1201                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1202                        i += 1;
1203                    }
1204                }
1205                return;
1206            }
1207        }
1208
1209        for i in 0..len {
1210            output[i] = a[i] * b[i];
1211        }
1212    }
1213
1214    #[cfg(not(feature = "simd"))]
1215    fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1216        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1217        assert_eq!(
1218            a.len(),
1219            output.len(),
1220            "Output buffer must match input length"
1221        );
1222        for i in 0..a.len() {
1223            output[i] = a[i] * b[i];
1224        }
1225    }
1226
1227    fn simd_div_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1228        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1229        assert_eq!(
1230            a.len(),
1231            output.len(),
1232            "Output buffer must match input length"
1233        );
1234        // Division doesn't benefit as much from SIMD due to higher latency
1235        for i in 0..a.len() {
1236            output[i] = a[i] / b[i];
1237        }
1238    }
1239
1240    #[cfg(feature = "simd")]
1241    fn simd_add_inplace(a: &mut [Self], b: &[Self]) {
1242        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1243
1244        let len = a.len();
1245
1246        #[cfg(target_arch = "x86_64")]
1247        {
1248            use std::arch::x86_64::*;
1249
1250            if is_x86_feature_detected!("avx2") {
1251                unsafe {
1252                    let mut i = 0;
1253                    while i + 8 <= len {
1254                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1255                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1256                        let result_vec = _mm256_add_ps(a_vec, b_vec);
1257                        _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
1258                        i += 8;
1259                    }
1260                    while i < len {
1261                        *a.get_unchecked_mut(i) += *b.get_unchecked(i);
1262                        i += 1;
1263                    }
1264                }
1265                return;
1266            }
1267        }
1268
1269        #[cfg(target_arch = "aarch64")]
1270        {
1271            use std::arch::aarch64::*;
1272
1273            if std::arch::is_aarch64_feature_detected!("neon") {
1274                unsafe {
1275                    let mut i = 0;
1276                    while i + 4 <= len {
1277                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1278                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1279                        let result_vec = vaddq_f32(a_vec, b_vec);
1280                        vst1q_f32(a.as_mut_ptr().add(i), result_vec);
1281                        i += 4;
1282                    }
1283                    while i < len {
1284                        *a.get_unchecked_mut(i) += *b.get_unchecked(i);
1285                        i += 1;
1286                    }
1287                }
1288                return;
1289            }
1290        }
1291
1292        for i in 0..len {
1293            a[i] += b[i];
1294        }
1295    }
1296
1297    #[cfg(not(feature = "simd"))]
1298    fn simd_add_inplace(a: &mut [Self], b: &[Self]) {
1299        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1300        for i in 0..a.len() {
1301            a[i] += b[i];
1302        }
1303    }
1304
1305    fn simd_sub_inplace(a: &mut [Self], b: &[Self]) {
1306        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1307        for i in 0..a.len() {
1308            a[i] -= b[i];
1309        }
1310    }
1311
1312    #[cfg(feature = "simd")]
1313    fn simd_mul_inplace(a: &mut [Self], b: &[Self]) {
1314        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1315
1316        let len = a.len();
1317
1318        #[cfg(target_arch = "x86_64")]
1319        {
1320            use std::arch::x86_64::*;
1321
1322            if is_x86_feature_detected!("avx2") {
1323                unsafe {
1324                    let mut i = 0;
1325                    while i + 8 <= len {
1326                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1327                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1328                        let result_vec = _mm256_mul_ps(a_vec, b_vec);
1329                        _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
1330                        i += 8;
1331                    }
1332                    while i < len {
1333                        *a.get_unchecked_mut(i) *= *b.get_unchecked(i);
1334                        i += 1;
1335                    }
1336                }
1337                return;
1338            }
1339        }
1340
1341        #[cfg(target_arch = "aarch64")]
1342        {
1343            use std::arch::aarch64::*;
1344
1345            if std::arch::is_aarch64_feature_detected!("neon") {
1346                unsafe {
1347                    let mut i = 0;
1348                    while i + 4 <= len {
1349                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1350                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1351                        let result_vec = vmulq_f32(a_vec, b_vec);
1352                        vst1q_f32(a.as_mut_ptr().add(i), result_vec);
1353                        i += 4;
1354                    }
1355                    while i < len {
1356                        *a.get_unchecked_mut(i) *= *b.get_unchecked(i);
1357                        i += 1;
1358                    }
1359                }
1360                return;
1361            }
1362        }
1363
1364        for i in 0..len {
1365            a[i] *= b[i];
1366        }
1367    }
1368
1369    #[cfg(not(feature = "simd"))]
1370    fn simd_mul_inplace(a: &mut [Self], b: &[Self]) {
1371        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1372        for i in 0..a.len() {
1373            a[i] *= b[i];
1374        }
1375    }
1376
1377    fn simd_div_inplace(a: &mut [Self], b: &[Self]) {
1378        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1379        for i in 0..a.len() {
1380            a[i] /= b[i];
1381        }
1382    }
1383
1384    fn simd_add_scalar_inplace(a: &mut [Self], scalar: Self) {
1385        for x in a.iter_mut() {
1386            *x += scalar;
1387        }
1388    }
1389
1390    fn simd_mul_scalar_inplace(a: &mut [Self], scalar: Self) {
1391        for x in a.iter_mut() {
1392            *x *= scalar;
1393        }
1394    }
1395
1396    #[cfg(feature = "simd")]
1397    fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]) {
1398        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1399        assert_eq!(a.len(), c.len(), "Input arrays must have same length");
1400        assert_eq!(
1401            a.len(),
1402            output.len(),
1403            "Output buffer must match input length"
1404        );
1405
1406        let len = a.len();
1407
1408        #[cfg(target_arch = "x86_64")]
1409        {
1410            use std::arch::x86_64::*;
1411
1412            if is_x86_feature_detected!("fma") && is_x86_feature_detected!("avx2") {
1413                unsafe {
1414                    let mut i = 0;
1415                    while i + 8 <= len {
1416                        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1417                        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1418                        let c_vec = _mm256_loadu_ps(c.as_ptr().add(i));
1419                        // FMA: a * b + c
1420                        let result_vec = _mm256_fmadd_ps(a_vec, b_vec, c_vec);
1421                        _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1422                        i += 8;
1423                    }
1424                    while i < len {
1425                        *output.get_unchecked_mut(i) = a
1426                            .get_unchecked(i)
1427                            .mul_add(*b.get_unchecked(i), *c.get_unchecked(i));
1428                        i += 1;
1429                    }
1430                }
1431                return;
1432            }
1433        }
1434
1435        #[cfg(target_arch = "aarch64")]
1436        {
1437            use std::arch::aarch64::*;
1438
1439            if std::arch::is_aarch64_feature_detected!("neon") {
1440                unsafe {
1441                    let mut i = 0;
1442                    while i + 4 <= len {
1443                        let a_vec = vld1q_f32(a.as_ptr().add(i));
1444                        let b_vec = vld1q_f32(b.as_ptr().add(i));
1445                        let c_vec = vld1q_f32(c.as_ptr().add(i));
1446                        // FMA: a * b + c
1447                        let result_vec = vfmaq_f32(c_vec, a_vec, b_vec);
1448                        vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1449                        i += 4;
1450                    }
1451                    while i < len {
1452                        *output.get_unchecked_mut(i) = a
1453                            .get_unchecked(i)
1454                            .mul_add(*b.get_unchecked(i), *c.get_unchecked(i));
1455                        i += 1;
1456                    }
1457                }
1458                return;
1459            }
1460        }
1461
1462        for i in 0..len {
1463            output[i] = a[i].mul_add(b[i], c[i]);
1464        }
1465    }
1466
1467    #[cfg(not(feature = "simd"))]
1468    fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]) {
1469        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1470        assert_eq!(a.len(), c.len(), "Input arrays must have same length");
1471        assert_eq!(
1472            a.len(),
1473            output.len(),
1474            "Output buffer must match input length"
1475        );
1476        for i in 0..a.len() {
1477            output[i] = a[i].mul_add(b[i], c[i]);
1478        }
1479    }
1480}