Skip to main content

scirs2_core/simd_ops/
functions_5.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
6
7use super::functions::SimdUnifiedOps;
8use super::functions_2::{
9    digamma_f32, digamma_f64, erf_f32, erf_f64, erfc_f32, erfc_f64, erfinv_f32, lanczos_gamma_f32,
10    lanczos_gamma_f64, ln_gamma_f32, ln_gamma_f64, trigamma_f32, trigamma_f64,
11};
12use super::functions_3::{
13    elu_f32, elu_f64, erfcinv_f32, erfcinv_f64, erfinv_f64, gelu_f32, gelu_f64, hardsigmoid_f32,
14    hardsigmoid_f64, hardswish_f32, hardswish_f64, mish_f32, mish_f64, selu_f32, selu_f64,
15    sigmoid_f32, sigmoid_f64, sinc_f32, sinc_f64, softplus_f32, softplus_f64, swish_f32, swish_f64,
16};
17#[cfg(feature = "simd")]
18use crate::simd_ops_polynomial;
19
20impl SimdUnifiedOps for f64 {
21    #[cfg(feature = "simd")]
22    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
23        crate::simd::simd_add_f64(a, b)
24    }
25    #[cfg(not(feature = "simd"))]
26    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
27        (a + b).to_owned()
28    }
29    #[cfg(feature = "simd")]
30    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
31        crate::simd::simd_sub_f64(a, b)
32    }
33    #[cfg(not(feature = "simd"))]
34    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
35        (a - b).to_owned()
36    }
37    #[cfg(feature = "simd")]
38    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
39        crate::simd::simd_mul_f64(a, b)
40    }
41    #[cfg(not(feature = "simd"))]
42    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
43        (a * b).to_owned()
44    }
45    #[cfg(feature = "simd")]
46    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
47        crate::simd::simd_div_f64(a, b)
48    }
49    #[cfg(not(feature = "simd"))]
50    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
51        (a / b).to_owned()
52    }
53    #[cfg(feature = "simd")]
54    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
55        crate::simd::simd_dot_f64(a, b)
56    }
57    #[cfg(not(feature = "simd"))]
58    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
59        a.dot(b)
60    }
61    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
62        let m = a.nrows();
63        let n = a.ncols();
64        assert_eq!(n, x.len());
65        assert_eq!(m, y.len());
66        if beta == 0.0 {
67            y.fill(0.0);
68        } else if beta != 1.0 {
69            y.mapv_inplace(|v| v * beta);
70        }
71        for i in 0..m {
72            let row = a.row(i);
73            y[i] += Self::simd_dot(&row, x);
74        }
75    }
76    fn simd_gemm(
77        alpha: Self,
78        a: &ArrayView2<Self>,
79        b: &ArrayView2<Self>,
80        beta: Self,
81        c: &mut Array2<Self>,
82    ) {
83        let m = a.nrows();
84        let k = a.ncols();
85        let n = b.ncols();
86        assert_eq!(k, b.nrows());
87        assert_eq!((m, n), c.dim());
88        if beta == 0.0 {
89            c.fill(0.0);
90        } else if beta != 1.0 {
91            c.mapv_inplace(|v| v * beta);
92        }
93        const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
94        if n * k > GEMM_TRANSPOSE_THRESHOLD {
95            let b_t = Self::simd_transpose_blocked(b);
96            for i in 0..m {
97                let a_row = a.row(i);
98                for j in 0..n {
99                    let b_row = b_t.row(j);
100                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
101                }
102            }
103        } else {
104            for i in 0..m {
105                let a_row = a.row(i);
106                for j in 0..n {
107                    let b_col = b.column(j);
108                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
109                }
110            }
111        }
112    }
113    #[cfg(feature = "simd")]
114    fn simd_norm(a: &ArrayView1<Self>) -> Self {
115        crate::simd::norms::simd_norm_l2_f64(a)
116    }
117    #[cfg(not(feature = "simd"))]
118    fn simd_norm(a: &ArrayView1<Self>) -> Self {
119        a.iter().map(|&x| x * x).sum::<f64>().sqrt()
120    }
121    #[cfg(feature = "simd")]
122    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
123        crate::simd::simd_maximum_f64(a, b)
124    }
125    #[cfg(not(feature = "simd"))]
126    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
127        let mut result = Array1::zeros(a.len());
128        for _i in 0..a.len() {
129            result[0] = a[0].max(b[0]);
130        }
131        result
132    }
133    #[cfg(feature = "simd")]
134    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
135        crate::simd::simd_minimum_f64(a, b)
136    }
137    #[cfg(not(feature = "simd"))]
138    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
139        let mut result = Array1::zeros(a.len());
140        for _i in 0..a.len() {
141            result[0] = a[0].min(b[0]);
142        }
143        result
144    }
145    #[cfg(feature = "simd")]
146    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
147        crate::simd::simd_scalar_mul_f64(a, scalar)
148    }
149    #[cfg(not(feature = "simd"))]
150    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
151        a.mapv(|x| x * scalar)
152    }
153    #[cfg(feature = "simd")]
154    fn simd_sum(a: &ArrayView1<Self>) -> Self {
155        crate::simd::simd_sum_f64(a)
156    }
157    #[cfg(not(feature = "simd"))]
158    fn simd_sum(a: &ArrayView1<Self>) -> Self {
159        a.sum()
160    }
161    fn simd_mean(a: &ArrayView1<Self>) -> Self {
162        if a.is_empty() {
163            0.0
164        } else {
165            Self::simd_sum(a) / (a.len() as f64)
166        }
167    }
168    #[cfg(feature = "simd")]
169    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
170        crate::simd::simd_max_f64(a)
171    }
172    #[cfg(not(feature = "simd"))]
173    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
174        a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
175    }
176    #[cfg(feature = "simd")]
177    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
178        crate::simd::simd_min_f64(a)
179    }
180    #[cfg(not(feature = "simd"))]
181    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
182        a.fold(f64::INFINITY, |acc, &x| acc.min(x))
183    }
184    #[cfg(feature = "simd")]
185    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
186        crate::simd::simd_fused_multiply_add_f64(a, b, c)
187    }
188    #[cfg(not(feature = "simd"))]
189    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
190        let mut result = Array1::zeros(a.len());
191        for _i in 0..a.len() {
192            result[0] = a[0] * b[0] + c[0];
193        }
194        result
195    }
196    #[cfg(feature = "simd")]
197    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
198        crate::simd::simd_add_cache_optimized_f64(a, b)
199    }
200    #[cfg(not(feature = "simd"))]
201    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
202        a + b
203    }
204    #[cfg(feature = "simd")]
205    fn simd_fma_advanced_optimized(
206        a: &ArrayView1<Self>,
207        b: &ArrayView1<Self>,
208        c: &ArrayView1<Self>,
209    ) -> Array1<Self> {
210        crate::simd::simd_fma_advanced_optimized_f64(a, b, c)
211    }
212    #[cfg(not(feature = "simd"))]
213    fn simd_fma_advanced_optimized(
214        a: &ArrayView1<Self>,
215        b: &ArrayView1<Self>,
216        c: &ArrayView1<Self>,
217    ) -> Array1<Self> {
218        let mut result = Array1::zeros(a.len());
219        for _i in 0..a.len() {
220            result[0] = a[0] * b[0] + c[0];
221        }
222        result
223    }
224    #[cfg(feature = "simd")]
225    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
226        crate::simd::simd_adaptive_add_f64(a, b)
227    }
228    #[cfg(not(feature = "simd"))]
229    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
230        a + b
231    }
232    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
233        a.t().to_owned()
234    }
235    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
236        #[cfg(feature = "simd")]
237        {
238            crate::simd::simd_transpose_blocked_f64(a)
239        }
240        #[cfg(not(feature = "simd"))]
241        {
242            a.t().to_owned()
243        }
244    }
245    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
246        a.iter().map(|&x| x * x).sum()
247    }
248    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
249        Self::simd_mul(a, b)
250    }
251    #[cfg(feature = "simd")]
252    fn simd_available() -> bool {
253        true
254    }
255    #[cfg(not(feature = "simd"))]
256    fn simd_available() -> bool {
257        false
258    }
259    fn simd_sub_f32_ultra(
260        a: &ArrayView1<Self>,
261        b: &ArrayView1<Self>,
262        result: &mut ArrayViewMut1<Self>,
263    ) {
264        let sub_result = Self::simd_sub(a, b);
265        result.assign(&sub_result);
266    }
267    fn simd_mul_f32_ultra(
268        a: &ArrayView1<Self>,
269        b: &ArrayView1<Self>,
270        result: &mut ArrayViewMut1<Self>,
271    ) {
272        let mul_result = Self::simd_mul(a, b);
273        result.assign(&mul_result);
274    }
275    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
276        a.iter().map(|&x| x * x * x).sum()
277    }
278    fn simd_div_f32_ultra(
279        a: &ArrayView1<Self>,
280        b: &ArrayView1<Self>,
281        result: &mut ArrayViewMut1<Self>,
282    ) {
283        let div_result = Self::simd_div(a, b);
284        result.assign(&div_result);
285    }
286    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
287        let sin_result = a.mapv(|x| x.sin());
288        result.assign(&sin_result);
289    }
290    fn simd_add_f32_ultra(
291        a: &ArrayView1<Self>,
292        b: &ArrayView1<Self>,
293        result: &mut ArrayViewMut1<Self>,
294    ) {
295        let add_result = Self::simd_add(a, b);
296        result.assign(&add_result);
297    }
298    fn simd_fma_f32_ultra(
299        a: &ArrayView1<Self>,
300        b: &ArrayView1<Self>,
301        c: &ArrayView1<Self>,
302        result: &mut ArrayViewMut1<Self>,
303    ) {
304        let fma_result = Self::simd_fma(a, b, c);
305        result.assign(&fma_result);
306    }
307    fn simd_pow_f32_ultra(
308        a: &ArrayView1<Self>,
309        b: &ArrayView1<Self>,
310        result: &mut ArrayViewMut1<Self>,
311    ) {
312        let pow_result = a
313            .iter()
314            .zip(b.iter())
315            .map(|(&x, &y)| x.powf(y))
316            .collect::<Vec<_>>();
317        result.assign(&Array1::from_vec(pow_result));
318    }
319    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
320        let exp_result = a.mapv(|x| x.exp());
321        result.assign(&exp_result);
322    }
323    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
324        let cos_result = a.mapv(|x| x.cos());
325        result.assign(&cos_result);
326    }
327    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
328        Self::simd_dot(a, b)
329    }
330    #[cfg(feature = "simd")]
331    fn simd_variance(a: &ArrayView1<Self>) -> Self {
332        crate::simd::simd_variance_f64(a)
333    }
334    #[cfg(not(feature = "simd"))]
335    fn simd_variance(a: &ArrayView1<Self>) -> Self {
336        let mean = Self::simd_mean(a);
337        let n = a.len() as f64;
338        if n < 2.0 {
339            return f64::NAN;
340        }
341        a.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
342    }
343    #[cfg(feature = "simd")]
344    fn simd_std(a: &ArrayView1<Self>) -> Self {
345        crate::simd::simd_std_f64(a)
346    }
347    #[cfg(not(feature = "simd"))]
348    fn simd_std(a: &ArrayView1<Self>) -> Self {
349        Self::simd_variance(a).sqrt()
350    }
351    #[cfg(feature = "simd")]
352    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
353        crate::simd::simd_norm_l1_f64(a)
354    }
355    #[cfg(not(feature = "simd"))]
356    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
357        a.iter().map(|&x| x.abs()).sum()
358    }
359    #[cfg(feature = "simd")]
360    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
361        crate::simd::simd_norm_linf_f64(a)
362    }
363    #[cfg(not(feature = "simd"))]
364    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
365        a.iter().fold(0.0f64, |acc, &x| acc.max(x.abs()))
366    }
367    #[cfg(feature = "simd")]
368    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
369        crate::simd::simd_cosine_similarity_f64(a, b)
370    }
371    #[cfg(not(feature = "simd"))]
372    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
373        let dot = Self::simd_dot(a, b);
374        let norm_a = Self::simd_norm(a);
375        let norm_b = Self::simd_norm(b);
376        dot / (norm_a * norm_b)
377    }
378    #[cfg(feature = "simd")]
379    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
380        crate::simd::simd_distance_euclidean_f64(a, b)
381    }
382    #[cfg(not(feature = "simd"))]
383    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
384        a.iter()
385            .zip(b.iter())
386            .map(|(&x, &y)| (x - y).powi(2))
387            .sum::<f64>()
388            .sqrt()
389    }
390    #[cfg(feature = "simd")]
391    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
392        crate::simd::simd_distance_manhattan_f64(a, b)
393    }
394    #[cfg(not(feature = "simd"))]
395    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
396        a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
397    }
398    #[cfg(feature = "simd")]
399    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
400        crate::simd::simd_distance_chebyshev_f64(a, b)
401    }
402    #[cfg(not(feature = "simd"))]
403    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
404        a.iter()
405            .zip(b.iter())
406            .fold(0.0f64, |acc, (&x, &y)| acc.max((x - y).abs()))
407    }
408    #[cfg(feature = "simd")]
409    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
410        crate::simd::simd_distance_cosine_f64(a, b)
411    }
412    #[cfg(not(feature = "simd"))]
413    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
414        1.0 - Self::simd_cosine_similarity(a, b)
415    }
416    #[cfg(feature = "simd")]
417    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
418        crate::simd::simd_weighted_sum_f64(values, weights)
419    }
420    #[cfg(not(feature = "simd"))]
421    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
422        values
423            .iter()
424            .zip(weights.iter())
425            .map(|(&v, &w)| v * w)
426            .sum()
427    }
428    #[cfg(feature = "simd")]
429    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
430        crate::simd::simd_weighted_mean_f64(values, weights)
431    }
432    #[cfg(not(feature = "simd"))]
433    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
434        let weighted_sum = Self::simd_weighted_sum(values, weights);
435        let weight_sum: f64 = weights.iter().sum();
436        weighted_sum / weight_sum
437    }
438    #[cfg(feature = "simd")]
439    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
440        crate::simd::simd_argmin_f64(a)
441    }
442    #[cfg(not(feature = "simd"))]
443    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
444        if a.is_empty() {
445            return None;
446        }
447        let mut min_idx = 0;
448        let mut min_val = a[0];
449        for (i, &v) in a.iter().enumerate().skip(1) {
450            if v < min_val {
451                min_val = v;
452                min_idx = i;
453            }
454        }
455        Some(min_idx)
456    }
457    #[cfg(feature = "simd")]
458    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
459        crate::simd::simd_argmax_f64(a)
460    }
461    #[cfg(not(feature = "simd"))]
462    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
463        if a.is_empty() {
464            return None;
465        }
466        let mut max_idx = 0;
467        let mut max_val = a[0];
468        for (i, &v) in a.iter().enumerate().skip(1) {
469            if v > max_val {
470                max_val = v;
471                max_idx = i;
472            }
473        }
474        Some(max_idx)
475    }
476    #[cfg(feature = "simd")]
477    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
478        crate::simd::simd_clip_f64(a, min_val, max_val)
479    }
480    #[cfg(not(feature = "simd"))]
481    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
482        a.mapv(|v| v.max(min_val).min(max_val))
483    }
484    #[cfg(feature = "simd")]
485    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
486        crate::simd::simd_log_sum_exp_f64(a)
487    }
488    #[cfg(not(feature = "simd"))]
489    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
490        if a.is_empty() {
491            return f64::NEG_INFINITY;
492        }
493        let max_val = a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x));
494        let sum_exp: f64 = a.iter().map(|&x| (x - max_val).exp()).sum();
495        max_val + sum_exp.ln()
496    }
497    #[cfg(feature = "simd")]
498    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
499        crate::simd::simd_softmax_f64(a)
500    }
501    #[cfg(not(feature = "simd"))]
502    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
503        if a.is_empty() {
504            return Array1::zeros(0);
505        }
506        let lse = Self::simd_log_sum_exp(a);
507        a.mapv(|x| (x - lse).exp())
508    }
509    #[cfg(feature = "simd")]
510    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
511        crate::simd::simd_cumsum_f64(a)
512    }
513    #[cfg(not(feature = "simd"))]
514    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
515        if a.is_empty() {
516            return Array1::zeros(0);
517        }
518        let mut cumsum = 0.0f64;
519        a.mapv(|x| {
520            cumsum += x;
521            cumsum
522        })
523    }
524    #[cfg(feature = "simd")]
525    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
526        crate::simd::simd_cumprod_f64(a)
527    }
528    #[cfg(not(feature = "simd"))]
529    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
530        if a.is_empty() {
531            return Array1::zeros(0);
532        }
533        let mut cumprod = 1.0f64;
534        a.mapv(|x| {
535            cumprod *= x;
536            cumprod
537        })
538    }
539    #[cfg(feature = "simd")]
540    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
541        crate::simd::simd_diff_f64(a)
542    }
543    #[cfg(not(feature = "simd"))]
544    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
545        if a.len() <= 1 {
546            return Array1::zeros(0);
547        }
548        Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
549    }
550    #[cfg(feature = "simd")]
551    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
552        crate::simd::simd_sign_f64(a)
553    }
554    #[cfg(not(feature = "simd"))]
555    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
556        a.mapv(|x| {
557            if x > 0.0 {
558                1.0
559            } else if x < 0.0 {
560                -1.0
561            } else {
562                0.0
563            }
564        })
565    }
566    #[cfg(feature = "simd")]
567    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
568        crate::simd::simd_relu_f64(a)
569    }
570    #[cfg(not(feature = "simd"))]
571    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
572        a.mapv(|x| x.max(0.0))
573    }
574    #[cfg(feature = "simd")]
575    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
576        crate::simd::simd_leaky_relu_f64(a, alpha)
577    }
578    #[cfg(not(feature = "simd"))]
579    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
580        a.mapv(|x| if x > 0.0 { x } else { alpha * x })
581    }
582    #[cfg(feature = "simd")]
583    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
584        crate::simd::simd_normalize_f64(a)
585    }
586    #[cfg(not(feature = "simd"))]
587    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
588        let norm: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
589        if norm == 0.0 {
590            return a.to_owned();
591        }
592        a.mapv(|x| x / norm)
593    }
594    #[cfg(feature = "simd")]
595    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
596        crate::simd::simd_standardize_f64(a)
597    }
598    #[cfg(not(feature = "simd"))]
599    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
600        if a.len() <= 1 {
601            return Array1::zeros(a.len());
602        }
603        let mean: f64 = a.iter().sum::<f64>() / a.len() as f64;
604        let variance: f64 =
605            a.iter().map(|x| (x - mean) * (x - mean)).sum::<f64>() / (a.len() - 1) as f64;
606        let std = variance.sqrt();
607        if std == 0.0 {
608            return Array1::zeros(a.len());
609        }
610        a.mapv(|x| (x - mean) / std)
611    }
612    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
613        a.mapv(|x| x.abs())
614    }
615    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
616        a.mapv(|x| x.sqrt())
617    }
618    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
619        a.mapv(|x| x.exp())
620    }
621    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
622        a.mapv(|x| x.ln())
623    }
624    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
625        a.mapv(|x| x.sin())
626    }
627    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
628        a.mapv(|x| x.cos())
629    }
630    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
631        a.mapv(|x| x.tan())
632    }
633    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
634        #[cfg(feature = "simd")]
635        {
636            simd_ops_polynomial::simd_sinh_f64_poly(a)
637        }
638        #[cfg(not(feature = "simd"))]
639        {
640            a.mapv(|x| x.sinh())
641        }
642    }
643    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
644        #[cfg(feature = "simd")]
645        {
646            simd_ops_polynomial::simd_cosh_f64_poly(a)
647        }
648        #[cfg(not(feature = "simd"))]
649        {
650            a.mapv(|x| x.cosh())
651        }
652    }
653    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
654        #[cfg(feature = "simd")]
655        {
656            simd_ops_polynomial::simd_tanh_f64_poly(a)
657        }
658        #[cfg(not(feature = "simd"))]
659        {
660            a.mapv(|x| x.tanh())
661        }
662    }
663    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
664        #[cfg(feature = "simd")]
665        {
666            crate::simd::simd_floor_f64(a)
667        }
668        #[cfg(not(feature = "simd"))]
669        {
670            a.mapv(|x| x.floor())
671        }
672    }
673    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
674        #[cfg(feature = "simd")]
675        {
676            crate::simd::simd_ceil_f64(a)
677        }
678        #[cfg(not(feature = "simd"))]
679        {
680            a.mapv(|x| x.ceil())
681        }
682    }
683    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
684        #[cfg(feature = "simd")]
685        {
686            crate::simd::simd_round_f64(a)
687        }
688        #[cfg(not(feature = "simd"))]
689        {
690            a.mapv(|x| x.round())
691        }
692    }
693    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
694        a.mapv(|x| x.atan())
695    }
696    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
697        a.mapv(|x| x.asin())
698    }
699    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
700        a.mapv(|x| x.acos())
701    }
702    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
703        y.iter()
704            .zip(x.iter())
705            .map(|(&y_val, &x_val)| y_val.atan2(x_val))
706            .collect::<Vec<_>>()
707            .into()
708    }
709    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
710        const LOG10_E: f64 = std::f64::consts::LOG10_E;
711        let ln_a = Self::simd_ln(a);
712        Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
713    }
714    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
715        const LOG2_E: f64 = std::f64::consts::LOG2_E;
716        let ln_a = Self::simd_ln(a);
717        Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
718    }
719    #[cfg(feature = "simd")]
720    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
721        crate::simd::simd_clip_f64(a, min, max)
722    }
723    #[cfg(not(feature = "simd"))]
724    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
725        a.mapv(|x| x.clamp(min, max))
726    }
727    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
728        #[cfg(feature = "simd")]
729        {
730            let truncated = crate::simd::simd_trunc_f64(a);
731            Self::simd_sub(a, &truncated.view())
732        }
733        #[cfg(not(feature = "simd"))]
734        {
735            a.mapv(|x| x.fract())
736        }
737    }
738    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
739        #[cfg(feature = "simd")]
740        {
741            crate::simd::simd_trunc_f64(a)
742        }
743        #[cfg(not(feature = "simd"))]
744        {
745            a.mapv(|x| x.trunc())
746        }
747    }
748    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
749        let ones = Array1::from_elem(a.len(), 1.0f64);
750        Self::simd_div(&ones.view(), a)
751    }
752    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
753        let ln_base = Self::simd_ln(base);
754        let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
755        Self::simd_exp(&scaled.view())
756    }
757    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
758        let ln_base = Self::simd_ln(base);
759        let scaled = Self::simd_mul(&ln_base.view(), exp);
760        Self::simd_exp(&scaled.view())
761    }
762    #[cfg(feature = "simd")]
763    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
764        crate::simd::unary_powi::simd_powi_f64(base, n)
765    }
766    #[cfg(not(feature = "simd"))]
767    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
768        base.mapv(|x| x.powi(n))
769    }
770    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
771        x.mapv(lanczos_gamma_f64)
772    }
773    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
774        const LN2: f64 = std::f64::consts::LN_2;
775        let scaled = Self::simd_scalar_mul(a, LN2);
776        Self::simd_exp(&scaled.view())
777    }
778    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
779        a.mapv(|x| x.cbrt())
780    }
781    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
782        a.mapv(|x| x.ln_1p())
783    }
784    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
785        a.mapv(|x| x.exp_m1())
786    }
787    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
788        const DEG_TO_RAD: f64 = std::f64::consts::PI / 180.0;
789        Self::simd_scalar_mul(a, DEG_TO_RAD)
790    }
791    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
792        const RAD_TO_DEG: f64 = 180.0 / std::f64::consts::PI;
793        Self::simd_scalar_mul(a, RAD_TO_DEG)
794    }
795    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
796        a.mapv(digamma_f64)
797    }
798    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
799        a.mapv(trigamma_f64)
800    }
801    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
802        a.mapv(ln_gamma_f64)
803    }
804    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
805        a.mapv(erf_f64)
806    }
807    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
808        a.mapv(erfc_f64)
809    }
810    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
811        a.mapv(erfinv_f64)
812    }
813    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
814        a.mapv(erfcinv_f64)
815    }
816    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
817        a.mapv(sigmoid_f64)
818    }
819    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
820        a.mapv(gelu_f64)
821    }
822    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
823        a.mapv(swish_f64)
824    }
825    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
826        a.mapv(softplus_f64)
827    }
828    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
829        a.mapv(mish_f64)
830    }
831    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
832        a.mapv(|x| elu_f64(x, alpha))
833    }
834    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
835        a.mapv(selu_f64)
836    }
837    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
838        a.mapv(hardsigmoid_f64)
839    }
840    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
841        a.mapv(hardswish_f64)
842    }
843    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
844        a.mapv(sinc_f64)
845    }
846    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
847        if a.is_empty() {
848            return Array1::zeros(0);
849        }
850        let lse = Self::simd_log_sum_exp(a);
851        a.mapv(|x| x - lse)
852    }
853    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
854        a.mapv(|x| x.asinh())
855    }
856    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
857        a.mapv(|x| x.acosh())
858    }
859    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
860        a.mapv(|x| x.atanh())
861    }
862    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
863        let ln_gamma_a = Self::simd_ln_gamma(a);
864        let ln_gamma_b = Self::simd_ln_gamma(b);
865        let a_plus_b = Self::simd_add(a, b);
866        let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
867        Self::simd_sub(
868            &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
869            &ln_gamma_ab.view(),
870        )
871    }
872    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
873        let ln_beta = Self::simd_ln_beta(a, b);
874        Self::simd_exp(&ln_beta.view())
875    }
876    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
877        if a.is_empty() || b.is_empty() {
878            return Array1::zeros(0);
879        }
880        let diff = Self::simd_sub(b, a);
881        let scaled = Self::simd_scalar_mul(&diff.view(), t);
882        Self::simd_add(a, &scaled.view())
883    }
884    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
885        if x.is_empty() {
886            return Array1::zeros(0);
887        }
888        let range = edge1 - edge0;
889        if range.abs() < Self::EPSILON {
890            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
891        }
892        x.mapv(|xi| {
893            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
894            t * t * (3.0 - 2.0 * t)
895        })
896    }
897    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
898        if x.is_empty() || y.is_empty() {
899            return Array1::zeros(0);
900        }
901        let len = x.len().min(y.len());
902        Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
903    }
904    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
905        if x.is_empty() || y.is_empty() {
906            return Array1::zeros(0);
907        }
908        let len = x.len().min(y.len());
909        Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
910    }
911    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
912        if x.is_empty() {
913            return Array1::zeros(0);
914        }
915        let range = edge1 - edge0;
916        if range.abs() < Self::EPSILON {
917            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
918        }
919        x.mapv(|xi| {
920            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
921            let t3 = t * t * t;
922            t3 * (t * (t * 6.0 - 15.0) + 10.0)
923        })
924    }
925    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
926        if a.is_empty() || b.is_empty() {
927            return Array1::zeros(0);
928        }
929        let len = a.len().min(b.len());
930        Array1::from_iter((0..len).map(|i| {
931            let ai = a[i];
932            let bi = b[i];
933            let max_val = ai.max(bi);
934            let diff = (ai - bi).abs();
935            if diff > 50.0 {
936                max_val
937            } else {
938                max_val + (1.0 + (-diff).exp()).ln()
939            }
940        }))
941    }
942    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
943        if a.is_empty() {
944            return Array1::zeros(0);
945        }
946        a.mapv(|p| {
947            if p <= 0.0 {
948                Self::NEG_INFINITY
949            } else if p >= 1.0 {
950                Self::INFINITY
951            } else {
952                (p / (1.0 - p)).ln()
953            }
954        })
955    }
956    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
957        if a.is_empty() {
958            return Array1::zeros(0);
959        }
960        a.mapv(|x| x * x)
961    }
962    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
963        if a.is_empty() {
964            return Array1::zeros(0);
965        }
966        a.mapv(|x| {
967            if x <= 0.0 {
968                if x == 0.0 {
969                    Self::INFINITY
970                } else {
971                    Self::NAN
972                }
973            } else {
974                1.0 / x.sqrt()
975            }
976        })
977    }
978    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
979        if a.is_empty() {
980            return (Array1::zeros(0), Array1::zeros(0));
981        }
982        let sin_result = a.mapv(|x| x.sin());
983        let cos_result = a.mapv(|x| x.cos());
984        (sin_result, cos_result)
985    }
986    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
987        if a.is_empty() {
988            return Array1::zeros(0);
989        }
990        a.mapv(|x| x.exp_m1())
991    }
992    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
993        if a.is_empty() {
994            return Array1::zeros(0);
995        }
996        a.mapv(|x| x.ln_1p())
997    }
998
999    // ============================================================================
1000    // ZERO-ALLOCATION SIMD OPERATIONS (Phase 1: ToRSh SIMD Performance Fix)
1001    // ============================================================================
1002
1003    #[cfg(feature = "simd")]
1004    fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1005        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1006        assert_eq!(
1007            a.len(),
1008            output.len(),
1009            "Output buffer must match input length"
1010        );
1011
1012        let len = a.len();
1013
1014        #[cfg(target_arch = "x86_64")]
1015        {
1016            use std::arch::x86_64::*;
1017
1018            if is_x86_feature_detected!("avx2") {
1019                unsafe {
1020                    let mut i = 0;
1021                    // Process 4 f64s at a time with AVX2
1022                    while i + 4 <= len {
1023                        let a_vec = _mm256_loadu_pd(a.as_ptr().add(i));
1024                        let b_vec = _mm256_loadu_pd(b.as_ptr().add(i));
1025                        let result_vec = _mm256_add_pd(a_vec, b_vec);
1026                        _mm256_storeu_pd(output.as_mut_ptr().add(i), result_vec);
1027                        i += 4;
1028                    }
1029                    while i < len {
1030                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1031                        i += 1;
1032                    }
1033                }
1034                return;
1035            }
1036        }
1037
1038        #[cfg(target_arch = "aarch64")]
1039        {
1040            use std::arch::aarch64::*;
1041
1042            if std::arch::is_aarch64_feature_detected!("neon") {
1043                unsafe {
1044                    let mut i = 0;
1045                    // Process 2 f64s at a time with NEON
1046                    while i + 2 <= len {
1047                        let a_vec = vld1q_f64(a.as_ptr().add(i));
1048                        let b_vec = vld1q_f64(b.as_ptr().add(i));
1049                        let result_vec = vaddq_f64(a_vec, b_vec);
1050                        vst1q_f64(output.as_mut_ptr().add(i), result_vec);
1051                        i += 2;
1052                    }
1053                    while i < len {
1054                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1055                        i += 1;
1056                    }
1057                }
1058                return;
1059            }
1060        }
1061
1062        for i in 0..len {
1063            output[i] = a[i] + b[i];
1064        }
1065    }
1066
1067    #[cfg(not(feature = "simd"))]
1068    fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1069        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1070        assert_eq!(
1071            a.len(),
1072            output.len(),
1073            "Output buffer must match input length"
1074        );
1075        for i in 0..a.len() {
1076            output[i] = a[i] + b[i];
1077        }
1078    }
1079
1080    fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1081        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1082        assert_eq!(
1083            a.len(),
1084            output.len(),
1085            "Output buffer must match input length"
1086        );
1087        for i in 0..a.len() {
1088            output[i] = a[i] - b[i];
1089        }
1090    }
1091
1092    #[cfg(feature = "simd")]
1093    fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1094        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1095        assert_eq!(
1096            a.len(),
1097            output.len(),
1098            "Output buffer must match input length"
1099        );
1100
1101        let len = a.len();
1102
1103        #[cfg(target_arch = "x86_64")]
1104        {
1105            use std::arch::x86_64::*;
1106
1107            if is_x86_feature_detected!("avx2") {
1108                unsafe {
1109                    let mut i = 0;
1110                    while i + 4 <= len {
1111                        let a_vec = _mm256_loadu_pd(a.as_ptr().add(i));
1112                        let b_vec = _mm256_loadu_pd(b.as_ptr().add(i));
1113                        let result_vec = _mm256_mul_pd(a_vec, b_vec);
1114                        _mm256_storeu_pd(output.as_mut_ptr().add(i), result_vec);
1115                        i += 4;
1116                    }
1117                    while i < len {
1118                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1119                        i += 1;
1120                    }
1121                }
1122                return;
1123            }
1124        }
1125
1126        #[cfg(target_arch = "aarch64")]
1127        {
1128            use std::arch::aarch64::*;
1129
1130            if std::arch::is_aarch64_feature_detected!("neon") {
1131                unsafe {
1132                    let mut i = 0;
1133                    while i + 2 <= len {
1134                        let a_vec = vld1q_f64(a.as_ptr().add(i));
1135                        let b_vec = vld1q_f64(b.as_ptr().add(i));
1136                        let result_vec = vmulq_f64(a_vec, b_vec);
1137                        vst1q_f64(output.as_mut_ptr().add(i), result_vec);
1138                        i += 2;
1139                    }
1140                    while i < len {
1141                        *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1142                        i += 1;
1143                    }
1144                }
1145                return;
1146            }
1147        }
1148
1149        for i in 0..len {
1150            output[i] = a[i] * b[i];
1151        }
1152    }
1153
1154    #[cfg(not(feature = "simd"))]
1155    fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1156        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1157        assert_eq!(
1158            a.len(),
1159            output.len(),
1160            "Output buffer must match input length"
1161        );
1162        for i in 0..a.len() {
1163            output[i] = a[i] * b[i];
1164        }
1165    }
1166
1167    fn simd_div_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1168        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1169        assert_eq!(
1170            a.len(),
1171            output.len(),
1172            "Output buffer must match input length"
1173        );
1174        for i in 0..a.len() {
1175            output[i] = a[i] / b[i];
1176        }
1177    }
1178
1179    fn simd_add_inplace(a: &mut [Self], b: &[Self]) {
1180        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1181        for i in 0..a.len() {
1182            a[i] += b[i];
1183        }
1184    }
1185
1186    fn simd_sub_inplace(a: &mut [Self], b: &[Self]) {
1187        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1188        for i in 0..a.len() {
1189            a[i] -= b[i];
1190        }
1191    }
1192
1193    fn simd_mul_inplace(a: &mut [Self], b: &[Self]) {
1194        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1195        for i in 0..a.len() {
1196            a[i] *= b[i];
1197        }
1198    }
1199
1200    fn simd_div_inplace(a: &mut [Self], b: &[Self]) {
1201        assert_eq!(a.len(), b.len(), "Arrays must have same length");
1202        for i in 0..a.len() {
1203            a[i] /= b[i];
1204        }
1205    }
1206
1207    fn simd_add_scalar_inplace(a: &mut [Self], scalar: Self) {
1208        for x in a.iter_mut() {
1209            *x += scalar;
1210        }
1211    }
1212
1213    fn simd_mul_scalar_inplace(a: &mut [Self], scalar: Self) {
1214        for x in a.iter_mut() {
1215            *x *= scalar;
1216        }
1217    }
1218
1219    fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]) {
1220        assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1221        assert_eq!(a.len(), c.len(), "Input arrays must have same length");
1222        assert_eq!(
1223            a.len(),
1224            output.len(),
1225            "Output buffer must match input length"
1226        );
1227        for i in 0..a.len() {
1228            output[i] = a[i].mul_add(b[i], c[i]);
1229        }
1230    }
1231}