Skip to main content

sklears_simd/vector/arithmetic_ops/
functions.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6use super::functions_2::{
7    abs_vec_avx2, abs_vec_avx512, add_vec_avx512, divide_vec_avx512, fma_fma_intrinsic,
8    multiply_vec_avx512, neg_vec_avx2, neg_vec_avx512, reciprocal_vec_avx2, reciprocal_vec_avx512,
9    scale_vec_avx2, scale_vec_avx512, square_vec_avx2, square_vec_avx512, subtract_vec_avx512,
10};
11#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
12use super::functions_2::{
13    abs_vec_neon, add_vec_neon, divide_vec_neon, fma_neon, multiply_vec_neon, neg_vec_neon,
14    reciprocal_vec_neon, scale_vec_neon, square_vec_neon, subtract_vec_neon,
15};
16
17/// SIMD-optimized element-wise vector addition
18///
19/// Computes c\[i\] = a\[i\] + b\[i\] for all elements using SIMD instructions
20/// when available. The operation is performed in-place on the output vector.
21///
22/// # Arguments
23/// * `a` - First input vector
24/// * `b` - Second input vector (must have same length as `a`)
25/// * `result` - Output vector (must have same length as `a` and `b`)
26///
27/// # Panics
28/// Panics if the vectors have different lengths
29///
30/// # Examples
31/// ```rust
32/// use sklears_simd::vector::arithmetic_ops::add_vec;
33///
34/// let a = vec![1.0, 2.0, 3.0, 4.0];
35/// let b = vec![5.0, 6.0, 7.0, 8.0];
36/// let mut result = vec![0.0; 4];
37///
38/// add_vec(&a, &b, &mut result);
39/// assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
40/// ```
41pub fn add_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
42    assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
43    assert_eq!(
44        a.len(),
45        result.len(),
46        "Output vector must have the same length as input vectors"
47    );
48    if a.is_empty() {
49        return;
50    }
51    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
52    {
53        if crate::simd_feature_detected!("avx512f") {
54            unsafe { add_vec_avx512(a, b, result) };
55            return;
56        } else if crate::simd_feature_detected!("avx2") {
57            unsafe { add_vec_avx2(a, b, result) };
58            return;
59        } else if crate::simd_feature_detected!("sse2") {
60            unsafe { add_vec_sse2(a, b, result) };
61            return;
62        }
63    }
64    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
65    {
66        if std::arch::is_aarch64_feature_detected!("neon") {
67            unsafe { add_vec_neon(a, b, result) };
68            return;
69        }
70    }
71    add_vec_scalar(a, b, result);
72}
73/// SIMD-optimized element-wise vector subtraction
74///
75/// Computes c\[i\] = a\[i\] - b\[i\] for all elements using SIMD instructions.
76///
77/// # Arguments
78/// * `a` - First input vector (minuend)
79/// * `b` - Second input vector (subtrahend, must have same length as `a`)
80/// * `result` - Output vector (must have same length as inputs)
81///
82/// # Panics
83/// Panics if the vectors have different lengths
84///
85/// # Examples
86/// ```rust
87/// use sklears_simd::vector::arithmetic_ops::subtract_vec;
88///
89/// let a = vec![10.0, 8.0, 6.0, 4.0];
90/// let b = vec![3.0, 2.0, 1.0, 1.0];
91/// let mut result = vec![0.0; 4];
92///
93/// subtract_vec(&a, &b, &mut result);
94/// assert_eq!(result, vec![7.0, 6.0, 5.0, 3.0]);
95/// ```
96pub fn subtract_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
97    assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
98    assert_eq!(
99        a.len(),
100        result.len(),
101        "Output vector must have the same length as input vectors"
102    );
103    if a.is_empty() {
104        return;
105    }
106    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
107    {
108        if crate::simd_feature_detected!("avx512f") {
109            unsafe { subtract_vec_avx512(a, b, result) };
110            return;
111        } else if crate::simd_feature_detected!("avx2") {
112            unsafe { subtract_vec_avx2(a, b, result) };
113            return;
114        } else if crate::simd_feature_detected!("sse2") {
115            unsafe { subtract_vec_sse2(a, b, result) };
116            return;
117        }
118    }
119    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
120    {
121        if std::arch::is_aarch64_feature_detected!("neon") {
122            unsafe { subtract_vec_neon(a, b, result) };
123            return;
124        }
125    }
126    subtract_vec_scalar(a, b, result);
127}
128/// SIMD-optimized element-wise vector multiplication
129///
130/// Computes c\[i\] = a\[i\] * b\[i\] for all elements using SIMD instructions.
131///
132/// # Arguments
133/// * `a` - First input vector
134/// * `b` - Second input vector (must have same length as `a`)
135/// * `result` - Output vector (must have same length as inputs)
136///
137/// # Panics
138/// Panics if the vectors have different lengths
139///
140/// # Examples
141/// ```rust
142/// use sklears_simd::vector::arithmetic_ops::multiply_vec;
143///
144/// let a = vec![2.0, 3.0, 4.0, 5.0];
145/// let b = vec![3.0, 4.0, 5.0, 6.0];
146/// let mut result = vec![0.0; 4];
147///
148/// multiply_vec(&a, &b, &mut result);
149/// assert_eq!(result, vec![6.0, 12.0, 20.0, 30.0]);
150/// ```
151pub fn multiply_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
152    assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
153    assert_eq!(
154        a.len(),
155        result.len(),
156        "Output vector must have the same length as input vectors"
157    );
158    if a.is_empty() {
159        return;
160    }
161    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
162    {
163        if crate::simd_feature_detected!("avx512f") {
164            unsafe { multiply_vec_avx512(a, b, result) };
165            return;
166        } else if crate::simd_feature_detected!("avx2") {
167            unsafe { multiply_vec_avx2(a, b, result) };
168            return;
169        } else if crate::simd_feature_detected!("sse2") {
170            unsafe { multiply_vec_sse2(a, b, result) };
171            return;
172        }
173    }
174    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
175    {
176        if std::arch::is_aarch64_feature_detected!("neon") {
177            unsafe { multiply_vec_neon(a, b, result) };
178            return;
179        }
180    }
181    multiply_vec_scalar(a, b, result);
182}
183/// SIMD-optimized element-wise vector division
184///
185/// Computes c\[i\] = a\[i\] / b\[i\] for all elements using SIMD instructions.
186/// Division by zero results in infinity or NaN according to IEEE 754 standard.
187///
188/// # Arguments
189/// * `a` - First input vector (dividend)
190/// * `b` - Second input vector (divisor, must have same length as `a`)
191/// * `result` - Output vector (must have same length as inputs)
192///
193/// # Panics
194/// Panics if the vectors have different lengths
195///
196/// # Examples
197/// ```rust
198/// use sklears_simd::vector::arithmetic_ops::divide_vec;
199///
200/// let a = vec![12.0, 15.0, 20.0, 25.0];
201/// let b = vec![3.0, 3.0, 4.0, 5.0];
202/// let mut result = vec![0.0; 4];
203///
204/// divide_vec(&a, &b, &mut result);
205/// assert_eq!(result, vec![4.0, 5.0, 5.0, 5.0]);
206/// ```
207pub fn divide_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
208    assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
209    assert_eq!(
210        a.len(),
211        result.len(),
212        "Output vector must have the same length as input vectors"
213    );
214    if a.is_empty() {
215        return;
216    }
217    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
218    {
219        if crate::simd_feature_detected!("avx512f") {
220            unsafe { divide_vec_avx512(a, b, result) };
221            return;
222        } else if crate::simd_feature_detected!("avx2") {
223            unsafe { divide_vec_avx2(a, b, result) };
224            return;
225        } else if crate::simd_feature_detected!("sse2") {
226            unsafe { divide_vec_sse2(a, b, result) };
227            return;
228        }
229    }
230    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
231    {
232        if std::arch::is_aarch64_feature_detected!("neon") {
233            unsafe { divide_vec_neon(a, b, result) };
234            return;
235        }
236    }
237    divide_vec_scalar(a, b, result);
238}
239/// SIMD-optimized fused multiply-add operation
240///
241/// Computes a\[i\] = a\[i\] * b\[i\] + c\[i\] for all elements in-place on vector `a`.
242/// This operation provides maximum performance and precision when FMA instructions
243/// are available, as it performs multiplication and addition in a single step.
244///
245/// # Arguments
246/// * `a` - Input/output vector (will be modified in-place)
247/// * `b` - Multiplier vector (must have same length as `a`)
248/// * `c` - Addend vector (must have same length as `a`)
249///
250/// # Panics
251/// Panics if the vectors have different lengths
252///
253/// # Examples
254/// ```rust
255/// use sklears_simd::vector::arithmetic_ops::fma;
256///
257/// let mut a = vec![1.0, 2.0, 3.0, 4.0];
258/// let b = vec![2.0, 3.0, 4.0, 5.0];
259/// let c = vec![1.0, 1.0, 1.0, 1.0];
260///
261/// fma(&mut a, &b, &c);
262/// // a = a * b + c = [1*2+1, 2*3+1, 3*4+1, 4*5+1] = [3, 7, 13, 21]
263/// assert_eq!(a, vec![3.0, 7.0, 13.0, 21.0]);
264/// ```
265pub fn fma(a: &mut [f32], b: &[f32], c: &[f32]) {
266    assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
267    assert_eq!(a.len(), c.len(), "Input vectors must have the same length");
268    if a.is_empty() {
269        return;
270    }
271    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
272    {
273        if crate::simd_feature_detected!("fma") {
274            unsafe { fma_fma_intrinsic(a, b, c) };
275            return;
276        } else if crate::simd_feature_detected!("avx2") {
277            unsafe { fma_avx2(a, b, c) };
278            return;
279        } else if crate::simd_feature_detected!("sse2") {
280            unsafe { fma_sse2(a, b, c) };
281            return;
282        }
283    }
284    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
285    {
286        if std::arch::is_aarch64_feature_detected!("neon") {
287            unsafe { fma_neon(a, b, c) };
288            return;
289        }
290    }
291    fma_scalar(a, b, c);
292}
293/// SIMD-optimized vector scaling (scalar multiplication)
294///
295/// Computes result\[i\] = vector\[i\] * scalar for all elements.
296/// This is optimized for the common case of multiplying a vector by a scalar value.
297///
298/// # Arguments
299/// * `vector` - Input vector
300/// * `scalar` - Scalar value to multiply each element by
301/// * `result` - Output vector (must have same length as input)
302///
303/// # Panics
304/// Panics if input and output vectors have different lengths
305///
306/// # Examples
307/// ```rust
308/// use sklears_simd::vector::arithmetic_ops::scale_vec;
309///
310/// let vector = vec![1.0, 2.0, 3.0, 4.0];
311/// let mut result = vec![0.0; 4];
312///
313/// scale_vec(&vector, 2.5, &mut result);
314/// assert_eq!(result, vec![2.5, 5.0, 7.5, 10.0]);
315/// ```
316pub fn scale_vec(vector: &[f32], scalar: f32, result: &mut [f32]) {
317    assert_eq!(
318        vector.len(),
319        result.len(),
320        "Input and output vectors must have the same length"
321    );
322    if vector.is_empty() {
323        return;
324    }
325    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326    {
327        if crate::simd_feature_detected!("avx512f") {
328            unsafe { scale_vec_avx512(vector, scalar, result) };
329            return;
330        } else if crate::simd_feature_detected!("avx2") {
331            unsafe { scale_vec_avx2(vector, scalar, result) };
332            return;
333        } else if crate::simd_feature_detected!("sse2") {
334            unsafe { scale_vec_sse2(vector, scalar, result) };
335            return;
336        }
337    }
338    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
339    {
340        if std::arch::is_aarch64_feature_detected!("neon") {
341            unsafe { scale_vec_neon(vector, scalar, result) };
342            return;
343        }
344    }
345    scale_vec_scalar(vector, scalar, result);
346}
347/// SIMD-optimized in-place vector scaling
348///
349/// Computes vector\[i\] = vector\[i\] * scalar for all elements in-place.
350/// This is more memory-efficient than the out-of-place version.
351///
352/// # Arguments
353/// * `vector` - Input/output vector (will be modified in-place)
354/// * `scalar` - Scalar value to multiply each element by
355///
356/// # Examples
357/// ```rust
358/// use sklears_simd::vector::arithmetic_ops::scale_vec_inplace;
359///
360/// let mut vector = vec![1.0, 2.0, 3.0, 4.0];
361/// scale_vec_inplace(&mut vector, 3.0);
362/// assert_eq!(vector, vec![3.0, 6.0, 9.0, 12.0]);
363/// ```
364pub fn scale_vec_inplace(vector: &mut [f32], scalar: f32) {
365    let len = vector.len();
366    if len == 0 {
367        return;
368    }
369    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
370    {
371        if crate::simd_feature_detected!("avx512f") {
372            unsafe {
373                let mut i = 0;
374                while i + 16 <= len {
375                    let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 16);
376                    let result_slice =
377                        core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 16);
378                    scale_vec_avx512(vec_slice, scalar, result_slice);
379                    i += 16;
380                }
381                while i < len {
382                    vector[i] *= scalar;
383                    i += 1;
384                }
385            }
386            return;
387        } else if crate::simd_feature_detected!("avx2") {
388            unsafe {
389                let mut i = 0;
390                while i + 8 <= len {
391                    let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 8);
392                    let result_slice =
393                        core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 8);
394                    scale_vec_avx2(vec_slice, scalar, result_slice);
395                    i += 8;
396                }
397                while i < len {
398                    vector[i] *= scalar;
399                    i += 1;
400                }
401            }
402            return;
403        } else if crate::simd_feature_detected!("sse2") {
404            unsafe {
405                let mut i = 0;
406                while i + 4 <= len {
407                    let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 4);
408                    let result_slice =
409                        core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 4);
410                    scale_vec_sse2(vec_slice, scalar, result_slice);
411                    i += 4;
412                }
413                while i < len {
414                    vector[i] *= scalar;
415                    i += 1;
416                }
417            }
418            return;
419        }
420    }
421    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
422    {
423        if std::arch::is_aarch64_feature_detected!("neon") {
424            unsafe {
425                let mut i = 0;
426                while i + 4 <= len {
427                    let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 4);
428                    let result_slice =
429                        core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 4);
430                    scale_vec_neon(vec_slice, scalar, result_slice);
431                    i += 4;
432                }
433                while i < len {
434                    vector[i] *= scalar;
435                    i += 1;
436                }
437            }
438            return;
439        }
440    }
441    for v in vector[..len].iter_mut() {
442        *v *= scalar;
443    }
444}
445/// SIMD-optimized vector absolute value
446///
447/// Computes result\[i\] = |vector\[i\]| for all elements.
448///
449/// # Arguments
450/// * `vector` - Input vector
451/// * `result` - Output vector (must have same length as input)
452///
453/// # Panics
454/// Panics if input and output vectors have different lengths
455///
456/// # Examples
457/// ```rust
458/// use sklears_simd::vector::arithmetic_ops::abs_vec;
459///
460/// let vector = vec![-2.0, -1.0, 0.0, 1.0, 2.0];
461/// let mut result = vec![0.0; 5];
462///
463/// abs_vec(&vector, &mut result);
464/// assert_eq!(result, vec![2.0, 1.0, 0.0, 1.0, 2.0]);
465/// ```
466pub fn abs_vec(vector: &[f32], result: &mut [f32]) {
467    assert_eq!(
468        vector.len(),
469        result.len(),
470        "Input and output vectors must have the same length"
471    );
472    if vector.is_empty() {
473        return;
474    }
475    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
476    {
477        if crate::simd_feature_detected!("avx512f") {
478            unsafe { abs_vec_avx512(vector, result) };
479            return;
480        } else if crate::simd_feature_detected!("avx2") {
481            unsafe { abs_vec_avx2(vector, result) };
482            return;
483        } else if crate::simd_feature_detected!("sse2") {
484            unsafe { abs_vec_sse2(vector, result) };
485            return;
486        }
487    }
488    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
489    {
490        if std::arch::is_aarch64_feature_detected!("neon") {
491            unsafe { abs_vec_neon(vector, result) };
492            return;
493        }
494    }
495    abs_vec_scalar(vector, result);
496}
497/// SIMD-optimized vector negation
498///
499/// Computes result\[i\] = -vector\[i\] for all elements.
500///
501/// # Arguments
502/// * `vector` - Input vector
503/// * `result` - Output vector (must have same length as input)
504///
505/// # Panics
506/// Panics if input and output vectors have different lengths
507///
508/// # Examples
509/// ```rust
510/// use sklears_simd::vector::arithmetic_ops::neg_vec;
511///
512/// let vector = vec![-2.0, -1.0, 0.0, 1.0, 2.0];
513/// let mut result = vec![0.0; 5];
514///
515/// neg_vec(&vector, &mut result);
516/// assert_eq!(result, vec![2.0, 1.0, 0.0, -1.0, -2.0]);
517/// ```
518pub fn neg_vec(vector: &[f32], result: &mut [f32]) {
519    assert_eq!(
520        vector.len(),
521        result.len(),
522        "Input and output vectors must have the same length"
523    );
524    if vector.is_empty() {
525        return;
526    }
527    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
528    {
529        if crate::simd_feature_detected!("avx512f") {
530            unsafe { neg_vec_avx512(vector, result) };
531            return;
532        } else if crate::simd_feature_detected!("avx2") {
533            unsafe { neg_vec_avx2(vector, result) };
534            return;
535        } else if crate::simd_feature_detected!("sse2") {
536            unsafe { neg_vec_sse2(vector, result) };
537            return;
538        }
539    }
540    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
541    {
542        if std::arch::is_aarch64_feature_detected!("neon") {
543            unsafe { neg_vec_neon(vector, result) };
544            return;
545        }
546    }
547    neg_vec_scalar(vector, result);
548}
549/// SIMD-optimized vector reciprocal
550///
551/// Computes result\[i\] = 1.0 / vector\[i\] for all elements.
552/// Division by zero results in infinity according to IEEE 754 standard.
553///
554/// # Arguments
555/// * `vector` - Input vector
556/// * `result` - Output vector (must have same length as input)
557///
558/// # Panics
559/// Panics if input and output vectors have different lengths
560///
561/// # Examples
562/// ```rust
563/// use sklears_simd::vector::arithmetic_ops::reciprocal_vec;
564///
565/// let vector = vec![1.0, 2.0, 4.0, 0.5];
566/// let mut result = vec![0.0; 4];
567///
568/// reciprocal_vec(&vector, &mut result);
569/// assert_eq!(result, vec![1.0, 0.5, 0.25, 2.0]);
570/// ```
571pub fn reciprocal_vec(vector: &[f32], result: &mut [f32]) {
572    assert_eq!(
573        vector.len(),
574        result.len(),
575        "Input and output vectors must have the same length"
576    );
577    if vector.is_empty() {
578        return;
579    }
580    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
581    {
582        if crate::simd_feature_detected!("avx512f") {
583            unsafe { reciprocal_vec_avx512(vector, result) };
584            return;
585        } else if crate::simd_feature_detected!("avx2") {
586            unsafe { reciprocal_vec_avx2(vector, result) };
587            return;
588        } else if crate::simd_feature_detected!("sse2") {
589            unsafe { reciprocal_vec_sse2(vector, result) };
590            return;
591        }
592    }
593    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
594    {
595        if std::arch::is_aarch64_feature_detected!("neon") {
596            unsafe { reciprocal_vec_neon(vector, result) };
597            return;
598        }
599    }
600    reciprocal_vec_scalar(vector, result);
601}
602/// SIMD-optimized vector squaring
603///
604/// Computes result\[i\] = vector\[i\] * vector\[i\] for all elements.
605///
606/// # Arguments
607/// * `vector` - Input vector
608/// * `result` - Output vector (must have same length as input)
609///
610/// # Panics
611/// Panics if input and output vectors have different lengths
612///
613/// # Examples
614/// ```rust
615/// use sklears_simd::vector::arithmetic_ops::square_vec;
616///
617/// let vector = vec![-3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0];
618/// let mut result = vec![0.0; 7];
619///
620/// square_vec(&vector, &mut result);
621/// assert_eq!(result, vec![9.0, 4.0, 1.0, 0.0, 1.0, 4.0, 9.0]);
622/// ```
623pub fn square_vec(vector: &[f32], result: &mut [f32]) {
624    assert_eq!(
625        vector.len(),
626        result.len(),
627        "Input and output vectors must have the same length"
628    );
629    if vector.is_empty() {
630        return;
631    }
632    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
633    {
634        if crate::simd_feature_detected!("avx512f") {
635            unsafe { square_vec_avx512(vector, result) };
636            return;
637        } else if crate::simd_feature_detected!("avx2") {
638            unsafe { square_vec_avx2(vector, result) };
639            return;
640        } else if crate::simd_feature_detected!("sse2") {
641            unsafe { square_vec_sse2(vector, result) };
642            return;
643        }
644    }
645    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
646    {
647        if std::arch::is_aarch64_feature_detected!("neon") {
648            unsafe { square_vec_neon(vector, result) };
649            return;
650        }
651    }
652    square_vec_scalar(vector, result);
653}
654fn add_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
655    for i in 0..a.len() {
656        result[i] = a[i] + b[i];
657    }
658}
659fn subtract_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
660    for i in 0..a.len() {
661        result[i] = a[i] - b[i];
662    }
663}
664fn multiply_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
665    for i in 0..a.len() {
666        result[i] = a[i] * b[i];
667    }
668}
669fn divide_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
670    for i in 0..a.len() {
671        result[i] = a[i] / b[i];
672    }
673}
674fn fma_scalar(a: &mut [f32], b: &[f32], c: &[f32]) {
675    for i in 0..a.len() {
676        a[i] = a[i] * b[i] + c[i];
677    }
678}
679fn scale_vec_scalar(vector: &[f32], scalar: f32, result: &mut [f32]) {
680    for i in 0..vector.len() {
681        result[i] = vector[i] * scalar;
682    }
683}
684fn abs_vec_scalar(vector: &[f32], result: &mut [f32]) {
685    for i in 0..vector.len() {
686        result[i] = vector[i].abs();
687    }
688}
689fn neg_vec_scalar(vector: &[f32], result: &mut [f32]) {
690    for i in 0..vector.len() {
691        result[i] = -vector[i];
692    }
693}
694fn reciprocal_vec_scalar(vector: &[f32], result: &mut [f32]) {
695    for i in 0..vector.len() {
696        result[i] = 1.0 / vector[i];
697    }
698}
699fn square_vec_scalar(vector: &[f32], result: &mut [f32]) {
700    for i in 0..vector.len() {
701        result[i] = vector[i] * vector[i];
702    }
703}
704#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
705#[target_feature(enable = "sse2")]
706unsafe fn add_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
707    #[cfg(feature = "no-std")]
708    use core::arch::x86_64::*;
709    #[cfg(not(feature = "no-std"))]
710    use core::arch::x86_64::*;
711    let mut i = 0;
712    while i + 4 <= a.len() {
713        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
714        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
715        let result_vec = _mm_add_ps(a_vec, b_vec);
716        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
717        i += 4;
718    }
719    while i < a.len() {
720        result[i] = a[i] + b[i];
721        i += 1;
722    }
723}
724#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
725#[target_feature(enable = "sse2")]
726unsafe fn subtract_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
727    #[cfg(feature = "no-std")]
728    use core::arch::x86_64::*;
729    #[cfg(not(feature = "no-std"))]
730    use core::arch::x86_64::*;
731    let mut i = 0;
732    while i + 4 <= a.len() {
733        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
734        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
735        let result_vec = _mm_sub_ps(a_vec, b_vec);
736        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
737        i += 4;
738    }
739    while i < a.len() {
740        result[i] = a[i] - b[i];
741        i += 1;
742    }
743}
744#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
745#[target_feature(enable = "sse2")]
746unsafe fn multiply_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
747    #[cfg(feature = "no-std")]
748    use core::arch::x86_64::*;
749    #[cfg(not(feature = "no-std"))]
750    use core::arch::x86_64::*;
751    let mut i = 0;
752    while i + 4 <= a.len() {
753        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
754        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
755        let result_vec = _mm_mul_ps(a_vec, b_vec);
756        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
757        i += 4;
758    }
759    while i < a.len() {
760        result[i] = a[i] * b[i];
761        i += 1;
762    }
763}
764#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
765#[target_feature(enable = "sse2")]
766unsafe fn divide_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
767    #[cfg(feature = "no-std")]
768    use core::arch::x86_64::*;
769    #[cfg(not(feature = "no-std"))]
770    use core::arch::x86_64::*;
771    let mut i = 0;
772    while i + 4 <= a.len() {
773        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
774        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
775        let result_vec = _mm_div_ps(a_vec, b_vec);
776        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
777        i += 4;
778    }
779    while i < a.len() {
780        result[i] = a[i] / b[i];
781        i += 1;
782    }
783}
784#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
785#[target_feature(enable = "sse2")]
786unsafe fn fma_sse2(a: &mut [f32], b: &[f32], c: &[f32]) {
787    #[cfg(feature = "no-std")]
788    use core::arch::x86_64::*;
789    #[cfg(not(feature = "no-std"))]
790    use core::arch::x86_64::*;
791    let mut i = 0;
792    while i + 4 <= a.len() {
793        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
794        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
795        let c_vec = _mm_loadu_ps(c.as_ptr().add(i));
796        let result_vec = _mm_add_ps(_mm_mul_ps(a_vec, b_vec), c_vec);
797        _mm_storeu_ps(a.as_mut_ptr().add(i), result_vec);
798        i += 4;
799    }
800    while i < a.len() {
801        a[i] = a[i] * b[i] + c[i];
802        i += 1;
803    }
804}
805#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
806#[target_feature(enable = "sse2")]
807unsafe fn scale_vec_sse2(vector: &[f32], scalar: f32, result: &mut [f32]) {
808    #[cfg(feature = "no-std")]
809    use core::arch::x86_64::*;
810    #[cfg(not(feature = "no-std"))]
811    use core::arch::x86_64::*;
812    let scalar_vec = _mm_set1_ps(scalar);
813    let mut i = 0;
814    while i + 4 <= vector.len() {
815        let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
816        let result_vec = _mm_mul_ps(vector_vec, scalar_vec);
817        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
818        i += 4;
819    }
820    while i < vector.len() {
821        result[i] = vector[i] * scalar;
822        i += 1;
823    }
824}
825#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
826#[target_feature(enable = "sse2")]
827unsafe fn abs_vec_sse2(vector: &[f32], result: &mut [f32]) {
828    #[cfg(feature = "no-std")]
829    use core::arch::x86_64::*;
830    #[cfg(not(feature = "no-std"))]
831    use core::arch::x86_64::*;
832    let abs_mask = _mm_set1_ps(f32::from_bits(0x7FFFFFFF));
833    let mut i = 0;
834    while i + 4 <= vector.len() {
835        let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
836        let result_vec = _mm_and_ps(vector_vec, abs_mask);
837        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
838        i += 4;
839    }
840    while i < vector.len() {
841        result[i] = vector[i].abs();
842        i += 1;
843    }
844}
845#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
846#[target_feature(enable = "sse2")]
847unsafe fn neg_vec_sse2(vector: &[f32], result: &mut [f32]) {
848    #[cfg(feature = "no-std")]
849    use core::arch::x86_64::*;
850    #[cfg(not(feature = "no-std"))]
851    use core::arch::x86_64::*;
852    let sign_mask = _mm_set1_ps(f32::from_bits(0x80000000));
853    let mut i = 0;
854    while i + 4 <= vector.len() {
855        let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
856        let result_vec = _mm_xor_ps(vector_vec, sign_mask);
857        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
858        i += 4;
859    }
860    while i < vector.len() {
861        result[i] = -vector[i];
862        i += 1;
863    }
864}
865#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
866#[target_feature(enable = "sse2")]
867unsafe fn reciprocal_vec_sse2(vector: &[f32], result: &mut [f32]) {
868    #[cfg(feature = "no-std")]
869    use core::arch::x86_64::*;
870    #[cfg(not(feature = "no-std"))]
871    use core::arch::x86_64::*;
872    let mut i = 0;
873    while i + 4 <= vector.len() {
874        let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
875        let result_vec = _mm_div_ps(_mm_set1_ps(1.0), vector_vec);
876        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
877        i += 4;
878    }
879    while i < vector.len() {
880        result[i] = 1.0 / vector[i];
881        i += 1;
882    }
883}
884#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
885#[target_feature(enable = "sse2")]
886unsafe fn square_vec_sse2(vector: &[f32], result: &mut [f32]) {
887    #[cfg(feature = "no-std")]
888    use core::arch::x86_64::*;
889    #[cfg(not(feature = "no-std"))]
890    use core::arch::x86_64::*;
891    let mut i = 0;
892    while i + 4 <= vector.len() {
893        let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
894        let result_vec = _mm_mul_ps(vector_vec, vector_vec);
895        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
896        i += 4;
897    }
898    while i < vector.len() {
899        result[i] = vector[i] * vector[i];
900        i += 1;
901    }
902}
903#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
904#[target_feature(enable = "avx2")]
905unsafe fn add_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
906    #[cfg(feature = "no-std")]
907    use core::arch::x86_64::*;
908    #[cfg(not(feature = "no-std"))]
909    use core::arch::x86_64::*;
910    let mut i = 0;
911    while i + 8 <= a.len() {
912        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
913        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
914        let result_vec = _mm256_add_ps(a_vec, b_vec);
915        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
916        i += 8;
917    }
918    while i < a.len() {
919        result[i] = a[i] + b[i];
920        i += 1;
921    }
922}
923#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
924#[target_feature(enable = "avx2")]
925unsafe fn subtract_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
926    #[cfg(feature = "no-std")]
927    use core::arch::x86_64::*;
928    #[cfg(not(feature = "no-std"))]
929    use core::arch::x86_64::*;
930    let mut i = 0;
931    while i + 8 <= a.len() {
932        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
933        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
934        let result_vec = _mm256_sub_ps(a_vec, b_vec);
935        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
936        i += 8;
937    }
938    while i < a.len() {
939        result[i] = a[i] - b[i];
940        i += 1;
941    }
942}
943#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
944#[target_feature(enable = "avx2")]
945unsafe fn multiply_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
946    #[cfg(feature = "no-std")]
947    use core::arch::x86_64::*;
948    #[cfg(not(feature = "no-std"))]
949    use core::arch::x86_64::*;
950    let mut i = 0;
951    while i + 8 <= a.len() {
952        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
953        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
954        let result_vec = _mm256_mul_ps(a_vec, b_vec);
955        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
956        i += 8;
957    }
958    while i < a.len() {
959        result[i] = a[i] * b[i];
960        i += 1;
961    }
962}
963#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
964#[target_feature(enable = "avx2")]
965unsafe fn divide_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
966    #[cfg(feature = "no-std")]
967    use core::arch::x86_64::*;
968    #[cfg(not(feature = "no-std"))]
969    use core::arch::x86_64::*;
970    let mut i = 0;
971    while i + 8 <= a.len() {
972        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
973        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
974        let result_vec = _mm256_div_ps(a_vec, b_vec);
975        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
976        i += 8;
977    }
978    while i < a.len() {
979        result[i] = a[i] / b[i];
980        i += 1;
981    }
982}
983#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
984#[target_feature(enable = "avx2")]
985unsafe fn fma_avx2(a: &mut [f32], b: &[f32], c: &[f32]) {
986    #[cfg(feature = "no-std")]
987    use core::arch::x86_64::*;
988    #[cfg(not(feature = "no-std"))]
989    use core::arch::x86_64::*;
990    let mut i = 0;
991    while i + 8 <= a.len() {
992        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
993        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
994        let c_vec = _mm256_loadu_ps(c.as_ptr().add(i));
995        let result_vec = _mm256_add_ps(_mm256_mul_ps(a_vec, b_vec), c_vec);
996        _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
997        i += 8;
998    }
999    while i < a.len() {
1000        a[i] = a[i] * b[i] + c[i];
1001        i += 1;
1002    }
1003}