1#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6use super::functions_2::{
7 abs_vec_avx2, abs_vec_avx512, add_vec_avx512, divide_vec_avx512, fma_fma_intrinsic,
8 multiply_vec_avx512, neg_vec_avx2, neg_vec_avx512, reciprocal_vec_avx2, reciprocal_vec_avx512,
9 scale_vec_avx2, scale_vec_avx512, square_vec_avx2, square_vec_avx512, subtract_vec_avx512,
10};
11#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
12use super::functions_2::{
13 abs_vec_neon, add_vec_neon, divide_vec_neon, fma_neon, multiply_vec_neon, neg_vec_neon,
14 reciprocal_vec_neon, scale_vec_neon, square_vec_neon, subtract_vec_neon,
15};
16
17pub fn add_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
42 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
43 assert_eq!(
44 a.len(),
45 result.len(),
46 "Output vector must have the same length as input vectors"
47 );
48 if a.is_empty() {
49 return;
50 }
51 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
52 {
53 if crate::simd_feature_detected!("avx512f") {
54 unsafe { add_vec_avx512(a, b, result) };
55 return;
56 } else if crate::simd_feature_detected!("avx2") {
57 unsafe { add_vec_avx2(a, b, result) };
58 return;
59 } else if crate::simd_feature_detected!("sse2") {
60 unsafe { add_vec_sse2(a, b, result) };
61 return;
62 }
63 }
64 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
65 {
66 if std::arch::is_aarch64_feature_detected!("neon") {
67 unsafe { add_vec_neon(a, b, result) };
68 return;
69 }
70 }
71 add_vec_scalar(a, b, result);
72}
73pub fn subtract_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
97 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
98 assert_eq!(
99 a.len(),
100 result.len(),
101 "Output vector must have the same length as input vectors"
102 );
103 if a.is_empty() {
104 return;
105 }
106 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
107 {
108 if crate::simd_feature_detected!("avx512f") {
109 unsafe { subtract_vec_avx512(a, b, result) };
110 return;
111 } else if crate::simd_feature_detected!("avx2") {
112 unsafe { subtract_vec_avx2(a, b, result) };
113 return;
114 } else if crate::simd_feature_detected!("sse2") {
115 unsafe { subtract_vec_sse2(a, b, result) };
116 return;
117 }
118 }
119 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
120 {
121 if std::arch::is_aarch64_feature_detected!("neon") {
122 unsafe { subtract_vec_neon(a, b, result) };
123 return;
124 }
125 }
126 subtract_vec_scalar(a, b, result);
127}
128pub fn multiply_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
152 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
153 assert_eq!(
154 a.len(),
155 result.len(),
156 "Output vector must have the same length as input vectors"
157 );
158 if a.is_empty() {
159 return;
160 }
161 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
162 {
163 if crate::simd_feature_detected!("avx512f") {
164 unsafe { multiply_vec_avx512(a, b, result) };
165 return;
166 } else if crate::simd_feature_detected!("avx2") {
167 unsafe { multiply_vec_avx2(a, b, result) };
168 return;
169 } else if crate::simd_feature_detected!("sse2") {
170 unsafe { multiply_vec_sse2(a, b, result) };
171 return;
172 }
173 }
174 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
175 {
176 if std::arch::is_aarch64_feature_detected!("neon") {
177 unsafe { multiply_vec_neon(a, b, result) };
178 return;
179 }
180 }
181 multiply_vec_scalar(a, b, result);
182}
183pub fn divide_vec(a: &[f32], b: &[f32], result: &mut [f32]) {
208 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
209 assert_eq!(
210 a.len(),
211 result.len(),
212 "Output vector must have the same length as input vectors"
213 );
214 if a.is_empty() {
215 return;
216 }
217 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
218 {
219 if crate::simd_feature_detected!("avx512f") {
220 unsafe { divide_vec_avx512(a, b, result) };
221 return;
222 } else if crate::simd_feature_detected!("avx2") {
223 unsafe { divide_vec_avx2(a, b, result) };
224 return;
225 } else if crate::simd_feature_detected!("sse2") {
226 unsafe { divide_vec_sse2(a, b, result) };
227 return;
228 }
229 }
230 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
231 {
232 if std::arch::is_aarch64_feature_detected!("neon") {
233 unsafe { divide_vec_neon(a, b, result) };
234 return;
235 }
236 }
237 divide_vec_scalar(a, b, result);
238}
239pub fn fma(a: &mut [f32], b: &[f32], c: &[f32]) {
266 assert_eq!(a.len(), b.len(), "Input vectors must have the same length");
267 assert_eq!(a.len(), c.len(), "Input vectors must have the same length");
268 if a.is_empty() {
269 return;
270 }
271 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
272 {
273 if crate::simd_feature_detected!("fma") {
274 unsafe { fma_fma_intrinsic(a, b, c) };
275 return;
276 } else if crate::simd_feature_detected!("avx2") {
277 unsafe { fma_avx2(a, b, c) };
278 return;
279 } else if crate::simd_feature_detected!("sse2") {
280 unsafe { fma_sse2(a, b, c) };
281 return;
282 }
283 }
284 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
285 {
286 if std::arch::is_aarch64_feature_detected!("neon") {
287 unsafe { fma_neon(a, b, c) };
288 return;
289 }
290 }
291 fma_scalar(a, b, c);
292}
293pub fn scale_vec(vector: &[f32], scalar: f32, result: &mut [f32]) {
317 assert_eq!(
318 vector.len(),
319 result.len(),
320 "Input and output vectors must have the same length"
321 );
322 if vector.is_empty() {
323 return;
324 }
325 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326 {
327 if crate::simd_feature_detected!("avx512f") {
328 unsafe { scale_vec_avx512(vector, scalar, result) };
329 return;
330 } else if crate::simd_feature_detected!("avx2") {
331 unsafe { scale_vec_avx2(vector, scalar, result) };
332 return;
333 } else if crate::simd_feature_detected!("sse2") {
334 unsafe { scale_vec_sse2(vector, scalar, result) };
335 return;
336 }
337 }
338 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
339 {
340 if std::arch::is_aarch64_feature_detected!("neon") {
341 unsafe { scale_vec_neon(vector, scalar, result) };
342 return;
343 }
344 }
345 scale_vec_scalar(vector, scalar, result);
346}
347pub fn scale_vec_inplace(vector: &mut [f32], scalar: f32) {
365 let len = vector.len();
366 if len == 0 {
367 return;
368 }
369 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
370 {
371 if crate::simd_feature_detected!("avx512f") {
372 unsafe {
373 let mut i = 0;
374 while i + 16 <= len {
375 let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 16);
376 let result_slice =
377 core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 16);
378 scale_vec_avx512(vec_slice, scalar, result_slice);
379 i += 16;
380 }
381 while i < len {
382 vector[i] *= scalar;
383 i += 1;
384 }
385 }
386 return;
387 } else if crate::simd_feature_detected!("avx2") {
388 unsafe {
389 let mut i = 0;
390 while i + 8 <= len {
391 let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 8);
392 let result_slice =
393 core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 8);
394 scale_vec_avx2(vec_slice, scalar, result_slice);
395 i += 8;
396 }
397 while i < len {
398 vector[i] *= scalar;
399 i += 1;
400 }
401 }
402 return;
403 } else if crate::simd_feature_detected!("sse2") {
404 unsafe {
405 let mut i = 0;
406 while i + 4 <= len {
407 let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 4);
408 let result_slice =
409 core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 4);
410 scale_vec_sse2(vec_slice, scalar, result_slice);
411 i += 4;
412 }
413 while i < len {
414 vector[i] *= scalar;
415 i += 1;
416 }
417 }
418 return;
419 }
420 }
421 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
422 {
423 if std::arch::is_aarch64_feature_detected!("neon") {
424 unsafe {
425 let mut i = 0;
426 while i + 4 <= len {
427 let vec_slice = core::slice::from_raw_parts(vector.as_ptr().add(i), 4);
428 let result_slice =
429 core::slice::from_raw_parts_mut(vector.as_mut_ptr().add(i), 4);
430 scale_vec_neon(vec_slice, scalar, result_slice);
431 i += 4;
432 }
433 while i < len {
434 vector[i] *= scalar;
435 i += 1;
436 }
437 }
438 return;
439 }
440 }
441 for v in vector[..len].iter_mut() {
442 *v *= scalar;
443 }
444}
445pub fn abs_vec(vector: &[f32], result: &mut [f32]) {
467 assert_eq!(
468 vector.len(),
469 result.len(),
470 "Input and output vectors must have the same length"
471 );
472 if vector.is_empty() {
473 return;
474 }
475 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
476 {
477 if crate::simd_feature_detected!("avx512f") {
478 unsafe { abs_vec_avx512(vector, result) };
479 return;
480 } else if crate::simd_feature_detected!("avx2") {
481 unsafe { abs_vec_avx2(vector, result) };
482 return;
483 } else if crate::simd_feature_detected!("sse2") {
484 unsafe { abs_vec_sse2(vector, result) };
485 return;
486 }
487 }
488 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
489 {
490 if std::arch::is_aarch64_feature_detected!("neon") {
491 unsafe { abs_vec_neon(vector, result) };
492 return;
493 }
494 }
495 abs_vec_scalar(vector, result);
496}
497pub fn neg_vec(vector: &[f32], result: &mut [f32]) {
519 assert_eq!(
520 vector.len(),
521 result.len(),
522 "Input and output vectors must have the same length"
523 );
524 if vector.is_empty() {
525 return;
526 }
527 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
528 {
529 if crate::simd_feature_detected!("avx512f") {
530 unsafe { neg_vec_avx512(vector, result) };
531 return;
532 } else if crate::simd_feature_detected!("avx2") {
533 unsafe { neg_vec_avx2(vector, result) };
534 return;
535 } else if crate::simd_feature_detected!("sse2") {
536 unsafe { neg_vec_sse2(vector, result) };
537 return;
538 }
539 }
540 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
541 {
542 if std::arch::is_aarch64_feature_detected!("neon") {
543 unsafe { neg_vec_neon(vector, result) };
544 return;
545 }
546 }
547 neg_vec_scalar(vector, result);
548}
549pub fn reciprocal_vec(vector: &[f32], result: &mut [f32]) {
572 assert_eq!(
573 vector.len(),
574 result.len(),
575 "Input and output vectors must have the same length"
576 );
577 if vector.is_empty() {
578 return;
579 }
580 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
581 {
582 if crate::simd_feature_detected!("avx512f") {
583 unsafe { reciprocal_vec_avx512(vector, result) };
584 return;
585 } else if crate::simd_feature_detected!("avx2") {
586 unsafe { reciprocal_vec_avx2(vector, result) };
587 return;
588 } else if crate::simd_feature_detected!("sse2") {
589 unsafe { reciprocal_vec_sse2(vector, result) };
590 return;
591 }
592 }
593 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
594 {
595 if std::arch::is_aarch64_feature_detected!("neon") {
596 unsafe { reciprocal_vec_neon(vector, result) };
597 return;
598 }
599 }
600 reciprocal_vec_scalar(vector, result);
601}
602pub fn square_vec(vector: &[f32], result: &mut [f32]) {
624 assert_eq!(
625 vector.len(),
626 result.len(),
627 "Input and output vectors must have the same length"
628 );
629 if vector.is_empty() {
630 return;
631 }
632 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
633 {
634 if crate::simd_feature_detected!("avx512f") {
635 unsafe { square_vec_avx512(vector, result) };
636 return;
637 } else if crate::simd_feature_detected!("avx2") {
638 unsafe { square_vec_avx2(vector, result) };
639 return;
640 } else if crate::simd_feature_detected!("sse2") {
641 unsafe { square_vec_sse2(vector, result) };
642 return;
643 }
644 }
645 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
646 {
647 if std::arch::is_aarch64_feature_detected!("neon") {
648 unsafe { square_vec_neon(vector, result) };
649 return;
650 }
651 }
652 square_vec_scalar(vector, result);
653}
654fn add_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
655 for i in 0..a.len() {
656 result[i] = a[i] + b[i];
657 }
658}
659fn subtract_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
660 for i in 0..a.len() {
661 result[i] = a[i] - b[i];
662 }
663}
664fn multiply_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
665 for i in 0..a.len() {
666 result[i] = a[i] * b[i];
667 }
668}
669fn divide_vec_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
670 for i in 0..a.len() {
671 result[i] = a[i] / b[i];
672 }
673}
674fn fma_scalar(a: &mut [f32], b: &[f32], c: &[f32]) {
675 for i in 0..a.len() {
676 a[i] = a[i] * b[i] + c[i];
677 }
678}
679fn scale_vec_scalar(vector: &[f32], scalar: f32, result: &mut [f32]) {
680 for i in 0..vector.len() {
681 result[i] = vector[i] * scalar;
682 }
683}
684fn abs_vec_scalar(vector: &[f32], result: &mut [f32]) {
685 for i in 0..vector.len() {
686 result[i] = vector[i].abs();
687 }
688}
689fn neg_vec_scalar(vector: &[f32], result: &mut [f32]) {
690 for i in 0..vector.len() {
691 result[i] = -vector[i];
692 }
693}
694fn reciprocal_vec_scalar(vector: &[f32], result: &mut [f32]) {
695 for i in 0..vector.len() {
696 result[i] = 1.0 / vector[i];
697 }
698}
699fn square_vec_scalar(vector: &[f32], result: &mut [f32]) {
700 for i in 0..vector.len() {
701 result[i] = vector[i] * vector[i];
702 }
703}
704#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
705#[target_feature(enable = "sse2")]
706unsafe fn add_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
707 #[cfg(feature = "no-std")]
708 use core::arch::x86_64::*;
709 #[cfg(not(feature = "no-std"))]
710 use core::arch::x86_64::*;
711 let mut i = 0;
712 while i + 4 <= a.len() {
713 let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
714 let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
715 let result_vec = _mm_add_ps(a_vec, b_vec);
716 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
717 i += 4;
718 }
719 while i < a.len() {
720 result[i] = a[i] + b[i];
721 i += 1;
722 }
723}
724#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
725#[target_feature(enable = "sse2")]
726unsafe fn subtract_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
727 #[cfg(feature = "no-std")]
728 use core::arch::x86_64::*;
729 #[cfg(not(feature = "no-std"))]
730 use core::arch::x86_64::*;
731 let mut i = 0;
732 while i + 4 <= a.len() {
733 let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
734 let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
735 let result_vec = _mm_sub_ps(a_vec, b_vec);
736 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
737 i += 4;
738 }
739 while i < a.len() {
740 result[i] = a[i] - b[i];
741 i += 1;
742 }
743}
744#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
745#[target_feature(enable = "sse2")]
746unsafe fn multiply_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
747 #[cfg(feature = "no-std")]
748 use core::arch::x86_64::*;
749 #[cfg(not(feature = "no-std"))]
750 use core::arch::x86_64::*;
751 let mut i = 0;
752 while i + 4 <= a.len() {
753 let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
754 let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
755 let result_vec = _mm_mul_ps(a_vec, b_vec);
756 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
757 i += 4;
758 }
759 while i < a.len() {
760 result[i] = a[i] * b[i];
761 i += 1;
762 }
763}
764#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
765#[target_feature(enable = "sse2")]
766unsafe fn divide_vec_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
767 #[cfg(feature = "no-std")]
768 use core::arch::x86_64::*;
769 #[cfg(not(feature = "no-std"))]
770 use core::arch::x86_64::*;
771 let mut i = 0;
772 while i + 4 <= a.len() {
773 let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
774 let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
775 let result_vec = _mm_div_ps(a_vec, b_vec);
776 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
777 i += 4;
778 }
779 while i < a.len() {
780 result[i] = a[i] / b[i];
781 i += 1;
782 }
783}
784#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
785#[target_feature(enable = "sse2")]
786unsafe fn fma_sse2(a: &mut [f32], b: &[f32], c: &[f32]) {
787 #[cfg(feature = "no-std")]
788 use core::arch::x86_64::*;
789 #[cfg(not(feature = "no-std"))]
790 use core::arch::x86_64::*;
791 let mut i = 0;
792 while i + 4 <= a.len() {
793 let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
794 let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
795 let c_vec = _mm_loadu_ps(c.as_ptr().add(i));
796 let result_vec = _mm_add_ps(_mm_mul_ps(a_vec, b_vec), c_vec);
797 _mm_storeu_ps(a.as_mut_ptr().add(i), result_vec);
798 i += 4;
799 }
800 while i < a.len() {
801 a[i] = a[i] * b[i] + c[i];
802 i += 1;
803 }
804}
805#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
806#[target_feature(enable = "sse2")]
807unsafe fn scale_vec_sse2(vector: &[f32], scalar: f32, result: &mut [f32]) {
808 #[cfg(feature = "no-std")]
809 use core::arch::x86_64::*;
810 #[cfg(not(feature = "no-std"))]
811 use core::arch::x86_64::*;
812 let scalar_vec = _mm_set1_ps(scalar);
813 let mut i = 0;
814 while i + 4 <= vector.len() {
815 let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
816 let result_vec = _mm_mul_ps(vector_vec, scalar_vec);
817 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
818 i += 4;
819 }
820 while i < vector.len() {
821 result[i] = vector[i] * scalar;
822 i += 1;
823 }
824}
825#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
826#[target_feature(enable = "sse2")]
827unsafe fn abs_vec_sse2(vector: &[f32], result: &mut [f32]) {
828 #[cfg(feature = "no-std")]
829 use core::arch::x86_64::*;
830 #[cfg(not(feature = "no-std"))]
831 use core::arch::x86_64::*;
832 let abs_mask = _mm_set1_ps(f32::from_bits(0x7FFFFFFF));
833 let mut i = 0;
834 while i + 4 <= vector.len() {
835 let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
836 let result_vec = _mm_and_ps(vector_vec, abs_mask);
837 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
838 i += 4;
839 }
840 while i < vector.len() {
841 result[i] = vector[i].abs();
842 i += 1;
843 }
844}
845#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
846#[target_feature(enable = "sse2")]
847unsafe fn neg_vec_sse2(vector: &[f32], result: &mut [f32]) {
848 #[cfg(feature = "no-std")]
849 use core::arch::x86_64::*;
850 #[cfg(not(feature = "no-std"))]
851 use core::arch::x86_64::*;
852 let sign_mask = _mm_set1_ps(f32::from_bits(0x80000000));
853 let mut i = 0;
854 while i + 4 <= vector.len() {
855 let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
856 let result_vec = _mm_xor_ps(vector_vec, sign_mask);
857 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
858 i += 4;
859 }
860 while i < vector.len() {
861 result[i] = -vector[i];
862 i += 1;
863 }
864}
865#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
866#[target_feature(enable = "sse2")]
867unsafe fn reciprocal_vec_sse2(vector: &[f32], result: &mut [f32]) {
868 #[cfg(feature = "no-std")]
869 use core::arch::x86_64::*;
870 #[cfg(not(feature = "no-std"))]
871 use core::arch::x86_64::*;
872 let mut i = 0;
873 while i + 4 <= vector.len() {
874 let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
875 let result_vec = _mm_div_ps(_mm_set1_ps(1.0), vector_vec);
876 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
877 i += 4;
878 }
879 while i < vector.len() {
880 result[i] = 1.0 / vector[i];
881 i += 1;
882 }
883}
884#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
885#[target_feature(enable = "sse2")]
886unsafe fn square_vec_sse2(vector: &[f32], result: &mut [f32]) {
887 #[cfg(feature = "no-std")]
888 use core::arch::x86_64::*;
889 #[cfg(not(feature = "no-std"))]
890 use core::arch::x86_64::*;
891 let mut i = 0;
892 while i + 4 <= vector.len() {
893 let vector_vec = _mm_loadu_ps(vector.as_ptr().add(i));
894 let result_vec = _mm_mul_ps(vector_vec, vector_vec);
895 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
896 i += 4;
897 }
898 while i < vector.len() {
899 result[i] = vector[i] * vector[i];
900 i += 1;
901 }
902}
903#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
904#[target_feature(enable = "avx2")]
905unsafe fn add_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
906 #[cfg(feature = "no-std")]
907 use core::arch::x86_64::*;
908 #[cfg(not(feature = "no-std"))]
909 use core::arch::x86_64::*;
910 let mut i = 0;
911 while i + 8 <= a.len() {
912 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
913 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
914 let result_vec = _mm256_add_ps(a_vec, b_vec);
915 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
916 i += 8;
917 }
918 while i < a.len() {
919 result[i] = a[i] + b[i];
920 i += 1;
921 }
922}
923#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
924#[target_feature(enable = "avx2")]
925unsafe fn subtract_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
926 #[cfg(feature = "no-std")]
927 use core::arch::x86_64::*;
928 #[cfg(not(feature = "no-std"))]
929 use core::arch::x86_64::*;
930 let mut i = 0;
931 while i + 8 <= a.len() {
932 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
933 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
934 let result_vec = _mm256_sub_ps(a_vec, b_vec);
935 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
936 i += 8;
937 }
938 while i < a.len() {
939 result[i] = a[i] - b[i];
940 i += 1;
941 }
942}
943#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
944#[target_feature(enable = "avx2")]
945unsafe fn multiply_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
946 #[cfg(feature = "no-std")]
947 use core::arch::x86_64::*;
948 #[cfg(not(feature = "no-std"))]
949 use core::arch::x86_64::*;
950 let mut i = 0;
951 while i + 8 <= a.len() {
952 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
953 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
954 let result_vec = _mm256_mul_ps(a_vec, b_vec);
955 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
956 i += 8;
957 }
958 while i < a.len() {
959 result[i] = a[i] * b[i];
960 i += 1;
961 }
962}
963#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
964#[target_feature(enable = "avx2")]
965unsafe fn divide_vec_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
966 #[cfg(feature = "no-std")]
967 use core::arch::x86_64::*;
968 #[cfg(not(feature = "no-std"))]
969 use core::arch::x86_64::*;
970 let mut i = 0;
971 while i + 8 <= a.len() {
972 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
973 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
974 let result_vec = _mm256_div_ps(a_vec, b_vec);
975 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
976 i += 8;
977 }
978 while i < a.len() {
979 result[i] = a[i] / b[i];
980 i += 1;
981 }
982}
983#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
984#[target_feature(enable = "avx2")]
985unsafe fn fma_avx2(a: &mut [f32], b: &[f32], c: &[f32]) {
986 #[cfg(feature = "no-std")]
987 use core::arch::x86_64::*;
988 #[cfg(not(feature = "no-std"))]
989 use core::arch::x86_64::*;
990 let mut i = 0;
991 while i + 8 <= a.len() {
992 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
993 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
994 let c_vec = _mm256_loadu_ps(c.as_ptr().add(i));
995 let result_vec = _mm256_add_ps(_mm256_mul_ps(a_vec, b_vec), c_vec);
996 _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
997 i += 8;
998 }
999 while i < a.len() {
1000 a[i] = a[i] * b[i] + c[i];
1001 i += 1;
1002 }
1003}