1#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
24use std::arch::is_aarch64_feature_detected;
25
26pub fn sin_vec(input: &[f32], result: &mut [f32]) {
52 assert_eq!(
53 input.len(),
54 result.len(),
55 "Input and output vectors must have the same length"
56 );
57
58 if input.is_empty() {
59 return;
60 }
61
62 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
63 {
64 if crate::simd_feature_detected!("avx512f") {
65 unsafe { sin_vec_avx512(input, result) };
66 return;
67 } else if crate::simd_feature_detected!("avx2") {
68 unsafe { sin_vec_avx2(input, result) };
69 return;
70 } else if crate::simd_feature_detected!("sse2") {
71 unsafe { sin_vec_sse2(input, result) };
72 return;
73 }
74 }
75
76 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
77 {
78 if is_aarch64_feature_detected!("neon") {
79 unsafe { sin_vec_neon(input, result) };
80 return;
81 }
82 }
83
84 sin_vec_scalar(input, result);
85}
86
87pub fn cos_vec(input: &[f32], result: &mut [f32]) {
113 assert_eq!(
114 input.len(),
115 result.len(),
116 "Input and output vectors must have the same length"
117 );
118
119 if input.is_empty() {
120 return;
121 }
122
123 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
124 {
125 if crate::simd_feature_detected!("avx512f") {
126 unsafe { cos_vec_avx512(input, result) };
127 return;
128 } else if crate::simd_feature_detected!("avx2") {
129 unsafe { cos_vec_avx2(input, result) };
130 return;
131 } else if crate::simd_feature_detected!("sse2") {
132 unsafe { cos_vec_sse2(input, result) };
133 return;
134 }
135 }
136
137 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
138 {
139 if is_aarch64_feature_detected!("neon") {
140 unsafe { cos_vec_neon(input, result) };
141 return;
142 }
143 }
144
145 cos_vec_scalar(input, result);
146}
147
148pub fn tan_vec(input: &[f32], result: &mut [f32]) {
173 assert_eq!(
174 input.len(),
175 result.len(),
176 "Input and output vectors must have the same length"
177 );
178
179 if input.is_empty() {
180 return;
181 }
182
183 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
184 {
185 if crate::simd_feature_detected!("avx512f") {
186 unsafe { tan_vec_avx512(input, result) };
187 return;
188 } else if crate::simd_feature_detected!("avx2") {
189 unsafe { tan_vec_avx2(input, result) };
190 return;
191 } else if crate::simd_feature_detected!("sse2") {
192 unsafe { tan_vec_sse2(input, result) };
193 return;
194 }
195 }
196
197 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
198 {
199 if is_aarch64_feature_detected!("neon") {
200 unsafe { tan_vec_neon(input, result) };
201 return;
202 }
203 }
204
205 tan_vec_scalar(input, result);
206}
207
208pub fn exp_vec(input: &[f32], result: &mut [f32]) {
232 assert_eq!(
233 input.len(),
234 result.len(),
235 "Input and output vectors must have the same length"
236 );
237
238 if input.is_empty() {
239 return;
240 }
241
242 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
243 {
244 if crate::simd_feature_detected!("avx512f") {
245 unsafe { exp_vec_avx512(input, result) };
246 return;
247 } else if crate::simd_feature_detected!("avx2") {
248 unsafe { exp_vec_avx2(input, result) };
249 return;
250 } else if crate::simd_feature_detected!("sse2") {
251 unsafe { exp_vec_sse2(input, result) };
252 return;
253 }
254 }
255
256 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
257 {
258 if is_aarch64_feature_detected!("neon") {
259 unsafe { exp_vec_neon(input, result) };
260 return;
261 }
262 }
263
264 exp_vec_scalar(input, result);
265}
266
267pub fn ln_vec(input: &[f32], result: &mut [f32]) {
292 assert_eq!(
293 input.len(),
294 result.len(),
295 "Input and output vectors must have the same length"
296 );
297
298 if input.is_empty() {
299 return;
300 }
301
302 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
303 {
304 if crate::simd_feature_detected!("avx512f") {
305 unsafe { ln_vec_avx512(input, result) };
306 return;
307 } else if crate::simd_feature_detected!("avx2") {
308 unsafe { ln_vec_avx2(input, result) };
309 return;
310 } else if crate::simd_feature_detected!("sse2") {
311 unsafe { ln_vec_sse2(input, result) };
312 return;
313 }
314 }
315
316 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
317 {
318 if is_aarch64_feature_detected!("neon") {
319 unsafe { ln_vec_neon(input, result) };
320 return;
321 }
322 }
323
324 ln_vec_scalar(input, result);
325}
326
327pub fn sqrt_vec(input: &[f32], result: &mut [f32]) {
354 assert_eq!(
355 input.len(),
356 result.len(),
357 "Input and output vectors must have the same length"
358 );
359
360 if input.is_empty() {
361 return;
362 }
363
364 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
365 {
366 if crate::simd_feature_detected!("avx512f") {
367 unsafe { sqrt_vec_avx512(input, result) };
368 return;
369 } else if crate::simd_feature_detected!("avx2") {
370 unsafe { sqrt_vec_avx2(input, result) };
371 return;
372 } else if crate::simd_feature_detected!("sse2") {
373 unsafe { sqrt_vec_sse2(input, result) };
374 return;
375 }
376 }
377
378 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
379 {
380 if is_aarch64_feature_detected!("neon") {
381 unsafe { sqrt_vec_neon(input, result) };
382 return;
383 }
384 }
385
386 sqrt_vec_scalar(input, result);
387}
388
389pub fn pow_vec(base: &[f32], exponent: &[f32], result: &mut [f32]) {
416 assert_eq!(
417 base.len(),
418 exponent.len(),
419 "Input vectors must have the same length"
420 );
421 assert_eq!(
422 base.len(),
423 result.len(),
424 "Output vector must have the same length as input vectors"
425 );
426
427 if base.is_empty() {
428 return;
429 }
430
431 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
432 {
433 if crate::simd_feature_detected!("avx512f") {
434 unsafe { pow_vec_avx512(base, exponent, result) };
435 return;
436 } else if crate::simd_feature_detected!("avx2") {
437 unsafe { pow_vec_avx2(base, exponent, result) };
438 return;
439 } else if crate::simd_feature_detected!("sse2") {
440 unsafe { pow_vec_sse2(base, exponent, result) };
441 return;
442 }
443 }
444
445 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
446 {
447 if is_aarch64_feature_detected!("neon") {
448 unsafe { pow_vec_neon(base, exponent, result) };
449 return;
450 }
451 }
452
453 pow_vec_scalar(base, exponent, result);
454}
455
456pub fn square_vec(input: &[f32], result: &mut [f32]) {
482 assert_eq!(
483 input.len(),
484 result.len(),
485 "Input and output vectors must have the same length"
486 );
487
488 if input.is_empty() {
489 return;
490 }
491
492 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
493 {
494 if crate::simd_feature_detected!("avx512f") {
495 unsafe { square_vec_avx512(input, result) };
496 return;
497 } else if crate::simd_feature_detected!("avx2") {
498 unsafe { square_vec_avx2(input, result) };
499 return;
500 } else if crate::simd_feature_detected!("sse2") {
501 unsafe { square_vec_sse2(input, result) };
502 return;
503 }
504 }
505
506 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
507 {
508 if is_aarch64_feature_detected!("neon") {
509 unsafe { square_vec_neon(input, result) };
510 return;
511 }
512 }
513
514 square_vec_scalar(input, result);
515}
516
517pub fn reciprocal_vec(input: &[f32], result: &mut [f32]) {
543 assert_eq!(
544 input.len(),
545 result.len(),
546 "Input and output vectors must have the same length"
547 );
548
549 if input.is_empty() {
550 return;
551 }
552
553 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
554 {
555 if crate::simd_feature_detected!("avx512f") {
556 unsafe { reciprocal_vec_avx512(input, result) };
557 return;
558 } else if crate::simd_feature_detected!("avx2") {
559 unsafe { reciprocal_vec_avx2(input, result) };
560 return;
561 } else if crate::simd_feature_detected!("sse2") {
562 unsafe { reciprocal_vec_sse2(input, result) };
563 return;
564 }
565 }
566
567 #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
568 {
569 if is_aarch64_feature_detected!("neon") {
570 unsafe { reciprocal_vec_neon(input, result) };
571 return;
572 }
573 }
574
575 reciprocal_vec_scalar(input, result);
576}
577
578fn sin_vec_scalar(input: &[f32], result: &mut [f32]) {
583 for i in 0..input.len() {
584 result[i] = input[i].sin();
585 }
586}
587
588fn cos_vec_scalar(input: &[f32], result: &mut [f32]) {
589 for i in 0..input.len() {
590 result[i] = input[i].cos();
591 }
592}
593
594fn tan_vec_scalar(input: &[f32], result: &mut [f32]) {
595 for i in 0..input.len() {
596 result[i] = input[i].tan();
597 }
598}
599
600fn exp_vec_scalar(input: &[f32], result: &mut [f32]) {
601 for i in 0..input.len() {
602 result[i] = input[i].exp();
603 }
604}
605
606fn ln_vec_scalar(input: &[f32], result: &mut [f32]) {
607 for i in 0..input.len() {
608 result[i] = input[i].ln();
609 }
610}
611
612fn sqrt_vec_scalar(input: &[f32], result: &mut [f32]) {
613 for i in 0..input.len() {
614 result[i] = input[i].sqrt();
615 }
616}
617
618fn pow_vec_scalar(base: &[f32], exponent: &[f32], result: &mut [f32]) {
619 for i in 0..base.len() {
620 result[i] = base[i].powf(exponent[i]);
621 }
622}
623
624fn square_vec_scalar(input: &[f32], result: &mut [f32]) {
625 for i in 0..input.len() {
626 result[i] = input[i] * input[i];
627 }
628}
629
630fn reciprocal_vec_scalar(input: &[f32], result: &mut [f32]) {
631 for i in 0..input.len() {
632 result[i] = 1.0 / input[i];
633 }
634}
635
636#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
641#[target_feature(enable = "sse2")]
642unsafe fn sin_vec_sse2(input: &[f32], result: &mut [f32]) {
643 let mut i = 0;
644
645 while i < input.len() {
649 result[i] = input[i].sin();
650 i += 1;
651 }
652}
653
654#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
655#[target_feature(enable = "sse2")]
656unsafe fn cos_vec_sse2(input: &[f32], result: &mut [f32]) {
657 let mut i = 0;
658
659 while i < input.len() {
660 result[i] = input[i].cos();
661 i += 1;
662 }
663}
664
665#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
666#[target_feature(enable = "sse2")]
667unsafe fn tan_vec_sse2(input: &[f32], result: &mut [f32]) {
668 let mut i = 0;
669
670 while i < input.len() {
671 result[i] = input[i].tan();
672 i += 1;
673 }
674}
675
676#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
677#[target_feature(enable = "sse2")]
678unsafe fn exp_vec_sse2(input: &[f32], result: &mut [f32]) {
679 let mut i = 0;
680
681 while i < input.len() {
682 result[i] = input[i].exp();
683 i += 1;
684 }
685}
686
687#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
688#[target_feature(enable = "sse2")]
689unsafe fn ln_vec_sse2(input: &[f32], result: &mut [f32]) {
690 let mut i = 0;
691
692 while i < input.len() {
693 result[i] = input[i].ln();
694 i += 1;
695 }
696}
697
698#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
699#[target_feature(enable = "sse2")]
700unsafe fn sqrt_vec_sse2(input: &[f32], result: &mut [f32]) {
701 #[cfg(feature = "no-std")]
702 use core::arch::x86_64::*;
703 #[cfg(not(feature = "no-std"))]
704 use core::arch::x86_64::*;
705
706 let mut i = 0;
707
708 while i + 4 <= input.len() {
710 let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
711 let result_vec = _mm_sqrt_ps(input_vec);
712 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
713 i += 4;
714 }
715
716 while i < input.len() {
718 result[i] = input[i].sqrt();
719 i += 1;
720 }
721}
722
723#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
724#[target_feature(enable = "sse2")]
725unsafe fn pow_vec_sse2(base: &[f32], exponent: &[f32], result: &mut [f32]) {
726 let mut i = 0;
727
728 while i < base.len() {
729 result[i] = base[i].powf(exponent[i]);
730 i += 1;
731 }
732}
733
734#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
735#[target_feature(enable = "sse2")]
736unsafe fn square_vec_sse2(input: &[f32], result: &mut [f32]) {
737 #[cfg(feature = "no-std")]
738 use core::arch::x86_64::*;
739 #[cfg(not(feature = "no-std"))]
740 use core::arch::x86_64::*;
741
742 let mut i = 0;
743
744 while i + 4 <= input.len() {
746 let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
747 let result_vec = _mm_mul_ps(input_vec, input_vec);
748 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
749 i += 4;
750 }
751
752 while i < input.len() {
754 result[i] = input[i] * input[i];
755 i += 1;
756 }
757}
758
759#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
760#[target_feature(enable = "sse2")]
761unsafe fn reciprocal_vec_sse2(input: &[f32], result: &mut [f32]) {
762 #[cfg(feature = "no-std")]
763 use core::arch::x86_64::*;
764 #[cfg(not(feature = "no-std"))]
765 use core::arch::x86_64::*;
766
767 let mut i = 0;
768
769 while i + 4 <= input.len() {
771 let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
772 let ones = _mm_set1_ps(1.0);
773 let result_vec = _mm_div_ps(ones, input_vec);
774 _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
775 i += 4;
776 }
777
778 while i < input.len() {
780 result[i] = 1.0 / input[i];
781 i += 1;
782 }
783}
784
785#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
790#[target_feature(enable = "avx2")]
791unsafe fn sin_vec_avx2(input: &[f32], result: &mut [f32]) {
792 let mut i = 0;
793
794 while i < input.len() {
795 result[i] = input[i].sin();
796 i += 1;
797 }
798}
799
800#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
801#[target_feature(enable = "avx2")]
802unsafe fn cos_vec_avx2(input: &[f32], result: &mut [f32]) {
803 let mut i = 0;
804
805 while i < input.len() {
806 result[i] = input[i].cos();
807 i += 1;
808 }
809}
810
811#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
812#[target_feature(enable = "avx2")]
813unsafe fn tan_vec_avx2(input: &[f32], result: &mut [f32]) {
814 let mut i = 0;
815
816 while i < input.len() {
817 result[i] = input[i].tan();
818 i += 1;
819 }
820}
821
822#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
823#[target_feature(enable = "avx2")]
824unsafe fn exp_vec_avx2(input: &[f32], result: &mut [f32]) {
825 let mut i = 0;
826
827 while i < input.len() {
828 result[i] = input[i].exp();
829 i += 1;
830 }
831}
832
833#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
834#[target_feature(enable = "avx2")]
835unsafe fn ln_vec_avx2(input: &[f32], result: &mut [f32]) {
836 let mut i = 0;
837
838 while i < input.len() {
839 result[i] = input[i].ln();
840 i += 1;
841 }
842}
843
844#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
845#[target_feature(enable = "avx2")]
846unsafe fn sqrt_vec_avx2(input: &[f32], result: &mut [f32]) {
847 #[cfg(feature = "no-std")]
848 use core::arch::x86_64::*;
849 #[cfg(not(feature = "no-std"))]
850 use core::arch::x86_64::*;
851
852 let mut i = 0;
853
854 while i + 8 <= input.len() {
856 let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
857 let result_vec = _mm256_sqrt_ps(input_vec);
858 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
859 i += 8;
860 }
861
862 while i < input.len() {
864 result[i] = input[i].sqrt();
865 i += 1;
866 }
867}
868
869#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
870#[target_feature(enable = "avx2")]
871unsafe fn pow_vec_avx2(base: &[f32], exponent: &[f32], result: &mut [f32]) {
872 let mut i = 0;
873
874 while i < base.len() {
875 result[i] = base[i].powf(exponent[i]);
876 i += 1;
877 }
878}
879
880#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
881#[target_feature(enable = "avx2")]
882unsafe fn square_vec_avx2(input: &[f32], result: &mut [f32]) {
883 #[cfg(feature = "no-std")]
884 use core::arch::x86_64::*;
885 #[cfg(not(feature = "no-std"))]
886 use core::arch::x86_64::*;
887
888 let mut i = 0;
889
890 while i + 8 <= input.len() {
892 let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
893 let result_vec = _mm256_mul_ps(input_vec, input_vec);
894 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
895 i += 8;
896 }
897
898 while i < input.len() {
900 result[i] = input[i] * input[i];
901 i += 1;
902 }
903}
904
905#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
906#[target_feature(enable = "avx2")]
907unsafe fn reciprocal_vec_avx2(input: &[f32], result: &mut [f32]) {
908 #[cfg(feature = "no-std")]
909 use core::arch::x86_64::*;
910 #[cfg(not(feature = "no-std"))]
911 use core::arch::x86_64::*;
912
913 let mut i = 0;
914
915 while i + 8 <= input.len() {
917 let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
918 let ones = _mm256_set1_ps(1.0);
919 let result_vec = _mm256_div_ps(ones, input_vec);
920 _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
921 i += 8;
922 }
923
924 while i < input.len() {
926 result[i] = 1.0 / input[i];
927 i += 1;
928 }
929}
930
931#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
936#[target_feature(enable = "avx512f")]
937unsafe fn sin_vec_avx512(input: &[f32], result: &mut [f32]) {
938 let mut i = 0;
939
940 while i < input.len() {
941 result[i] = input[i].sin();
942 i += 1;
943 }
944}
945
946#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
947#[target_feature(enable = "avx512f")]
948unsafe fn cos_vec_avx512(input: &[f32], result: &mut [f32]) {
949 let mut i = 0;
950
951 while i < input.len() {
952 result[i] = input[i].cos();
953 i += 1;
954 }
955}
956
957#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
958#[target_feature(enable = "avx512f")]
959unsafe fn tan_vec_avx512(input: &[f32], result: &mut [f32]) {
960 let mut i = 0;
961
962 while i < input.len() {
963 result[i] = input[i].tan();
964 i += 1;
965 }
966}
967
968#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
969#[target_feature(enable = "avx512f")]
970unsafe fn exp_vec_avx512(input: &[f32], result: &mut [f32]) {
971 let mut i = 0;
972
973 while i < input.len() {
974 result[i] = input[i].exp();
975 i += 1;
976 }
977}
978
979#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
980#[target_feature(enable = "avx512f")]
981unsafe fn ln_vec_avx512(input: &[f32], result: &mut [f32]) {
982 let mut i = 0;
983
984 while i < input.len() {
985 result[i] = input[i].ln();
986 i += 1;
987 }
988}
989
990#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
991#[target_feature(enable = "avx512f")]
992unsafe fn sqrt_vec_avx512(input: &[f32], result: &mut [f32]) {
993 #[cfg(feature = "no-std")]
994 use core::arch::x86_64::*;
995 #[cfg(not(feature = "no-std"))]
996 use core::arch::x86_64::*;
997
998 let mut i = 0;
999
1000 while i + 16 <= input.len() {
1002 let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1003 let result_vec = _mm512_sqrt_ps(input_vec);
1004 _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1005 i += 16;
1006 }
1007
1008 while i < input.len() {
1010 result[i] = input[i].sqrt();
1011 i += 1;
1012 }
1013}
1014
1015#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1016#[target_feature(enable = "avx512f")]
1017unsafe fn pow_vec_avx512(base: &[f32], exponent: &[f32], result: &mut [f32]) {
1018 let mut i = 0;
1019
1020 while i < base.len() {
1021 result[i] = base[i].powf(exponent[i]);
1022 i += 1;
1023 }
1024}
1025
1026#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1027#[target_feature(enable = "avx512f")]
1028unsafe fn square_vec_avx512(input: &[f32], result: &mut [f32]) {
1029 #[cfg(feature = "no-std")]
1030 use core::arch::x86_64::*;
1031 #[cfg(not(feature = "no-std"))]
1032 use core::arch::x86_64::*;
1033
1034 let mut i = 0;
1035
1036 while i + 16 <= input.len() {
1038 let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1039 let result_vec = _mm512_mul_ps(input_vec, input_vec);
1040 _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1041 i += 16;
1042 }
1043
1044 while i < input.len() {
1046 result[i] = input[i] * input[i];
1047 i += 1;
1048 }
1049}
1050
1051#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1052#[target_feature(enable = "avx512f")]
1053unsafe fn reciprocal_vec_avx512(input: &[f32], result: &mut [f32]) {
1054 #[cfg(feature = "no-std")]
1055 use core::arch::x86_64::*;
1056 #[cfg(not(feature = "no-std"))]
1057 use core::arch::x86_64::*;
1058
1059 let mut i = 0;
1060
1061 while i + 16 <= input.len() {
1063 let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1064 let ones = _mm512_set1_ps(1.0);
1065 let result_vec = _mm512_div_ps(ones, input_vec);
1066 _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1067 i += 16;
1068 }
1069
1070 while i < input.len() {
1072 result[i] = 1.0 / input[i];
1073 i += 1;
1074 }
1075}
1076
1077#[cfg(target_arch = "aarch64")]
1082#[allow(dead_code)] #[target_feature(enable = "neon")]
1084unsafe fn sin_vec_neon(input: &[f32], result: &mut [f32]) {
1085 let mut i = 0;
1086
1087 while i < input.len() {
1088 result[i] = input[i].sin();
1089 i += 1;
1090 }
1091}
1092
1093#[cfg(target_arch = "aarch64")]
1094#[allow(dead_code)] #[target_feature(enable = "neon")]
1096unsafe fn cos_vec_neon(input: &[f32], result: &mut [f32]) {
1097 let mut i = 0;
1098
1099 while i < input.len() {
1100 result[i] = input[i].cos();
1101 i += 1;
1102 }
1103}
1104
1105#[cfg(target_arch = "aarch64")]
1106#[allow(dead_code)] #[target_feature(enable = "neon")]
1108unsafe fn tan_vec_neon(input: &[f32], result: &mut [f32]) {
1109 let mut i = 0;
1110
1111 while i < input.len() {
1112 result[i] = input[i].tan();
1113 i += 1;
1114 }
1115}
1116
1117#[cfg(target_arch = "aarch64")]
1118#[allow(dead_code)] #[target_feature(enable = "neon")]
1120unsafe fn exp_vec_neon(input: &[f32], result: &mut [f32]) {
1121 let mut i = 0;
1122
1123 while i < input.len() {
1124 result[i] = input[i].exp();
1125 i += 1;
1126 }
1127}
1128
1129#[cfg(target_arch = "aarch64")]
1130#[allow(dead_code)] #[target_feature(enable = "neon")]
1132unsafe fn ln_vec_neon(input: &[f32], result: &mut [f32]) {
1133 let mut i = 0;
1134
1135 while i < input.len() {
1136 result[i] = input[i].ln();
1137 i += 1;
1138 }
1139}
1140
1141#[cfg(target_arch = "aarch64")]
1142#[allow(dead_code)] #[target_feature(enable = "neon")]
1144unsafe fn sqrt_vec_neon(input: &[f32], result: &mut [f32]) {
1145 use core::arch::aarch64::*;
1146
1147 let mut i = 0;
1148
1149 while i + 4 <= input.len() {
1151 let input_vec = vld1q_f32(input.as_ptr().add(i));
1152 let result_vec = vsqrtq_f32(input_vec);
1153 vst1q_f32(result.as_mut_ptr().add(i), result_vec);
1154 i += 4;
1155 }
1156
1157 while i < input.len() {
1159 result[i] = input[i].sqrt();
1160 i += 1;
1161 }
1162}
1163
1164#[cfg(target_arch = "aarch64")]
1165#[allow(dead_code)] #[target_feature(enable = "neon")]
1167unsafe fn pow_vec_neon(base: &[f32], exponent: &[f32], result: &mut [f32]) {
1168 let mut i = 0;
1169
1170 while i < base.len() {
1171 result[i] = base[i].powf(exponent[i]);
1172 i += 1;
1173 }
1174}
1175
1176#[cfg(target_arch = "aarch64")]
1177#[allow(dead_code)] #[target_feature(enable = "neon")]
1179unsafe fn square_vec_neon(input: &[f32], result: &mut [f32]) {
1180 use core::arch::aarch64::*;
1181
1182 let mut i = 0;
1183
1184 while i + 4 <= input.len() {
1186 let input_vec = vld1q_f32(input.as_ptr().add(i));
1187 let result_vec = vmulq_f32(input_vec, input_vec);
1188 vst1q_f32(result.as_mut_ptr().add(i), result_vec);
1189 i += 4;
1190 }
1191
1192 while i < input.len() {
1194 result[i] = input[i] * input[i];
1195 i += 1;
1196 }
1197}
1198
1199#[cfg(target_arch = "aarch64")]
1200#[allow(dead_code)] #[target_feature(enable = "neon")]
1202unsafe fn reciprocal_vec_neon(input: &[f32], result: &mut [f32]) {
1203 use core::arch::aarch64::*;
1204
1205 let mut i = 0;
1206
1207 while i + 4 <= input.len() {
1209 let input_vec = vld1q_f32(input.as_ptr().add(i));
1210 let mut estimate = vrecpeq_f32(input_vec);
1211 estimate = vmulq_f32(vrecpsq_f32(input_vec, estimate), estimate);
1213 estimate = vmulq_f32(vrecpsq_f32(input_vec, estimate), estimate);
1214 vst1q_f32(result.as_mut_ptr().add(i), estimate);
1215 i += 4;
1216 }
1217
1218 while i < input.len() {
1220 result[i] = 1.0 / input[i];
1221 i += 1;
1222 }
1223}
1224
1225#[allow(non_snake_case)]
1226#[cfg(all(test, not(feature = "no-std")))]
1227mod tests {
1228 use super::*;
1229
1230 #[cfg(feature = "no-std")]
1231 use alloc::{vec, vec::Vec};
1232
1233 #[cfg(feature = "no-std")]
1234 #[allow(unused_imports)]
1235 use core::f32::consts::{E, PI};
1236 #[cfg(not(feature = "no-std"))]
1237 use std::f32::consts::{E, PI};
1238
1239 const EPSILON: f32 = 1e-6;
1240
1241 #[test]
1242 fn test_sin_vec() {
1243 let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
1244 let mut result = vec![0.0; 4];
1245
1246 sin_vec(&input, &mut result);
1247 assert!((result[0] - 0.0).abs() < EPSILON);
1248 assert!((result[1] - 1.0).abs() < EPSILON);
1249 assert!((result[2] - 0.0).abs() < EPSILON);
1250 assert!((result[3] + 1.0).abs() < EPSILON);
1251
1252 let empty_input: Vec<f32> = vec![];
1254 let mut empty_result: Vec<f32> = vec![];
1255 sin_vec(&empty_input, &mut empty_result);
1256 assert_eq!(empty_result, Vec::<f32>::new());
1257 }
1258
1259 #[test]
1260 fn test_cos_vec() {
1261 let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
1262 let mut result = vec![0.0; 4];
1263
1264 cos_vec(&input, &mut result);
1265 assert!((result[0] - 1.0).abs() < EPSILON);
1266 assert!((result[1] - 0.0).abs() < EPSILON);
1267 assert!((result[2] + 1.0).abs() < EPSILON);
1268 assert!((result[3] - 0.0).abs() < EPSILON);
1269 }
1270
1271 #[test]
1272 fn test_tan_vec() {
1273 let input = vec![0.0, PI / 4.0, -PI / 4.0];
1274 let mut result = vec![0.0; 3];
1275
1276 tan_vec(&input, &mut result);
1277 assert!((result[0] - 0.0).abs() < EPSILON);
1278 assert!((result[1] - 1.0).abs() < EPSILON);
1279 assert!((result[2] + 1.0).abs() < EPSILON);
1280 }
1281
1282 #[test]
1283 fn test_exp_vec() {
1284 let input = vec![0.0, 1.0, 2.0, -1.0];
1285 let mut result = vec![0.0; 4];
1286
1287 exp_vec(&input, &mut result);
1288 assert!((result[0] - 1.0).abs() < EPSILON);
1289 assert!((result[1] - E).abs() < EPSILON);
1290 assert!((result[2] - (E * E)).abs() < 1e-5);
1291 assert!((result[3] - (1.0 / E)).abs() < EPSILON);
1292 }
1293
1294 #[test]
1295 fn test_ln_vec() {
1296 let input = vec![1.0, E, E * E, 1.0 / E];
1297 let mut result = vec![0.0; 4];
1298
1299 ln_vec(&input, &mut result);
1300 assert!((result[0] - 0.0).abs() < EPSILON);
1301 assert!((result[1] - 1.0).abs() < EPSILON);
1302 assert!((result[2] - 2.0).abs() < EPSILON);
1303 assert!((result[3] + 1.0).abs() < EPSILON);
1304
1305 let negative_input = vec![-1.0];
1307 let mut negative_result = vec![0.0; 1];
1308 ln_vec(&negative_input, &mut negative_result);
1309 assert!(negative_result[0].is_nan());
1310 }
1311
1312 #[test]
1313 fn test_sqrt_vec() {
1314 let input = vec![0.0, 1.0, 4.0, 9.0, 16.0];
1315 let mut result = vec![0.0; 5];
1316
1317 sqrt_vec(&input, &mut result);
1318 assert!((result[0] - 0.0).abs() < EPSILON);
1319 assert!((result[1] - 1.0).abs() < EPSILON);
1320 assert!((result[2] - 2.0).abs() < EPSILON);
1321 assert!((result[3] - 3.0).abs() < EPSILON);
1322 assert!((result[4] - 4.0).abs() < EPSILON);
1323
1324 let negative_input = vec![-1.0];
1326 let mut negative_result = vec![0.0; 1];
1327 sqrt_vec(&negative_input, &mut negative_result);
1328 assert!(negative_result[0].is_nan());
1329 }
1330
1331 #[test]
1332 fn test_pow_vec() {
1333 let base = vec![2.0, 3.0, 4.0, 5.0];
1334 let exponent = vec![2.0, 3.0, 0.5, -1.0];
1335 let mut result = vec![0.0; 4];
1336
1337 pow_vec(&base, &exponent, &mut result);
1338 assert!((result[0] - 4.0).abs() < EPSILON); assert!((result[1] - 27.0).abs() < EPSILON); assert!((result[2] - 2.0).abs() < EPSILON); assert!((result[3] - 0.2).abs() < EPSILON); let empty_base: Vec<f32> = vec![];
1345 let empty_exp: Vec<f32> = vec![];
1346 let mut empty_result: Vec<f32> = vec![];
1347 pow_vec(&empty_base, &empty_exp, &mut empty_result);
1348 assert_eq!(empty_result, Vec::<f32>::new());
1349 }
1350
1351 #[test]
1352 fn test_square_vec() {
1353 let input = vec![1.0, 2.0, 3.0, -4.0];
1354 let mut result = vec![0.0; 4];
1355
1356 square_vec(&input, &mut result);
1357 assert!((result[0] - 1.0).abs() < EPSILON);
1358 assert!((result[1] - 4.0).abs() < EPSILON);
1359 assert!((result[2] - 9.0).abs() < EPSILON);
1360 assert!((result[3] - 16.0).abs() < EPSILON);
1361 }
1362
1363 #[test]
1364 fn test_reciprocal_vec() {
1365 let input = vec![1.0, 2.0, 4.0, 0.5];
1366 let mut result = vec![0.0; 4];
1367
1368 reciprocal_vec(&input, &mut result);
1369 assert!((result[0] - 1.0).abs() < EPSILON);
1370 assert!((result[1] - 0.5).abs() < EPSILON);
1371 assert!((result[2] - 0.25).abs() < EPSILON);
1372 assert!((result[3] - 2.0).abs() < EPSILON);
1373
1374 let zero_input = vec![0.0];
1376 let mut zero_result = vec![0.0; 1];
1377 reciprocal_vec(&zero_input, &mut zero_result);
1378 assert_eq!(zero_result[0], f32::INFINITY);
1379 }
1380
1381 #[test]
1382 fn test_special_values() {
1383 let large_input = vec![100.0];
1385 let mut large_result = vec![0.0; 1];
1386 exp_vec(&large_input, &mut large_result);
1387 assert_eq!(large_result[0], f32::INFINITY);
1388
1389 let zero_input = vec![0.0];
1391 let mut zero_result = vec![0.0; 1];
1392 ln_vec(&zero_input, &mut zero_result);
1393 assert_eq!(zero_result[0], f32::NEG_INFINITY);
1394
1395 let inf_input = vec![f32::INFINITY];
1397 let mut inf_result = vec![0.0; 1];
1398 sqrt_vec(&inf_input, &mut inf_result);
1399 assert_eq!(inf_result[0], f32::INFINITY);
1400 }
1401
1402 #[test]
1403 fn test_mathematical_identities() {
1404 let angles = vec![0.5, 1.0, 1.5, 2.0];
1405 let mut sin_results = vec![0.0; 4];
1406 let mut cos_results = vec![0.0; 4];
1407
1408 sin_vec(&angles, &mut sin_results);
1409 cos_vec(&angles, &mut cos_results);
1410
1411 for i in 0..4 {
1413 let sin_sq = sin_results[i] * sin_results[i];
1414 let cos_sq = cos_results[i] * cos_results[i];
1415 assert!((sin_sq + cos_sq - 1.0).abs() < 1e-6);
1416 }
1417
1418 let positive_values = vec![1.0, 2.0, 3.0, 4.0];
1420 let mut ln_results = vec![0.0; 4];
1421 let mut exp_ln_results = vec![0.0; 4];
1422
1423 ln_vec(&positive_values, &mut ln_results);
1424 exp_vec(&ln_results, &mut exp_ln_results);
1425
1426 for i in 0..4 {
1427 assert!((exp_ln_results[i] - positive_values[i]).abs() < 1e-5);
1428 }
1429 }
1430
1431 #[test]
1432 fn test_large_vectors() {
1433 let size = 100;
1434 let input: Vec<f32> = (0..size).map(|i| (i as f32) / 10.0).collect();
1435 let mut result = vec![0.0; size];
1436
1437 sqrt_vec(&input, &mut result);
1438
1439 for (i, &val) in result.iter().enumerate().take(size) {
1440 let expected = ((i as f32) / 10.0).sqrt();
1441 assert!((val - expected).abs() < 1e-6);
1442 }
1443 }
1444
1445 #[test]
1446 #[should_panic(expected = "Input and output vectors must have the same length")]
1447 fn test_sin_vec_dimension_mismatch() {
1448 let input = vec![1.0, 2.0, 3.0];
1449 let mut result = vec![0.0; 2];
1450 sin_vec(&input, &mut result);
1451 }
1452
1453 #[test]
1454 #[should_panic(expected = "Input vectors must have the same length")]
1455 fn test_pow_vec_input_dimension_mismatch() {
1456 let base = vec![1.0, 2.0, 3.0];
1457 let exponent = vec![2.0, 3.0];
1458 let mut result = vec![0.0; 3];
1459 pow_vec(&base, &exponent, &mut result);
1460 }
1461
1462 #[test]
1463 #[should_panic(expected = "Output vector must have the same length as input vectors")]
1464 fn test_pow_vec_output_dimension_mismatch() {
1465 let base = vec![1.0, 2.0, 3.0];
1466 let exponent = vec![2.0, 3.0, 4.0];
1467 let mut result = vec![0.0; 2];
1468 pow_vec(&base, &exponent, &mut result);
1469 }
1470}