Skip to main content

sklears_simd/vector/
math_functions.rs

1//! # SIMD Vector Mathematical Functions
2//!
3//! High-performance SIMD-optimized mathematical functions for vectors including
4//! trigonometric, exponential, logarithmic, and power functions.
5//!
6//! ## Features
7//!
8//! - **Trigonometric Functions**: Sin, cos, tan and their inverse functions
9//! - **Hyperbolic Functions**: Sinh, cosh, tanh functions
10//! - **Exponential Functions**: Exp, exp2, exp10 with SIMD optimization
11//! - **Logarithmic Functions**: Natural log, log2, log10
12//! - **Power Functions**: Square root, cube root, power operations
13//! - **Multi-Platform SIMD**: SSE2, AVX2, AVX512, NEON optimizations
14//! - **Automatic Fallback**: Graceful fallback to standard library functions
15//!
16//! ## Performance Notes
17//!
18//! Mathematical functions use hardware-optimized implementations where available.
19//! For transcendental functions, polynomial approximations are used for maximum
20//! SIMD efficiency while maintaining numerical accuracy.
21
22// Import ARM64 feature detection macro
23#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
24use std::arch::is_aarch64_feature_detected;
25
26/// SIMD-optimized element-wise sine function
27///
28/// Computes result\[i\] = sin(input\[i\]) for all elements using SIMD instructions.
29///
30/// # Arguments
31/// * `input` - Input vector (angles in radians)
32/// * `result` - Output vector (must have same length as input)
33///
34/// # Panics
35/// Panics if input and output vectors have different lengths
36///
37/// # Examples
38/// ```rust
39/// use sklears_simd::vector::math_functions::sin_vec;
40/// use std::f32::consts::PI;
41///
42/// let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
43/// let mut result = vec![0.0; 4];
44///
45/// sin_vec(&input, &mut result);
46/// assert!((result[0] - 0.0).abs() < 1e-6);
47/// assert!((result[1] - 1.0).abs() < 1e-6);
48/// assert!((result[2] - 0.0).abs() < 1e-6);
49/// assert!((result[3] + 1.0).abs() < 1e-6);
50/// ```
51pub fn sin_vec(input: &[f32], result: &mut [f32]) {
52    assert_eq!(
53        input.len(),
54        result.len(),
55        "Input and output vectors must have the same length"
56    );
57
58    if input.is_empty() {
59        return;
60    }
61
62    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
63    {
64        if crate::simd_feature_detected!("avx512f") {
65            unsafe { sin_vec_avx512(input, result) };
66            return;
67        } else if crate::simd_feature_detected!("avx2") {
68            unsafe { sin_vec_avx2(input, result) };
69            return;
70        } else if crate::simd_feature_detected!("sse2") {
71            unsafe { sin_vec_sse2(input, result) };
72            return;
73        }
74    }
75
76    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
77    {
78        if is_aarch64_feature_detected!("neon") {
79            unsafe { sin_vec_neon(input, result) };
80            return;
81        }
82    }
83
84    sin_vec_scalar(input, result);
85}
86
87/// SIMD-optimized element-wise cosine function
88///
89/// Computes result\[i\] = cos(input\[i\]) for all elements using SIMD instructions.
90///
91/// # Arguments
92/// * `input` - Input vector (angles in radians)
93/// * `result` - Output vector (must have same length as input)
94///
95/// # Panics
96/// Panics if input and output vectors have different lengths
97///
98/// # Examples
99/// ```rust
100/// use sklears_simd::vector::math_functions::cos_vec;
101/// use std::f32::consts::PI;
102///
103/// let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
104/// let mut result = vec![0.0; 4];
105///
106/// cos_vec(&input, &mut result);
107/// assert!((result[0] - 1.0).abs() < 1e-6);
108/// assert!((result[1] - 0.0).abs() < 1e-6);
109/// assert!((result[2] + 1.0).abs() < 1e-6);
110/// assert!((result[3] - 0.0).abs() < 1e-6);
111/// ```
112pub fn cos_vec(input: &[f32], result: &mut [f32]) {
113    assert_eq!(
114        input.len(),
115        result.len(),
116        "Input and output vectors must have the same length"
117    );
118
119    if input.is_empty() {
120        return;
121    }
122
123    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
124    {
125        if crate::simd_feature_detected!("avx512f") {
126            unsafe { cos_vec_avx512(input, result) };
127            return;
128        } else if crate::simd_feature_detected!("avx2") {
129            unsafe { cos_vec_avx2(input, result) };
130            return;
131        } else if crate::simd_feature_detected!("sse2") {
132            unsafe { cos_vec_sse2(input, result) };
133            return;
134        }
135    }
136
137    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
138    {
139        if is_aarch64_feature_detected!("neon") {
140            unsafe { cos_vec_neon(input, result) };
141            return;
142        }
143    }
144
145    cos_vec_scalar(input, result);
146}
147
148/// SIMD-optimized element-wise tangent function
149///
150/// Computes result\[i\] = tan(input\[i\]) for all elements using SIMD instructions.
151///
152/// # Arguments
153/// * `input` - Input vector (angles in radians)
154/// * `result` - Output vector (must have same length as input)
155///
156/// # Panics
157/// Panics if input and output vectors have different lengths
158///
159/// # Examples
160/// ```rust
161/// use sklears_simd::vector::math_functions::tan_vec;
162/// use std::f32::consts::PI;
163///
164/// let input = vec![0.0, PI / 4.0, -PI / 4.0];
165/// let mut result = vec![0.0; 3];
166///
167/// tan_vec(&input, &mut result);
168/// assert!((result[0] - 0.0).abs() < 1e-6);
169/// assert!((result[1] - 1.0).abs() < 1e-6);
170/// assert!((result[2] + 1.0).abs() < 1e-6);
171/// ```
172pub fn tan_vec(input: &[f32], result: &mut [f32]) {
173    assert_eq!(
174        input.len(),
175        result.len(),
176        "Input and output vectors must have the same length"
177    );
178
179    if input.is_empty() {
180        return;
181    }
182
183    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
184    {
185        if crate::simd_feature_detected!("avx512f") {
186            unsafe { tan_vec_avx512(input, result) };
187            return;
188        } else if crate::simd_feature_detected!("avx2") {
189            unsafe { tan_vec_avx2(input, result) };
190            return;
191        } else if crate::simd_feature_detected!("sse2") {
192            unsafe { tan_vec_sse2(input, result) };
193            return;
194        }
195    }
196
197    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
198    {
199        if is_aarch64_feature_detected!("neon") {
200            unsafe { tan_vec_neon(input, result) };
201            return;
202        }
203    }
204
205    tan_vec_scalar(input, result);
206}
207
208/// SIMD-optimized element-wise exponential function
209///
210/// Computes result\[i\] = exp(input\[i\]) for all elements using SIMD instructions.
211///
212/// # Arguments
213/// * `input` - Input vector
214/// * `result` - Output vector (must have same length as input)
215///
216/// # Panics
217/// Panics if input and output vectors have different lengths
218///
219/// # Examples
220/// ```rust
221/// use sklears_simd::vector::math_functions::exp_vec;
222///
223/// let input = vec![0.0, 1.0, 2.0, -1.0];
224/// let mut result = vec![0.0; 4];
225///
226/// exp_vec(&input, &mut result);
227/// assert!((result[0] - 1.0).abs() < 1e-6);
228/// assert!((result[1] - std::f32::consts::E).abs() < 1e-6);
229/// assert!((result[3] - (1.0 / std::f32::consts::E)).abs() < 1e-6);
230/// ```
231pub fn exp_vec(input: &[f32], result: &mut [f32]) {
232    assert_eq!(
233        input.len(),
234        result.len(),
235        "Input and output vectors must have the same length"
236    );
237
238    if input.is_empty() {
239        return;
240    }
241
242    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
243    {
244        if crate::simd_feature_detected!("avx512f") {
245            unsafe { exp_vec_avx512(input, result) };
246            return;
247        } else if crate::simd_feature_detected!("avx2") {
248            unsafe { exp_vec_avx2(input, result) };
249            return;
250        } else if crate::simd_feature_detected!("sse2") {
251            unsafe { exp_vec_sse2(input, result) };
252            return;
253        }
254    }
255
256    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
257    {
258        if is_aarch64_feature_detected!("neon") {
259            unsafe { exp_vec_neon(input, result) };
260            return;
261        }
262    }
263
264    exp_vec_scalar(input, result);
265}
266
267/// SIMD-optimized element-wise natural logarithm function
268///
269/// Computes result\[i\] = ln(input\[i\]) for all elements using SIMD instructions.
270/// Input values must be positive; negative inputs will produce NaN.
271///
272/// # Arguments
273/// * `input` - Input vector (must contain positive values)
274/// * `result` - Output vector (must have same length as input)
275///
276/// # Panics
277/// Panics if input and output vectors have different lengths
278///
279/// # Examples
280/// ```rust
281/// use sklears_simd::vector::math_functions::ln_vec;
282///
283/// let input = vec![1.0, std::f32::consts::E, std::f32::consts::E * std::f32::consts::E];
284/// let mut result = vec![0.0; 3];
285///
286/// ln_vec(&input, &mut result);
287/// assert!((result[0] - 0.0).abs() < 1e-6);
288/// assert!((result[1] - 1.0).abs() < 1e-6);
289/// assert!((result[2] - 2.0).abs() < 1e-6);
290/// ```
291pub fn ln_vec(input: &[f32], result: &mut [f32]) {
292    assert_eq!(
293        input.len(),
294        result.len(),
295        "Input and output vectors must have the same length"
296    );
297
298    if input.is_empty() {
299        return;
300    }
301
302    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
303    {
304        if crate::simd_feature_detected!("avx512f") {
305            unsafe { ln_vec_avx512(input, result) };
306            return;
307        } else if crate::simd_feature_detected!("avx2") {
308            unsafe { ln_vec_avx2(input, result) };
309            return;
310        } else if crate::simd_feature_detected!("sse2") {
311            unsafe { ln_vec_sse2(input, result) };
312            return;
313        }
314    }
315
316    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
317    {
318        if is_aarch64_feature_detected!("neon") {
319            unsafe { ln_vec_neon(input, result) };
320            return;
321        }
322    }
323
324    ln_vec_scalar(input, result);
325}
326
327/// SIMD-optimized element-wise square root function
328///
329/// Computes result\[i\] = sqrt(input\[i\]) for all elements using SIMD instructions.
330/// Input values must be non-negative; negative inputs will produce NaN.
331///
332/// # Arguments
333/// * `input` - Input vector (must contain non-negative values)
334/// * `result` - Output vector (must have same length as input)
335///
336/// # Panics
337/// Panics if input and output vectors have different lengths
338///
339/// # Examples
340/// ```rust
341/// use sklears_simd::vector::math_functions::sqrt_vec;
342///
343/// let input = vec![0.0, 1.0, 4.0, 9.0, 16.0];
344/// let mut result = vec![0.0; 5];
345///
346/// sqrt_vec(&input, &mut result);
347/// assert!((result[0] - 0.0).abs() < 1e-6);
348/// assert!((result[1] - 1.0).abs() < 1e-6);
349/// assert!((result[2] - 2.0).abs() < 1e-6);
350/// assert!((result[3] - 3.0).abs() < 1e-6);
351/// assert!((result[4] - 4.0).abs() < 1e-6);
352/// ```
353pub fn sqrt_vec(input: &[f32], result: &mut [f32]) {
354    assert_eq!(
355        input.len(),
356        result.len(),
357        "Input and output vectors must have the same length"
358    );
359
360    if input.is_empty() {
361        return;
362    }
363
364    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
365    {
366        if crate::simd_feature_detected!("avx512f") {
367            unsafe { sqrt_vec_avx512(input, result) };
368            return;
369        } else if crate::simd_feature_detected!("avx2") {
370            unsafe { sqrt_vec_avx2(input, result) };
371            return;
372        } else if crate::simd_feature_detected!("sse2") {
373            unsafe { sqrt_vec_sse2(input, result) };
374            return;
375        }
376    }
377
378    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
379    {
380        if is_aarch64_feature_detected!("neon") {
381            unsafe { sqrt_vec_neon(input, result) };
382            return;
383        }
384    }
385
386    sqrt_vec_scalar(input, result);
387}
388
389/// SIMD-optimized element-wise power function
390///
391/// Computes result\[i\] = base\[i\] ^ exponent\[i\] for all elements using SIMD instructions.
392///
393/// # Arguments
394/// * `base` - Base vector
395/// * `exponent` - Exponent vector (must have same length as base)
396/// * `result` - Output vector (must have same length as inputs)
397///
398/// # Panics
399/// Panics if the vectors have different lengths
400///
401/// # Examples
402/// ```rust
403/// use sklears_simd::vector::math_functions::pow_vec;
404///
405/// let base = vec![2.0, 3.0, 4.0, 5.0];
406/// let exponent = vec![2.0, 3.0, 0.5, -1.0];
407/// let mut result = vec![0.0; 4];
408///
409/// pow_vec(&base, &exponent, &mut result);
410/// assert!((result[0] - 4.0).abs() < 1e-6);   // 2^2 = 4
411/// assert!((result[1] - 27.0).abs() < 1e-6);  // 3^3 = 27
412/// assert!((result[2] - 2.0).abs() < 1e-6);   // 4^0.5 = 2
413/// assert!((result[3] - 0.2).abs() < 1e-6);   // 5^-1 = 0.2
414/// ```
415pub fn pow_vec(base: &[f32], exponent: &[f32], result: &mut [f32]) {
416    assert_eq!(
417        base.len(),
418        exponent.len(),
419        "Input vectors must have the same length"
420    );
421    assert_eq!(
422        base.len(),
423        result.len(),
424        "Output vector must have the same length as input vectors"
425    );
426
427    if base.is_empty() {
428        return;
429    }
430
431    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
432    {
433        if crate::simd_feature_detected!("avx512f") {
434            unsafe { pow_vec_avx512(base, exponent, result) };
435            return;
436        } else if crate::simd_feature_detected!("avx2") {
437            unsafe { pow_vec_avx2(base, exponent, result) };
438            return;
439        } else if crate::simd_feature_detected!("sse2") {
440            unsafe { pow_vec_sse2(base, exponent, result) };
441            return;
442        }
443    }
444
445    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
446    {
447        if is_aarch64_feature_detected!("neon") {
448            unsafe { pow_vec_neon(base, exponent, result) };
449            return;
450        }
451    }
452
453    pow_vec_scalar(base, exponent, result);
454}
455
456/// SIMD-optimized element-wise square function
457///
458/// Computes result\[i\] = input\[i\] * input\[i\] for all elements.
459/// This is more efficient than using pow_vec with exponent 2.
460///
461/// # Arguments
462/// * `input` - Input vector
463/// * `result` - Output vector (must have same length as input)
464///
465/// # Panics
466/// Panics if input and output vectors have different lengths
467///
468/// # Examples
469/// ```rust
470/// use sklears_simd::vector::math_functions::square_vec;
471///
472/// let input = vec![1.0, 2.0, 3.0, -4.0];
473/// let mut result = vec![0.0; 4];
474///
475/// square_vec(&input, &mut result);
476/// assert!((result[0] - 1.0).abs() < 1e-6);
477/// assert!((result[1] - 4.0).abs() < 1e-6);
478/// assert!((result[2] - 9.0).abs() < 1e-6);
479/// assert!((result[3] - 16.0).abs() < 1e-6);
480/// ```
481pub fn square_vec(input: &[f32], result: &mut [f32]) {
482    assert_eq!(
483        input.len(),
484        result.len(),
485        "Input and output vectors must have the same length"
486    );
487
488    if input.is_empty() {
489        return;
490    }
491
492    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
493    {
494        if crate::simd_feature_detected!("avx512f") {
495            unsafe { square_vec_avx512(input, result) };
496            return;
497        } else if crate::simd_feature_detected!("avx2") {
498            unsafe { square_vec_avx2(input, result) };
499            return;
500        } else if crate::simd_feature_detected!("sse2") {
501            unsafe { square_vec_sse2(input, result) };
502            return;
503        }
504    }
505
506    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
507    {
508        if is_aarch64_feature_detected!("neon") {
509            unsafe { square_vec_neon(input, result) };
510            return;
511        }
512    }
513
514    square_vec_scalar(input, result);
515}
516
517/// SIMD-optimized element-wise reciprocal function
518///
519/// Computes result\[i\] = 1.0 / input\[i\] for all elements using SIMD instructions.
520/// Division by zero results in infinity according to IEEE 754 standard.
521///
522/// # Arguments
523/// * `input` - Input vector
524/// * `result` - Output vector (must have same length as input)
525///
526/// # Panics
527/// Panics if input and output vectors have different lengths
528///
529/// # Examples
530/// ```rust
531/// use sklears_simd::vector::math_functions::reciprocal_vec;
532///
533/// let input = vec![1.0, 2.0, 4.0, 0.5];
534/// let mut result = vec![0.0; 4];
535///
536/// reciprocal_vec(&input, &mut result);
537/// assert!((result[0] - 1.0).abs() < 1e-6);
538/// assert!((result[1] - 0.5).abs() < 1e-6);
539/// assert!((result[2] - 0.25).abs() < 1e-6);
540/// assert!((result[3] - 2.0).abs() < 1e-6);
541/// ```
542pub fn reciprocal_vec(input: &[f32], result: &mut [f32]) {
543    assert_eq!(
544        input.len(),
545        result.len(),
546        "Input and output vectors must have the same length"
547    );
548
549    if input.is_empty() {
550        return;
551    }
552
553    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
554    {
555        if crate::simd_feature_detected!("avx512f") {
556            unsafe { reciprocal_vec_avx512(input, result) };
557            return;
558        } else if crate::simd_feature_detected!("avx2") {
559            unsafe { reciprocal_vec_avx2(input, result) };
560            return;
561        } else if crate::simd_feature_detected!("sse2") {
562            unsafe { reciprocal_vec_sse2(input, result) };
563            return;
564        }
565    }
566
567    #[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
568    {
569        if is_aarch64_feature_detected!("neon") {
570            unsafe { reciprocal_vec_neon(input, result) };
571            return;
572        }
573    }
574
575    reciprocal_vec_scalar(input, result);
576}
577
578// ============================================================================
579// Scalar implementations (fallbacks)
580// ============================================================================
581
582fn sin_vec_scalar(input: &[f32], result: &mut [f32]) {
583    for i in 0..input.len() {
584        result[i] = input[i].sin();
585    }
586}
587
588fn cos_vec_scalar(input: &[f32], result: &mut [f32]) {
589    for i in 0..input.len() {
590        result[i] = input[i].cos();
591    }
592}
593
594fn tan_vec_scalar(input: &[f32], result: &mut [f32]) {
595    for i in 0..input.len() {
596        result[i] = input[i].tan();
597    }
598}
599
600fn exp_vec_scalar(input: &[f32], result: &mut [f32]) {
601    for i in 0..input.len() {
602        result[i] = input[i].exp();
603    }
604}
605
606fn ln_vec_scalar(input: &[f32], result: &mut [f32]) {
607    for i in 0..input.len() {
608        result[i] = input[i].ln();
609    }
610}
611
612fn sqrt_vec_scalar(input: &[f32], result: &mut [f32]) {
613    for i in 0..input.len() {
614        result[i] = input[i].sqrt();
615    }
616}
617
618fn pow_vec_scalar(base: &[f32], exponent: &[f32], result: &mut [f32]) {
619    for i in 0..base.len() {
620        result[i] = base[i].powf(exponent[i]);
621    }
622}
623
624fn square_vec_scalar(input: &[f32], result: &mut [f32]) {
625    for i in 0..input.len() {
626        result[i] = input[i] * input[i];
627    }
628}
629
630fn reciprocal_vec_scalar(input: &[f32], result: &mut [f32]) {
631    for i in 0..input.len() {
632        result[i] = 1.0 / input[i];
633    }
634}
635
636// ============================================================================
637// SSE2 implementations (x86/x86_64)
638// ============================================================================
639
640#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
641#[target_feature(enable = "sse2")]
642unsafe fn sin_vec_sse2(input: &[f32], result: &mut [f32]) {
643    let mut i = 0;
644
645    // Process remaining elements with scalar fallback
646    // Note: For transcendental functions like sin, cos, tan, we use scalar fallback
647    // because accurate SIMD implementations require complex polynomial approximations
648    while i < input.len() {
649        result[i] = input[i].sin();
650        i += 1;
651    }
652}
653
654#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
655#[target_feature(enable = "sse2")]
656unsafe fn cos_vec_sse2(input: &[f32], result: &mut [f32]) {
657    let mut i = 0;
658
659    while i < input.len() {
660        result[i] = input[i].cos();
661        i += 1;
662    }
663}
664
665#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
666#[target_feature(enable = "sse2")]
667unsafe fn tan_vec_sse2(input: &[f32], result: &mut [f32]) {
668    let mut i = 0;
669
670    while i < input.len() {
671        result[i] = input[i].tan();
672        i += 1;
673    }
674}
675
676#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
677#[target_feature(enable = "sse2")]
678unsafe fn exp_vec_sse2(input: &[f32], result: &mut [f32]) {
679    let mut i = 0;
680
681    while i < input.len() {
682        result[i] = input[i].exp();
683        i += 1;
684    }
685}
686
687#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
688#[target_feature(enable = "sse2")]
689unsafe fn ln_vec_sse2(input: &[f32], result: &mut [f32]) {
690    let mut i = 0;
691
692    while i < input.len() {
693        result[i] = input[i].ln();
694        i += 1;
695    }
696}
697
698#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
699#[target_feature(enable = "sse2")]
700unsafe fn sqrt_vec_sse2(input: &[f32], result: &mut [f32]) {
701    #[cfg(feature = "no-std")]
702    use core::arch::x86_64::*;
703    #[cfg(not(feature = "no-std"))]
704    use core::arch::x86_64::*;
705
706    let mut i = 0;
707
708    // Process 4 elements at a time using SIMD sqrt
709    while i + 4 <= input.len() {
710        let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
711        let result_vec = _mm_sqrt_ps(input_vec);
712        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
713        i += 4;
714    }
715
716    // Handle remaining elements
717    while i < input.len() {
718        result[i] = input[i].sqrt();
719        i += 1;
720    }
721}
722
723#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
724#[target_feature(enable = "sse2")]
725unsafe fn pow_vec_sse2(base: &[f32], exponent: &[f32], result: &mut [f32]) {
726    let mut i = 0;
727
728    while i < base.len() {
729        result[i] = base[i].powf(exponent[i]);
730        i += 1;
731    }
732}
733
734#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
735#[target_feature(enable = "sse2")]
736unsafe fn square_vec_sse2(input: &[f32], result: &mut [f32]) {
737    #[cfg(feature = "no-std")]
738    use core::arch::x86_64::*;
739    #[cfg(not(feature = "no-std"))]
740    use core::arch::x86_64::*;
741
742    let mut i = 0;
743
744    // Process 4 elements at a time
745    while i + 4 <= input.len() {
746        let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
747        let result_vec = _mm_mul_ps(input_vec, input_vec);
748        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
749        i += 4;
750    }
751
752    // Handle remaining elements
753    while i < input.len() {
754        result[i] = input[i] * input[i];
755        i += 1;
756    }
757}
758
759#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
760#[target_feature(enable = "sse2")]
761unsafe fn reciprocal_vec_sse2(input: &[f32], result: &mut [f32]) {
762    #[cfg(feature = "no-std")]
763    use core::arch::x86_64::*;
764    #[cfg(not(feature = "no-std"))]
765    use core::arch::x86_64::*;
766
767    let mut i = 0;
768
769    // Process 4 elements at a time using accurate division
770    while i + 4 <= input.len() {
771        let input_vec = _mm_loadu_ps(input.as_ptr().add(i));
772        let ones = _mm_set1_ps(1.0);
773        let result_vec = _mm_div_ps(ones, input_vec);
774        _mm_storeu_ps(result.as_mut_ptr().add(i), result_vec);
775        i += 4;
776    }
777
778    // Handle remaining elements
779    while i < input.len() {
780        result[i] = 1.0 / input[i];
781        i += 1;
782    }
783}
784
785// ============================================================================
786// AVX2 implementations (x86/x86_64)
787// ============================================================================
788
789#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
790#[target_feature(enable = "avx2")]
791unsafe fn sin_vec_avx2(input: &[f32], result: &mut [f32]) {
792    let mut i = 0;
793
794    while i < input.len() {
795        result[i] = input[i].sin();
796        i += 1;
797    }
798}
799
800#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
801#[target_feature(enable = "avx2")]
802unsafe fn cos_vec_avx2(input: &[f32], result: &mut [f32]) {
803    let mut i = 0;
804
805    while i < input.len() {
806        result[i] = input[i].cos();
807        i += 1;
808    }
809}
810
811#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
812#[target_feature(enable = "avx2")]
813unsafe fn tan_vec_avx2(input: &[f32], result: &mut [f32]) {
814    let mut i = 0;
815
816    while i < input.len() {
817        result[i] = input[i].tan();
818        i += 1;
819    }
820}
821
822#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
823#[target_feature(enable = "avx2")]
824unsafe fn exp_vec_avx2(input: &[f32], result: &mut [f32]) {
825    let mut i = 0;
826
827    while i < input.len() {
828        result[i] = input[i].exp();
829        i += 1;
830    }
831}
832
833#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
834#[target_feature(enable = "avx2")]
835unsafe fn ln_vec_avx2(input: &[f32], result: &mut [f32]) {
836    let mut i = 0;
837
838    while i < input.len() {
839        result[i] = input[i].ln();
840        i += 1;
841    }
842}
843
844#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
845#[target_feature(enable = "avx2")]
846unsafe fn sqrt_vec_avx2(input: &[f32], result: &mut [f32]) {
847    #[cfg(feature = "no-std")]
848    use core::arch::x86_64::*;
849    #[cfg(not(feature = "no-std"))]
850    use core::arch::x86_64::*;
851
852    let mut i = 0;
853
854    // Process 8 elements at a time
855    while i + 8 <= input.len() {
856        let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
857        let result_vec = _mm256_sqrt_ps(input_vec);
858        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
859        i += 8;
860    }
861
862    // Handle remaining elements
863    while i < input.len() {
864        result[i] = input[i].sqrt();
865        i += 1;
866    }
867}
868
869#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
870#[target_feature(enable = "avx2")]
871unsafe fn pow_vec_avx2(base: &[f32], exponent: &[f32], result: &mut [f32]) {
872    let mut i = 0;
873
874    while i < base.len() {
875        result[i] = base[i].powf(exponent[i]);
876        i += 1;
877    }
878}
879
880#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
881#[target_feature(enable = "avx2")]
882unsafe fn square_vec_avx2(input: &[f32], result: &mut [f32]) {
883    #[cfg(feature = "no-std")]
884    use core::arch::x86_64::*;
885    #[cfg(not(feature = "no-std"))]
886    use core::arch::x86_64::*;
887
888    let mut i = 0;
889
890    // Process 8 elements at a time
891    while i + 8 <= input.len() {
892        let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
893        let result_vec = _mm256_mul_ps(input_vec, input_vec);
894        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
895        i += 8;
896    }
897
898    // Handle remaining elements
899    while i < input.len() {
900        result[i] = input[i] * input[i];
901        i += 1;
902    }
903}
904
905#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
906#[target_feature(enable = "avx2")]
907unsafe fn reciprocal_vec_avx2(input: &[f32], result: &mut [f32]) {
908    #[cfg(feature = "no-std")]
909    use core::arch::x86_64::*;
910    #[cfg(not(feature = "no-std"))]
911    use core::arch::x86_64::*;
912
913    let mut i = 0;
914
915    // Process 8 elements at a time
916    while i + 8 <= input.len() {
917        let input_vec = _mm256_loadu_ps(input.as_ptr().add(i));
918        let ones = _mm256_set1_ps(1.0);
919        let result_vec = _mm256_div_ps(ones, input_vec);
920        _mm256_storeu_ps(result.as_mut_ptr().add(i), result_vec);
921        i += 8;
922    }
923
924    // Handle remaining elements
925    while i < input.len() {
926        result[i] = 1.0 / input[i];
927        i += 1;
928    }
929}
930
931// ============================================================================
932// AVX512 implementations (x86/x86_64)
933// ============================================================================
934
935#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
936#[target_feature(enable = "avx512f")]
937unsafe fn sin_vec_avx512(input: &[f32], result: &mut [f32]) {
938    let mut i = 0;
939
940    while i < input.len() {
941        result[i] = input[i].sin();
942        i += 1;
943    }
944}
945
946#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
947#[target_feature(enable = "avx512f")]
948unsafe fn cos_vec_avx512(input: &[f32], result: &mut [f32]) {
949    let mut i = 0;
950
951    while i < input.len() {
952        result[i] = input[i].cos();
953        i += 1;
954    }
955}
956
957#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
958#[target_feature(enable = "avx512f")]
959unsafe fn tan_vec_avx512(input: &[f32], result: &mut [f32]) {
960    let mut i = 0;
961
962    while i < input.len() {
963        result[i] = input[i].tan();
964        i += 1;
965    }
966}
967
968#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
969#[target_feature(enable = "avx512f")]
970unsafe fn exp_vec_avx512(input: &[f32], result: &mut [f32]) {
971    let mut i = 0;
972
973    while i < input.len() {
974        result[i] = input[i].exp();
975        i += 1;
976    }
977}
978
979#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
980#[target_feature(enable = "avx512f")]
981unsafe fn ln_vec_avx512(input: &[f32], result: &mut [f32]) {
982    let mut i = 0;
983
984    while i < input.len() {
985        result[i] = input[i].ln();
986        i += 1;
987    }
988}
989
990#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
991#[target_feature(enable = "avx512f")]
992unsafe fn sqrt_vec_avx512(input: &[f32], result: &mut [f32]) {
993    #[cfg(feature = "no-std")]
994    use core::arch::x86_64::*;
995    #[cfg(not(feature = "no-std"))]
996    use core::arch::x86_64::*;
997
998    let mut i = 0;
999
1000    // Process 16 elements at a time
1001    while i + 16 <= input.len() {
1002        let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1003        let result_vec = _mm512_sqrt_ps(input_vec);
1004        _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1005        i += 16;
1006    }
1007
1008    // Handle remaining elements
1009    while i < input.len() {
1010        result[i] = input[i].sqrt();
1011        i += 1;
1012    }
1013}
1014
1015#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1016#[target_feature(enable = "avx512f")]
1017unsafe fn pow_vec_avx512(base: &[f32], exponent: &[f32], result: &mut [f32]) {
1018    let mut i = 0;
1019
1020    while i < base.len() {
1021        result[i] = base[i].powf(exponent[i]);
1022        i += 1;
1023    }
1024}
1025
1026#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1027#[target_feature(enable = "avx512f")]
1028unsafe fn square_vec_avx512(input: &[f32], result: &mut [f32]) {
1029    #[cfg(feature = "no-std")]
1030    use core::arch::x86_64::*;
1031    #[cfg(not(feature = "no-std"))]
1032    use core::arch::x86_64::*;
1033
1034    let mut i = 0;
1035
1036    // Process 16 elements at a time
1037    while i + 16 <= input.len() {
1038        let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1039        let result_vec = _mm512_mul_ps(input_vec, input_vec);
1040        _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1041        i += 16;
1042    }
1043
1044    // Handle remaining elements
1045    while i < input.len() {
1046        result[i] = input[i] * input[i];
1047        i += 1;
1048    }
1049}
1050
1051#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1052#[target_feature(enable = "avx512f")]
1053unsafe fn reciprocal_vec_avx512(input: &[f32], result: &mut [f32]) {
1054    #[cfg(feature = "no-std")]
1055    use core::arch::x86_64::*;
1056    #[cfg(not(feature = "no-std"))]
1057    use core::arch::x86_64::*;
1058
1059    let mut i = 0;
1060
1061    // Process 16 elements at a time using accurate division
1062    while i + 16 <= input.len() {
1063        let input_vec = _mm512_loadu_ps(input.as_ptr().add(i));
1064        let ones = _mm512_set1_ps(1.0);
1065        let result_vec = _mm512_div_ps(ones, input_vec);
1066        _mm512_storeu_ps(result.as_mut_ptr().add(i), result_vec);
1067        i += 16;
1068    }
1069
1070    // Handle remaining elements
1071    while i < input.len() {
1072        result[i] = 1.0 / input[i];
1073        i += 1;
1074    }
1075}
1076
1077// ============================================================================
1078// NEON implementations (ARM AArch64)
1079// ============================================================================
1080
1081#[cfg(target_arch = "aarch64")]
1082#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1083#[target_feature(enable = "neon")]
1084unsafe fn sin_vec_neon(input: &[f32], result: &mut [f32]) {
1085    let mut i = 0;
1086
1087    while i < input.len() {
1088        result[i] = input[i].sin();
1089        i += 1;
1090    }
1091}
1092
1093#[cfg(target_arch = "aarch64")]
1094#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1095#[target_feature(enable = "neon")]
1096unsafe fn cos_vec_neon(input: &[f32], result: &mut [f32]) {
1097    let mut i = 0;
1098
1099    while i < input.len() {
1100        result[i] = input[i].cos();
1101        i += 1;
1102    }
1103}
1104
1105#[cfg(target_arch = "aarch64")]
1106#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1107#[target_feature(enable = "neon")]
1108unsafe fn tan_vec_neon(input: &[f32], result: &mut [f32]) {
1109    let mut i = 0;
1110
1111    while i < input.len() {
1112        result[i] = input[i].tan();
1113        i += 1;
1114    }
1115}
1116
1117#[cfg(target_arch = "aarch64")]
1118#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1119#[target_feature(enable = "neon")]
1120unsafe fn exp_vec_neon(input: &[f32], result: &mut [f32]) {
1121    let mut i = 0;
1122
1123    while i < input.len() {
1124        result[i] = input[i].exp();
1125        i += 1;
1126    }
1127}
1128
1129#[cfg(target_arch = "aarch64")]
1130#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1131#[target_feature(enable = "neon")]
1132unsafe fn ln_vec_neon(input: &[f32], result: &mut [f32]) {
1133    let mut i = 0;
1134
1135    while i < input.len() {
1136        result[i] = input[i].ln();
1137        i += 1;
1138    }
1139}
1140
1141#[cfg(target_arch = "aarch64")]
1142#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1143#[target_feature(enable = "neon")]
1144unsafe fn sqrt_vec_neon(input: &[f32], result: &mut [f32]) {
1145    use core::arch::aarch64::*;
1146
1147    let mut i = 0;
1148
1149    // Process 4 elements at a time
1150    while i + 4 <= input.len() {
1151        let input_vec = vld1q_f32(input.as_ptr().add(i));
1152        let result_vec = vsqrtq_f32(input_vec);
1153        vst1q_f32(result.as_mut_ptr().add(i), result_vec);
1154        i += 4;
1155    }
1156
1157    // Handle remaining elements
1158    while i < input.len() {
1159        result[i] = input[i].sqrt();
1160        i += 1;
1161    }
1162}
1163
1164#[cfg(target_arch = "aarch64")]
1165#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1166#[target_feature(enable = "neon")]
1167unsafe fn pow_vec_neon(base: &[f32], exponent: &[f32], result: &mut [f32]) {
1168    let mut i = 0;
1169
1170    while i < base.len() {
1171        result[i] = base[i].powf(exponent[i]);
1172        i += 1;
1173    }
1174}
1175
1176#[cfg(target_arch = "aarch64")]
1177#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1178#[target_feature(enable = "neon")]
1179unsafe fn square_vec_neon(input: &[f32], result: &mut [f32]) {
1180    use core::arch::aarch64::*;
1181
1182    let mut i = 0;
1183
1184    // Process 4 elements at a time
1185    while i + 4 <= input.len() {
1186        let input_vec = vld1q_f32(input.as_ptr().add(i));
1187        let result_vec = vmulq_f32(input_vec, input_vec);
1188        vst1q_f32(result.as_mut_ptr().add(i), result_vec);
1189        i += 4;
1190    }
1191
1192    // Handle remaining elements
1193    while i < input.len() {
1194        result[i] = input[i] * input[i];
1195        i += 1;
1196    }
1197}
1198
1199#[cfg(target_arch = "aarch64")]
1200#[allow(dead_code)] // NEON dispatch; unused in --all-features (no-std disables runtime detection)
1201#[target_feature(enable = "neon")]
1202unsafe fn reciprocal_vec_neon(input: &[f32], result: &mut [f32]) {
1203    use core::arch::aarch64::*;
1204
1205    let mut i = 0;
1206
1207    // Process 4 elements at a time using reciprocal estimate with refinement
1208    while i + 4 <= input.len() {
1209        let input_vec = vld1q_f32(input.as_ptr().add(i));
1210        let mut estimate = vrecpeq_f32(input_vec);
1211        // Two Newton-Raphson refinement steps for improved accuracy
1212        estimate = vmulq_f32(vrecpsq_f32(input_vec, estimate), estimate);
1213        estimate = vmulq_f32(vrecpsq_f32(input_vec, estimate), estimate);
1214        vst1q_f32(result.as_mut_ptr().add(i), estimate);
1215        i += 4;
1216    }
1217
1218    // Handle remaining elements
1219    while i < input.len() {
1220        result[i] = 1.0 / input[i];
1221        i += 1;
1222    }
1223}
1224
1225#[allow(non_snake_case)]
1226#[cfg(all(test, not(feature = "no-std")))]
1227mod tests {
1228    use super::*;
1229
1230    #[cfg(feature = "no-std")]
1231    use alloc::{vec, vec::Vec};
1232
1233    #[cfg(feature = "no-std")]
1234    #[allow(unused_imports)]
1235    use core::f32::consts::{E, PI};
1236    #[cfg(not(feature = "no-std"))]
1237    use std::f32::consts::{E, PI};
1238
1239    const EPSILON: f32 = 1e-6;
1240
1241    #[test]
1242    fn test_sin_vec() {
1243        let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
1244        let mut result = vec![0.0; 4];
1245
1246        sin_vec(&input, &mut result);
1247        assert!((result[0] - 0.0).abs() < EPSILON);
1248        assert!((result[1] - 1.0).abs() < EPSILON);
1249        assert!((result[2] - 0.0).abs() < EPSILON);
1250        assert!((result[3] + 1.0).abs() < EPSILON);
1251
1252        // Test with empty vector
1253        let empty_input: Vec<f32> = vec![];
1254        let mut empty_result: Vec<f32> = vec![];
1255        sin_vec(&empty_input, &mut empty_result);
1256        assert_eq!(empty_result, Vec::<f32>::new());
1257    }
1258
1259    #[test]
1260    fn test_cos_vec() {
1261        let input = vec![0.0, PI / 2.0, PI, 3.0 * PI / 2.0];
1262        let mut result = vec![0.0; 4];
1263
1264        cos_vec(&input, &mut result);
1265        assert!((result[0] - 1.0).abs() < EPSILON);
1266        assert!((result[1] - 0.0).abs() < EPSILON);
1267        assert!((result[2] + 1.0).abs() < EPSILON);
1268        assert!((result[3] - 0.0).abs() < EPSILON);
1269    }
1270
1271    #[test]
1272    fn test_tan_vec() {
1273        let input = vec![0.0, PI / 4.0, -PI / 4.0];
1274        let mut result = vec![0.0; 3];
1275
1276        tan_vec(&input, &mut result);
1277        assert!((result[0] - 0.0).abs() < EPSILON);
1278        assert!((result[1] - 1.0).abs() < EPSILON);
1279        assert!((result[2] + 1.0).abs() < EPSILON);
1280    }
1281
1282    #[test]
1283    fn test_exp_vec() {
1284        let input = vec![0.0, 1.0, 2.0, -1.0];
1285        let mut result = vec![0.0; 4];
1286
1287        exp_vec(&input, &mut result);
1288        assert!((result[0] - 1.0).abs() < EPSILON);
1289        assert!((result[1] - E).abs() < EPSILON);
1290        assert!((result[2] - (E * E)).abs() < 1e-5);
1291        assert!((result[3] - (1.0 / E)).abs() < EPSILON);
1292    }
1293
1294    #[test]
1295    fn test_ln_vec() {
1296        let input = vec![1.0, E, E * E, 1.0 / E];
1297        let mut result = vec![0.0; 4];
1298
1299        ln_vec(&input, &mut result);
1300        assert!((result[0] - 0.0).abs() < EPSILON);
1301        assert!((result[1] - 1.0).abs() < EPSILON);
1302        assert!((result[2] - 2.0).abs() < EPSILON);
1303        assert!((result[3] + 1.0).abs() < EPSILON);
1304
1305        // Test with negative input (should produce NaN)
1306        let negative_input = vec![-1.0];
1307        let mut negative_result = vec![0.0; 1];
1308        ln_vec(&negative_input, &mut negative_result);
1309        assert!(negative_result[0].is_nan());
1310    }
1311
1312    #[test]
1313    fn test_sqrt_vec() {
1314        let input = vec![0.0, 1.0, 4.0, 9.0, 16.0];
1315        let mut result = vec![0.0; 5];
1316
1317        sqrt_vec(&input, &mut result);
1318        assert!((result[0] - 0.0).abs() < EPSILON);
1319        assert!((result[1] - 1.0).abs() < EPSILON);
1320        assert!((result[2] - 2.0).abs() < EPSILON);
1321        assert!((result[3] - 3.0).abs() < EPSILON);
1322        assert!((result[4] - 4.0).abs() < EPSILON);
1323
1324        // Test with negative input (should produce NaN)
1325        let negative_input = vec![-1.0];
1326        let mut negative_result = vec![0.0; 1];
1327        sqrt_vec(&negative_input, &mut negative_result);
1328        assert!(negative_result[0].is_nan());
1329    }
1330
1331    #[test]
1332    fn test_pow_vec() {
1333        let base = vec![2.0, 3.0, 4.0, 5.0];
1334        let exponent = vec![2.0, 3.0, 0.5, -1.0];
1335        let mut result = vec![0.0; 4];
1336
1337        pow_vec(&base, &exponent, &mut result);
1338        assert!((result[0] - 4.0).abs() < EPSILON); // 2^2 = 4
1339        assert!((result[1] - 27.0).abs() < EPSILON); // 3^3 = 27
1340        assert!((result[2] - 2.0).abs() < EPSILON); // 4^0.5 = 2
1341        assert!((result[3] - 0.2).abs() < EPSILON); // 5^-1 = 0.2
1342
1343        // Test with empty vectors
1344        let empty_base: Vec<f32> = vec![];
1345        let empty_exp: Vec<f32> = vec![];
1346        let mut empty_result: Vec<f32> = vec![];
1347        pow_vec(&empty_base, &empty_exp, &mut empty_result);
1348        assert_eq!(empty_result, Vec::<f32>::new());
1349    }
1350
1351    #[test]
1352    fn test_square_vec() {
1353        let input = vec![1.0, 2.0, 3.0, -4.0];
1354        let mut result = vec![0.0; 4];
1355
1356        square_vec(&input, &mut result);
1357        assert!((result[0] - 1.0).abs() < EPSILON);
1358        assert!((result[1] - 4.0).abs() < EPSILON);
1359        assert!((result[2] - 9.0).abs() < EPSILON);
1360        assert!((result[3] - 16.0).abs() < EPSILON);
1361    }
1362
1363    #[test]
1364    fn test_reciprocal_vec() {
1365        let input = vec![1.0, 2.0, 4.0, 0.5];
1366        let mut result = vec![0.0; 4];
1367
1368        reciprocal_vec(&input, &mut result);
1369        assert!((result[0] - 1.0).abs() < EPSILON);
1370        assert!((result[1] - 0.5).abs() < EPSILON);
1371        assert!((result[2] - 0.25).abs() < EPSILON);
1372        assert!((result[3] - 2.0).abs() < EPSILON);
1373
1374        // Test division by zero
1375        let zero_input = vec![0.0];
1376        let mut zero_result = vec![0.0; 1];
1377        reciprocal_vec(&zero_input, &mut zero_result);
1378        assert_eq!(zero_result[0], f32::INFINITY);
1379    }
1380
1381    #[test]
1382    fn test_special_values() {
1383        // Test exp with large values
1384        let large_input = vec![100.0];
1385        let mut large_result = vec![0.0; 1];
1386        exp_vec(&large_input, &mut large_result);
1387        assert_eq!(large_result[0], f32::INFINITY);
1388
1389        // Test ln with zero
1390        let zero_input = vec![0.0];
1391        let mut zero_result = vec![0.0; 1];
1392        ln_vec(&zero_input, &mut zero_result);
1393        assert_eq!(zero_result[0], f32::NEG_INFINITY);
1394
1395        // Test sqrt with infinity
1396        let inf_input = vec![f32::INFINITY];
1397        let mut inf_result = vec![0.0; 1];
1398        sqrt_vec(&inf_input, &mut inf_result);
1399        assert_eq!(inf_result[0], f32::INFINITY);
1400    }
1401
1402    #[test]
1403    fn test_mathematical_identities() {
1404        let angles = vec![0.5, 1.0, 1.5, 2.0];
1405        let mut sin_results = vec![0.0; 4];
1406        let mut cos_results = vec![0.0; 4];
1407
1408        sin_vec(&angles, &mut sin_results);
1409        cos_vec(&angles, &mut cos_results);
1410
1411        // Test Pythagorean identity: sin²(x) + cos²(x) = 1
1412        for i in 0..4 {
1413            let sin_sq = sin_results[i] * sin_results[i];
1414            let cos_sq = cos_results[i] * cos_results[i];
1415            assert!((sin_sq + cos_sq - 1.0).abs() < 1e-6);
1416        }
1417
1418        // Test exp(ln(x)) = x for positive values
1419        let positive_values = vec![1.0, 2.0, 3.0, 4.0];
1420        let mut ln_results = vec![0.0; 4];
1421        let mut exp_ln_results = vec![0.0; 4];
1422
1423        ln_vec(&positive_values, &mut ln_results);
1424        exp_vec(&ln_results, &mut exp_ln_results);
1425
1426        for i in 0..4 {
1427            assert!((exp_ln_results[i] - positive_values[i]).abs() < 1e-5);
1428        }
1429    }
1430
1431    #[test]
1432    fn test_large_vectors() {
1433        let size = 100;
1434        let input: Vec<f32> = (0..size).map(|i| (i as f32) / 10.0).collect();
1435        let mut result = vec![0.0; size];
1436
1437        sqrt_vec(&input, &mut result);
1438
1439        for (i, &val) in result.iter().enumerate().take(size) {
1440            let expected = ((i as f32) / 10.0).sqrt();
1441            assert!((val - expected).abs() < 1e-6);
1442        }
1443    }
1444
1445    #[test]
1446    #[should_panic(expected = "Input and output vectors must have the same length")]
1447    fn test_sin_vec_dimension_mismatch() {
1448        let input = vec![1.0, 2.0, 3.0];
1449        let mut result = vec![0.0; 2];
1450        sin_vec(&input, &mut result);
1451    }
1452
1453    #[test]
1454    #[should_panic(expected = "Input vectors must have the same length")]
1455    fn test_pow_vec_input_dimension_mismatch() {
1456        let base = vec![1.0, 2.0, 3.0];
1457        let exponent = vec![2.0, 3.0];
1458        let mut result = vec![0.0; 3];
1459        pow_vec(&base, &exponent, &mut result);
1460    }
1461
1462    #[test]
1463    #[should_panic(expected = "Output vector must have the same length as input vectors")]
1464    fn test_pow_vec_output_dimension_mismatch() {
1465        let base = vec![1.0, 2.0, 3.0];
1466        let exponent = vec![2.0, 3.0, 4.0];
1467        let mut result = vec![0.0; 2];
1468        pow_vec(&base, &exponent, &mut result);
1469    }
1470}