scirs2_core/
simd_ops.rs

1//! Unified SIMD operations abstraction layer
2//!
3//! This module provides a comprehensive abstraction layer for all SIMD operations
4//! used across the scirs2 ecosystem. All modules should use these operations
5//! instead of implementing their own SIMD code.
6
7use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
8use num_traits::Zero;
9
10#[cfg(feature = "simd")]
11use crate::simd_ops_polynomial;
12
13/// Unified SIMD operations trait
14pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
15    /// Element-wise addition
16    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
17
18    /// Element-wise subtraction
19    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
20
21    /// Element-wise multiplication
22    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
23
24    /// Element-wise division
25    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
26
27    /// Dot product
28    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
29
30    /// Matrix-vector multiplication (GEMV)
31    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
32
33    /// Matrix-matrix multiplication (GEMM)
34    fn simd_gemm(
35        alpha: Self,
36        a: &ArrayView2<Self>,
37        b: &ArrayView2<Self>,
38        beta: Self,
39        c: &mut Array2<Self>,
40    );
41
42    /// Vector norm (L2)
43    fn simd_norm(a: &ArrayView1<Self>) -> Self;
44
45    /// Element-wise maximum
46    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
47
48    /// Element-wise minimum
49    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
50
51    /// Scalar multiplication
52    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
53
54    /// Sum reduction
55    fn simd_sum(a: &ArrayView1<Self>) -> Self;
56
57    /// Mean reduction
58    fn simd_mean(a: &ArrayView1<Self>) -> Self;
59
60    /// Find maximum element
61    fn simd_max_element(a: &ArrayView1<Self>) -> Self;
62
63    /// Find minimum element
64    fn simd_min_element(a: &ArrayView1<Self>) -> Self;
65
66    /// Fused multiply-add: a * b + c
67    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
68
69    /// Enhanced cache-optimized addition for large arrays
70    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
71
72    /// Advanced-optimized fused multiply-add for maximum performance
73    fn simd_fma_advanced_optimized(
74        a: &ArrayView1<Self>,
75        b: &ArrayView1<Self>,
76        c: &ArrayView1<Self>,
77    ) -> Array1<Self>;
78
79    /// Adaptive SIMD operation that selects optimal implementation
80    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
81
82    /// Matrix transpose
83    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
84
85    /// Cache-optimized blocked matrix transpose
86    /// Uses L1 cache-friendly block sizes for improved memory access patterns.
87    /// Expected 3-5x speedup for large matrices (>512x512).
88    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self>;
89
90    /// Element-wise absolute value
91    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
92
93    /// Element-wise square root
94    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
95
96    /// Element-wise exponential (e^x)
97    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self>;
98
99    /// Element-wise natural logarithm (ln(x))
100    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self>;
101
102    /// Element-wise sine (sin(x))
103    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self>;
104
105    /// Element-wise cosine (cos(x))
106    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self>;
107
108    /// Element-wise tangent (tan(x))
109    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self>;
110
111    /// Element-wise hyperbolic sine (sinh(x))
112    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self>;
113
114    /// Element-wise hyperbolic cosine (cosh(x))
115    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self>;
116
117    /// Element-wise hyperbolic tangent (tanh(x))
118    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self>;
119
120    /// Element-wise floor (largest integer <= x)
121    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self>;
122
123    /// Element-wise ceiling (smallest integer >= x)
124    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self>;
125
126    /// Element-wise rounding to nearest integer
127    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self>;
128
129    /// Element-wise arctangent (atan(x))
130    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self>;
131
132    /// Element-wise arcsine (asin(x))
133    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self>;
134
135    /// Element-wise arccosine (acos(x))
136    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self>;
137
138    /// Element-wise two-argument arctangent (atan2(y, x))
139    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self>;
140
141    /// Element-wise base-10 logarithm (log10(x))
142    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self>;
143
144    /// Element-wise base-2 logarithm (log2(x))
145    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self>;
146
147    /// Element-wise clamp (constrain values to [min, max])
148    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self>;
149
150    /// Element-wise fractional part (x - trunc(x))
151    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self>;
152
153    /// Element-wise trunc (round toward zero)
154    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self>;
155
156    /// Element-wise reciprocal (1/x)
157    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self>;
158
159    /// Element-wise power with scalar exponent (base^exp)
160    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self>;
161
162    /// Element-wise power with array exponent (`base[i]^exp[i]`)
163    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self>;
164
165    /// Element-wise power with integer exponent (base^n)
166    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self>;
167
168    /// Element-wise gamma function Γ(x)
169    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self>;
170
171    /// Element-wise 2^x (base-2 exponential)
172    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self>;
173
174    /// Element-wise cube root (cbrt(x) = x^(1/3))
175    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self>;
176
177    /// Element-wise ln(1+x) (numerically stable for small x)
178    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self>;
179
180    /// Element-wise exp(x)-1 (numerically stable for small x)
181    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self>;
182
183    /// Element-wise conversion from degrees to radians (x * π / 180)
184    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self>;
185
186    /// Element-wise conversion from radians to degrees (x * 180 / π)
187    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self>;
188
189    /// Element-wise digamma function ψ(x) = d/dx ln(Γ(x))
190    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self>;
191
192    /// Element-wise trigamma function ψ'(x) = d²/dx² ln(Γ(x))
193    /// The second derivative of log-gamma, critical for Fisher information in Bayesian inference.
194    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self>;
195
196    /// Element-wise log-gamma function ln(Γ(x))
197    /// More numerically stable than computing gamma(x).ln() - used extensively in statistical distributions.
198    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self>;
199
200    /// Element-wise error function erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
201    /// Critical for normal distribution CDF: Φ(x) = 0.5 * (1 + erf(x/√2))
202    /// Properties: erf(0)=0, erf(∞)=1, erf(-x)=-erf(x)
203    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self>;
204
205    /// Element-wise complementary error function erfc(x) = 1 - erf(x)
206    /// More numerically stable than 1 - erf(x) for large x
207    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self>;
208
209    /// Element-wise inverse error function erfinv(y) such that erf(erfinv(y)) = y
210    /// Critical for inverse normal CDF (probit function): Φ⁻¹(p) = √2 * erfinv(2p - 1)
211    /// Domain: (-1, 1), Range: (-∞, ∞)
212    /// Properties: erfinv(0)=0, erfinv(-y)=-erfinv(y) (odd function)
213    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self>;
214
215    /// Element-wise inverse complementary error function erfcinv(y) such that erfc(erfcinv(y)) = y
216    /// More numerically stable than erfinv(1-y) for y close to 0
217    /// Domain: (0, 2), Range: (-∞, ∞)
218    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self>;
219
220    /// Element-wise sigmoid (logistic) function: σ(x) = 1 / (1 + exp(-x))
221    /// Critical for neural networks, logistic regression, and probability modeling
222    /// Range: (0, 1), σ(0) = 0.5, σ(-∞) = 0, σ(+∞) = 1
223    /// Properties: σ(-x) = 1 - σ(x), derivative σ'(x) = σ(x)(1 - σ(x))
224    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
225
226    /// Element-wise GELU (Gaussian Error Linear Unit) activation function
227    /// GELU(x) = x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
228    /// Where Φ(x) is the standard normal CDF
229    /// Critical for Transformer models (BERT, GPT, etc.)
230    /// Properties: GELU(0) = 0, smooth approximation of ReLU
231    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self>;
232
233    /// Element-wise Swish (SiLU - Sigmoid Linear Unit) activation function
234    /// Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
235    /// Self-gated activation discovered via neural architecture search
236    /// Used in EfficientNet, GPT-NeoX, and many modern architectures
237    /// Properties: smooth, non-monotonic, self-gating, unbounded above
238    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self>;
239
240    /// Element-wise Softplus activation function
241    /// Softplus(x) = ln(1 + exp(x))
242    /// Smooth approximation of ReLU
243    /// Used in probabilistic models, Bayesian deep learning, smooth counting
244    /// Properties: softplus(0) = ln(2) ≈ 0.693, always positive, derivative = sigmoid(x)
245    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self>;
246
247    /// Element-wise Mish activation function
248    /// Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
249    /// Self-regularized non-monotonic activation function
250    /// Used in YOLOv4, modern object detection, and neural architectures
251    /// Properties: smooth, non-monotonic, Mish(0) = 0, unbounded above
252    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self>;
253
254    /// Element-wise ELU (Exponential Linear Unit) activation function
255    /// ELU(x, α) = x if x >= 0, α * (exp(x) - 1) if x < 0
256    /// Helps with vanishing gradients and faster learning
257    /// Used in deep neural networks for smoother outputs
258    /// Properties: smooth, continuous derivative, bounded below by -α
259    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
260
261    /// SELU activation function (Scaled Exponential Linear Unit)
262    ///
263    /// SELU(x) = λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
264    /// where λ ≈ 1.0507 and α ≈ 1.6733 (fixed constants)
265    /// Self-normalizing: preserves mean=0, variance=1 through layers
266    /// Used in Self-Normalizing Neural Networks (SNNs)
267    /// Eliminates need for BatchNorm when using LeCun Normal initialization
268    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self>;
269
270    /// Hardsigmoid activation function
271    ///
272    /// Hardsigmoid(x) = clip((x + 3) / 6, 0, 1)
273    /// Piecewise linear approximation of sigmoid
274    /// Used in MobileNetV3 for efficient inference
275    /// Avoids expensive exp() computation
276    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
277
278    /// Hardswish activation function
279    ///
280    /// Hardswish(x) = x * hardsigmoid(x) = x * clip((x + 3) / 6, 0, 1)
281    /// Piecewise linear approximation of Swish
282    /// Used in MobileNetV3 for efficient inference
283    /// Avoids expensive exp() computation while maintaining self-gating
284    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self>;
285
286    /// Sinc function (normalized)
287    ///
288    /// sinc(x) = sin(πx) / (πx) for x ≠ 0, sinc(0) = 1
289    /// Critical for signal processing, windowing, interpolation
290    /// Properties: sinc(n) = 0 for all non-zero integers n
291    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self>;
292
293    /// Log-softmax function for numerically stable probability computation
294    ///
295    /// log_softmax(x_i) = x_i - log(Σ_j exp(x_j))
296    /// Critical for neural networks, especially cross-entropy loss
297    /// More numerically stable than computing log(softmax(x))
298    /// Used in Transformers, LLMs, and classification networks
299    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
300
301    /// Inverse hyperbolic sine: asinh(x) = ln(x + √(x² + 1))
302    ///
303    /// Domain: (-∞, +∞), Range: (-∞, +∞)
304    /// Used in: hyperbolic geometry, conformal mapping, special relativity (rapidity)
305    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self>;
306
307    /// Inverse hyperbolic cosine: acosh(x) = ln(x + √(x² - 1))
308    ///
309    /// Domain: [1, +∞), Range: [0, +∞)
310    /// Returns NaN for x < 1
311    /// Used in: hyperbolic geometry, distance calculations, special relativity
312    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self>;
313
314    /// Inverse hyperbolic tangent: atanh(x) = 0.5 * ln((1+x)/(1-x))
315    ///
316    /// Domain: (-1, 1), Range: (-∞, +∞)
317    /// Returns ±∞ at x = ±1, NaN for |x| > 1
318    /// Used in: statistical transformations (Fisher's z), probability
319    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self>;
320
321    /// Beta function: B(a, b) = Γ(a)Γ(b)/Γ(a+b)
322    ///
323    /// The beta function is fundamental for:
324    /// - Beta distribution (Bayesian priors)
325    /// - Binomial coefficients: C(n,k) = 1/(n+1)/B(n-k+1, k+1)
326    /// - Statistical hypothesis testing
327    /// - Incomplete beta function (regularized)
328    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
329
330    /// Log-Beta function: ln(B(a, b)) = ln(Γ(a)) + ln(Γ(b)) - ln(Γ(a+b))
331    ///
332    /// More numerically stable than computing B(a,b) for large arguments.
333    /// Returns ln(B(a,b)) for each pair of inputs.
334    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
335
336    /// Linear interpolation: lerp(a, b, t) = a + t * (b - a) = a * (1 - t) + b * t
337    ///
338    /// Computes element-wise linear interpolation between arrays `a` and `b`
339    /// using interpolation parameter `t`. When t=0, returns a; when t=1, returns b.
340    ///
341    /// Critical for:
342    /// - Animation blending (skeletal animation, morph targets)
343    /// - Quaternion SLERP approximation (for small angles)
344    /// - Gradient computation in neural networks
345    /// - Smooth parameter transitions
346    /// - Color blending and image processing
347    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self>;
348
349    /// Smoothstep interpolation: smoothstep(edge0, edge1, x)
350    ///
351    /// Returns smooth Hermite interpolation between 0 and 1 when edge0 < x < edge1.
352    /// - Returns 0 if x <= edge0
353    /// - Returns 1 if x >= edge1
354    /// - Returns smooth curve: 3t² - 2t³ where t = (x - edge0) / (edge1 - edge0)
355    ///
356    /// Critical for:
357    /// - Shader programming (lighting, transitions)
358    /// - Activation function variants
359    /// - Smooth threshold functions
360    /// - Anti-aliasing and blending
361    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
362
363    /// Hypotenuse: hypot(x, y) = sqrt(x² + y²)
364    ///
365    /// Computes element-wise hypotenuse without overflow/underflow issues.
366    /// Uses the standard library implementation which handles extreme values.
367    ///
368    /// Critical for:
369    /// - Distance calculations in 2D/3D
370    /// - Computing vector magnitudes
371    /// - Graphics and physics simulations
372    /// - Complex number modulus: |a+bi| = hypot(a, b)
373    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
374
375    /// Copysign: copysign(x, y) returns x with the sign of y
376    ///
377    /// For each element, returns the magnitude of x with the sign of y.
378    /// - copysign(1.0, -2.0) = -1.0
379    /// - copysign(-3.0, 4.0) = 3.0
380    ///
381    /// Critical for:
382    /// - Sign manipulation in numerical algorithms
383    /// - Implementing special functions (e.g., reflection formula)
384    /// - Gradient sign propagation
385    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
386
387    /// Smootherstep (Ken Perlin's improved smoothstep): 6t⁵ - 15t⁴ + 10t³
388    ///
389    /// An improved version of smoothstep with second-order continuous derivatives.
390    /// The first AND second derivatives are zero at the boundaries.
391    ///
392    /// Critical for:
393    /// - Perlin noise and procedural generation
394    /// - High-quality animation easing
395    /// - Shader programming (better lighting transitions)
396    /// - Gradient-based optimization (smoother loss landscapes)
397    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
398
399    /// Logaddexp: log(exp(a) + exp(b)) computed in a numerically stable way
400    ///
401    /// Uses the identity: log(exp(a) + exp(b)) = max(a,b) + log(1 + exp(-|a-b|))
402    /// This avoids overflow/underflow for large positive or negative values.
403    ///
404    /// Critical for:
405    /// - Log-probability computations (Bayesian inference)
406    /// - Log-likelihood calculations in ML
407    /// - Hidden Markov Model forward/backward algorithms
408    /// - Neural network loss functions (cross-entropy)
409    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
410
411    /// Logit function: log(p / (1-p)) - inverse of sigmoid
412    ///
413    /// Maps probabilities in (0, 1) to log-odds in (-∞, +∞).
414    /// The logit function is the inverse of the sigmoid (logistic) function.
415    ///
416    /// Critical for:
417    /// - Logistic regression (log-odds interpretation)
418    /// - Probability calibration
419    /// - Converting probabilities to unbounded space for optimization
420    /// - Statistical modeling (link functions)
421    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self>;
422
423    /// Element-wise square: x²
424    ///
425    /// More efficient than simd_pow(x, 2) or simd_mul(x, x) as it's a single multiplication.
426    ///
427    /// Critical for:
428    /// - Variance computation: E\[X²\] - E\[X\]²
429    /// - Distance calculations: ||a - b||² = (a - b)²
430    /// - Neural network loss functions (MSE)
431    /// - Physics simulations (kinetic energy: ½mv²)
432    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self>;
433
434    /// Inverse square root: 1/sqrt(x)
435    ///
436    /// More efficient than simd_div(1, simd_sqrt(x)) for normalization operations.
437    ///
438    /// Critical for:
439    /// - Vector normalization: v * rsqrt(dot(v,v))
440    /// - Graphics (lighting, physics simulations)
441    /// - Layer normalization in neural networks
442    /// - Quaternion normalization
443    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self>;
444
445    /// Simultaneous sin and cos: returns (sin(x), cos(x))
446    ///
447    /// More efficient than calling sin and cos separately when both are needed.
448    /// Returns a tuple of two arrays.
449    ///
450    /// Critical for:
451    /// - Rotation matrices (2D and 3D)
452    /// - Fourier transforms
453    /// - Wave simulations
454    /// - Animation and physics
455    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>);
456
457    /// Numerically stable exp(x) - 1
458    ///
459    /// Returns exp(x) - 1 accurately for small x values where exp(x) ≈ 1.
460    /// For small x, the direct calculation exp(x) - 1 suffers from catastrophic cancellation.
461    ///
462    /// Critical for:
463    /// - Financial calculations (compound interest for small rates)
464    /// - Numerical integration of differential equations
465    /// - Statistical distributions (Poisson, exponential)
466    /// - Machine learning (softplus, log-sum-exp)
467    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self>;
468
469    /// Numerically stable ln(1 + x)
470    ///
471    /// Returns ln(1 + x) accurately for small x values where 1 + x ≈ 1.
472    /// For small x, the direct calculation ln(1 + x) suffers from catastrophic cancellation.
473    ///
474    /// Critical for:
475    /// - Log-probability calculations (log(1 - p) for small p)
476    /// - Numerical integration
477    /// - Statistical distributions
478    /// - Machine learning (binary cross-entropy loss)
479    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self>;
480
481    /// Sum of squares
482    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
483
484    /// Element-wise multiplication (alias for simd_mul)
485    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
486
487    /// Check if SIMD is available for this type
488    fn simd_available() -> bool;
489
490    /// Ultra-optimized sum reduction (alias for simd_sum for compatibility)
491    fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
492        Self::simd_sum(a)
493    }
494
495    /// Ultra-optimized subtraction (alias for simd_sub for compatibility)
496    fn simd_sub_f32_ultra(
497        a: &ArrayView1<Self>,
498        b: &ArrayView1<Self>,
499        result: &mut ArrayViewMut1<Self>,
500    );
501
502    /// Ultra-optimized multiplication (alias for simd_mul for compatibility)
503    fn simd_mul_f32_ultra(
504        a: &ArrayView1<Self>,
505        b: &ArrayView1<Self>,
506        result: &mut ArrayViewMut1<Self>,
507    );
508
509    /// Ultra-optimized cubes sum (power 3 sum)
510    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
511
512    /// Ultra-optimized division (alias for simd_div for compatibility)
513    fn simd_div_f32_ultra(
514        a: &ArrayView1<Self>,
515        b: &ArrayView1<Self>,
516        result: &mut ArrayViewMut1<Self>,
517    );
518
519    /// Ultra-optimized sine function
520    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
521
522    /// Ultra-optimized addition (alias for simd_add for compatibility)
523    fn simd_add_f32_ultra(
524        a: &ArrayView1<Self>,
525        b: &ArrayView1<Self>,
526        result: &mut ArrayViewMut1<Self>,
527    );
528
529    /// Ultra-optimized fused multiply-add
530    fn simd_fma_f32_ultra(
531        a: &ArrayView1<Self>,
532        b: &ArrayView1<Self>,
533        c: &ArrayView1<Self>,
534        result: &mut ArrayViewMut1<Self>,
535    );
536
537    /// Ultra-optimized power function
538    fn simd_pow_f32_ultra(
539        a: &ArrayView1<Self>,
540        b: &ArrayView1<Self>,
541        result: &mut ArrayViewMut1<Self>,
542    );
543
544    /// Ultra-optimized exponential function
545    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
546
547    /// Ultra-optimized cosine function
548    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
549
550    /// Ultra-optimized dot product
551    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
552
553    /// Variance (population variance)
554    fn simd_variance(a: &ArrayView1<Self>) -> Self;
555
556    /// Standard deviation
557    fn simd_std(a: &ArrayView1<Self>) -> Self;
558
559    /// L1 norm (Manhattan norm)
560    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self;
561
562    /// L∞ norm (Chebyshev norm / max absolute)
563    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self;
564
565    /// Cosine similarity between two vectors
566    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
567
568    /// Euclidean distance between two vectors
569    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
570
571    /// Manhattan distance between two vectors
572    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
573
574    /// Chebyshev distance between two vectors
575    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
576
577    /// Cosine distance (1 - cosine_similarity)
578    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
579
580    /// Weighted sum
581    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
582
583    /// Weighted mean
584    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
585
586    /// Find index of minimum element (argmin)
587    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize>;
588
589    /// Find index of maximum element (argmax)
590    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize>;
591
592    /// Clip values to [min_val, max_val] range
593    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self>;
594
595    /// Log-sum-exp for numerically stable softmax computation
596    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self;
597
598    /// Softmax for probability distribution (softmax = exp(x - log_sum_exp(x)))
599    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
600
601    /// Cumulative sum
602    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self>;
603
604    /// Cumulative product
605    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self>;
606
607    /// First-order difference (`a[i+1] - a[i]`)
608    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self>;
609
610    /// Sign function: returns -1 for negative, 0 for zero, +1 for positive
611    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self>;
612
613    /// ReLU activation: max(0, x)
614    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self>;
615
616    /// Leaky ReLU: x if x > 0 else alpha * x
617    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
618
619    /// L2 normalization (unit vector)
620    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self>;
621
622    /// Standardization: (x - mean) / std
623    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self>;
624}
625
626// Lanczos approximation helper functions for gamma
627/// Lanczos approximation for gamma function (f32)
628#[allow(dead_code)]
629fn lanczos_gamma_f32(z: f32) -> f32 {
630    const G: f32 = 7.0;
631    const SQRT_2PI: f32 = 2.5066282746310002; // sqrt(2*PI)
632    const LANCZOS_COEFFS: [f32; 9] = [
633        0.99999999999980993,
634        676.5203681218851,
635        -1259.1392167224028,
636        771.32342877765313,
637        -176.61502916214059,
638        12.507343278686905,
639        -0.13857109526572012,
640        9.9843695780195716e-6,
641        1.5056327351493116e-7,
642    ];
643
644    // For z < 0.5, use reflection formula: Γ(z) = π / (sin(πz) · Γ(1-z))
645    if z < 0.5 {
646        let pi = std::f32::consts::PI;
647        let sinpix = (pi * z).sin();
648        if sinpix.abs() < 1e-10 {
649            return f32::INFINITY;
650        }
651        return pi / (sinpix * lanczos_gamma_f32(1.0 - z));
652    }
653
654    let z_shifted = z - 1.0;
655    let mut acc = LANCZOS_COEFFS[0];
656    for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
657        acc += coeff / (z_shifted + i as f32);
658    }
659
660    let t = z_shifted + G + 0.5;
661    SQRT_2PI * acc * t.powf(z_shifted + 0.5) * (-t).exp()
662}
663
664/// Lanczos approximation for gamma function (f64)
665#[allow(dead_code)]
666fn lanczos_gamma_f64(z: f64) -> f64 {
667    const G: f64 = 7.0;
668    const SQRT_2PI: f64 = 2.5066282746310002; // sqrt(2*PI)
669    const LANCZOS_COEFFS: [f64; 9] = [
670        0.999999999999809932,
671        676.5203681218851,
672        -1259.1392167224028,
673        771.32342877765313,
674        -176.61502916214059,
675        12.507343278686905,
676        -0.13857109526572012,
677        9.9843695780195716e-6,
678        1.5056327351493116e-7,
679    ];
680
681    // For z < 0.5, use reflection formula: Γ(z) = π / (sin(πz) · Γ(1-z))
682    if z < 0.5 {
683        let pi = std::f64::consts::PI;
684        let sinpix = (pi * z).sin();
685        if sinpix.abs() < 1e-14 {
686            return f64::INFINITY;
687        }
688        return pi / (sinpix * lanczos_gamma_f64(1.0 - z));
689    }
690
691    let z_shifted = z - 1.0;
692    let mut acc = LANCZOS_COEFFS[0];
693    for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
694        acc += coeff / (z_shifted + i as f64);
695    }
696
697    let t = z_shifted + G + 0.5;
698    SQRT_2PI * acc * t.powf(z_shifted + 0.5) * (-t).exp()
699}
700
701/// Digamma function (psi) - logarithmic derivative of gamma function (f32)
702/// ψ(x) = d/dx ln(Γ(x)) = Γ'(x) / Γ(x)
703#[allow(dead_code)]
704fn digamma_f32(mut x: f32) -> f32 {
705    let pi = std::f32::consts::PI;
706
707    // Handle special cases
708    if x.is_nan() {
709        return f32::NAN;
710    }
711    if x.is_infinite() {
712        return if x > 0.0 { f32::INFINITY } else { f32::NAN };
713    }
714
715    // Handle negative x using reflection formula:
716    // ψ(-x) = ψ(x) + π·cot(πx) + 1/x
717    // Or equivalently: ψ(1-x) - ψ(x) = π·cot(πx)
718    if x <= 0.0 {
719        // Check if x is a negative integer (pole)
720        if x == x.floor() {
721            return f32::NAN;
722        }
723        // Use reflection: ψ(x) = ψ(1-x) - π·cot(πx)
724        return digamma_f32(1.0 - x) - pi / (pi * x).tan();
725    }
726
727    let mut result = 0.0;
728
729    // Use recurrence relation ψ(x+1) = ψ(x) + 1/x to shift to large x
730    // ψ(x) = ψ(x+n) - 1/(x+n-1) - 1/(x+n-2) - ... - 1/x
731    while x < 6.0 {
732        result -= 1.0 / x;
733        x += 1.0;
734    }
735
736    // Asymptotic expansion for large x:
737    // ψ(x) ≈ ln(x) - 1/(2x) - 1/(12x²) + 1/(120x⁴) - 1/(252x⁶) + ...
738    result += x.ln() - 0.5 / x;
739    let x2 = x * x;
740    let x4 = x2 * x2;
741    let x6 = x4 * x2;
742    result -= 1.0 / (12.0 * x2);
743    result += 1.0 / (120.0 * x4);
744    result -= 1.0 / (252.0 * x6);
745
746    result
747}
748
749/// Digamma function (psi) - logarithmic derivative of gamma function (f64)
750/// ψ(x) = d/dx ln(Γ(x)) = Γ'(x) / Γ(x)
751#[allow(dead_code)]
752fn digamma_f64(mut x: f64) -> f64 {
753    let pi = std::f64::consts::PI;
754
755    // Handle special cases
756    if x.is_nan() {
757        return f64::NAN;
758    }
759    if x.is_infinite() {
760        return if x > 0.0 { f64::INFINITY } else { f64::NAN };
761    }
762
763    // Handle negative x using reflection formula:
764    // ψ(-x) = ψ(x) + π·cot(πx) + 1/x
765    // Or equivalently: ψ(1-x) - ψ(x) = π·cot(πx)
766    if x <= 0.0 {
767        // Check if x is a negative integer (pole)
768        if x == x.floor() {
769            return f64::NAN;
770        }
771        // Use reflection: ψ(x) = ψ(1-x) - π·cot(πx)
772        return digamma_f64(1.0 - x) - pi / (pi * x).tan();
773    }
774
775    let mut result = 0.0;
776
777    // Use recurrence relation ψ(x+1) = ψ(x) + 1/x to shift to large x
778    // ψ(x) = ψ(x+n) - 1/(x+n-1) - 1/(x+n-2) - ... - 1/x
779    while x < 8.0 {
780        result -= 1.0 / x;
781        x += 1.0;
782    }
783
784    // Asymptotic expansion for large x (more terms for f64):
785    // ψ(x) ≈ ln(x) - 1/(2x) - Σ B_{2k}/(2k·x^{2k}) for k=1,2,...
786    // where B_{2k} are Bernoulli numbers
787    result += x.ln() - 0.5 / x;
788    let x2 = x * x;
789    let x4 = x2 * x2;
790    let x6 = x4 * x2;
791    let x8 = x4 * x4;
792    let x10 = x8 * x2;
793    let x12 = x8 * x4;
794    // B_2 = 1/6, B_4 = -1/30, B_6 = 1/42, B_8 = -1/30, B_10 = 5/66, B_12 = -691/2730
795    result -= 1.0 / (12.0 * x2); // B_2 / (2 * x^2)
796    result += 1.0 / (120.0 * x4); // -B_4 / (4 * x^4)
797    result -= 1.0 / (252.0 * x6); // B_6 / (6 * x^6)
798    result += 1.0 / (240.0 * x8); // -B_8 / (8 * x^8)
799    result -= 5.0 / (660.0 * x10); // B_10 / (10 * x^10)
800    result += 691.0 / (32760.0 * x12); // -B_12 / (12 * x^12)
801
802    result
803}
804
805/// Trigamma function ψ'(x) = d²/dx² ln(Γ(x)) for f32
806///
807/// Uses:
808/// 1. Reflection formula for x < 0.5: ψ'(1-x) + ψ'(x) = π²/sin²(πx)
809/// 2. Recurrence relation for small x: ψ'(x+1) = ψ'(x) - 1/x²
810/// 3. Asymptotic expansion for large x
811fn trigamma_f32(mut x: f32) -> f32 {
812    let pi = std::f32::consts::PI;
813
814    // Handle special cases
815    if x.is_nan() {
816        return f32::NAN;
817    }
818    if x.is_infinite() {
819        return if x > 0.0 { 0.0 } else { f32::NAN };
820    }
821
822    // Handle negative x using reflection formula:
823    // ψ'(1-x) + ψ'(x) = π²/sin²(πx)
824    if x <= 0.0 {
825        // Check if x is a negative integer (pole)
826        if x == x.floor() {
827            return f32::NAN;
828        }
829        // Use reflection: ψ'(x) = π²/sin²(πx) - ψ'(1-x)
830        let sin_pix = (pi * x).sin();
831        return (pi * pi) / (sin_pix * sin_pix) - trigamma_f32(1.0 - x);
832    }
833
834    let mut result = 0.0;
835
836    // Use recurrence relation ψ'(x+1) = ψ'(x) - 1/x² to shift to large x
837    // ψ'(x) = ψ'(x+n) + 1/x² + 1/(x+1)² + ... + 1/(x+n-1)²
838    while x < 6.0 {
839        result += 1.0 / (x * x);
840        x += 1.0;
841    }
842
843    // Asymptotic expansion for large x:
844    // ψ'(x) ≈ 1/x + 1/(2x²) + 1/(6x³) - 1/(30x⁵) + 1/(42x⁷) - ...
845    let x2 = x * x;
846    let x3 = x2 * x;
847    let x5 = x3 * x2;
848    let x7 = x5 * x2;
849
850    result += 1.0 / x;
851    result += 1.0 / (2.0 * x2);
852    result += 1.0 / (6.0 * x3);
853    result -= 1.0 / (30.0 * x5);
854    result += 1.0 / (42.0 * x7);
855
856    result
857}
858
859/// Trigamma function ψ'(x) = d²/dx² ln(Γ(x)) for f64
860///
861/// Uses:
862/// 1. Reflection formula for x < 0.5: ψ'(1-x) + ψ'(x) = π²/sin²(πx)
863/// 2. Recurrence relation for small x: ψ'(x+1) = ψ'(x) - 1/x²
864/// 3. Asymptotic expansion for large x (more terms for f64 precision)
865fn trigamma_f64(mut x: f64) -> f64 {
866    let pi = std::f64::consts::PI;
867
868    // Handle special cases
869    if x.is_nan() {
870        return f64::NAN;
871    }
872    if x.is_infinite() {
873        return if x > 0.0 { 0.0 } else { f64::NAN };
874    }
875
876    // Handle negative x using reflection formula:
877    // ψ'(1-x) + ψ'(x) = π²/sin²(πx)
878    if x <= 0.0 {
879        // Check if x is a negative integer (pole)
880        if x == x.floor() {
881            return f64::NAN;
882        }
883        // Use reflection: ψ'(x) = π²/sin²(πx) - ψ'(1-x)
884        let sin_pix = (pi * x).sin();
885        return (pi * pi) / (sin_pix * sin_pix) - trigamma_f64(1.0 - x);
886    }
887
888    let mut result = 0.0;
889
890    // Use recurrence relation ψ'(x+1) = ψ'(x) - 1/x² to shift to large x
891    // ψ'(x) = ψ'(x+n) + 1/x² + 1/(x+1)² + ... + 1/(x+n-1)²
892    while x < 8.0 {
893        result += 1.0 / (x * x);
894        x += 1.0;
895    }
896
897    // Asymptotic expansion for large x (more terms for f64):
898    // ψ'(x) ≈ 1/x + 1/(2x²) + Σ B_{2k}/x^{2k+1} for k=1,2,...
899    // where B_{2k} are Bernoulli numbers: B_2=1/6, B_4=-1/30, B_6=1/42, B_8=-1/30
900    let x2 = x * x;
901    let x3 = x2 * x;
902    let x5 = x3 * x2;
903    let x7 = x5 * x2;
904    let x9 = x7 * x2;
905    let x11 = x9 * x2;
906
907    result += 1.0 / x;
908    result += 1.0 / (2.0 * x2);
909    result += 1.0 / (6.0 * x3); // B_2 = 1/6
910    result -= 1.0 / (30.0 * x5); // B_4 = -1/30
911    result += 1.0 / (42.0 * x7); // B_6 = 1/42
912    result -= 1.0 / (30.0 * x9); // B_8 = -1/30
913    result += 5.0 / (66.0 * x11); // B_10 = 5/66
914
915    result
916}
917
918/// Log-gamma function ln(Γ(x)) for f32
919///
920/// More numerically stable than gamma(x).ln() since it avoids overflow.
921/// Uses Lanczos approximation: ln(Γ(z)) = ln(√(2π)) + (z-0.5)*ln(t) - t + ln(sum)
922/// where t = z + g - 0.5
923fn ln_gamma_f32(z: f32) -> f32 {
924    const G: f32 = 7.0;
925    const LN_SQRT_2PI: f32 = 0.9189385332046727; // ln(sqrt(2*PI))
926    const LANCZOS_COEFFS: [f32; 9] = [
927        0.99999999999980993,
928        676.5203681218851,
929        -1259.1392167224028,
930        771.32342877765313,
931        -176.61502916214059,
932        12.507343278686905,
933        -0.13857109526572012,
934        9.9843695780195716e-6,
935        1.5056327351493116e-7,
936    ];
937
938    // Handle special cases
939    if z.is_nan() {
940        return f32::NAN;
941    }
942    if z.is_infinite() {
943        return if z > 0.0 { f32::INFINITY } else { f32::NAN };
944    }
945    if z <= 0.0 && z == z.floor() {
946        return f32::INFINITY; // Poles at non-positive integers
947    }
948
949    // For z < 0.5, use reflection formula:
950    // ln(Γ(z)) = ln(π) - ln(sin(πz)) - ln(Γ(1-z))
951    if z < 0.5 {
952        let pi = std::f32::consts::PI;
953        let sinpiz = (pi * z).sin().abs();
954        if sinpiz < 1e-10 {
955            return f32::INFINITY;
956        }
957        return pi.ln() - sinpiz.ln() - ln_gamma_f32(1.0 - z);
958    }
959
960    // Lanczos approximation for z >= 0.5
961    let z_shifted = z - 1.0;
962    let mut sum = LANCZOS_COEFFS[0];
963    for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
964        sum += coeff / (z_shifted + i as f32);
965    }
966
967    let t = z_shifted + G + 0.5;
968    // ln(Γ(z)) = ln(√(2π)) + (z-0.5)*ln(t) - t + ln(sum)
969    LN_SQRT_2PI + (z_shifted + 0.5) * t.ln() - t + sum.ln()
970}
971
972/// Log-gamma function ln(Γ(x)) for f64
973///
974/// More numerically stable than gamma(x).ln() since it avoids overflow.
975/// Uses Lanczos approximation with higher precision coefficients.
976fn ln_gamma_f64(z: f64) -> f64 {
977    const G: f64 = 7.0;
978    const LN_SQRT_2PI: f64 = 0.9189385332046727; // ln(sqrt(2*PI))
979    const LANCZOS_COEFFS: [f64; 9] = [
980        0.999999999999809932,
981        676.5203681218851,
982        -1259.1392167224028,
983        771.32342877765313,
984        -176.61502916214059,
985        12.507343278686905,
986        -0.13857109526572012,
987        9.9843695780195716e-6,
988        1.5056327351493116e-7,
989    ];
990
991    // Handle special cases
992    if z.is_nan() {
993        return f64::NAN;
994    }
995    if z.is_infinite() {
996        return if z > 0.0 { f64::INFINITY } else { f64::NAN };
997    }
998    if z <= 0.0 && z == z.floor() {
999        return f64::INFINITY; // Poles at non-positive integers
1000    }
1001
1002    // For z < 0.5, use reflection formula:
1003    // ln(Γ(z)) = ln(π) - ln(sin(πz)) - ln(Γ(1-z))
1004    if z < 0.5 {
1005        let pi = std::f64::consts::PI;
1006        let sinpiz = (pi * z).sin().abs();
1007        if sinpiz < 1e-14 {
1008            return f64::INFINITY;
1009        }
1010        return pi.ln() - sinpiz.ln() - ln_gamma_f64(1.0 - z);
1011    }
1012
1013    // Lanczos approximation for z >= 0.5
1014    let z_shifted = z - 1.0;
1015    let mut sum = LANCZOS_COEFFS[0];
1016    for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
1017        sum += coeff / (z_shifted + i as f64);
1018    }
1019
1020    let t = z_shifted + G + 0.5;
1021    // ln(Γ(z)) = ln(√(2π)) + (z-0.5)*ln(t) - t + ln(sum)
1022    LN_SQRT_2PI + (z_shifted + 0.5) * t.ln() - t + sum.ln()
1023}
1024
1025/// Error function erf(x) for f32
1026///
1027/// Uses Abramowitz & Stegun approximation (equation 7.1.26)
1028/// Maximum error: ~1.5×10⁻⁷
1029fn erf_f32(x: f32) -> f32 {
1030    // Handle special cases
1031    if x.is_nan() {
1032        return f32::NAN;
1033    }
1034    if x.is_infinite() {
1035        return if x > 0.0 { 1.0 } else { -1.0 };
1036    }
1037
1038    // erf is an odd function: erf(-x) = -erf(x)
1039    let sign = if x >= 0.0 { 1.0 } else { -1.0 };
1040    let x = x.abs();
1041
1042    // For very small x, use series approximation erf(x) ≈ 2x/√π
1043    if x < 1e-10 {
1044        return sign * x * std::f32::consts::FRAC_2_SQRT_PI;
1045    }
1046
1047    // Abramowitz & Stegun approximation (7.1.26)
1048    // erf(x) ≈ 1 - (a₁t + a₂t² + a₃t³ + a₄t⁴ + a₅t⁵)e^(-x²)
1049    // where t = 1/(1 + px), p = 0.3275911
1050    const P: f32 = 0.3275911;
1051    const A1: f32 = 0.254829592;
1052    const A2: f32 = -0.284496736;
1053    const A3: f32 = 1.421413741;
1054    const A4: f32 = -1.453152027;
1055    const A5: f32 = 1.061405429;
1056
1057    let t = 1.0 / (1.0 + P * x);
1058    let t2 = t * t;
1059    let t3 = t2 * t;
1060    let t4 = t3 * t;
1061    let t5 = t4 * t;
1062
1063    let poly = A1 * t + A2 * t2 + A3 * t3 + A4 * t4 + A5 * t5;
1064    let result = 1.0 - poly * (-x * x).exp();
1065
1066    sign * result
1067}
1068
1069/// Error function erf(x) for f64
1070///
1071/// Uses a higher-precision rational approximation
1072/// Maximum error: ~1.5×10⁻¹⁵
1073fn erf_f64(x: f64) -> f64 {
1074    // Handle special cases
1075    if x.is_nan() {
1076        return f64::NAN;
1077    }
1078    if x.is_infinite() {
1079        return if x > 0.0 { 1.0 } else { -1.0 };
1080    }
1081
1082    // erf is an odd function: erf(-x) = -erf(x)
1083    let sign = if x >= 0.0 { 1.0 } else { -1.0 };
1084    let x = x.abs();
1085
1086    // For very small x, use series approximation erf(x) ≈ 2x/√π
1087    if x < 1e-20 {
1088        return sign * x * std::f64::consts::FRAC_2_SQRT_PI;
1089    }
1090
1091    // For large x, erf approaches 1
1092    if x > 6.0 {
1093        return sign;
1094    }
1095
1096    // Use different approximations based on x range for best accuracy
1097    // Maclaurin series loses accuracy near x=0.5, so use lower threshold
1098    let result = if x < 0.25 {
1099        // For small x, use Maclaurin series
1100        // erf(x) = (2/√π) * (x - x³/3 + x⁵/10 - x⁷/42 + x⁹/216 - ...)
1101        let x2 = x * x;
1102        let two_over_sqrt_pi = std::f64::consts::FRAC_2_SQRT_PI;
1103        let term = x
1104            * (1.0
1105                - x2 / 3.0
1106                    * (1.0
1107                        - x2 / 5.0
1108                            * 0.5
1109                            * (1.0
1110                                - x2 / 7.0
1111                                    * (1.0 / 3.0)
1112                                    * (1.0
1113                                        - x2 / 9.0
1114                                            * 0.25
1115                                            * (1.0
1116                                                - x2 / 11.0
1117                                                    * 0.2
1118                                                    * (1.0
1119                                                        - x2 / 13.0
1120                                                            * (1.0 / 6.0)
1121                                                            * (1.0 - x2 / 15.0 * (1.0 / 7.0))))))));
1122        two_over_sqrt_pi * term
1123    } else {
1124        // For medium to large x, use Abramowitz & Stegun approximation with more terms
1125        // This is a 7th order approximation
1126        const P: f64 = 0.3275911;
1127        const A1: f64 = 0.254829592;
1128        const A2: f64 = -0.284496736;
1129        const A3: f64 = 1.421413741;
1130        const A4: f64 = -1.453152027;
1131        const A5: f64 = 1.061405429;
1132
1133        let t = 1.0 / (1.0 + P * x);
1134        let poly = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5))));
1135        1.0 - poly * (-x * x).exp()
1136    };
1137
1138    sign * result
1139}
1140
1141/// Complementary error function erfc(x) = 1 - erf(x) for f32
1142///
1143/// More numerically stable than 1 - erf(x) for large x
1144fn erfc_f32(x: f32) -> f32 {
1145    // Handle special cases
1146    if x.is_nan() {
1147        return f32::NAN;
1148    }
1149    if x.is_infinite() {
1150        return if x > 0.0 { 0.0 } else { 2.0 };
1151    }
1152
1153    // For negative x, use erfc(-x) = 2 - erfc(x)
1154    if x < 0.0 {
1155        return 2.0 - erfc_f32(-x);
1156    }
1157
1158    // For large x, erfc(x) → 0 very quickly
1159    if x > 10.0 {
1160        return 0.0;
1161    }
1162
1163    // For small x, compute 1 - erf(x) directly
1164    if x < 0.5 {
1165        return 1.0 - erf_f32(x);
1166    }
1167
1168    // Abramowitz & Stegun approximation for erfc
1169    // erfc(x) = (a₁t + a₂t² + a₃t³ + a₄t⁴ + a₅t⁵)e^(-x²)
1170    const P: f32 = 0.3275911;
1171    const A1: f32 = 0.254829592;
1172    const A2: f32 = -0.284496736;
1173    const A3: f32 = 1.421413741;
1174    const A4: f32 = -1.453152027;
1175    const A5: f32 = 1.061405429;
1176
1177    let t = 1.0 / (1.0 + P * x);
1178    let t2 = t * t;
1179    let t3 = t2 * t;
1180    let t4 = t3 * t;
1181    let t5 = t4 * t;
1182
1183    let poly = A1 * t + A2 * t2 + A3 * t3 + A4 * t4 + A5 * t5;
1184    poly * (-x * x).exp()
1185}
1186
1187/// Complementary error function erfc(x) = 1 - erf(x) for f64
1188///
1189/// More numerically stable than 1 - erf(x) for large x
1190fn erfc_f64(x: f64) -> f64 {
1191    // Handle special cases
1192    if x.is_nan() {
1193        return f64::NAN;
1194    }
1195    if x.is_infinite() {
1196        return if x > 0.0 { 0.0 } else { 2.0 };
1197    }
1198
1199    // For negative x, use erfc(-x) = 2 - erfc(x)
1200    if x < 0.0 {
1201        return 2.0 - erfc_f64(-x);
1202    }
1203
1204    // For very large x, erfc(x) → 0
1205    if x > 27.0 {
1206        return 0.0;
1207    }
1208
1209    // For small x, compute 1 - erf(x) directly
1210    if x < 0.5 {
1211        return 1.0 - erf_f64(x);
1212    }
1213
1214    // Use continued fraction for medium to large x
1215    // erfc(x) = exp(-x²)/√π * 1/(x + 1/(2x + 2/(x + 3/(2x + ...))))
1216    // This is more accurate than the polynomial for larger x
1217    if x > 4.0 {
1218        // Asymptotic expansion for large x
1219        let x2 = x * x;
1220        let inv_x2 = 1.0 / x2;
1221        let sqrt_pi = std::f64::consts::PI.sqrt();
1222
1223        // erfc(x) ≈ exp(-x²)/(x√π) * (1 - 1/(2x²) + 3/(4x⁴) - 15/(8x⁶) + ...)
1224        let asymp = 1.0 - 0.5 * inv_x2 + 0.75 * inv_x2 * inv_x2 - 1.875 * inv_x2 * inv_x2 * inv_x2
1225            + 6.5625 * inv_x2 * inv_x2 * inv_x2 * inv_x2;
1226        return (-x2).exp() / (x * sqrt_pi) * asymp;
1227    }
1228
1229    // Abramowitz & Stegun approximation for medium x
1230    const P: f64 = 0.3275911;
1231    const A1: f64 = 0.254829592;
1232    const A2: f64 = -0.284496736;
1233    const A3: f64 = 1.421413741;
1234    const A4: f64 = -1.453152027;
1235    const A5: f64 = 1.061405429;
1236
1237    let t = 1.0 / (1.0 + P * x);
1238    let poly = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5))));
1239    poly * (-x * x).exp()
1240}
1241
1242/// Inverse error function erfinv(y) for f32
1243///
1244/// Uses Winitzki's approximation followed by Newton-Raphson refinement
1245/// Domain: (-1, 1), returns x such that erf(x) = y
1246fn erfinv_f32(y: f32) -> f32 {
1247    // Handle special cases
1248    if y.is_nan() {
1249        return f32::NAN;
1250    }
1251    if y <= -1.0 {
1252        return if y == -1.0 {
1253            f32::NEG_INFINITY
1254        } else {
1255            f32::NAN
1256        };
1257    }
1258    if y >= 1.0 {
1259        return if y == 1.0 { f32::INFINITY } else { f32::NAN };
1260    }
1261    if y == 0.0 {
1262        return 0.0;
1263    }
1264
1265    // erfinv is odd: erfinv(-y) = -erfinv(y)
1266    let sign = if y >= 0.0 { 1.0 } else { -1.0 };
1267    let y = y.abs();
1268
1269    // Winitzki approximation for initial guess
1270    // erfinv(y) ≈ sign(y) * sqrt(sqrt((4/π + ay²)/(1+ay²))² - (4/π + ay²)/(1+ay²) + 2/πa * ln(1-y²))
1271    // where a ≈ 0.147 for good accuracy
1272    let a = 0.147_f32;
1273    let two_over_pi_a = 2.0 / (std::f32::consts::PI * a);
1274    let ln_one_minus_y2 = (1.0 - y * y).ln();
1275
1276    let t1 = two_over_pi_a + 0.5 * ln_one_minus_y2;
1277    let t2 = (1.0 / a) * ln_one_minus_y2;
1278    let inner = (t1 * t1 - t2).sqrt() - t1;
1279
1280    let mut x = inner.sqrt();
1281
1282    // Newton-Raphson refinement: x_new = x - (erf(x) - y) / erf'(x)
1283    // erf'(x) = 2/sqrt(π) * exp(-x²)
1284    let two_over_sqrt_pi = std::f32::consts::FRAC_2_SQRT_PI;
1285    for _ in 0..2 {
1286        let erf_x = erf_f32(x);
1287        let erf_deriv = two_over_sqrt_pi * (-x * x).exp();
1288        x -= (erf_x - y) / erf_deriv;
1289    }
1290
1291    sign * x
1292}
1293
1294/// Inverse error function erfinv(y) for f64
1295///
1296/// Uses Winitzki's approximation followed by Halley's method refinement
1297/// Domain: (-1, 1), returns x such that erf(x) = y
1298fn erfinv_f64(y: f64) -> f64 {
1299    // Handle special cases
1300    if y.is_nan() {
1301        return f64::NAN;
1302    }
1303    if y <= -1.0 {
1304        return if y == -1.0 {
1305            f64::NEG_INFINITY
1306        } else {
1307            f64::NAN
1308        };
1309    }
1310    if y >= 1.0 {
1311        return if y == 1.0 { f64::INFINITY } else { f64::NAN };
1312    }
1313    if y == 0.0 {
1314        return 0.0;
1315    }
1316
1317    // erfinv is odd: erfinv(-y) = -erfinv(y)
1318    let sign = if y >= 0.0 { 1.0 } else { -1.0 };
1319    let y = y.abs();
1320
1321    // Winitzki approximation for initial guess
1322    // Higher precision constant for f64
1323    let a = 0.147_f64;
1324    let two_over_pi_a = 2.0 / (std::f64::consts::PI * a);
1325    let ln_one_minus_y2 = (1.0 - y * y).ln();
1326
1327    let t1 = two_over_pi_a + 0.5 * ln_one_minus_y2;
1328    let t2 = (1.0 / a) * ln_one_minus_y2;
1329    let inner = (t1 * t1 - t2).sqrt() - t1;
1330
1331    let mut x = inner.sqrt();
1332
1333    // Halley's method for faster convergence (cubic)
1334    // x_new = x - f(x)/f'(x) * (1 + f(x)*f''(x)/(2*f'(x)²))⁻¹
1335    // For f(x) = erf(x) - y: f' = 2/√π * e^(-x²), f'' = -4x/√π * e^(-x²)
1336    // Use 5 iterations for ~1e-14 accuracy (f64 precision)
1337    let two_over_sqrt_pi = std::f64::consts::FRAC_2_SQRT_PI;
1338    for _ in 0..5 {
1339        let erf_x = erf_f64(x);
1340        let f = erf_x - y;
1341        let exp_neg_x2 = (-x * x).exp();
1342        let f_prime = two_over_sqrt_pi * exp_neg_x2;
1343
1344        // Newton step
1345        let newton_step = f / f_prime;
1346
1347        // Halley correction factor
1348        let f_double_prime = -2.0 * x * f_prime;
1349        let halley_correction = 1.0 / (1.0 - 0.5 * f * f_double_prime / (f_prime * f_prime));
1350
1351        x -= newton_step * halley_correction;
1352    }
1353
1354    sign * x
1355}
1356
1357/// Inverse complementary error function erfcinv(y) for f32
1358///
1359/// erfcinv(y) = erfinv(1 - y)
1360/// Domain: (0, 2), returns x such that erfc(x) = y
1361fn erfcinv_f32(y: f32) -> f32 {
1362    // Handle special cases
1363    if y.is_nan() {
1364        return f32::NAN;
1365    }
1366    if y <= 0.0 {
1367        return if y == 0.0 { f32::INFINITY } else { f32::NAN };
1368    }
1369    if y >= 2.0 {
1370        return if y == 2.0 {
1371            f32::NEG_INFINITY
1372        } else {
1373            f32::NAN
1374        };
1375    }
1376
1377    // Use erfinv(1 - y) for all valid inputs
1378    erfinv_f32(1.0 - y)
1379}
1380
1381/// Inverse complementary error function erfcinv(y) for f64
1382///
1383/// erfcinv(y) = erfinv(1 - y)
1384/// More numerically stable for y close to 0
1385/// Domain: (0, 2), returns x such that erfc(x) = y
1386fn erfcinv_f64(y: f64) -> f64 {
1387    // Handle special cases
1388    if y.is_nan() {
1389        return f64::NAN;
1390    }
1391    if y <= 0.0 {
1392        return if y == 0.0 { f64::INFINITY } else { f64::NAN };
1393    }
1394    if y >= 2.0 {
1395        return if y == 2.0 {
1396            f64::NEG_INFINITY
1397        } else {
1398            f64::NAN
1399        };
1400    }
1401
1402    // Use erfinv for all values: erfcinv(y) = erfinv(1 - y)
1403    // erfinv_f64 already handles all cases with good accuracy via Halley's method
1404    erfinv_f64(1.0 - y)
1405}
1406
1407/// Sigmoid (logistic) function for f32
1408///
1409/// σ(x) = 1 / (1 + exp(-x))
1410/// Numerically stable implementation that avoids overflow
1411fn sigmoid_f32(x: f32) -> f32 {
1412    if x.is_nan() {
1413        return f32::NAN;
1414    }
1415    // Numerically stable sigmoid:
1416    // For x >= 0: σ(x) = 1 / (1 + exp(-x))
1417    // For x < 0: σ(x) = exp(x) / (1 + exp(x))
1418    if x >= 0.0 {
1419        let exp_neg_x = (-x).exp();
1420        1.0 / (1.0 + exp_neg_x)
1421    } else {
1422        let exp_x = x.exp();
1423        exp_x / (1.0 + exp_x)
1424    }
1425}
1426
1427/// Sigmoid (logistic) function for f64
1428///
1429/// σ(x) = 1 / (1 + exp(-x))
1430/// Numerically stable implementation that avoids overflow
1431fn sigmoid_f64(x: f64) -> f64 {
1432    if x.is_nan() {
1433        return f64::NAN;
1434    }
1435    // Numerically stable sigmoid:
1436    // For x >= 0: σ(x) = 1 / (1 + exp(-x))
1437    // For x < 0: σ(x) = exp(x) / (1 + exp(x))
1438    if x >= 0.0 {
1439        let exp_neg_x = (-x).exp();
1440        1.0 / (1.0 + exp_neg_x)
1441    } else {
1442        let exp_x = x.exp();
1443        exp_x / (1.0 + exp_x)
1444    }
1445}
1446
1447/// GELU (Gaussian Error Linear Unit) for f32
1448///
1449/// GELU(x) = x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
1450/// Critical for Transformer models (BERT, GPT, etc.)
1451fn gelu_f32(x: f32) -> f32 {
1452    if x.is_nan() {
1453        return f32::NAN;
1454    }
1455    // GELU(x) = x * 0.5 * (1 + erf(x / √2))
1456    let sqrt_2_inv = std::f32::consts::FRAC_1_SQRT_2; // 1/√2
1457    let erf_arg = x * sqrt_2_inv;
1458    x * 0.5 * (1.0 + erf_f32(erf_arg))
1459}
1460
1461/// GELU (Gaussian Error Linear Unit) for f64
1462///
1463/// GELU(x) = x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
1464/// Critical for Transformer models (BERT, GPT, etc.)
1465fn gelu_f64(x: f64) -> f64 {
1466    if x.is_nan() {
1467        return f64::NAN;
1468    }
1469    // GELU(x) = x * 0.5 * (1 + erf(x / √2))
1470    let sqrt_2_inv = std::f64::consts::FRAC_1_SQRT_2; // 1/√2
1471    let erf_arg = x * sqrt_2_inv;
1472    x * 0.5 * (1.0 + erf_f64(erf_arg))
1473}
1474
1475/// Swish (SiLU - Sigmoid Linear Unit) for f32
1476///
1477/// Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
1478/// Self-gated activation discovered via neural architecture search
1479/// Used in EfficientNet, GPT-NeoX, and many modern architectures
1480/// Properties: smooth, non-monotonic, self-gating, unbounded above
1481fn swish_f32(x: f32) -> f32 {
1482    if x.is_nan() {
1483        return f32::NAN;
1484    }
1485    // Swish(x) = x * sigmoid(x)
1486    // Leverages the numerically stable sigmoid implementation
1487    x * sigmoid_f32(x)
1488}
1489
1490/// Swish (SiLU - Sigmoid Linear Unit) for f64
1491///
1492/// Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
1493/// Self-gated activation discovered via neural architecture search
1494/// Used in EfficientNet, GPT-NeoX, and many modern architectures
1495/// Properties: smooth, non-monotonic, self-gating, unbounded above
1496fn swish_f64(x: f64) -> f64 {
1497    if x.is_nan() {
1498        return f64::NAN;
1499    }
1500    // Swish(x) = x * sigmoid(x)
1501    // Leverages the numerically stable sigmoid implementation
1502    x * sigmoid_f64(x)
1503}
1504
1505/// Softplus for f32
1506///
1507/// Softplus(x) = ln(1 + exp(x))
1508/// Smooth approximation of ReLU
1509/// Used in probabilistic models, Bayesian deep learning, smooth counting
1510/// Properties: softplus(0) = ln(2), always positive, approaches ReLU for x → +∞
1511fn softplus_f32(x: f32) -> f32 {
1512    if x.is_nan() {
1513        return f32::NAN;
1514    }
1515    // Numerically stable implementation:
1516    // For large positive x: ln(1 + exp(x)) ≈ x (exp(x) dominates)
1517    // For large negative x: ln(1 + exp(x)) ≈ exp(x) ≈ 0
1518    // Use threshold based on machine precision
1519    if x > 20.0 {
1520        // For x > 20, exp(x) >> 1, so ln(1 + exp(x)) ≈ x
1521        x
1522    } else if x < -20.0 {
1523        // For x < -20, exp(x) ≈ 0, so ln(1 + exp(x)) ≈ 0
1524        x.exp()
1525    } else {
1526        // Standard computation
1527        (1.0_f32 + x.exp()).ln()
1528    }
1529}
1530
1531/// Softplus for f64
1532///
1533/// Softplus(x) = ln(1 + exp(x))
1534/// Smooth approximation of ReLU
1535/// Used in probabilistic models, Bayesian deep learning, smooth counting
1536/// Properties: softplus(0) = ln(2), always positive, approaches ReLU for x → +∞
1537fn softplus_f64(x: f64) -> f64 {
1538    if x.is_nan() {
1539        return f64::NAN;
1540    }
1541    // Numerically stable implementation:
1542    // For large positive x: ln(1 + exp(x)) ≈ x (exp(x) dominates)
1543    // For large negative x: ln(1 + exp(x)) ≈ exp(x) ≈ 0
1544    // Use threshold based on machine precision
1545    if x > 34.0 {
1546        // For x > 34, exp(x) >> 1, so ln(1 + exp(x)) ≈ x
1547        x
1548    } else if x < -34.0 {
1549        // For x < -34, exp(x) ≈ 0, so ln(1 + exp(x)) ≈ 0
1550        x.exp()
1551    } else {
1552        // Standard computation
1553        (1.0_f64 + x.exp()).ln()
1554    }
1555}
1556
1557/// Mish activation for f32
1558///
1559/// Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
1560/// Self-regularized non-monotonic activation function
1561/// Used in YOLOv4, modern object detection, and neural architectures
1562/// Properties: smooth, non-monotonic, self-gating, unbounded above
1563fn mish_f32(x: f32) -> f32 {
1564    if x.is_nan() {
1565        return f32::NAN;
1566    }
1567    // Mish(x) = x * tanh(softplus(x))
1568    // Leverage existing softplus for numerical stability
1569    x * softplus_f32(x).tanh()
1570}
1571
1572/// Mish activation for f64
1573///
1574/// Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
1575/// Self-regularized non-monotonic activation function
1576/// Used in YOLOv4, modern object detection, and neural architectures
1577/// Properties: smooth, non-monotonic, self-gating, unbounded above
1578fn mish_f64(x: f64) -> f64 {
1579    if x.is_nan() {
1580        return f64::NAN;
1581    }
1582    // Mish(x) = x * tanh(softplus(x))
1583    // Leverage existing softplus for numerical stability
1584    x * softplus_f64(x).tanh()
1585}
1586
1587/// ELU (Exponential Linear Unit) for f32
1588///
1589/// ELU(x, α) = x if x >= 0
1590/// ELU(x, α) = α * (exp(x) - 1) if x < 0
1591/// Helps with vanishing gradients and faster learning
1592/// Used in deep neural networks for smoother outputs
1593/// Properties: smooth, continuous derivative, bounded below by -α
1594fn elu_f32(x: f32, alpha: f32) -> f32 {
1595    if x.is_nan() {
1596        return f32::NAN;
1597    }
1598    if x >= 0.0 {
1599        x
1600    } else {
1601        alpha * (x.exp() - 1.0)
1602    }
1603}
1604
1605/// ELU (Exponential Linear Unit) for f64
1606///
1607/// ELU(x, α) = x if x >= 0
1608/// ELU(x, α) = α * (exp(x) - 1) if x < 0
1609/// Helps with vanishing gradients and faster learning
1610/// Used in deep neural networks for smoother outputs
1611/// Properties: smooth, continuous derivative, bounded below by -α
1612fn elu_f64(x: f64, alpha: f64) -> f64 {
1613    if x.is_nan() {
1614        return f64::NAN;
1615    }
1616    if x >= 0.0 {
1617        x
1618    } else {
1619        alpha * (x.exp() - 1.0)
1620    }
1621}
1622
1623/// SELU (Scaled Exponential Linear Unit) constants
1624///
1625/// These constants are derived from the self-normalizing property:
1626/// For inputs with mean 0 and variance 1, the outputs will also have
1627/// mean 0 and variance 1 (approximately) when using LeCun Normal initialization.
1628const SELU_ALPHA: f64 = 1.6732632423543772;
1629const SELU_LAMBDA: f64 = 1.0507009873554805;
1630const SELU_ALPHA_F32: f32 = 1.6732632;
1631const SELU_LAMBDA_F32: f32 = 1.0507010;
1632
1633/// SELU (Scaled Exponential Linear Unit) for f32
1634///
1635/// SELU(x) = λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
1636/// where λ ≈ 1.0507 and α ≈ 1.6733
1637/// Self-normalizing activation: preserves mean=0, variance=1 through layers
1638/// Used in Self-Normalizing Neural Networks (SNNs)
1639/// Properties: automatic normalization, no BatchNorm needed
1640fn selu_f32(x: f32) -> f32 {
1641    if x.is_nan() {
1642        return f32::NAN;
1643    }
1644    if x > 0.0 {
1645        SELU_LAMBDA_F32 * x
1646    } else {
1647        SELU_LAMBDA_F32 * SELU_ALPHA_F32 * (x.exp() - 1.0)
1648    }
1649}
1650
1651/// SELU (Scaled Exponential Linear Unit) for f64
1652///
1653/// SELU(x) = λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
1654/// where λ ≈ 1.0507 and α ≈ 1.6733
1655/// Self-normalizing activation: preserves mean=0, variance=1 through layers
1656/// Used in Self-Normalizing Neural Networks (SNNs)
1657/// Properties: automatic normalization, no BatchNorm needed
1658fn selu_f64(x: f64) -> f64 {
1659    if x.is_nan() {
1660        return f64::NAN;
1661    }
1662    if x > 0.0 {
1663        SELU_LAMBDA * x
1664    } else {
1665        SELU_LAMBDA * SELU_ALPHA * (x.exp() - 1.0)
1666    }
1667}
1668
1669/// Hardsigmoid for f32
1670///
1671/// Hardsigmoid(x) = clip((x + 3) / 6, 0, 1)
1672/// Piecewise linear approximation of sigmoid
1673/// Used in MobileNetV3 for efficient inference
1674/// Properties: hardsigmoid(0) = 0.5, linear in [-3, 3], saturates outside
1675fn hardsigmoid_f32(x: f32) -> f32 {
1676    if x.is_nan() {
1677        return f32::NAN;
1678    }
1679    // Piecewise linear: 0 for x <= -3, 1 for x >= 3, linear in between
1680    if x <= -3.0 {
1681        0.0
1682    } else if x >= 3.0 {
1683        1.0
1684    } else {
1685        (x + 3.0) / 6.0
1686    }
1687}
1688
1689/// Hardsigmoid for f64
1690///
1691/// Hardsigmoid(x) = clip((x + 3) / 6, 0, 1)
1692/// Piecewise linear approximation of sigmoid
1693/// Used in MobileNetV3 for efficient inference
1694/// Properties: hardsigmoid(0) = 0.5, linear in [-3, 3], saturates outside
1695fn hardsigmoid_f64(x: f64) -> f64 {
1696    if x.is_nan() {
1697        return f64::NAN;
1698    }
1699    // Piecewise linear: 0 for x <= -3, 1 for x >= 3, linear in between
1700    if x <= -3.0 {
1701        0.0
1702    } else if x >= 3.0 {
1703        1.0
1704    } else {
1705        (x + 3.0) / 6.0
1706    }
1707}
1708
1709/// Hardswish for f32
1710///
1711/// Hardswish(x) = x * hardsigmoid(x) = x * clip((x + 3) / 6, 0, 1)
1712/// Piecewise linear approximation of Swish
1713/// Used in MobileNetV3 for efficient inference
1714/// Properties: hardswish(0) = 0, smooth at boundaries, self-gating
1715fn hardswish_f32(x: f32) -> f32 {
1716    if x.is_nan() {
1717        return f32::NAN;
1718    }
1719    // Piecewise: 0 for x <= -3, x for x >= 3, x*(x+3)/6 in between
1720    if x <= -3.0 {
1721        0.0
1722    } else if x >= 3.0 {
1723        x
1724    } else {
1725        x * (x + 3.0) / 6.0
1726    }
1727}
1728
1729/// Hardswish for f64
1730///
1731/// Hardswish(x) = x * hardsigmoid(x) = x * clip((x + 3) / 6, 0, 1)
1732/// Piecewise linear approximation of Swish
1733/// Used in MobileNetV3 for efficient inference
1734/// Properties: hardswish(0) = 0, smooth at boundaries, self-gating
1735fn hardswish_f64(x: f64) -> f64 {
1736    if x.is_nan() {
1737        return f64::NAN;
1738    }
1739    // Piecewise: 0 for x <= -3, x for x >= 3, x*(x+3)/6 in between
1740    if x <= -3.0 {
1741        0.0
1742    } else if x >= 3.0 {
1743        x
1744    } else {
1745        x * (x + 3.0) / 6.0
1746    }
1747}
1748
1749/// Sinc function for f32 (normalized)
1750///
1751/// sinc(x) = sin(πx) / (πx) for x ≠ 0
1752/// sinc(0) = 1 (by L'Hôpital's rule)
1753/// Critical for signal processing, windowing, interpolation
1754/// Properties: sinc(n) = 0 for all non-zero integers n
1755fn sinc_f32(x: f32) -> f32 {
1756    if x.is_nan() {
1757        return f32::NAN;
1758    }
1759    if x.abs() < 1e-7 {
1760        // For very small x, sin(πx) ≈ πx, so sinc(x) ≈ 1
1761        // Use Taylor expansion: sinc(x) = 1 - (πx)²/6 + O(x⁴)
1762        let pi_x = std::f32::consts::PI * x;
1763        1.0 - pi_x * pi_x / 6.0
1764    } else {
1765        let pi_x = std::f32::consts::PI * x;
1766        pi_x.sin() / pi_x
1767    }
1768}
1769
1770/// Sinc function for f64 (normalized)
1771///
1772/// sinc(x) = sin(πx) / (πx) for x ≠ 0
1773/// sinc(0) = 1 (by L'Hôpital's rule)
1774/// Critical for signal processing, windowing, interpolation
1775/// Properties: sinc(n) = 0 for all non-zero integers n
1776fn sinc_f64(x: f64) -> f64 {
1777    if x.is_nan() {
1778        return f64::NAN;
1779    }
1780    if x.abs() < 1e-15 {
1781        // For very small x, sin(πx) ≈ πx, so sinc(x) ≈ 1
1782        // Use Taylor expansion: sinc(x) = 1 - (πx)²/6 + (πx)⁴/120 + O(x⁶)
1783        let pi_x = std::f64::consts::PI * x;
1784        let pi_x_sq = pi_x * pi_x;
1785        1.0 - pi_x_sq / 6.0 + pi_x_sq * pi_x_sq / 120.0
1786    } else {
1787        let pi_x = std::f64::consts::PI * x;
1788        pi_x.sin() / pi_x
1789    }
1790}
1791
1792// Implementation for f32
1793impl SimdUnifiedOps for f32 {
1794    #[cfg(feature = "simd")]
1795    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1796        crate::simd::simd_add_f32(a, b)
1797    }
1798
1799    #[cfg(not(feature = "simd"))]
1800    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1801        (a + b).to_owned()
1802    }
1803
1804    #[cfg(feature = "simd")]
1805    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1806        crate::simd::simd_sub_f32(a, b)
1807    }
1808
1809    #[cfg(not(feature = "simd"))]
1810    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1811        (a - b).to_owned()
1812    }
1813
1814    #[cfg(feature = "simd")]
1815    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1816        crate::simd::simd_mul_f32(a, b)
1817    }
1818
1819    #[cfg(not(feature = "simd"))]
1820    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1821        (a * b).to_owned()
1822    }
1823
1824    #[cfg(feature = "simd")]
1825    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1826        crate::simd::simd_div_f32(a, b)
1827    }
1828
1829    #[cfg(not(feature = "simd"))]
1830    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1831        (a / b).to_owned()
1832    }
1833
1834    #[cfg(feature = "simd")]
1835    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
1836        crate::simd::simd_dot_f32(a, b)
1837    }
1838
1839    #[cfg(not(feature = "simd"))]
1840    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
1841        a.dot(b)
1842    }
1843
1844    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
1845        let m = a.nrows();
1846        let n = a.ncols();
1847
1848        assert_eq!(n, x.len());
1849        assert_eq!(m, y.len());
1850
1851        // Scale y by beta
1852        if beta == 0.0 {
1853            y.fill(0.0);
1854        } else if beta != 1.0 {
1855            y.mapv_inplace(|v| v * beta);
1856        }
1857
1858        // Compute matrix-vector product
1859        for i in 0..m {
1860            let row = a.row(i);
1861            y[i] += Self::simd_dot(&row, x);
1862        }
1863    }
1864
1865    fn simd_gemm(
1866        alpha: Self,
1867        a: &ArrayView2<Self>,
1868        b: &ArrayView2<Self>,
1869        beta: Self,
1870        c: &mut Array2<Self>,
1871    ) {
1872        let m = a.nrows();
1873        let k = a.ncols();
1874        let n = b.ncols();
1875
1876        assert_eq!(k, b.nrows());
1877        assert_eq!((m, n), c.dim());
1878
1879        // Scale C by beta
1880        if beta == 0.0 {
1881            c.fill(0.0);
1882        } else if beta != 1.0 {
1883            c.mapv_inplace(|v| v * beta);
1884        }
1885
1886        // Use blocked transpose for large matrices to improve cache efficiency
1887        // Threshold: n * k > 4096 (amortize transpose cost)
1888        const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
1889
1890        if n * k > GEMM_TRANSPOSE_THRESHOLD {
1891            // Pre-transpose B for cache-efficient row-wise access
1892            let b_t = Self::simd_transpose_blocked(b);
1893
1894            for i in 0..m {
1895                let a_row = a.row(i);
1896                for j in 0..n {
1897                    // Access b_t row-wise (contiguous memory)
1898                    let b_row = b_t.row(j);
1899                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
1900                }
1901            }
1902        } else {
1903            // Small matrices: use column access (overhead of transpose not worth it)
1904            for i in 0..m {
1905                let a_row = a.row(i);
1906                for j in 0..n {
1907                    let b_col = b.column(j);
1908                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
1909                }
1910            }
1911        }
1912    }
1913
1914    #[cfg(feature = "simd")]
1915    fn simd_norm(a: &ArrayView1<Self>) -> Self {
1916        crate::simd::norms::simd_norm_l2_f32(a)
1917    }
1918
1919    #[cfg(not(feature = "simd"))]
1920    fn simd_norm(a: &ArrayView1<Self>) -> Self {
1921        a.iter().map(|&x| x * x).sum::<f32>().sqrt()
1922    }
1923
1924    #[cfg(feature = "simd")]
1925    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1926        crate::simd::simd_maximum_f32(a, b)
1927    }
1928
1929    #[cfg(not(feature = "simd"))]
1930    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1931        let mut result = Array1::zeros(a.len());
1932        for _i in 0..a.len() {
1933            result[0] = a[0].max(b[0]);
1934        }
1935        result
1936    }
1937
1938    #[cfg(feature = "simd")]
1939    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1940        crate::simd::simd_minimum_f32(a, b)
1941    }
1942
1943    #[cfg(not(feature = "simd"))]
1944    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
1945        let mut result = Array1::zeros(a.len());
1946        for _i in 0..a.len() {
1947            result[0] = a[0].min(b[0]);
1948        }
1949        result
1950    }
1951
1952    #[cfg(feature = "simd")]
1953    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
1954        crate::simd::simd_scalar_mul_f32(a, scalar)
1955    }
1956
1957    #[cfg(not(feature = "simd"))]
1958    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
1959        a.mapv(|x| x * scalar)
1960    }
1961
1962    #[cfg(feature = "simd")]
1963    fn simd_sum(a: &ArrayView1<Self>) -> Self {
1964        crate::simd::simd_sum_f32(a)
1965    }
1966
1967    #[cfg(not(feature = "simd"))]
1968    fn simd_sum(a: &ArrayView1<Self>) -> Self {
1969        a.sum()
1970    }
1971
1972    fn simd_mean(a: &ArrayView1<Self>) -> Self {
1973        if a.is_empty() {
1974            0.0
1975        } else {
1976            Self::simd_sum(a) / (a.len() as f32)
1977        }
1978    }
1979
1980    #[cfg(feature = "simd")]
1981    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
1982        crate::simd::simd_max_f32(a)
1983    }
1984
1985    #[cfg(not(feature = "simd"))]
1986    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
1987        a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
1988    }
1989
1990    #[cfg(feature = "simd")]
1991    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
1992        crate::simd::simd_min_f32(a)
1993    }
1994
1995    #[cfg(not(feature = "simd"))]
1996    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
1997        a.fold(f32::INFINITY, |acc, &x| acc.min(x))
1998    }
1999
2000    #[cfg(feature = "simd")]
2001    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
2002        crate::simd::simd_fused_multiply_add_f32(a, b, c)
2003    }
2004
2005    #[cfg(not(feature = "simd"))]
2006    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
2007        let mut result = Array1::zeros(a.len());
2008        for _i in 0..a.len() {
2009            result[0] = a[0] * b[0] + c[0];
2010        }
2011        result
2012    }
2013
2014    #[cfg(feature = "simd")]
2015    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2016        crate::simd::simd_add_cache_optimized_f32(a, b)
2017    }
2018
2019    #[cfg(not(feature = "simd"))]
2020    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2021        a + b
2022    }
2023
2024    #[cfg(feature = "simd")]
2025    fn simd_fma_advanced_optimized(
2026        a: &ArrayView1<Self>,
2027        b: &ArrayView1<Self>,
2028        c: &ArrayView1<Self>,
2029    ) -> Array1<Self> {
2030        crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
2031    }
2032
2033    #[cfg(not(feature = "simd"))]
2034    fn simd_fma_advanced_optimized(
2035        a: &ArrayView1<Self>,
2036        b: &ArrayView1<Self>,
2037        c: &ArrayView1<Self>,
2038    ) -> Array1<Self> {
2039        let mut result = Array1::zeros(a.len());
2040        for _i in 0..a.len() {
2041            result[0] = a[0] * b[0] + c[0];
2042        }
2043        result
2044    }
2045
2046    #[cfg(feature = "simd")]
2047    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2048        crate::simd::simd_adaptive_add_f32(a, b)
2049    }
2050
2051    #[cfg(not(feature = "simd"))]
2052    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2053        a + b
2054    }
2055
2056    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
2057        a.t().to_owned()
2058    }
2059
2060    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
2061        #[cfg(feature = "simd")]
2062        {
2063            crate::simd::simd_transpose_blocked_f32(a)
2064        }
2065        #[cfg(not(feature = "simd"))]
2066        {
2067            a.t().to_owned()
2068        }
2069    }
2070
2071    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
2072        a.iter().map(|&x| x * x).sum()
2073    }
2074
2075    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2076        Self::simd_mul(a, b)
2077    }
2078
2079    #[cfg(feature = "simd")]
2080    fn simd_available() -> bool {
2081        true
2082    }
2083
2084    #[cfg(not(feature = "simd"))]
2085    fn simd_available() -> bool {
2086        false
2087    }
2088
2089    fn simd_sub_f32_ultra(
2090        a: &ArrayView1<Self>,
2091        b: &ArrayView1<Self>,
2092        result: &mut ArrayViewMut1<Self>,
2093    ) {
2094        let sub_result = Self::simd_sub(a, b);
2095        result.assign(&sub_result);
2096    }
2097
2098    fn simd_mul_f32_ultra(
2099        a: &ArrayView1<Self>,
2100        b: &ArrayView1<Self>,
2101        result: &mut ArrayViewMut1<Self>,
2102    ) {
2103        let mul_result = Self::simd_mul(a, b);
2104        result.assign(&mul_result);
2105    }
2106
2107    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
2108        // Calculate sum of cubes: sum(x^3)
2109        a.iter().map(|&x| x * x * x).sum()
2110    }
2111
2112    fn simd_div_f32_ultra(
2113        a: &ArrayView1<Self>,
2114        b: &ArrayView1<Self>,
2115        result: &mut ArrayViewMut1<Self>,
2116    ) {
2117        let div_result = Self::simd_div(a, b);
2118        result.assign(&div_result);
2119    }
2120
2121    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2122        let sin_result = a.mapv(|x| x.sin());
2123        result.assign(&sin_result);
2124    }
2125
2126    fn simd_add_f32_ultra(
2127        a: &ArrayView1<Self>,
2128        b: &ArrayView1<Self>,
2129        result: &mut ArrayViewMut1<Self>,
2130    ) {
2131        let add_result = Self::simd_add(a, b);
2132        result.assign(&add_result);
2133    }
2134
2135    fn simd_fma_f32_ultra(
2136        a: &ArrayView1<Self>,
2137        b: &ArrayView1<Self>,
2138        c: &ArrayView1<Self>,
2139        result: &mut ArrayViewMut1<Self>,
2140    ) {
2141        let fma_result = Self::simd_fma(a, b, c);
2142        result.assign(&fma_result);
2143    }
2144
2145    fn simd_pow_f32_ultra(
2146        a: &ArrayView1<Self>,
2147        b: &ArrayView1<Self>,
2148        result: &mut ArrayViewMut1<Self>,
2149    ) {
2150        let pow_result = a
2151            .iter()
2152            .zip(b.iter())
2153            .map(|(&x, &y)| x.powf(y))
2154            .collect::<Vec<_>>();
2155        result.assign(&Array1::from_vec(pow_result));
2156    }
2157
2158    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2159        let exp_result = a.mapv(|x| x.exp());
2160        result.assign(&exp_result);
2161    }
2162
2163    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
2164        let cos_result = a.mapv(|x| x.cos());
2165        result.assign(&cos_result);
2166    }
2167
2168    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2169        Self::simd_dot(a, b)
2170    }
2171
2172    #[cfg(feature = "simd")]
2173    fn simd_variance(a: &ArrayView1<Self>) -> Self {
2174        crate::simd::simd_variance_f32(a)
2175    }
2176
2177    #[cfg(not(feature = "simd"))]
2178    fn simd_variance(a: &ArrayView1<Self>) -> Self {
2179        let mean = Self::simd_mean(a);
2180        let n = a.len() as f32;
2181        if n < 2.0 {
2182            return f32::NAN;
2183        }
2184        // Sample variance with Bessel's correction (n-1)
2185        a.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / (n - 1.0)
2186    }
2187
2188    #[cfg(feature = "simd")]
2189    fn simd_std(a: &ArrayView1<Self>) -> Self {
2190        crate::simd::simd_std_f32(a)
2191    }
2192
2193    #[cfg(not(feature = "simd"))]
2194    fn simd_std(a: &ArrayView1<Self>) -> Self {
2195        Self::simd_variance(a).sqrt()
2196    }
2197
2198    #[cfg(feature = "simd")]
2199    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
2200        crate::simd::simd_norm_l1_f32(a)
2201    }
2202
2203    #[cfg(not(feature = "simd"))]
2204    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
2205        a.iter().map(|&x| x.abs()).sum()
2206    }
2207
2208    #[cfg(feature = "simd")]
2209    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
2210        crate::simd::simd_norm_linf_f32(a)
2211    }
2212
2213    #[cfg(not(feature = "simd"))]
2214    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
2215        a.iter().fold(0.0f32, |acc, &x| acc.max(x.abs()))
2216    }
2217
2218    #[cfg(feature = "simd")]
2219    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2220        crate::simd::simd_cosine_similarity_f32(a, b)
2221    }
2222
2223    #[cfg(not(feature = "simd"))]
2224    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2225        let dot = Self::simd_dot(a, b);
2226        let norm_a = Self::simd_norm(a);
2227        let norm_b = Self::simd_norm(b);
2228        dot / (norm_a * norm_b)
2229    }
2230
2231    #[cfg(feature = "simd")]
2232    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2233        crate::simd::simd_distance_euclidean_f32(a, b)
2234    }
2235
2236    #[cfg(not(feature = "simd"))]
2237    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2238        a.iter()
2239            .zip(b.iter())
2240            .map(|(&x, &y)| (x - y).powi(2))
2241            .sum::<f32>()
2242            .sqrt()
2243    }
2244
2245    #[cfg(feature = "simd")]
2246    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2247        crate::simd::simd_distance_manhattan_f32(a, b)
2248    }
2249
2250    #[cfg(not(feature = "simd"))]
2251    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2252        a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
2253    }
2254
2255    #[cfg(feature = "simd")]
2256    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2257        crate::simd::simd_distance_chebyshev_f32(a, b)
2258    }
2259
2260    #[cfg(not(feature = "simd"))]
2261    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2262        a.iter()
2263            .zip(b.iter())
2264            .fold(0.0f32, |acc, (&x, &y)| acc.max((x - y).abs()))
2265    }
2266
2267    #[cfg(feature = "simd")]
2268    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2269        crate::simd::simd_distance_cosine_f32(a, b)
2270    }
2271
2272    #[cfg(not(feature = "simd"))]
2273    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
2274        1.0 - Self::simd_cosine_similarity(a, b)
2275    }
2276
2277    #[cfg(feature = "simd")]
2278    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2279        crate::simd::simd_weighted_sum_f32(values, weights)
2280    }
2281
2282    #[cfg(not(feature = "simd"))]
2283    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2284        values
2285            .iter()
2286            .zip(weights.iter())
2287            .map(|(&v, &w)| v * w)
2288            .sum()
2289    }
2290
2291    #[cfg(feature = "simd")]
2292    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2293        crate::simd::simd_weighted_mean_f32(values, weights)
2294    }
2295
2296    #[cfg(not(feature = "simd"))]
2297    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
2298        let weighted_sum = Self::simd_weighted_sum(values, weights);
2299        let weight_sum: f32 = weights.iter().sum();
2300        weighted_sum / weight_sum
2301    }
2302
2303    #[cfg(feature = "simd")]
2304    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
2305        crate::simd::simd_argmin_f32(a)
2306    }
2307
2308    #[cfg(not(feature = "simd"))]
2309    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
2310        if a.is_empty() {
2311            return None;
2312        }
2313        let mut min_idx = 0;
2314        let mut min_val = a[0];
2315        for (i, &v) in a.iter().enumerate().skip(1) {
2316            if v < min_val {
2317                min_val = v;
2318                min_idx = i;
2319            }
2320        }
2321        Some(min_idx)
2322    }
2323
2324    #[cfg(feature = "simd")]
2325    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
2326        crate::simd::simd_argmax_f32(a)
2327    }
2328
2329    #[cfg(not(feature = "simd"))]
2330    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
2331        if a.is_empty() {
2332            return None;
2333        }
2334        let mut max_idx = 0;
2335        let mut max_val = a[0];
2336        for (i, &v) in a.iter().enumerate().skip(1) {
2337            if v > max_val {
2338                max_val = v;
2339                max_idx = i;
2340            }
2341        }
2342        Some(max_idx)
2343    }
2344
2345    #[cfg(feature = "simd")]
2346    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
2347        crate::simd::simd_clip_f32(a, min_val, max_val)
2348    }
2349
2350    #[cfg(not(feature = "simd"))]
2351    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
2352        a.mapv(|v| v.max(min_val).min(max_val))
2353    }
2354
2355    #[cfg(feature = "simd")]
2356    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
2357        crate::simd::simd_log_sum_exp_f32(a)
2358    }
2359
2360    #[cfg(not(feature = "simd"))]
2361    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
2362        if a.is_empty() {
2363            return f32::NEG_INFINITY;
2364        }
2365        let max_val = a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x));
2366        let sum_exp: f32 = a.iter().map(|&x| (x - max_val).exp()).sum();
2367        max_val + sum_exp.ln()
2368    }
2369
2370    #[cfg(feature = "simd")]
2371    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2372        crate::simd::simd_softmax_f32(a)
2373    }
2374
2375    #[cfg(not(feature = "simd"))]
2376    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2377        if a.is_empty() {
2378            return Array1::zeros(0);
2379        }
2380        let lse = Self::simd_log_sum_exp(a);
2381        a.mapv(|x| (x - lse).exp())
2382    }
2383
2384    #[cfg(feature = "simd")]
2385    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
2386        crate::simd::simd_cumsum_f32(a)
2387    }
2388
2389    #[cfg(not(feature = "simd"))]
2390    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
2391        if a.is_empty() {
2392            return Array1::zeros(0);
2393        }
2394        let mut cumsum = 0.0f32;
2395        a.mapv(|x| {
2396            cumsum += x;
2397            cumsum
2398        })
2399    }
2400
2401    #[cfg(feature = "simd")]
2402    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
2403        crate::simd::simd_cumprod_f32(a)
2404    }
2405
2406    #[cfg(not(feature = "simd"))]
2407    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
2408        if a.is_empty() {
2409            return Array1::zeros(0);
2410        }
2411        let mut cumprod = 1.0f32;
2412        a.mapv(|x| {
2413            cumprod *= x;
2414            cumprod
2415        })
2416    }
2417
2418    #[cfg(feature = "simd")]
2419    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
2420        crate::simd::simd_diff_f32(a)
2421    }
2422
2423    #[cfg(not(feature = "simd"))]
2424    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
2425        if a.len() <= 1 {
2426            return Array1::zeros(0);
2427        }
2428        Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
2429    }
2430
2431    #[cfg(feature = "simd")]
2432    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
2433        crate::simd::simd_sign_f32(a)
2434    }
2435
2436    #[cfg(not(feature = "simd"))]
2437    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
2438        a.mapv(|x| {
2439            if x > 0.0 {
2440                1.0
2441            } else if x < 0.0 {
2442                -1.0
2443            } else {
2444                0.0
2445            }
2446        })
2447    }
2448
2449    #[cfg(feature = "simd")]
2450    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
2451        crate::simd::simd_relu_f32(a)
2452    }
2453
2454    #[cfg(not(feature = "simd"))]
2455    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
2456        a.mapv(|x| x.max(0.0))
2457    }
2458
2459    #[cfg(feature = "simd")]
2460    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2461        crate::simd::simd_leaky_relu_f32(a, alpha)
2462    }
2463
2464    #[cfg(not(feature = "simd"))]
2465    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2466        a.mapv(|x| if x > 0.0 { x } else { alpha * x })
2467    }
2468
2469    #[cfg(feature = "simd")]
2470    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
2471        crate::simd::simd_normalize_f32(a)
2472    }
2473
2474    #[cfg(not(feature = "simd"))]
2475    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
2476        let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2477        if norm == 0.0 {
2478            return a.to_owned();
2479        }
2480        a.mapv(|x| x / norm)
2481    }
2482
2483    #[cfg(feature = "simd")]
2484    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
2485        crate::simd::simd_standardize_f32(a)
2486    }
2487
2488    #[cfg(not(feature = "simd"))]
2489    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
2490        if a.len() <= 1 {
2491            return Array1::zeros(a.len());
2492        }
2493        let mean: f32 = a.iter().sum::<f32>() / a.len() as f32;
2494        let variance: f32 =
2495            a.iter().map(|x| (x - mean) * (x - mean)).sum::<f32>() / (a.len() - 1) as f32;
2496        let std = variance.sqrt();
2497        if std == 0.0 {
2498            return Array1::zeros(a.len());
2499        }
2500        a.mapv(|x| (x - mean) / std)
2501    }
2502
2503    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
2504        a.mapv(|x| x.abs())
2505    }
2506
2507    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
2508        a.mapv(|x| x.sqrt())
2509    }
2510
2511    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
2512        // Note: SIMD polynomial approximation available via crate::simd::simd_exp_f32
2513        // for ~5-10x speedup with 10^-7 accuracy. Keeping scalar for trait compatibility.
2514        a.mapv(|x| x.exp())
2515    }
2516
2517    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
2518        // Note: SIMD logarithm available via crate::simd::simd_ln_f32
2519        // for speedup with ~0.05 absolute error. Keeping scalar for trait compatibility.
2520        a.mapv(|x| x.ln())
2521    }
2522
2523    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
2524        // Note: SIMD polynomial approximation available via crate::simd::simd_sin_f32
2525        // for ~5x speedup with 10^-4 accuracy. Keeping scalar for trait compatibility.
2526        a.mapv(|x| x.sin())
2527    }
2528
2529    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
2530        // Note: SIMD cosine available via crate::simd::simd_cos_f32
2531        // for ~5x speedup with 10^-4 accuracy. Keeping scalar for trait compatibility.
2532        a.mapv(|x| x.cos())
2533    }
2534
2535    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
2536        // Note: Can use SIMD via sin/cos: crate::simd::simd_sin_f32 / crate::simd::simd_cos_f32
2537        a.mapv(|x| x.tan())
2538    }
2539
2540    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
2541        // sinh(x) = (exp(x) - exp(-x)) / 2
2542        // Using SIMD exp for acceleration
2543        let exp_a = Self::simd_exp(a);
2544        let neg_a = Self::simd_scalar_mul(a, -1.0);
2545        let exp_neg_a = Self::simd_exp(&neg_a.view());
2546        let diff = Self::simd_sub(&exp_a.view(), &exp_neg_a.view());
2547        Self::simd_scalar_mul(&diff.view(), 0.5)
2548    }
2549
2550    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
2551        // cosh(x) = (exp(x) + exp(-x)) / 2
2552        // Using SIMD exp for acceleration
2553        let exp_a = Self::simd_exp(a);
2554        let neg_a = Self::simd_scalar_mul(a, -1.0);
2555        let exp_neg_a = Self::simd_exp(&neg_a.view());
2556        let sum = Self::simd_add(&exp_a.view(), &exp_neg_a.view());
2557        Self::simd_scalar_mul(&sum.view(), 0.5)
2558    }
2559
2560    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
2561        #[cfg(feature = "simd")]
2562        {
2563            // Use polynomial approximation (good accuracy, fast)
2564            simd_ops_polynomial::simd_tanh_f32_poly(a)
2565        }
2566        #[cfg(not(feature = "simd"))]
2567        {
2568            a.mapv(|x| x.tanh())
2569        }
2570    }
2571
2572    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
2573        #[cfg(feature = "simd")]
2574        {
2575            crate::simd::simd_floor_f32(a)
2576        }
2577        #[cfg(not(feature = "simd"))]
2578        {
2579            a.mapv(|x| x.floor())
2580        }
2581    }
2582
2583    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
2584        #[cfg(feature = "simd")]
2585        {
2586            crate::simd::simd_ceil_f32(a)
2587        }
2588        #[cfg(not(feature = "simd"))]
2589        {
2590            a.mapv(|x| x.ceil())
2591        }
2592    }
2593
2594    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
2595        #[cfg(feature = "simd")]
2596        {
2597            crate::simd::simd_round_f32(a)
2598        }
2599        #[cfg(not(feature = "simd"))]
2600        {
2601            a.mapv(|x| x.round())
2602        }
2603    }
2604
2605    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
2606        a.mapv(|x| x.atan())
2607    }
2608
2609    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
2610        a.mapv(|x| x.asin())
2611    }
2612
2613    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
2614        a.mapv(|x| x.acos())
2615    }
2616
2617    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
2618        y.iter()
2619            .zip(x.iter())
2620            .map(|(&y_val, &x_val)| y_val.atan2(x_val))
2621            .collect::<Vec<_>>()
2622            .into()
2623    }
2624
2625    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
2626        // log10(x) = ln(x) * (1/ln(10)) - uses SIMD ln
2627        const LOG10_E: f32 = std::f32::consts::LOG10_E; // 1/ln(10)
2628        let ln_a = Self::simd_ln(a);
2629        Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
2630    }
2631
2632    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
2633        // log2(x) = ln(x) * (1/ln(2)) - uses SIMD ln
2634        const LOG2_E: f32 = std::f32::consts::LOG2_E; // 1/ln(2)
2635        let ln_a = Self::simd_ln(a);
2636        Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
2637    }
2638
2639    #[cfg(feature = "simd")]
2640    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
2641        // Use SIMD-accelerated clip (AVX2/SSE/NEON)
2642        crate::simd::simd_clip_f32(a, min, max)
2643    }
2644
2645    #[cfg(not(feature = "simd"))]
2646    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
2647        a.mapv(|x| x.clamp(min, max))
2648    }
2649
2650    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
2651        #[cfg(feature = "simd")]
2652        {
2653            let truncated = crate::simd::simd_trunc_f32(a);
2654            Self::simd_sub(a, &truncated.view())
2655        }
2656        #[cfg(not(feature = "simd"))]
2657        {
2658            a.mapv(|x| x.fract())
2659        }
2660    }
2661
2662    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
2663        #[cfg(feature = "simd")]
2664        {
2665            crate::simd::simd_trunc_f32(a)
2666        }
2667        #[cfg(not(feature = "simd"))]
2668        {
2669            a.mapv(|x| x.trunc())
2670        }
2671    }
2672
2673    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
2674        // Optimized SIMD reciprocal: 1/x using SIMD division
2675        let ones = Array1::from_elem(a.len(), 1.0f32);
2676        Self::simd_div(&ones.view(), a)
2677    }
2678
2679    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
2680        // Optimized SIMD powf: base^exp = exp(exp * ln(base))
2681        // Uses SIMD-accelerated exp and ln operations
2682        let ln_base = Self::simd_ln(base);
2683        let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
2684        Self::simd_exp(&scaled.view())
2685    }
2686
2687    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
2688        // Optimized SIMD pow: base[i]^exp[i] = exp(exp[i] * ln(base[i]))
2689        // Uses SIMD-accelerated exp, ln, and mul operations
2690        let ln_base = Self::simd_ln(base);
2691        let scaled = Self::simd_mul(&ln_base.view(), exp);
2692        Self::simd_exp(&scaled.view())
2693    }
2694
2695    #[cfg(feature = "simd")]
2696    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
2697        crate::simd::unary_powi::simd_powi_f32(base, n)
2698    }
2699
2700    #[cfg(not(feature = "simd"))]
2701    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
2702        base.mapv(|x| x.powi(n))
2703    }
2704
2705    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
2706        x.mapv(lanczos_gamma_f32)
2707    }
2708
2709    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
2710        // 2^x = exp(x * ln(2))
2711        const LN2: f32 = std::f32::consts::LN_2;
2712        let scaled = Self::simd_scalar_mul(a, LN2);
2713        Self::simd_exp(&scaled.view())
2714    }
2715
2716    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
2717        // Cube root: x^(1/3)
2718        // Handle negative numbers: cbrt(-x) = -cbrt(x)
2719        a.mapv(|x| x.cbrt())
2720    }
2721
2722    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
2723        // ln(1+x) - numerically stable for small x
2724        a.mapv(|x| x.ln_1p())
2725    }
2726
2727    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
2728        // exp(x)-1 - numerically stable for small x
2729        a.mapv(|x| x.exp_m1())
2730    }
2731
2732    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
2733        // degrees to radians: x * π / 180
2734        const DEG_TO_RAD: f32 = std::f32::consts::PI / 180.0;
2735        Self::simd_scalar_mul(a, DEG_TO_RAD)
2736    }
2737
2738    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
2739        // radians to degrees: x * 180 / π
2740        const RAD_TO_DEG: f32 = 180.0 / std::f32::consts::PI;
2741        Self::simd_scalar_mul(a, RAD_TO_DEG)
2742    }
2743
2744    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
2745        // Digamma function ψ(x) = d/dx ln(Γ(x))
2746        // Uses reflection formula, recurrence relation, and asymptotic expansion
2747        a.mapv(digamma_f32)
2748    }
2749
2750    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
2751        // Trigamma function ψ'(x) = d²/dx² ln(Γ(x))
2752        // Critical for Fisher information in Bayesian inference
2753        a.mapv(trigamma_f32)
2754    }
2755
2756    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
2757        // Log-gamma function ln(Γ(x)) - more stable than gamma(x).ln()
2758        a.mapv(ln_gamma_f32)
2759    }
2760
2761    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
2762        // Error function erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
2763        // Critical for normal distribution CDF: Φ(x) = 0.5 * (1 + erf(x/√2))
2764        a.mapv(erf_f32)
2765    }
2766
2767    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
2768        // Complementary error function erfc(x) = 1 - erf(x)
2769        // More numerically stable than 1 - erf(x) for large x
2770        a.mapv(erfc_f32)
2771    }
2772
2773    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
2774        // Inverse error function: erfinv(y) = x such that erf(x) = y
2775        // Critical for inverse normal CDF (probit function)
2776        a.mapv(erfinv_f32)
2777    }
2778
2779    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
2780        // Inverse complementary error function: erfcinv(y) = x such that erfc(x) = y
2781        // More numerically stable than erfinv(1-y) for small y
2782        a.mapv(erfcinv_f32)
2783    }
2784
2785    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
2786        // Sigmoid (logistic) function: σ(x) = 1 / (1 + exp(-x))
2787        // Critical for neural networks, logistic regression
2788        // Uses numerically stable implementation
2789        a.mapv(sigmoid_f32)
2790    }
2791
2792    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
2793        // GELU (Gaussian Error Linear Unit): x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
2794        // Critical for Transformer models (BERT, GPT, etc.)
2795        a.mapv(gelu_f32)
2796    }
2797
2798    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
2799        // Swish (SiLU): x * sigmoid(x)
2800        // Self-gated activation for EfficientNet, GPT-NeoX
2801        a.mapv(swish_f32)
2802    }
2803
2804    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
2805        // Softplus: ln(1 + exp(x))
2806        // Smooth approximation of ReLU for probabilistic models
2807        a.mapv(softplus_f32)
2808    }
2809
2810    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
2811        // Mish: x * tanh(softplus(x))
2812        // Self-regularized activation for YOLOv4
2813        a.mapv(mish_f32)
2814    }
2815
2816    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
2817        // ELU: x if x >= 0, alpha * (exp(x) - 1) if x < 0
2818        // Helps with vanishing gradients
2819        a.mapv(|x| elu_f32(x, alpha))
2820    }
2821
2822    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
2823        // SELU: λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
2824        // Self-normalizing activation with fixed α and λ constants
2825        a.mapv(selu_f32)
2826    }
2827
2828    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
2829        // Hardsigmoid: clip((x + 3) / 6, 0, 1)
2830        // Piecewise linear approximation of sigmoid
2831        a.mapv(hardsigmoid_f32)
2832    }
2833
2834    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
2835        // Hardswish: x * hardsigmoid(x)
2836        // Piecewise linear approximation of Swish
2837        a.mapv(hardswish_f32)
2838    }
2839
2840    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
2841        // Sinc: sin(πx) / (πx), with sinc(0) = 1
2842        // Critical for signal processing and interpolation
2843        a.mapv(sinc_f32)
2844    }
2845
2846    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
2847        // Log-softmax: x_i - log(Σ_j exp(x_j))
2848        // Numerically stable for neural network loss computation
2849        // Leverages existing SIMD-accelerated log_sum_exp
2850        if a.is_empty() {
2851            return Array1::zeros(0);
2852        }
2853        let lse = Self::simd_log_sum_exp(a);
2854        a.mapv(|x| x - lse)
2855    }
2856
2857    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
2858        // Inverse hyperbolic sine: asinh(x) = ln(x + √(x² + 1))
2859        // Uses Rust's built-in asinh for accuracy
2860        a.mapv(|x| x.asinh())
2861    }
2862
2863    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
2864        // Inverse hyperbolic cosine: acosh(x) = ln(x + √(x² - 1))
2865        // Domain: [1, +∞), returns NaN for x < 1
2866        a.mapv(|x| x.acosh())
2867    }
2868
2869    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
2870        // Inverse hyperbolic tangent: atanh(x) = 0.5 * ln((1+x)/(1-x))
2871        // Domain: (-1, 1), returns ±∞ at ±1, NaN for |x| > 1
2872        a.mapv(|x| x.atanh())
2873    }
2874
2875    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2876        // Log-Beta: ln(B(a,b)) = ln(Γ(a)) + ln(Γ(b)) - ln(Γ(a+b))
2877        // Leverages SIMD-accelerated ln_gamma
2878        let ln_gamma_a = Self::simd_ln_gamma(a);
2879        let ln_gamma_b = Self::simd_ln_gamma(b);
2880        let a_plus_b = Self::simd_add(a, b);
2881        let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
2882        Self::simd_sub(
2883            &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
2884            &ln_gamma_ab.view(),
2885        )
2886    }
2887
2888    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2889        // Beta: B(a,b) = exp(ln(B(a,b)))
2890        // Uses log-beta for numerical stability
2891        let ln_beta = Self::simd_ln_beta(a, b);
2892        Self::simd_exp(&ln_beta.view())
2893    }
2894
2895    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
2896        // Linear interpolation: lerp(a, b, t) = a + t * (b - a)
2897        // This formula is more numerically stable when t is close to 0
2898        // Alternative: a * (1 - t) + b * t (more symmetric but less stable)
2899        if a.is_empty() || b.is_empty() {
2900            return Array1::zeros(0);
2901        }
2902        let diff = Self::simd_sub(b, a);
2903        let scaled = Self::simd_scalar_mul(&diff.view(), t);
2904        Self::simd_add(a, &scaled.view())
2905    }
2906
2907    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
2908        // Smoothstep: smooth Hermite interpolation
2909        // t = clamp((x - edge0) / (edge1 - edge0), 0, 1)
2910        // result = t * t * (3 - 2 * t)
2911        if x.is_empty() {
2912            return Array1::zeros(0);
2913        }
2914        let range = edge1 - edge0;
2915        if range.abs() < Self::EPSILON {
2916            // Degenerate case: edge0 == edge1
2917            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
2918        }
2919        x.mapv(|xi| {
2920            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
2921            t * t * (3.0 - 2.0 * t)
2922        })
2923    }
2924
2925    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
2926        // Hypotenuse: sqrt(x² + y²) with overflow/underflow protection
2927        // Uses Rust's built-in hypot which handles extreme values correctly
2928        if x.is_empty() || y.is_empty() {
2929            return Array1::zeros(0);
2930        }
2931        let len = x.len().min(y.len());
2932        Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
2933    }
2934
2935    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
2936        // Returns magnitude of x with sign of y
2937        if x.is_empty() || y.is_empty() {
2938            return Array1::zeros(0);
2939        }
2940        let len = x.len().min(y.len());
2941        Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
2942    }
2943
2944    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
2945        // Ken Perlin's smootherstep: 6t⁵ - 15t⁴ + 10t³
2946        // Has zero first AND second derivatives at boundaries
2947        if x.is_empty() {
2948            return Array1::zeros(0);
2949        }
2950        let range = edge1 - edge0;
2951        if range.abs() < Self::EPSILON {
2952            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
2953        }
2954        x.mapv(|xi| {
2955            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
2956            // 6t⁵ - 15t⁴ + 10t³ = t³(10 - 15t + 6t²) = t³(6t² - 15t + 10)
2957            let t3 = t * t * t;
2958            t3 * (t * (t * 6.0 - 15.0) + 10.0)
2959        })
2960    }
2961
2962    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
2963        // Numerically stable log(exp(a) + exp(b))
2964        // Formula: max(a,b) + log(1 + exp(-|a-b|))
2965        if a.is_empty() || b.is_empty() {
2966            return Array1::zeros(0);
2967        }
2968        let len = a.len().min(b.len());
2969        Array1::from_iter((0..len).map(|i| {
2970            let ai = a[i];
2971            let bi = b[i];
2972            let max_val = ai.max(bi);
2973            let diff = (ai - bi).abs();
2974            // For very large differences, the smaller term is negligible
2975            if diff > 50.0 {
2976                max_val
2977            } else {
2978                max_val + (1.0 + (-diff).exp()).ln()
2979            }
2980        }))
2981    }
2982
2983    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
2984        // Logit: log(p / (1-p)) = log(p) - log(1-p)
2985        // Domain: (0, 1), Range: (-∞, +∞)
2986        if a.is_empty() {
2987            return Array1::zeros(0);
2988        }
2989        a.mapv(|p| {
2990            // Handle edge cases
2991            if p <= 0.0 {
2992                Self::NEG_INFINITY
2993            } else if p >= 1.0 {
2994                Self::INFINITY
2995            } else {
2996                (p / (1.0 - p)).ln()
2997            }
2998        })
2999    }
3000
3001    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
3002        // Element-wise x²
3003        if a.is_empty() {
3004            return Array1::zeros(0);
3005        }
3006        a.mapv(|x| x * x)
3007    }
3008
3009    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
3010        // Inverse square root: 1/sqrt(x)
3011        if a.is_empty() {
3012            return Array1::zeros(0);
3013        }
3014        a.mapv(|x| {
3015            if x <= 0.0 {
3016                if x == 0.0 {
3017                    Self::INFINITY
3018                } else {
3019                    Self::NAN
3020                }
3021            } else {
3022                1.0 / x.sqrt()
3023            }
3024        })
3025    }
3026
3027    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
3028        // Simultaneous sin and cos
3029        if a.is_empty() {
3030            return (Array1::zeros(0), Array1::zeros(0));
3031        }
3032        let sin_result = a.mapv(|x| x.sin());
3033        let cos_result = a.mapv(|x| x.cos());
3034        (sin_result, cos_result)
3035    }
3036
3037    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
3038        // Numerically stable exp(x) - 1
3039        // Uses std library's expm1 which handles small x accurately
3040        if a.is_empty() {
3041            return Array1::zeros(0);
3042        }
3043        a.mapv(|x| x.exp_m1())
3044    }
3045
3046    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
3047        // Numerically stable ln(1 + x)
3048        // Uses std library's ln_1p which handles small x accurately
3049        if a.is_empty() {
3050            return Array1::zeros(0);
3051        }
3052        a.mapv(|x| x.ln_1p())
3053    }
3054}
3055
3056// Implementation for f64
3057impl SimdUnifiedOps for f64 {
3058    #[cfg(feature = "simd")]
3059    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3060        crate::simd::simd_add_f64(a, b)
3061    }
3062
3063    #[cfg(not(feature = "simd"))]
3064    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3065        (a + b).to_owned()
3066    }
3067
3068    #[cfg(feature = "simd")]
3069    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3070        crate::simd::simd_sub_f64(a, b)
3071    }
3072
3073    #[cfg(not(feature = "simd"))]
3074    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3075        (a - b).to_owned()
3076    }
3077
3078    #[cfg(feature = "simd")]
3079    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3080        crate::simd::simd_mul_f64(a, b)
3081    }
3082
3083    #[cfg(not(feature = "simd"))]
3084    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3085        (a * b).to_owned()
3086    }
3087
3088    #[cfg(feature = "simd")]
3089    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3090        crate::simd::simd_div_f64(a, b)
3091    }
3092
3093    #[cfg(not(feature = "simd"))]
3094    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3095        (a / b).to_owned()
3096    }
3097
3098    #[cfg(feature = "simd")]
3099    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3100        crate::simd::simd_dot_f64(a, b)
3101    }
3102
3103    #[cfg(not(feature = "simd"))]
3104    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3105        a.dot(b)
3106    }
3107
3108    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
3109        let m = a.nrows();
3110        let n = a.ncols();
3111
3112        assert_eq!(n, x.len());
3113        assert_eq!(m, y.len());
3114
3115        // Scale y by beta
3116        if beta == 0.0 {
3117            y.fill(0.0);
3118        } else if beta != 1.0 {
3119            y.mapv_inplace(|v| v * beta);
3120        }
3121
3122        // Compute matrix-vector product
3123        for i in 0..m {
3124            let row = a.row(i);
3125            y[i] += Self::simd_dot(&row, x);
3126        }
3127    }
3128
3129    fn simd_gemm(
3130        alpha: Self,
3131        a: &ArrayView2<Self>,
3132        b: &ArrayView2<Self>,
3133        beta: Self,
3134        c: &mut Array2<Self>,
3135    ) {
3136        let m = a.nrows();
3137        let k = a.ncols();
3138        let n = b.ncols();
3139
3140        assert_eq!(k, b.nrows());
3141        assert_eq!((m, n), c.dim());
3142
3143        // Scale C by beta
3144        if beta == 0.0 {
3145            c.fill(0.0);
3146        } else if beta != 1.0 {
3147            c.mapv_inplace(|v| v * beta);
3148        }
3149
3150        // Use blocked transpose for large matrices to improve cache efficiency
3151        // Threshold: n * k > 4096 (amortize transpose cost)
3152        const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
3153
3154        if n * k > GEMM_TRANSPOSE_THRESHOLD {
3155            // Pre-transpose B for cache-efficient row-wise access
3156            let b_t = Self::simd_transpose_blocked(b);
3157
3158            for i in 0..m {
3159                let a_row = a.row(i);
3160                for j in 0..n {
3161                    // Access b_t row-wise (contiguous memory)
3162                    let b_row = b_t.row(j);
3163                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
3164                }
3165            }
3166        } else {
3167            // Small matrices: use column access (overhead of transpose not worth it)
3168            for i in 0..m {
3169                let a_row = a.row(i);
3170                for j in 0..n {
3171                    let b_col = b.column(j);
3172                    c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
3173                }
3174            }
3175        }
3176    }
3177
3178    #[cfg(feature = "simd")]
3179    fn simd_norm(a: &ArrayView1<Self>) -> Self {
3180        crate::simd::norms::simd_norm_l2_f64(a)
3181    }
3182
3183    #[cfg(not(feature = "simd"))]
3184    fn simd_norm(a: &ArrayView1<Self>) -> Self {
3185        a.iter().map(|&x| x * x).sum::<f64>().sqrt()
3186    }
3187
3188    #[cfg(feature = "simd")]
3189    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3190        crate::simd::simd_maximum_f64(a, b)
3191    }
3192
3193    #[cfg(not(feature = "simd"))]
3194    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3195        let mut result = Array1::zeros(a.len());
3196        for _i in 0..a.len() {
3197            result[0] = a[0].max(b[0]);
3198        }
3199        result
3200    }
3201
3202    #[cfg(feature = "simd")]
3203    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3204        crate::simd::simd_minimum_f64(a, b)
3205    }
3206
3207    #[cfg(not(feature = "simd"))]
3208    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3209        let mut result = Array1::zeros(a.len());
3210        for _i in 0..a.len() {
3211            result[0] = a[0].min(b[0]);
3212        }
3213        result
3214    }
3215
3216    #[cfg(feature = "simd")]
3217    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
3218        crate::simd::simd_scalar_mul_f64(a, scalar)
3219    }
3220
3221    #[cfg(not(feature = "simd"))]
3222    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
3223        a.mapv(|x| x * scalar)
3224    }
3225
3226    #[cfg(feature = "simd")]
3227    fn simd_sum(a: &ArrayView1<Self>) -> Self {
3228        crate::simd::simd_sum_f64(a)
3229    }
3230
3231    #[cfg(not(feature = "simd"))]
3232    fn simd_sum(a: &ArrayView1<Self>) -> Self {
3233        a.sum()
3234    }
3235
3236    fn simd_mean(a: &ArrayView1<Self>) -> Self {
3237        if a.is_empty() {
3238            0.0
3239        } else {
3240            Self::simd_sum(a) / (a.len() as f64)
3241        }
3242    }
3243
3244    #[cfg(feature = "simd")]
3245    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
3246        crate::simd::simd_max_f64(a)
3247    }
3248
3249    #[cfg(not(feature = "simd"))]
3250    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
3251        a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
3252    }
3253
3254    #[cfg(feature = "simd")]
3255    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
3256        crate::simd::simd_min_f64(a)
3257    }
3258
3259    #[cfg(not(feature = "simd"))]
3260    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
3261        a.fold(f64::INFINITY, |acc, &x| acc.min(x))
3262    }
3263
3264    #[cfg(feature = "simd")]
3265    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
3266        crate::simd::simd_fused_multiply_add_f64(a, b, c)
3267    }
3268
3269    #[cfg(not(feature = "simd"))]
3270    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
3271        let mut result = Array1::zeros(a.len());
3272        for _i in 0..a.len() {
3273            result[0] = a[0] * b[0] + c[0];
3274        }
3275        result
3276    }
3277
3278    #[cfg(feature = "simd")]
3279    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3280        crate::simd::simd_add_cache_optimized_f64(a, b)
3281    }
3282
3283    #[cfg(not(feature = "simd"))]
3284    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3285        a + b
3286    }
3287
3288    #[cfg(feature = "simd")]
3289    fn simd_fma_advanced_optimized(
3290        a: &ArrayView1<Self>,
3291        b: &ArrayView1<Self>,
3292        c: &ArrayView1<Self>,
3293    ) -> Array1<Self> {
3294        crate::simd::simd_fma_advanced_optimized_f64(a, b, c)
3295    }
3296
3297    #[cfg(not(feature = "simd"))]
3298    fn simd_fma_advanced_optimized(
3299        a: &ArrayView1<Self>,
3300        b: &ArrayView1<Self>,
3301        c: &ArrayView1<Self>,
3302    ) -> Array1<Self> {
3303        let mut result = Array1::zeros(a.len());
3304        for _i in 0..a.len() {
3305            result[0] = a[0] * b[0] + c[0];
3306        }
3307        result
3308    }
3309
3310    #[cfg(feature = "simd")]
3311    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3312        crate::simd::simd_adaptive_add_f64(a, b)
3313    }
3314
3315    #[cfg(not(feature = "simd"))]
3316    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3317        a + b
3318    }
3319
3320    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
3321        a.t().to_owned()
3322    }
3323
3324    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
3325        #[cfg(feature = "simd")]
3326        {
3327            crate::simd::simd_transpose_blocked_f64(a)
3328        }
3329        #[cfg(not(feature = "simd"))]
3330        {
3331            a.t().to_owned()
3332        }
3333    }
3334
3335    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
3336        a.iter().map(|&x| x * x).sum()
3337    }
3338
3339    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
3340        Self::simd_mul(a, b)
3341    }
3342
3343    #[cfg(feature = "simd")]
3344    fn simd_available() -> bool {
3345        true
3346    }
3347
3348    #[cfg(not(feature = "simd"))]
3349    fn simd_available() -> bool {
3350        false
3351    }
3352
3353    fn simd_sub_f32_ultra(
3354        a: &ArrayView1<Self>,
3355        b: &ArrayView1<Self>,
3356        result: &mut ArrayViewMut1<Self>,
3357    ) {
3358        let sub_result = Self::simd_sub(a, b);
3359        result.assign(&sub_result);
3360    }
3361
3362    fn simd_mul_f32_ultra(
3363        a: &ArrayView1<Self>,
3364        b: &ArrayView1<Self>,
3365        result: &mut ArrayViewMut1<Self>,
3366    ) {
3367        let mul_result = Self::simd_mul(a, b);
3368        result.assign(&mul_result);
3369    }
3370
3371    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
3372        // Calculate sum of cubes: sum(x^3)
3373        a.iter().map(|&x| x * x * x).sum()
3374    }
3375
3376    fn simd_div_f32_ultra(
3377        a: &ArrayView1<Self>,
3378        b: &ArrayView1<Self>,
3379        result: &mut ArrayViewMut1<Self>,
3380    ) {
3381        let div_result = Self::simd_div(a, b);
3382        result.assign(&div_result);
3383    }
3384
3385    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3386        let sin_result = a.mapv(|x| x.sin());
3387        result.assign(&sin_result);
3388    }
3389
3390    fn simd_add_f32_ultra(
3391        a: &ArrayView1<Self>,
3392        b: &ArrayView1<Self>,
3393        result: &mut ArrayViewMut1<Self>,
3394    ) {
3395        let add_result = Self::simd_add(a, b);
3396        result.assign(&add_result);
3397    }
3398
3399    fn simd_fma_f32_ultra(
3400        a: &ArrayView1<Self>,
3401        b: &ArrayView1<Self>,
3402        c: &ArrayView1<Self>,
3403        result: &mut ArrayViewMut1<Self>,
3404    ) {
3405        let fma_result = Self::simd_fma(a, b, c);
3406        result.assign(&fma_result);
3407    }
3408
3409    fn simd_pow_f32_ultra(
3410        a: &ArrayView1<Self>,
3411        b: &ArrayView1<Self>,
3412        result: &mut ArrayViewMut1<Self>,
3413    ) {
3414        let pow_result = a
3415            .iter()
3416            .zip(b.iter())
3417            .map(|(&x, &y)| x.powf(y))
3418            .collect::<Vec<_>>();
3419        result.assign(&Array1::from_vec(pow_result));
3420    }
3421
3422    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3423        let exp_result = a.mapv(|x| x.exp());
3424        result.assign(&exp_result);
3425    }
3426
3427    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
3428        let cos_result = a.mapv(|x| x.cos());
3429        result.assign(&cos_result);
3430    }
3431
3432    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3433        Self::simd_dot(a, b)
3434    }
3435
3436    #[cfg(feature = "simd")]
3437    fn simd_variance(a: &ArrayView1<Self>) -> Self {
3438        crate::simd::simd_variance_f64(a)
3439    }
3440
3441    #[cfg(not(feature = "simd"))]
3442    fn simd_variance(a: &ArrayView1<Self>) -> Self {
3443        let mean = Self::simd_mean(a);
3444        let n = a.len() as f64;
3445        if n < 2.0 {
3446            return f64::NAN;
3447        }
3448        // Sample variance with Bessel's correction (n-1)
3449        a.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
3450    }
3451
3452    #[cfg(feature = "simd")]
3453    fn simd_std(a: &ArrayView1<Self>) -> Self {
3454        crate::simd::simd_std_f64(a)
3455    }
3456
3457    #[cfg(not(feature = "simd"))]
3458    fn simd_std(a: &ArrayView1<Self>) -> Self {
3459        Self::simd_variance(a).sqrt()
3460    }
3461
3462    #[cfg(feature = "simd")]
3463    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
3464        crate::simd::simd_norm_l1_f64(a)
3465    }
3466
3467    #[cfg(not(feature = "simd"))]
3468    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
3469        a.iter().map(|&x| x.abs()).sum()
3470    }
3471
3472    #[cfg(feature = "simd")]
3473    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
3474        crate::simd::simd_norm_linf_f64(a)
3475    }
3476
3477    #[cfg(not(feature = "simd"))]
3478    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
3479        a.iter().fold(0.0f64, |acc, &x| acc.max(x.abs()))
3480    }
3481
3482    #[cfg(feature = "simd")]
3483    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3484        crate::simd::simd_cosine_similarity_f64(a, b)
3485    }
3486
3487    #[cfg(not(feature = "simd"))]
3488    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3489        let dot = Self::simd_dot(a, b);
3490        let norm_a = Self::simd_norm(a);
3491        let norm_b = Self::simd_norm(b);
3492        dot / (norm_a * norm_b)
3493    }
3494
3495    #[cfg(feature = "simd")]
3496    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3497        crate::simd::simd_distance_euclidean_f64(a, b)
3498    }
3499
3500    #[cfg(not(feature = "simd"))]
3501    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3502        a.iter()
3503            .zip(b.iter())
3504            .map(|(&x, &y)| (x - y).powi(2))
3505            .sum::<f64>()
3506            .sqrt()
3507    }
3508
3509    #[cfg(feature = "simd")]
3510    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3511        crate::simd::simd_distance_manhattan_f64(a, b)
3512    }
3513
3514    #[cfg(not(feature = "simd"))]
3515    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3516        a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
3517    }
3518
3519    #[cfg(feature = "simd")]
3520    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3521        crate::simd::simd_distance_chebyshev_f64(a, b)
3522    }
3523
3524    #[cfg(not(feature = "simd"))]
3525    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3526        a.iter()
3527            .zip(b.iter())
3528            .fold(0.0f64, |acc, (&x, &y)| acc.max((x - y).abs()))
3529    }
3530
3531    #[cfg(feature = "simd")]
3532    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3533        crate::simd::simd_distance_cosine_f64(a, b)
3534    }
3535
3536    #[cfg(not(feature = "simd"))]
3537    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
3538        1.0 - Self::simd_cosine_similarity(a, b)
3539    }
3540
3541    #[cfg(feature = "simd")]
3542    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3543        crate::simd::simd_weighted_sum_f64(values, weights)
3544    }
3545
3546    #[cfg(not(feature = "simd"))]
3547    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3548        values
3549            .iter()
3550            .zip(weights.iter())
3551            .map(|(&v, &w)| v * w)
3552            .sum()
3553    }
3554
3555    #[cfg(feature = "simd")]
3556    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3557        crate::simd::simd_weighted_mean_f64(values, weights)
3558    }
3559
3560    #[cfg(not(feature = "simd"))]
3561    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
3562        let weighted_sum = Self::simd_weighted_sum(values, weights);
3563        let weight_sum: f64 = weights.iter().sum();
3564        weighted_sum / weight_sum
3565    }
3566
3567    #[cfg(feature = "simd")]
3568    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
3569        crate::simd::simd_argmin_f64(a)
3570    }
3571
3572    #[cfg(not(feature = "simd"))]
3573    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
3574        if a.is_empty() {
3575            return None;
3576        }
3577        let mut min_idx = 0;
3578        let mut min_val = a[0];
3579        for (i, &v) in a.iter().enumerate().skip(1) {
3580            if v < min_val {
3581                min_val = v;
3582                min_idx = i;
3583            }
3584        }
3585        Some(min_idx)
3586    }
3587
3588    #[cfg(feature = "simd")]
3589    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
3590        crate::simd::simd_argmax_f64(a)
3591    }
3592
3593    #[cfg(not(feature = "simd"))]
3594    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
3595        if a.is_empty() {
3596            return None;
3597        }
3598        let mut max_idx = 0;
3599        let mut max_val = a[0];
3600        for (i, &v) in a.iter().enumerate().skip(1) {
3601            if v > max_val {
3602                max_val = v;
3603                max_idx = i;
3604            }
3605        }
3606        Some(max_idx)
3607    }
3608
3609    #[cfg(feature = "simd")]
3610    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
3611        crate::simd::simd_clip_f64(a, min_val, max_val)
3612    }
3613
3614    #[cfg(not(feature = "simd"))]
3615    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
3616        a.mapv(|v| v.max(min_val).min(max_val))
3617    }
3618
3619    #[cfg(feature = "simd")]
3620    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
3621        crate::simd::simd_log_sum_exp_f64(a)
3622    }
3623
3624    #[cfg(not(feature = "simd"))]
3625    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
3626        if a.is_empty() {
3627            return f64::NEG_INFINITY;
3628        }
3629        let max_val = a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x));
3630        let sum_exp: f64 = a.iter().map(|&x| (x - max_val).exp()).sum();
3631        max_val + sum_exp.ln()
3632    }
3633
3634    #[cfg(feature = "simd")]
3635    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
3636        crate::simd::simd_softmax_f64(a)
3637    }
3638
3639    #[cfg(not(feature = "simd"))]
3640    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
3641        if a.is_empty() {
3642            return Array1::zeros(0);
3643        }
3644        let lse = Self::simd_log_sum_exp(a);
3645        a.mapv(|x| (x - lse).exp())
3646    }
3647
3648    #[cfg(feature = "simd")]
3649    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
3650        crate::simd::simd_cumsum_f64(a)
3651    }
3652
3653    #[cfg(not(feature = "simd"))]
3654    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
3655        if a.is_empty() {
3656            return Array1::zeros(0);
3657        }
3658        let mut cumsum = 0.0f64;
3659        a.mapv(|x| {
3660            cumsum += x;
3661            cumsum
3662        })
3663    }
3664
3665    #[cfg(feature = "simd")]
3666    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
3667        crate::simd::simd_cumprod_f64(a)
3668    }
3669
3670    #[cfg(not(feature = "simd"))]
3671    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
3672        if a.is_empty() {
3673            return Array1::zeros(0);
3674        }
3675        let mut cumprod = 1.0f64;
3676        a.mapv(|x| {
3677            cumprod *= x;
3678            cumprod
3679        })
3680    }
3681
3682    #[cfg(feature = "simd")]
3683    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
3684        crate::simd::simd_diff_f64(a)
3685    }
3686
3687    #[cfg(not(feature = "simd"))]
3688    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
3689        if a.len() <= 1 {
3690            return Array1::zeros(0);
3691        }
3692        Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
3693    }
3694
3695    #[cfg(feature = "simd")]
3696    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
3697        crate::simd::simd_sign_f64(a)
3698    }
3699
3700    #[cfg(not(feature = "simd"))]
3701    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
3702        a.mapv(|x| {
3703            if x > 0.0 {
3704                1.0
3705            } else if x < 0.0 {
3706                -1.0
3707            } else {
3708                0.0
3709            }
3710        })
3711    }
3712
3713    #[cfg(feature = "simd")]
3714    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
3715        crate::simd::simd_relu_f64(a)
3716    }
3717
3718    #[cfg(not(feature = "simd"))]
3719    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
3720        a.mapv(|x| x.max(0.0))
3721    }
3722
3723    #[cfg(feature = "simd")]
3724    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
3725        crate::simd::simd_leaky_relu_f64(a, alpha)
3726    }
3727
3728    #[cfg(not(feature = "simd"))]
3729    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
3730        a.mapv(|x| if x > 0.0 { x } else { alpha * x })
3731    }
3732
3733    #[cfg(feature = "simd")]
3734    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
3735        crate::simd::simd_normalize_f64(a)
3736    }
3737
3738    #[cfg(not(feature = "simd"))]
3739    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
3740        let norm: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
3741        if norm == 0.0 {
3742            return a.to_owned();
3743        }
3744        a.mapv(|x| x / norm)
3745    }
3746
3747    #[cfg(feature = "simd")]
3748    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
3749        crate::simd::simd_standardize_f64(a)
3750    }
3751
3752    #[cfg(not(feature = "simd"))]
3753    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
3754        if a.len() <= 1 {
3755            return Array1::zeros(a.len());
3756        }
3757        let mean: f64 = a.iter().sum::<f64>() / a.len() as f64;
3758        let variance: f64 =
3759            a.iter().map(|x| (x - mean) * (x - mean)).sum::<f64>() / (a.len() - 1) as f64;
3760        let std = variance.sqrt();
3761        if std == 0.0 {
3762            return Array1::zeros(a.len());
3763        }
3764        a.mapv(|x| (x - mean) / std)
3765    }
3766
3767    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
3768        a.mapv(|x| x.abs())
3769    }
3770
3771    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
3772        a.mapv(|x| x.sqrt())
3773    }
3774
3775    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
3776        // Note: SIMD polynomial approximation available via crate::simd::simd_exp_f64
3777        // for ~5-10x speedup with 10^-9 accuracy. Keeping scalar for trait compatibility.
3778        a.mapv(|x| x.exp())
3779    }
3780
3781    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
3782        // Note: SIMD logarithm available via crate::simd::simd_ln_f64
3783        // for speedup with moderate accuracy. Keeping scalar for trait compatibility.
3784        a.mapv(|x| x.ln())
3785    }
3786
3787    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
3788        // Note: SIMD polynomial approximation available via crate::simd::simd_sin_f64
3789        // for ~5x speedup with 10^-4 accuracy. Keeping scalar for trait compatibility.
3790        a.mapv(|x| x.sin())
3791    }
3792
3793    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
3794        // Note: SIMD cosine available via crate::simd::simd_cos_f64
3795        // for ~5x speedup with 10^-4 accuracy. Keeping scalar for trait compatibility.
3796        a.mapv(|x| x.cos())
3797    }
3798
3799    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
3800        // Note: Can use SIMD via sin/cos: crate::simd::simd_sin_f64 / crate::simd::simd_cos_f64
3801        a.mapv(|x| x.tan())
3802    }
3803
3804    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
3805        #[cfg(feature = "simd")]
3806        {
3807            simd_ops_polynomial::simd_sinh_f64_poly(a)
3808        }
3809        #[cfg(not(feature = "simd"))]
3810        {
3811            a.mapv(|x| x.sinh())
3812        }
3813    }
3814
3815    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
3816        #[cfg(feature = "simd")]
3817        {
3818            simd_ops_polynomial::simd_cosh_f64_poly(a)
3819        }
3820        #[cfg(not(feature = "simd"))]
3821        {
3822            a.mapv(|x| x.cosh())
3823        }
3824    }
3825
3826    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
3827        #[cfg(feature = "simd")]
3828        {
3829            // Use polynomial approximation (good accuracy, fast)
3830            simd_ops_polynomial::simd_tanh_f64_poly(a)
3831        }
3832        #[cfg(not(feature = "simd"))]
3833        {
3834            a.mapv(|x| x.tanh())
3835        }
3836    }
3837
3838    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
3839        #[cfg(feature = "simd")]
3840        {
3841            crate::simd::simd_floor_f64(a)
3842        }
3843        #[cfg(not(feature = "simd"))]
3844        {
3845            a.mapv(|x| x.floor())
3846        }
3847    }
3848
3849    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
3850        #[cfg(feature = "simd")]
3851        {
3852            crate::simd::simd_ceil_f64(a)
3853        }
3854        #[cfg(not(feature = "simd"))]
3855        {
3856            a.mapv(|x| x.ceil())
3857        }
3858    }
3859
3860    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
3861        #[cfg(feature = "simd")]
3862        {
3863            crate::simd::simd_round_f64(a)
3864        }
3865        #[cfg(not(feature = "simd"))]
3866        {
3867            a.mapv(|x| x.round())
3868        }
3869    }
3870
3871    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
3872        a.mapv(|x| x.atan())
3873    }
3874
3875    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
3876        a.mapv(|x| x.asin())
3877    }
3878
3879    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
3880        a.mapv(|x| x.acos())
3881    }
3882
3883    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
3884        y.iter()
3885            .zip(x.iter())
3886            .map(|(&y_val, &x_val)| y_val.atan2(x_val))
3887            .collect::<Vec<_>>()
3888            .into()
3889    }
3890
3891    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
3892        // log10(x) = ln(x) * (1/ln(10)) - uses SIMD ln
3893        const LOG10_E: f64 = std::f64::consts::LOG10_E; // 1/ln(10)
3894        let ln_a = Self::simd_ln(a);
3895        Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
3896    }
3897
3898    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
3899        // log2(x) = ln(x) * (1/ln(2)) - uses SIMD ln
3900        const LOG2_E: f64 = std::f64::consts::LOG2_E; // 1/ln(2)
3901        let ln_a = Self::simd_ln(a);
3902        Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
3903    }
3904
3905    #[cfg(feature = "simd")]
3906    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
3907        // Use SIMD-accelerated clip (AVX2/SSE/NEON)
3908        crate::simd::simd_clip_f64(a, min, max)
3909    }
3910
3911    #[cfg(not(feature = "simd"))]
3912    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
3913        a.mapv(|x| x.clamp(min, max))
3914    }
3915
3916    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
3917        #[cfg(feature = "simd")]
3918        {
3919            let truncated = crate::simd::simd_trunc_f64(a);
3920            Self::simd_sub(a, &truncated.view())
3921        }
3922        #[cfg(not(feature = "simd"))]
3923        {
3924            a.mapv(|x| x.fract())
3925        }
3926    }
3927
3928    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
3929        #[cfg(feature = "simd")]
3930        {
3931            crate::simd::simd_trunc_f64(a)
3932        }
3933        #[cfg(not(feature = "simd"))]
3934        {
3935            a.mapv(|x| x.trunc())
3936        }
3937    }
3938
3939    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
3940        // Optimized SIMD reciprocal: 1/x using SIMD division
3941        let ones = Array1::from_elem(a.len(), 1.0f64);
3942        Self::simd_div(&ones.view(), a)
3943    }
3944
3945    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
3946        // Optimized SIMD powf: base^exp = exp(exp * ln(base))
3947        // Uses SIMD-accelerated exp and ln operations
3948        let ln_base = Self::simd_ln(base);
3949        let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
3950        Self::simd_exp(&scaled.view())
3951    }
3952
3953    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
3954        // Optimized SIMD pow: base[i]^exp[i] = exp(exp[i] * ln(base[i]))
3955        // Uses SIMD-accelerated exp, ln, and mul operations
3956        let ln_base = Self::simd_ln(base);
3957        let scaled = Self::simd_mul(&ln_base.view(), exp);
3958        Self::simd_exp(&scaled.view())
3959    }
3960
3961    #[cfg(feature = "simd")]
3962    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
3963        crate::simd::unary_powi::simd_powi_f64(base, n)
3964    }
3965
3966    #[cfg(not(feature = "simd"))]
3967    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
3968        base.mapv(|x| x.powi(n))
3969    }
3970
3971    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
3972        x.mapv(lanczos_gamma_f64)
3973    }
3974
3975    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
3976        // 2^x = exp(x * ln(2))
3977        const LN2: f64 = std::f64::consts::LN_2;
3978        let scaled = Self::simd_scalar_mul(a, LN2);
3979        Self::simd_exp(&scaled.view())
3980    }
3981
3982    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
3983        // Cube root: x^(1/3)
3984        // Handle negative numbers: cbrt(-x) = -cbrt(x)
3985        a.mapv(|x| x.cbrt())
3986    }
3987
3988    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
3989        // ln(1+x) - numerically stable for small x
3990        a.mapv(|x| x.ln_1p())
3991    }
3992
3993    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
3994        // exp(x)-1 - numerically stable for small x
3995        a.mapv(|x| x.exp_m1())
3996    }
3997
3998    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
3999        // degrees to radians: x * π / 180
4000        const DEG_TO_RAD: f64 = std::f64::consts::PI / 180.0;
4001        Self::simd_scalar_mul(a, DEG_TO_RAD)
4002    }
4003
4004    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
4005        // radians to degrees: x * 180 / π
4006        const RAD_TO_DEG: f64 = 180.0 / std::f64::consts::PI;
4007        Self::simd_scalar_mul(a, RAD_TO_DEG)
4008    }
4009
4010    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
4011        // Digamma function ψ(x) = d/dx ln(Γ(x))
4012        // Uses reflection formula, recurrence relation, and asymptotic expansion
4013        a.mapv(digamma_f64)
4014    }
4015
4016    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
4017        // Trigamma function ψ'(x) = d²/dx² ln(Γ(x))
4018        // Critical for Fisher information in Bayesian inference
4019        a.mapv(trigamma_f64)
4020    }
4021
4022    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
4023        // Log-gamma function ln(Γ(x)) - more stable than gamma(x).ln()
4024        a.mapv(ln_gamma_f64)
4025    }
4026
4027    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
4028        // Error function erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
4029        // Critical for normal distribution CDF: Φ(x) = 0.5 * (1 + erf(x/√2))
4030        a.mapv(erf_f64)
4031    }
4032
4033    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
4034        // Complementary error function erfc(x) = 1 - erf(x)
4035        // More numerically stable than 1 - erf(x) for large x
4036        a.mapv(erfc_f64)
4037    }
4038
4039    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
4040        // Inverse error function: erfinv(y) = x such that erf(x) = y
4041        // Critical for inverse normal CDF (probit function)
4042        a.mapv(erfinv_f64)
4043    }
4044
4045    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
4046        // Inverse complementary error function: erfcinv(y) = x such that erfc(x) = y
4047        // More numerically stable than erfinv(1-y) for small y
4048        a.mapv(erfcinv_f64)
4049    }
4050
4051    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
4052        // Sigmoid (logistic) function: σ(x) = 1 / (1 + exp(-x))
4053        // Critical for neural networks, logistic regression
4054        // Uses numerically stable implementation
4055        a.mapv(sigmoid_f64)
4056    }
4057
4058    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
4059        // GELU (Gaussian Error Linear Unit): x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
4060        // Critical for Transformer models (BERT, GPT, etc.)
4061        a.mapv(gelu_f64)
4062    }
4063
4064    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
4065        // Swish (SiLU): x * sigmoid(x)
4066        // Self-gated activation for EfficientNet, GPT-NeoX
4067        a.mapv(swish_f64)
4068    }
4069
4070    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
4071        // Softplus: ln(1 + exp(x))
4072        // Smooth approximation of ReLU for probabilistic models
4073        a.mapv(softplus_f64)
4074    }
4075
4076    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
4077        // Mish: x * tanh(softplus(x))
4078        // Self-regularized activation for YOLOv4
4079        a.mapv(mish_f64)
4080    }
4081
4082    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
4083        // ELU: x if x >= 0, alpha * (exp(x) - 1) if x < 0
4084        // Helps with vanishing gradients
4085        a.mapv(|x| elu_f64(x, alpha))
4086    }
4087
4088    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
4089        // SELU: λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
4090        // Self-normalizing activation with fixed α and λ constants
4091        a.mapv(selu_f64)
4092    }
4093
4094    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
4095        // Hardsigmoid: clip((x + 3) / 6, 0, 1)
4096        // Piecewise linear approximation of sigmoid
4097        a.mapv(hardsigmoid_f64)
4098    }
4099
4100    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
4101        // Hardswish: x * hardsigmoid(x)
4102        // Piecewise linear approximation of Swish
4103        a.mapv(hardswish_f64)
4104    }
4105
4106    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
4107        // Sinc: sin(πx) / (πx), with sinc(0) = 1
4108        // Critical for signal processing and interpolation
4109        a.mapv(sinc_f64)
4110    }
4111
4112    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
4113        // Log-softmax: x_i - log(Σ_j exp(x_j))
4114        // Numerically stable for neural network loss computation
4115        // Leverages existing SIMD-accelerated log_sum_exp
4116        if a.is_empty() {
4117            return Array1::zeros(0);
4118        }
4119        let lse = Self::simd_log_sum_exp(a);
4120        a.mapv(|x| x - lse)
4121    }
4122
4123    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
4124        // Inverse hyperbolic sine: asinh(x) = ln(x + √(x² + 1))
4125        // Uses Rust's built-in asinh for accuracy
4126        a.mapv(|x| x.asinh())
4127    }
4128
4129    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
4130        // Inverse hyperbolic cosine: acosh(x) = ln(x + √(x² - 1))
4131        // Domain: [1, +∞), returns NaN for x < 1
4132        a.mapv(|x| x.acosh())
4133    }
4134
4135    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
4136        // Inverse hyperbolic tangent: atanh(x) = 0.5 * ln((1+x)/(1-x))
4137        // Domain: (-1, 1), returns ±∞ at ±1, NaN for |x| > 1
4138        a.mapv(|x| x.atanh())
4139    }
4140
4141    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4142        // Log-Beta: ln(B(a,b)) = ln(Γ(a)) + ln(Γ(b)) - ln(Γ(a+b))
4143        // Leverages SIMD-accelerated ln_gamma
4144        let ln_gamma_a = Self::simd_ln_gamma(a);
4145        let ln_gamma_b = Self::simd_ln_gamma(b);
4146        let a_plus_b = Self::simd_add(a, b);
4147        let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
4148        Self::simd_sub(
4149            &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
4150            &ln_gamma_ab.view(),
4151        )
4152    }
4153
4154    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4155        // Beta: B(a,b) = exp(ln(B(a,b)))
4156        // Uses log-beta for numerical stability
4157        let ln_beta = Self::simd_ln_beta(a, b);
4158        Self::simd_exp(&ln_beta.view())
4159    }
4160
4161    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
4162        // Linear interpolation: lerp(a, b, t) = a + t * (b - a)
4163        // This formula is more numerically stable when t is close to 0
4164        if a.is_empty() || b.is_empty() {
4165            return Array1::zeros(0);
4166        }
4167        let diff = Self::simd_sub(b, a);
4168        let scaled = Self::simd_scalar_mul(&diff.view(), t);
4169        Self::simd_add(a, &scaled.view())
4170    }
4171
4172    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
4173        // Smoothstep: smooth Hermite interpolation
4174        // t = clamp((x - edge0) / (edge1 - edge0), 0, 1)
4175        // result = t * t * (3 - 2 * t)
4176        if x.is_empty() {
4177            return Array1::zeros(0);
4178        }
4179        let range = edge1 - edge0;
4180        if range.abs() < Self::EPSILON {
4181            // Degenerate case: edge0 == edge1
4182            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
4183        }
4184        x.mapv(|xi| {
4185            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
4186            t * t * (3.0 - 2.0 * t)
4187        })
4188    }
4189
4190    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
4191        // Hypotenuse: sqrt(x² + y²) with overflow/underflow protection
4192        // Uses Rust's built-in hypot which handles extreme values correctly
4193        if x.is_empty() || y.is_empty() {
4194            return Array1::zeros(0);
4195        }
4196        let len = x.len().min(y.len());
4197        Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
4198    }
4199
4200    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
4201        // Returns magnitude of x with sign of y
4202        if x.is_empty() || y.is_empty() {
4203            return Array1::zeros(0);
4204        }
4205        let len = x.len().min(y.len());
4206        Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
4207    }
4208
4209    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
4210        // Ken Perlin's smootherstep: 6t⁵ - 15t⁴ + 10t³
4211        // Has zero first AND second derivatives at boundaries
4212        if x.is_empty() {
4213            return Array1::zeros(0);
4214        }
4215        let range = edge1 - edge0;
4216        if range.abs() < Self::EPSILON {
4217            return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
4218        }
4219        x.mapv(|xi| {
4220            let t = ((xi - edge0) / range).clamp(0.0, 1.0);
4221            // 6t⁵ - 15t⁴ + 10t³ = t³(6t² - 15t + 10)
4222            let t3 = t * t * t;
4223            t3 * (t * (t * 6.0 - 15.0) + 10.0)
4224        })
4225    }
4226
4227    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
4228        // Numerically stable log(exp(a) + exp(b))
4229        // Formula: max(a,b) + log(1 + exp(-|a-b|))
4230        if a.is_empty() || b.is_empty() {
4231            return Array1::zeros(0);
4232        }
4233        let len = a.len().min(b.len());
4234        Array1::from_iter((0..len).map(|i| {
4235            let ai = a[i];
4236            let bi = b[i];
4237            let max_val = ai.max(bi);
4238            let diff = (ai - bi).abs();
4239            // For very large differences, the smaller term is negligible
4240            if diff > 50.0 {
4241                max_val
4242            } else {
4243                max_val + (1.0 + (-diff).exp()).ln()
4244            }
4245        }))
4246    }
4247
4248    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
4249        // Logit: log(p / (1-p)) = log(p) - log(1-p)
4250        // Domain: (0, 1), Range: (-∞, +∞)
4251        if a.is_empty() {
4252            return Array1::zeros(0);
4253        }
4254        a.mapv(|p| {
4255            // Handle edge cases
4256            if p <= 0.0 {
4257                Self::NEG_INFINITY
4258            } else if p >= 1.0 {
4259                Self::INFINITY
4260            } else {
4261                (p / (1.0 - p)).ln()
4262            }
4263        })
4264    }
4265
4266    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
4267        // Element-wise x²
4268        if a.is_empty() {
4269            return Array1::zeros(0);
4270        }
4271        a.mapv(|x| x * x)
4272    }
4273
4274    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
4275        // Inverse square root: 1/sqrt(x)
4276        if a.is_empty() {
4277            return Array1::zeros(0);
4278        }
4279        a.mapv(|x| {
4280            if x <= 0.0 {
4281                if x == 0.0 {
4282                    Self::INFINITY
4283                } else {
4284                    Self::NAN
4285                }
4286            } else {
4287                1.0 / x.sqrt()
4288            }
4289        })
4290    }
4291
4292    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
4293        // Simultaneous sin and cos
4294        if a.is_empty() {
4295            return (Array1::zeros(0), Array1::zeros(0));
4296        }
4297        let sin_result = a.mapv(|x| x.sin());
4298        let cos_result = a.mapv(|x| x.cos());
4299        (sin_result, cos_result)
4300    }
4301
4302    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
4303        // Numerically stable exp(x) - 1
4304        // Uses std library's expm1 which handles small x accurately
4305        if a.is_empty() {
4306            return Array1::zeros(0);
4307        }
4308        a.mapv(|x| x.exp_m1())
4309    }
4310
4311    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
4312        // Numerically stable ln(1 + x)
4313        // Uses std library's ln_1p which handles small x accurately
4314        if a.is_empty() {
4315            return Array1::zeros(0);
4316        }
4317        a.mapv(|x| x.ln_1p())
4318    }
4319}
4320
4321/// Platform capability detection
4322#[derive(Debug, Clone, Copy)]
4323pub struct PlatformCapabilities {
4324    pub simd_available: bool,
4325    pub gpu_available: bool,
4326    pub cuda_available: bool,
4327    pub opencl_available: bool,
4328    pub metal_available: bool,
4329    pub avx2_available: bool,
4330    pub avx512_available: bool,
4331    pub neon_available: bool,
4332}
4333
4334impl PlatformCapabilities {
4335    /// Detect current platform capabilities
4336    pub fn detect() -> Self {
4337        Self {
4338            simd_available: cfg!(feature = "simd"),
4339            gpu_available: cfg!(feature = "gpu"),
4340            cuda_available: cfg!(all(feature = "gpu", feature = "cuda")),
4341            opencl_available: cfg!(all(feature = "gpu", feature = "opencl")),
4342            metal_available: cfg!(all(feature = "gpu", feature = "metal", target_os = "macos")),
4343            avx2_available: cfg!(target_feature = "avx2"),
4344            avx512_available: cfg!(target_feature = "avx512f"),
4345            neon_available: cfg!(target_arch = "aarch64"),
4346        }
4347    }
4348
4349    /// Get a summary of available acceleration features
4350    pub fn summary(&self) -> String {
4351        let mut features = Vec::new();
4352
4353        if self.simd_available {
4354            features.push("SIMD");
4355        }
4356        if self.gpu_available {
4357            features.push("GPU");
4358        }
4359        if self.cuda_available {
4360            features.push("CUDA");
4361        }
4362        if self.opencl_available {
4363            features.push("OpenCL");
4364        }
4365        if self.metal_available {
4366            features.push("Metal");
4367        }
4368        if self.avx2_available {
4369            features.push("AVX2");
4370        }
4371        if self.avx512_available {
4372            features.push("AVX512");
4373        }
4374        if self.neon_available {
4375            features.push("NEON");
4376        }
4377
4378        if features.is_empty() {
4379            "No acceleration features available".to_string()
4380        } else {
4381            format!(
4382                "Available acceleration: {features}",
4383                features = features.join(", ")
4384            )
4385        }
4386    }
4387
4388    /// Check if AVX2 is available
4389    pub fn has_avx2(&self) -> bool {
4390        self.avx2_available
4391    }
4392
4393    /// Check if AVX512 is available
4394    pub fn has_avx512(&self) -> bool {
4395        self.avx512_available
4396    }
4397
4398    /// Check if SSE is available (fallback to SIMD availability)
4399    pub fn has_sse(&self) -> bool {
4400        self.simd_available || self.neon_available || self.avx2_available
4401    }
4402
4403    /// Get the number of CPU cores
4404    pub fn num_cores(&self) -> usize {
4405        std::thread::available_parallelism()
4406            .map(|n| n.get())
4407            .unwrap_or(1)
4408    }
4409
4410    /// Get the cache line size in bytes
4411    pub fn cache_line_size(&self) -> usize {
4412        // Standard cache line size on most modern processors
4413        // x86/x64: typically 64 bytes
4414        // ARM: typically 64 bytes (Apple Silicon, newer ARM)
4415        64
4416    }
4417}
4418
4419/// Automatic operation selection based on problem size and available features
4420pub struct AutoOptimizer {
4421    capabilities: PlatformCapabilities,
4422}
4423
4424impl AutoOptimizer {
4425    pub fn new() -> Self {
4426        Self {
4427            capabilities: PlatformCapabilities::detect(),
4428        }
4429    }
4430
4431    /// Determine if GPU should be used for a given problem size
4432    pub fn should_use_gpu(&self, size: usize) -> bool {
4433        // Use GPU for large problems when available
4434        self.capabilities.gpu_available && size > 10000
4435    }
4436
4437    /// Determine if Metal should be used on macOS
4438    pub fn should_use_metal(&self, size: usize) -> bool {
4439        // Use Metal for medium to large problems on macOS
4440        // Metal has lower overhead than CUDA/OpenCL, so we can use it for smaller problems
4441        self.capabilities.metal_available && size > 1024
4442    }
4443
4444    /// Determine if SIMD should be used
4445    pub fn should_use_simd(&self, size: usize) -> bool {
4446        // Use SIMD for medium to large problems
4447        self.capabilities.simd_available && size > 64
4448    }
4449
4450    /// Select the best implementation for matrix multiplication
4451    pub fn select_gemm_impl(&self, m: usize, n: usize, k: usize) -> &'static str {
4452        let total_ops = m * n * k;
4453
4454        // Metal-specific heuristics for macOS
4455        if self.capabilities.metal_available {
4456            // For Apple Silicon with unified memory, Metal is efficient even for smaller matrices
4457            if total_ops > 8192 {
4458                // 16x16x32 or larger
4459                return "Metal";
4460            }
4461        }
4462
4463        if self.should_use_gpu(total_ops) {
4464            if self.capabilities.cuda_available {
4465                "CUDA"
4466            } else if self.capabilities.metal_available {
4467                "Metal"
4468            } else if self.capabilities.opencl_available {
4469                "OpenCL"
4470            } else {
4471                "GPU"
4472            }
4473        } else if self.should_use_simd(total_ops) {
4474            "SIMD"
4475        } else {
4476            "Scalar"
4477        }
4478    }
4479
4480    /// Select the best implementation for vector operations
4481    pub fn select_vector_impl(&self, size: usize) -> &'static str {
4482        // Metal is efficient for vector operations on Apple Silicon
4483        if self.capabilities.metal_available && size > 1024 {
4484            return "Metal";
4485        }
4486
4487        if self.should_use_gpu(size) {
4488            if self.capabilities.cuda_available {
4489                "CUDA"
4490            } else if self.capabilities.metal_available {
4491                "Metal"
4492            } else if self.capabilities.opencl_available {
4493                "OpenCL"
4494            } else {
4495                "GPU"
4496            }
4497        } else if self.should_use_simd(size) {
4498            if self.capabilities.avx512_available {
4499                "AVX512"
4500            } else if self.capabilities.avx2_available {
4501                "AVX2"
4502            } else if self.capabilities.neon_available {
4503                "NEON"
4504            } else {
4505                "SIMD"
4506            }
4507        } else {
4508            "Scalar"
4509        }
4510    }
4511
4512    /// Select the best implementation for reduction operations
4513    pub fn select_reduction_impl(&self, size: usize) -> &'static str {
4514        // Reductions benefit from GPU parallelism at larger sizes
4515        // Metal has efficient reduction primitives
4516        if self.capabilities.metal_available && size > 4096 {
4517            return "Metal";
4518        }
4519
4520        if self.should_use_gpu(size * 2) {
4521            // Higher threshold for reductions
4522            if self.capabilities.cuda_available {
4523                "CUDA"
4524            } else if self.capabilities.metal_available {
4525                "Metal"
4526            } else {
4527                "GPU"
4528            }
4529        } else if self.should_use_simd(size) {
4530            "SIMD"
4531        } else {
4532            "Scalar"
4533        }
4534    }
4535
4536    /// Select the best implementation for FFT operations
4537    pub fn select_fft_impl(&self, size: usize) -> &'static str {
4538        // FFT benefits greatly from GPU acceleration
4539        // Metal Performance Shaders has optimized FFT
4540        if self.capabilities.metal_available && size > 512 {
4541            return "Metal-MPS";
4542        }
4543
4544        if self.capabilities.cuda_available && size > 1024 {
4545            "cuFFT"
4546        } else if self.should_use_simd(size) {
4547            "SIMD"
4548        } else {
4549            "Scalar"
4550        }
4551    }
4552
4553    /// Check if running on Apple Silicon with unified memory
4554    pub fn has_unified_memory(&self) -> bool {
4555        cfg!(all(target_os = "macos", target_arch = "aarch64"))
4556    }
4557
4558    /// Get optimization recommendation for a specific operation
4559    pub fn recommend(&self, operation: &str, size: usize) -> String {
4560        let recommendation = match operation {
4561            "gemm" | "matmul" => self.select_gemm_impl(size, size, size),
4562            "vector" | "axpy" | "dot" => self.select_vector_impl(size),
4563            "reduction" | "sum" | "mean" => self.select_reduction_impl(size),
4564            "fft" => self.select_fft_impl(size),
4565            _ => "Scalar",
4566        };
4567
4568        if self.has_unified_memory() && recommendation == "Metal" {
4569            format!("{recommendation} (Unified Memory)")
4570        } else {
4571            recommendation.to_string()
4572        }
4573    }
4574}
4575
4576impl Default for AutoOptimizer {
4577    fn default() -> Self {
4578        Self::new()
4579    }
4580}
4581
4582/// Standalone ultra-optimized dot product function for f32
4583pub fn simd_dot_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
4584    f32::simd_dot_f32_ultra(a, b)
4585}
4586
4587/// Standalone ultra-optimized FMA function for f32
4588pub fn simd_fma_f32_ultra(
4589    a: &ArrayView1<f32>,
4590    b: &ArrayView1<f32>,
4591    c: &ArrayView1<f32>,
4592    result: &mut ArrayViewMut1<f32>,
4593) {
4594    f32::simd_fma_f32_ultra(a, b, c, result)
4595}
4596
4597/// Additional standalone functions that might be needed
4598pub fn simd_add_f32_adaptive(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
4599    f32::simd_add_adaptive(a, b)
4600}
4601
4602pub fn simd_mul_f32_hyperoptimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
4603    f32::simd_mul(a, b)
4604}
4605
4606/// Helper functions for `Vec<T>` compatibility
4607/// These functions accept `Vec<T>` and internally convert to Array types
4608///
4609/// Helper function for Vec-based SIMD multiplication
4610pub fn simd_mul_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4611    let a_array = Array1::from_vec(a.clone());
4612    let b_array = Array1::from_vec(b.clone());
4613    let mut result_array = Array1::from_vec(result.clone());
4614
4615    f32::simd_mul_f32_ultra(
4616        &a_array.view(),
4617        &b_array.view(),
4618        &mut result_array.view_mut(),
4619    );
4620    *result = result_array.to_vec();
4621}
4622
4623/// Helper function for Vec-based SIMD addition
4624pub fn simd_add_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4625    let a_array = Array1::from_vec(a.clone());
4626    let b_array = Array1::from_vec(b.clone());
4627    let mut result_array = Array1::from_vec(result.clone());
4628
4629    f32::simd_add_f32_ultra(
4630        &a_array.view(),
4631        &b_array.view(),
4632        &mut result_array.view_mut(),
4633    );
4634    *result = result_array.to_vec();
4635}
4636
4637/// Helper function for Vec-based SIMD division
4638pub fn simd_div_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
4639    let a_array = Array1::from_vec(a.clone());
4640    let b_array = Array1::from_vec(b.clone());
4641    let mut result_array = Array1::from_vec(result.clone());
4642
4643    f32::simd_div_f32_ultra(
4644        &a_array.view(),
4645        &b_array.view(),
4646        &mut result_array.view_mut(),
4647    );
4648    *result = result_array.to_vec();
4649}
4650
4651/// Helper function for Vec-based SIMD sine
4652pub fn simd_sin_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4653    let a_array = Array1::from_vec(a.to_owned());
4654    let mut result_array = Array1::from_vec(result.clone());
4655
4656    f32::simd_sin_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4657    *result = result_array.to_vec();
4658}
4659
4660/// Helper function for Vec-based SIMD subtraction
4661pub fn simd_sub_f32_ultra_vec(a: &[f32], b: &[f32], result: &mut Vec<f32>) {
4662    let a_array = Array1::from_vec(a.to_owned());
4663    let b_array = Array1::from_vec(b.to_owned());
4664    let mut result_array = Array1::from_vec(result.clone());
4665
4666    f32::simd_sub_f32_ultra(
4667        &a_array.view(),
4668        &b_array.view(),
4669        &mut result_array.view_mut(),
4670    );
4671    *result = result_array.to_vec();
4672}
4673
4674/// Helper function for Vec-based SIMD FMA
4675pub fn simd_fma_f32_ultra_vec(a: &[f32], b: &[f32], c: &[f32], result: &mut Vec<f32>) {
4676    let a_array = Array1::from_vec(a.to_owned());
4677    let b_array = Array1::from_vec(b.to_owned());
4678    let c_array = Array1::from_vec(c.to_owned());
4679    let mut result_array = Array1::from_vec(result.clone());
4680
4681    f32::simd_fma_f32_ultra(
4682        &a_array.view(),
4683        &b_array.view(),
4684        &c_array.view(),
4685        &mut result_array.view_mut(),
4686    );
4687    *result = result_array.to_vec();
4688}
4689
4690/// Helper function for Vec-based SIMD power
4691pub fn simd_pow_f32_ultra_vec(a: &[f32], b: &[f32], result: &mut Vec<f32>) {
4692    let a_array = Array1::from_vec(a.to_owned());
4693    let b_array = Array1::from_vec(b.to_owned());
4694    let mut result_array = Array1::from_vec(result.clone());
4695
4696    f32::simd_pow_f32_ultra(
4697        &a_array.view(),
4698        &b_array.view(),
4699        &mut result_array.view_mut(),
4700    );
4701    *result = result_array.to_vec();
4702}
4703
4704/// Helper function for Vec-based SIMD exp
4705pub fn simd_exp_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4706    let a_array = Array1::from_vec(a.to_owned());
4707    let mut result_array = Array1::from_vec(result.clone());
4708
4709    f32::simd_exp_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4710    *result = result_array.to_vec();
4711}
4712
4713/// Helper function for Vec-based SIMD cos
4714pub fn simd_cos_f32_ultra_vec(a: &[f32], result: &mut Vec<f32>) {
4715    let a_array = Array1::from_vec(a.to_owned());
4716    let mut result_array = Array1::from_vec(result.clone());
4717
4718    f32::simd_cos_f32_ultra(&a_array.view(), &mut result_array.view_mut());
4719    *result = result_array.to_vec();
4720}
4721
4722#[cfg(test)]
4723#[path = "simd_ops_tests.rs"]
4724mod tests;