Skip to main content

scirs2_core/simd_ops/
functions.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
6use num_traits::Zero;
7
8/// Unified SIMD operations trait
9pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
10    /// Element-wise addition
11    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
12    /// Element-wise subtraction
13    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
14    /// Element-wise multiplication
15    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
16    /// Element-wise division
17    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
18    /// Dot product
19    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
20    /// Matrix-vector multiplication (GEMV)
21    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
22    /// Matrix-matrix multiplication (GEMM)
23    fn simd_gemm(
24        alpha: Self,
25        a: &ArrayView2<Self>,
26        b: &ArrayView2<Self>,
27        beta: Self,
28        c: &mut Array2<Self>,
29    );
30    /// Vector norm (L2)
31    fn simd_norm(a: &ArrayView1<Self>) -> Self;
32    /// Element-wise maximum
33    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
34    /// Element-wise minimum
35    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
36    /// Scalar multiplication
37    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
38    /// Sum reduction
39    fn simd_sum(a: &ArrayView1<Self>) -> Self;
40    /// Mean reduction
41    fn simd_mean(a: &ArrayView1<Self>) -> Self;
42    /// Find maximum element
43    fn simd_max_element(a: &ArrayView1<Self>) -> Self;
44    /// Find minimum element
45    fn simd_min_element(a: &ArrayView1<Self>) -> Self;
46    /// Fused multiply-add: a * b + c
47    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
48    /// Enhanced cache-optimized addition for large arrays
49    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
50    /// Advanced-optimized fused multiply-add for maximum performance
51    fn simd_fma_advanced_optimized(
52        a: &ArrayView1<Self>,
53        b: &ArrayView1<Self>,
54        c: &ArrayView1<Self>,
55    ) -> Array1<Self>;
56    /// Adaptive SIMD operation that selects optimal implementation
57    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
58    /// Matrix transpose
59    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
60    /// Cache-optimized blocked matrix transpose
61    /// Uses L1 cache-friendly block sizes for improved memory access patterns.
62    /// Expected 3-5x speedup for large matrices (>512x512).
63    fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self>;
64    /// Element-wise absolute value
65    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
66    /// Element-wise square root
67    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
68    /// Element-wise exponential (e^x)
69    fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self>;
70    /// Element-wise natural logarithm (ln(x))
71    fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self>;
72    /// Element-wise sine (sin(x))
73    fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self>;
74    /// Element-wise cosine (cos(x))
75    fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self>;
76    /// Element-wise tangent (tan(x))
77    fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self>;
78    /// Element-wise hyperbolic sine (sinh(x))
79    fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self>;
80    /// Element-wise hyperbolic cosine (cosh(x))
81    fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self>;
82    /// Element-wise hyperbolic tangent (tanh(x))
83    fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self>;
84    /// Element-wise floor (largest integer <= x)
85    fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self>;
86    /// Element-wise ceiling (smallest integer >= x)
87    fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self>;
88    /// Element-wise rounding to nearest integer
89    fn simd_round(a: &ArrayView1<Self>) -> Array1<Self>;
90    /// Element-wise arctangent (atan(x))
91    fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self>;
92    /// Element-wise arcsine (asin(x))
93    fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self>;
94    /// Element-wise arccosine (acos(x))
95    fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self>;
96    /// Element-wise two-argument arctangent (atan2(y, x))
97    fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self>;
98    /// Element-wise base-10 logarithm (log10(x))
99    fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self>;
100    /// Element-wise base-2 logarithm (log2(x))
101    fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self>;
102    /// Element-wise clamp (constrain values to [min, max])
103    fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self>;
104    /// Element-wise fractional part (x - trunc(x))
105    fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self>;
106    /// Element-wise trunc (round toward zero)
107    fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self>;
108    /// Element-wise reciprocal (1/x)
109    fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self>;
110    /// Element-wise power with scalar exponent (base^exp)
111    fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self>;
112    /// Element-wise power with array exponent (`base[i]^exp[i]`)
113    fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self>;
114    /// Element-wise power with integer exponent (base^n)
115    fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self>;
116    /// Element-wise gamma function Γ(x)
117    fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self>;
118    /// Element-wise 2^x (base-2 exponential)
119    fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self>;
120    /// Element-wise cube root (cbrt(x) = x^(1/3))
121    fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self>;
122    /// Element-wise ln(1+x) (numerically stable for small x)
123    fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self>;
124    /// Element-wise exp(x)-1 (numerically stable for small x)
125    fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self>;
126    /// Element-wise conversion from degrees to radians (x * π / 180)
127    fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self>;
128    /// Element-wise conversion from radians to degrees (x * 180 / π)
129    fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self>;
130    /// Element-wise digamma function ψ(x) = d/dx ln(Γ(x))
131    fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self>;
132    /// Element-wise trigamma function ψ'(x) = d²/dx² ln(Γ(x))
133    /// The second derivative of log-gamma, critical for Fisher information in Bayesian inference.
134    fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self>;
135    /// Element-wise log-gamma function ln(Γ(x))
136    /// More numerically stable than computing gamma(x).ln() - used extensively in statistical distributions.
137    fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self>;
138    /// Element-wise error function erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
139    /// Critical for normal distribution CDF: Φ(x) = 0.5 * (1 + erf(x/√2))
140    /// Properties: erf(0)=0, erf(∞)=1, erf(-x)=-erf(x)
141    fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self>;
142    /// Element-wise complementary error function erfc(x) = 1 - erf(x)
143    /// More numerically stable than 1 - erf(x) for large x
144    fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self>;
145    /// Element-wise inverse error function erfinv(y) such that erf(erfinv(y)) = y
146    /// Critical for inverse normal CDF (probit function): Φ⁻¹(p) = √2 * erfinv(2p - 1)
147    /// Domain: (-1, 1), Range: (-∞, ∞)
148    /// Properties: erfinv(0)=0, erfinv(-y)=-erfinv(y) (odd function)
149    fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self>;
150    /// Element-wise inverse complementary error function erfcinv(y) such that erfc(erfcinv(y)) = y
151    /// More numerically stable than erfinv(1-y) for y close to 0
152    /// Domain: (0, 2), Range: (-∞, ∞)
153    fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self>;
154    /// Element-wise sigmoid (logistic) function: σ(x) = 1 / (1 + exp(-x))
155    /// Critical for neural networks, logistic regression, and probability modeling
156    /// Range: (0, 1), σ(0) = 0.5, σ(-∞) = 0, σ(+∞) = 1
157    /// Properties: σ(-x) = 1 - σ(x), derivative σ'(x) = σ(x)(1 - σ(x))
158    fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
159    /// Element-wise GELU (Gaussian Error Linear Unit) activation function
160    /// GELU(x) = x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
161    /// Where Φ(x) is the standard normal CDF
162    /// Critical for Transformer models (BERT, GPT, etc.)
163    /// Properties: GELU(0) = 0, smooth approximation of ReLU
164    fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self>;
165    /// Element-wise Swish (SiLU - Sigmoid Linear Unit) activation function
166    /// Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
167    /// Self-gated activation discovered via neural architecture search
168    /// Used in EfficientNet, GPT-NeoX, and many modern architectures
169    /// Properties: smooth, non-monotonic, self-gating, unbounded above
170    fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self>;
171    /// Element-wise Softplus activation function
172    /// Softplus(x) = ln(1 + exp(x))
173    /// Smooth approximation of ReLU
174    /// Used in probabilistic models, Bayesian deep learning, smooth counting
175    /// Properties: softplus(0) = ln(2) ≈ 0.693, always positive, derivative = sigmoid(x)
176    fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self>;
177    /// Element-wise Mish activation function
178    /// Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
179    /// Self-regularized non-monotonic activation function
180    /// Used in YOLOv4, modern object detection, and neural architectures
181    /// Properties: smooth, non-monotonic, Mish(0) = 0, unbounded above
182    fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self>;
183    /// Element-wise ELU (Exponential Linear Unit) activation function
184    /// ELU(x, α) = x if x >= 0, α * (exp(x) - 1) if x < 0
185    /// Helps with vanishing gradients and faster learning
186    /// Used in deep neural networks for smoother outputs
187    /// Properties: smooth, continuous derivative, bounded below by -α
188    fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
189    /// SELU activation function (Scaled Exponential Linear Unit)
190    ///
191    /// SELU(x) = λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
192    /// where λ ≈ 1.0507 and α ≈ 1.6733 (fixed constants)
193    /// Self-normalizing: preserves mean=0, variance=1 through layers
194    /// Used in Self-Normalizing Neural Networks (SNNs)
195    /// Eliminates need for BatchNorm when using LeCun Normal initialization
196    fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self>;
197    /// Hardsigmoid activation function
198    ///
199    /// Hardsigmoid(x) = clip((x + 3) / 6, 0, 1)
200    /// Piecewise linear approximation of sigmoid
201    /// Used in MobileNetV3 for efficient inference
202    /// Avoids expensive exp() computation
203    fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
204    /// Hardswish activation function
205    ///
206    /// Hardswish(x) = x * hardsigmoid(x) = x * clip((x + 3) / 6, 0, 1)
207    /// Piecewise linear approximation of Swish
208    /// Used in MobileNetV3 for efficient inference
209    /// Avoids expensive exp() computation while maintaining self-gating
210    fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self>;
211    /// Sinc function (normalized)
212    ///
213    /// sinc(x) = sin(πx) / (πx) for x ≠ 0, sinc(0) = 1
214    /// Critical for signal processing, windowing, interpolation
215    /// Properties: sinc(n) = 0 for all non-zero integers n
216    fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self>;
217    /// Log-softmax function for numerically stable probability computation
218    ///
219    /// log_softmax(x_i) = x_i - log(Σ_j exp(x_j))
220    /// Critical for neural networks, especially cross-entropy loss
221    /// More numerically stable than computing log(softmax(x))
222    /// Used in Transformers, LLMs, and classification networks
223    fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
224    /// Inverse hyperbolic sine: asinh(x) = ln(x + √(x² + 1))
225    ///
226    /// Domain: (-∞, +∞), Range: (-∞, +∞)
227    /// Used in: hyperbolic geometry, conformal mapping, special relativity (rapidity)
228    fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self>;
229    /// Inverse hyperbolic cosine: acosh(x) = ln(x + √(x² - 1))
230    ///
231    /// Domain: [1, +∞), Range: [0, +∞)
232    /// Returns NaN for x < 1
233    /// Used in: hyperbolic geometry, distance calculations, special relativity
234    fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self>;
235    /// Inverse hyperbolic tangent: atanh(x) = 0.5 * ln((1+x)/(1-x))
236    ///
237    /// Domain: (-1, 1), Range: (-∞, +∞)
238    /// Returns ±∞ at x = ±1, NaN for |x| > 1
239    /// Used in: statistical transformations (Fisher's z), probability
240    fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self>;
241    /// Beta function: B(a, b) = Γ(a)Γ(b)/Γ(a+b)
242    ///
243    /// The beta function is fundamental for:
244    /// - Beta distribution (Bayesian priors)
245    /// - Binomial coefficients: C(n,k) = 1/(n+1)/B(n-k+1, k+1)
246    /// - Statistical hypothesis testing
247    /// - Incomplete beta function (regularized)
248    fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
249    /// Log-Beta function: ln(B(a, b)) = ln(Γ(a)) + ln(Γ(b)) - ln(Γ(a+b))
250    ///
251    /// More numerically stable than computing B(a,b) for large arguments.
252    /// Returns ln(B(a,b)) for each pair of inputs.
253    fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
254    /// Linear interpolation: lerp(a, b, t) = a + t * (b - a) = a * (1 - t) + b * t
255    ///
256    /// Computes element-wise linear interpolation between arrays `a` and `b`
257    /// using interpolation parameter `t`. When t=0, returns a; when t=1, returns b.
258    ///
259    /// Critical for:
260    /// - Animation blending (skeletal animation, morph targets)
261    /// - Quaternion SLERP approximation (for small angles)
262    /// - Gradient computation in neural networks
263    /// - Smooth parameter transitions
264    /// - Color blending and image processing
265    fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self>;
266    /// Smoothstep interpolation: smoothstep(edge0, edge1, x)
267    ///
268    /// Returns smooth Hermite interpolation between 0 and 1 when edge0 < x < edge1.
269    /// - Returns 0 if x <= edge0
270    /// - Returns 1 if x >= edge1
271    /// - Returns smooth curve: 3t² - 2t³ where t = (x - edge0) / (edge1 - edge0)
272    ///
273    /// Critical for:
274    /// - Shader programming (lighting, transitions)
275    /// - Activation function variants
276    /// - Smooth threshold functions
277    /// - Anti-aliasing and blending
278    fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
279    /// Hypotenuse: hypot(x, y) = sqrt(x² + y²)
280    ///
281    /// Computes element-wise hypotenuse without overflow/underflow issues.
282    /// Uses the standard library implementation which handles extreme values.
283    ///
284    /// Critical for:
285    /// - Distance calculations in 2D/3D
286    /// - Computing vector magnitudes
287    /// - Graphics and physics simulations
288    /// - Complex number modulus: |a+bi| = hypot(a, b)
289    fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
290    /// Copysign: copysign(x, y) returns x with the sign of y
291    ///
292    /// For each element, returns the magnitude of x with the sign of y.
293    /// - copysign(1.0, -2.0) = -1.0
294    /// - copysign(-3.0, 4.0) = 3.0
295    ///
296    /// Critical for:
297    /// - Sign manipulation in numerical algorithms
298    /// - Implementing special functions (e.g., reflection formula)
299    /// - Gradient sign propagation
300    fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
301    /// Smootherstep (Ken Perlin's improved smoothstep): 6t⁵ - 15t⁴ + 10t³
302    ///
303    /// An improved version of smoothstep with second-order continuous derivatives.
304    /// The first AND second derivatives are zero at the boundaries.
305    ///
306    /// Critical for:
307    /// - Perlin noise and procedural generation
308    /// - High-quality animation easing
309    /// - Shader programming (better lighting transitions)
310    /// - Gradient-based optimization (smoother loss landscapes)
311    fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
312    /// Logaddexp: log(exp(a) + exp(b)) computed in a numerically stable way
313    ///
314    /// Uses the identity: log(exp(a) + exp(b)) = max(a,b) + log(1 + exp(-|a-b|))
315    /// This avoids overflow/underflow for large positive or negative values.
316    ///
317    /// Critical for:
318    /// - Log-probability computations (Bayesian inference)
319    /// - Log-likelihood calculations in ML
320    /// - Hidden Markov Model forward/backward algorithms
321    /// - Neural network loss functions (cross-entropy)
322    fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
323    /// Logit function: log(p / (1-p)) - inverse of sigmoid
324    ///
325    /// Maps probabilities in (0, 1) to log-odds in (-∞, +∞).
326    /// The logit function is the inverse of the sigmoid (logistic) function.
327    ///
328    /// Critical for:
329    /// - Logistic regression (log-odds interpretation)
330    /// - Probability calibration
331    /// - Converting probabilities to unbounded space for optimization
332    /// - Statistical modeling (link functions)
333    fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self>;
334    /// Element-wise square: x²
335    ///
336    /// More efficient than simd_pow(x, 2) or simd_mul(x, x) as it's a single multiplication.
337    ///
338    /// Critical for:
339    /// - Variance computation: E\[X²\] - E\[X\]²
340    /// - Distance calculations: ||a - b||² = (a - b)²
341    /// - Neural network loss functions (MSE)
342    /// - Physics simulations (kinetic energy: ½mv²)
343    fn simd_square(a: &ArrayView1<Self>) -> Array1<Self>;
344    /// Inverse square root: 1/sqrt(x)
345    ///
346    /// More efficient than simd_div(1, simd_sqrt(x)) for normalization operations.
347    ///
348    /// Critical for:
349    /// - Vector normalization: v * rsqrt(dot(v,v))
350    /// - Graphics (lighting, physics simulations)
351    /// - Layer normalization in neural networks
352    /// - Quaternion normalization
353    fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self>;
354    /// Simultaneous sin and cos: returns (sin(x), cos(x))
355    ///
356    /// More efficient than calling sin and cos separately when both are needed.
357    /// Returns a tuple of two arrays.
358    ///
359    /// Critical for:
360    /// - Rotation matrices (2D and 3D)
361    /// - Fourier transforms
362    /// - Wave simulations
363    /// - Animation and physics
364    fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>);
365    /// Numerically stable exp(x) - 1
366    ///
367    /// Returns exp(x) - 1 accurately for small x values where exp(x) ≈ 1.
368    /// For small x, the direct calculation exp(x) - 1 suffers from catastrophic cancellation.
369    ///
370    /// Critical for:
371    /// - Financial calculations (compound interest for small rates)
372    /// - Numerical integration of differential equations
373    /// - Statistical distributions (Poisson, exponential)
374    /// - Machine learning (softplus, log-sum-exp)
375    fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self>;
376    /// Numerically stable ln(1 + x)
377    ///
378    /// Returns ln(1 + x) accurately for small x values where 1 + x ≈ 1.
379    /// For small x, the direct calculation ln(1 + x) suffers from catastrophic cancellation.
380    ///
381    /// Critical for:
382    /// - Log-probability calculations (log(1 - p) for small p)
383    /// - Numerical integration
384    /// - Statistical distributions
385    /// - Machine learning (binary cross-entropy loss)
386    fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self>;
387    /// Sum of squares
388    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
389    /// Element-wise multiplication (alias for simd_mul)
390    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
391    /// Check if SIMD is available for this type
392    fn simd_available() -> bool;
393    /// Ultra-optimized sum reduction (alias for simd_sum for compatibility)
394    fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
395        Self::simd_sum(a)
396    }
397    /// Ultra-optimized subtraction (alias for simd_sub for compatibility)
398    fn simd_sub_f32_ultra(
399        a: &ArrayView1<Self>,
400        b: &ArrayView1<Self>,
401        result: &mut ArrayViewMut1<Self>,
402    );
403    /// Ultra-optimized multiplication (alias for simd_mul for compatibility)
404    fn simd_mul_f32_ultra(
405        a: &ArrayView1<Self>,
406        b: &ArrayView1<Self>,
407        result: &mut ArrayViewMut1<Self>,
408    );
409    /// Ultra-optimized cubes sum (power 3 sum)
410    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
411    /// Ultra-optimized division (alias for simd_div for compatibility)
412    fn simd_div_f32_ultra(
413        a: &ArrayView1<Self>,
414        b: &ArrayView1<Self>,
415        result: &mut ArrayViewMut1<Self>,
416    );
417    /// Ultra-optimized sine function
418    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
419    /// Ultra-optimized addition (alias for simd_add for compatibility)
420    fn simd_add_f32_ultra(
421        a: &ArrayView1<Self>,
422        b: &ArrayView1<Self>,
423        result: &mut ArrayViewMut1<Self>,
424    );
425    /// Ultra-optimized fused multiply-add
426    fn simd_fma_f32_ultra(
427        a: &ArrayView1<Self>,
428        b: &ArrayView1<Self>,
429        c: &ArrayView1<Self>,
430        result: &mut ArrayViewMut1<Self>,
431    );
432    /// Ultra-optimized power function
433    fn simd_pow_f32_ultra(
434        a: &ArrayView1<Self>,
435        b: &ArrayView1<Self>,
436        result: &mut ArrayViewMut1<Self>,
437    );
438    /// Ultra-optimized exponential function
439    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
440    /// Ultra-optimized cosine function
441    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
442    /// Ultra-optimized dot product
443    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
444    /// Variance (population variance)
445    fn simd_variance(a: &ArrayView1<Self>) -> Self;
446    /// Standard deviation
447    fn simd_std(a: &ArrayView1<Self>) -> Self;
448    /// L1 norm (Manhattan norm)
449    fn simd_norm_l1(a: &ArrayView1<Self>) -> Self;
450    /// L∞ norm (Chebyshev norm / max absolute)
451    fn simd_norm_linf(a: &ArrayView1<Self>) -> Self;
452    /// Cosine similarity between two vectors
453    fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
454    /// Euclidean distance between two vectors
455    fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
456    /// Manhattan distance between two vectors
457    fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
458    /// Chebyshev distance between two vectors
459    fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
460    /// Cosine distance (1 - cosine_similarity)
461    fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
462    /// Weighted sum
463    fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
464    /// Weighted mean
465    fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
466    /// Find index of minimum element (argmin)
467    fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize>;
468    /// Find index of maximum element (argmax)
469    fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize>;
470    /// Clip values to [min_val, max_val] range
471    fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self>;
472    /// Log-sum-exp for numerically stable softmax computation
473    fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self;
474    /// Softmax for probability distribution (softmax = exp(x - log_sum_exp(x)))
475    fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
476    /// Cumulative sum
477    fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self>;
478    /// Cumulative product
479    fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self>;
480    /// First-order difference (`a[i+1] - a[i]`)
481    fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self>;
482    /// Sign function: returns -1 for negative, 0 for zero, +1 for positive
483    fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self>;
484    /// ReLU activation: max(0, x)
485    fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self>;
486    /// Leaky ReLU: x if x > 0 else alpha * x
487    fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
488    /// L2 normalization (unit vector)
489    fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self>;
490    /// Standardization: (x - mean) / std
491    fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self>;
492
493    // ============================================================================
494    // ZERO-ALLOCATION SIMD OPERATIONS (Phase 1: ToRSh SIMD Performance Fix)
495    // ============================================================================
496    // These methods write directly to pre-allocated output buffers, eliminating
497    // all intermediate allocations. Critical for achieving SIMD speedup in ToRSh.
498    //
499    // Design rationale:
500    // - Use raw slices (&[Self], &mut [Self]) instead of ArrayView for maximum efficiency
501    // - No intermediate Array1 allocation - writes directly to output
502    // - Enables ToRSh to reduce from 4 allocations to 1 per operation
503    // ============================================================================
504
505    /// Zero-allocation element-wise addition: output = a + b
506    ///
507    /// Writes SIMD addition results directly to pre-allocated output buffer.
508    /// This is the core operation for achieving SIMD speedup without allocation overhead.
509    ///
510    /// # Panics
511    /// Panics if `a`, `b`, and `output` do not have the same length.
512    ///
513    /// # Performance
514    /// - x86_64 AVX2: Processes 8 elements per cycle
515    /// - ARM64 NEON: Processes 4 elements per cycle
516    /// - Expected 2-4x speedup over scalar for large arrays (>1000 elements)
517    fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]);
518
519    /// Zero-allocation element-wise subtraction: output = a - b
520    fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]);
521
522    /// Zero-allocation element-wise multiplication: output = a * b
523    ///
524    /// Writes SIMD multiplication results directly to pre-allocated output buffer.
525    fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]);
526
527    /// Zero-allocation element-wise division: output = a / b
528    fn simd_div_into(a: &[Self], b: &[Self], output: &mut [Self]);
529
530    /// In-place element-wise addition: a += b
531    ///
532    /// Modifies `a` in-place, adding corresponding elements from `b`.
533    /// Zero allocations, zero copies - pure SIMD operation.
534    fn simd_add_inplace(a: &mut [Self], b: &[Self]);
535
536    /// In-place element-wise subtraction: a -= b
537    fn simd_sub_inplace(a: &mut [Self], b: &[Self]);
538
539    /// In-place element-wise multiplication: a *= b
540    fn simd_mul_inplace(a: &mut [Self], b: &[Self]);
541
542    /// In-place element-wise division: a /= b
543    fn simd_div_inplace(a: &mut [Self], b: &[Self]);
544
545    /// In-place scalar addition: a += scalar
546    fn simd_add_scalar_inplace(a: &mut [Self], scalar: Self);
547
548    /// In-place scalar multiplication: a *= scalar
549    fn simd_mul_scalar_inplace(a: &mut [Self], scalar: Self);
550
551    /// Zero-allocation fused multiply-add: output = a * b + c
552    fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]);
553}