scirs2_core/simd_ops/functions.rs
1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
6use num_traits::Zero;
7
8/// Unified SIMD operations trait
9pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
10 /// Element-wise addition
11 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
12 /// Element-wise subtraction
13 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
14 /// Element-wise multiplication
15 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
16 /// Element-wise division
17 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
18 /// Dot product
19 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
20 /// Matrix-vector multiplication (GEMV)
21 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
22 /// Matrix-matrix multiplication (GEMM)
23 fn simd_gemm(
24 alpha: Self,
25 a: &ArrayView2<Self>,
26 b: &ArrayView2<Self>,
27 beta: Self,
28 c: &mut Array2<Self>,
29 );
30 /// Vector norm (L2)
31 fn simd_norm(a: &ArrayView1<Self>) -> Self;
32 /// Element-wise maximum
33 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
34 /// Element-wise minimum
35 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
36 /// Scalar multiplication
37 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
38 /// Sum reduction
39 fn simd_sum(a: &ArrayView1<Self>) -> Self;
40 /// Mean reduction
41 fn simd_mean(a: &ArrayView1<Self>) -> Self;
42 /// Find maximum element
43 fn simd_max_element(a: &ArrayView1<Self>) -> Self;
44 /// Find minimum element
45 fn simd_min_element(a: &ArrayView1<Self>) -> Self;
46 /// Fused multiply-add: a * b + c
47 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
48 /// Enhanced cache-optimized addition for large arrays
49 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
50 /// Advanced-optimized fused multiply-add for maximum performance
51 fn simd_fma_advanced_optimized(
52 a: &ArrayView1<Self>,
53 b: &ArrayView1<Self>,
54 c: &ArrayView1<Self>,
55 ) -> Array1<Self>;
56 /// Adaptive SIMD operation that selects optimal implementation
57 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
58 /// Matrix transpose
59 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
60 /// Cache-optimized blocked matrix transpose
61 /// Uses L1 cache-friendly block sizes for improved memory access patterns.
62 /// Expected 3-5x speedup for large matrices (>512x512).
63 fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self>;
64 /// Element-wise absolute value
65 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
66 /// Element-wise square root
67 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
68 /// Element-wise exponential (e^x)
69 fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self>;
70 /// Element-wise natural logarithm (ln(x))
71 fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self>;
72 /// Element-wise sine (sin(x))
73 fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self>;
74 /// Element-wise cosine (cos(x))
75 fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self>;
76 /// Element-wise tangent (tan(x))
77 fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self>;
78 /// Element-wise hyperbolic sine (sinh(x))
79 fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self>;
80 /// Element-wise hyperbolic cosine (cosh(x))
81 fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self>;
82 /// Element-wise hyperbolic tangent (tanh(x))
83 fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self>;
84 /// Element-wise floor (largest integer <= x)
85 fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self>;
86 /// Element-wise ceiling (smallest integer >= x)
87 fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self>;
88 /// Element-wise rounding to nearest integer
89 fn simd_round(a: &ArrayView1<Self>) -> Array1<Self>;
90 /// Element-wise arctangent (atan(x))
91 fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self>;
92 /// Element-wise arcsine (asin(x))
93 fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self>;
94 /// Element-wise arccosine (acos(x))
95 fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self>;
96 /// Element-wise two-argument arctangent (atan2(y, x))
97 fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self>;
98 /// Element-wise base-10 logarithm (log10(x))
99 fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self>;
100 /// Element-wise base-2 logarithm (log2(x))
101 fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self>;
102 /// Element-wise clamp (constrain values to [min, max])
103 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self>;
104 /// Element-wise fractional part (x - trunc(x))
105 fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self>;
106 /// Element-wise trunc (round toward zero)
107 fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self>;
108 /// Element-wise reciprocal (1/x)
109 fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self>;
110 /// Element-wise power with scalar exponent (base^exp)
111 fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self>;
112 /// Element-wise power with array exponent (`base[i]^exp[i]`)
113 fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self>;
114 /// Element-wise power with integer exponent (base^n)
115 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self>;
116 /// Element-wise gamma function Γ(x)
117 fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self>;
118 /// Element-wise 2^x (base-2 exponential)
119 fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self>;
120 /// Element-wise cube root (cbrt(x) = x^(1/3))
121 fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self>;
122 /// Element-wise ln(1+x) (numerically stable for small x)
123 fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self>;
124 /// Element-wise exp(x)-1 (numerically stable for small x)
125 fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self>;
126 /// Element-wise conversion from degrees to radians (x * π / 180)
127 fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self>;
128 /// Element-wise conversion from radians to degrees (x * 180 / π)
129 fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self>;
130 /// Element-wise digamma function ψ(x) = d/dx ln(Γ(x))
131 fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self>;
132 /// Element-wise trigamma function ψ'(x) = d²/dx² ln(Γ(x))
133 /// The second derivative of log-gamma, critical for Fisher information in Bayesian inference.
134 fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self>;
135 /// Element-wise log-gamma function ln(Γ(x))
136 /// More numerically stable than computing gamma(x).ln() - used extensively in statistical distributions.
137 fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self>;
138 /// Element-wise error function erf(x) = (2/√π) ∫₀ˣ e^(-t²) dt
139 /// Critical for normal distribution CDF: Φ(x) = 0.5 * (1 + erf(x/√2))
140 /// Properties: erf(0)=0, erf(∞)=1, erf(-x)=-erf(x)
141 fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self>;
142 /// Element-wise complementary error function erfc(x) = 1 - erf(x)
143 /// More numerically stable than 1 - erf(x) for large x
144 fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self>;
145 /// Element-wise inverse error function erfinv(y) such that erf(erfinv(y)) = y
146 /// Critical for inverse normal CDF (probit function): Φ⁻¹(p) = √2 * erfinv(2p - 1)
147 /// Domain: (-1, 1), Range: (-∞, ∞)
148 /// Properties: erfinv(0)=0, erfinv(-y)=-erfinv(y) (odd function)
149 fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self>;
150 /// Element-wise inverse complementary error function erfcinv(y) such that erfc(erfcinv(y)) = y
151 /// More numerically stable than erfinv(1-y) for y close to 0
152 /// Domain: (0, 2), Range: (-∞, ∞)
153 fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self>;
154 /// Element-wise sigmoid (logistic) function: σ(x) = 1 / (1 + exp(-x))
155 /// Critical for neural networks, logistic regression, and probability modeling
156 /// Range: (0, 1), σ(0) = 0.5, σ(-∞) = 0, σ(+∞) = 1
157 /// Properties: σ(-x) = 1 - σ(x), derivative σ'(x) = σ(x)(1 - σ(x))
158 fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
159 /// Element-wise GELU (Gaussian Error Linear Unit) activation function
160 /// GELU(x) = x * Φ(x) = x * 0.5 * (1 + erf(x / √2))
161 /// Where Φ(x) is the standard normal CDF
162 /// Critical for Transformer models (BERT, GPT, etc.)
163 /// Properties: GELU(0) = 0, smooth approximation of ReLU
164 fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self>;
165 /// Element-wise Swish (SiLU - Sigmoid Linear Unit) activation function
166 /// Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
167 /// Self-gated activation discovered via neural architecture search
168 /// Used in EfficientNet, GPT-NeoX, and many modern architectures
169 /// Properties: smooth, non-monotonic, self-gating, unbounded above
170 fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self>;
171 /// Element-wise Softplus activation function
172 /// Softplus(x) = ln(1 + exp(x))
173 /// Smooth approximation of ReLU
174 /// Used in probabilistic models, Bayesian deep learning, smooth counting
175 /// Properties: softplus(0) = ln(2) ≈ 0.693, always positive, derivative = sigmoid(x)
176 fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self>;
177 /// Element-wise Mish activation function
178 /// Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
179 /// Self-regularized non-monotonic activation function
180 /// Used in YOLOv4, modern object detection, and neural architectures
181 /// Properties: smooth, non-monotonic, Mish(0) = 0, unbounded above
182 fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self>;
183 /// Element-wise ELU (Exponential Linear Unit) activation function
184 /// ELU(x, α) = x if x >= 0, α * (exp(x) - 1) if x < 0
185 /// Helps with vanishing gradients and faster learning
186 /// Used in deep neural networks for smoother outputs
187 /// Properties: smooth, continuous derivative, bounded below by -α
188 fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
189 /// SELU activation function (Scaled Exponential Linear Unit)
190 ///
191 /// SELU(x) = λ * (x if x > 0, α * (exp(x) - 1) if x <= 0)
192 /// where λ ≈ 1.0507 and α ≈ 1.6733 (fixed constants)
193 /// Self-normalizing: preserves mean=0, variance=1 through layers
194 /// Used in Self-Normalizing Neural Networks (SNNs)
195 /// Eliminates need for BatchNorm when using LeCun Normal initialization
196 fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self>;
197 /// Hardsigmoid activation function
198 ///
199 /// Hardsigmoid(x) = clip((x + 3) / 6, 0, 1)
200 /// Piecewise linear approximation of sigmoid
201 /// Used in MobileNetV3 for efficient inference
202 /// Avoids expensive exp() computation
203 fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self>;
204 /// Hardswish activation function
205 ///
206 /// Hardswish(x) = x * hardsigmoid(x) = x * clip((x + 3) / 6, 0, 1)
207 /// Piecewise linear approximation of Swish
208 /// Used in MobileNetV3 for efficient inference
209 /// Avoids expensive exp() computation while maintaining self-gating
210 fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self>;
211 /// Sinc function (normalized)
212 ///
213 /// sinc(x) = sin(πx) / (πx) for x ≠ 0, sinc(0) = 1
214 /// Critical for signal processing, windowing, interpolation
215 /// Properties: sinc(n) = 0 for all non-zero integers n
216 fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self>;
217 /// Log-softmax function for numerically stable probability computation
218 ///
219 /// log_softmax(x_i) = x_i - log(Σ_j exp(x_j))
220 /// Critical for neural networks, especially cross-entropy loss
221 /// More numerically stable than computing log(softmax(x))
222 /// Used in Transformers, LLMs, and classification networks
223 fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
224 /// Inverse hyperbolic sine: asinh(x) = ln(x + √(x² + 1))
225 ///
226 /// Domain: (-∞, +∞), Range: (-∞, +∞)
227 /// Used in: hyperbolic geometry, conformal mapping, special relativity (rapidity)
228 fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self>;
229 /// Inverse hyperbolic cosine: acosh(x) = ln(x + √(x² - 1))
230 ///
231 /// Domain: [1, +∞), Range: [0, +∞)
232 /// Returns NaN for x < 1
233 /// Used in: hyperbolic geometry, distance calculations, special relativity
234 fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self>;
235 /// Inverse hyperbolic tangent: atanh(x) = 0.5 * ln((1+x)/(1-x))
236 ///
237 /// Domain: (-1, 1), Range: (-∞, +∞)
238 /// Returns ±∞ at x = ±1, NaN for |x| > 1
239 /// Used in: statistical transformations (Fisher's z), probability
240 fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self>;
241 /// Beta function: B(a, b) = Γ(a)Γ(b)/Γ(a+b)
242 ///
243 /// The beta function is fundamental for:
244 /// - Beta distribution (Bayesian priors)
245 /// - Binomial coefficients: C(n,k) = 1/(n+1)/B(n-k+1, k+1)
246 /// - Statistical hypothesis testing
247 /// - Incomplete beta function (regularized)
248 fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
249 /// Log-Beta function: ln(B(a, b)) = ln(Γ(a)) + ln(Γ(b)) - ln(Γ(a+b))
250 ///
251 /// More numerically stable than computing B(a,b) for large arguments.
252 /// Returns ln(B(a,b)) for each pair of inputs.
253 fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
254 /// Linear interpolation: lerp(a, b, t) = a + t * (b - a) = a * (1 - t) + b * t
255 ///
256 /// Computes element-wise linear interpolation between arrays `a` and `b`
257 /// using interpolation parameter `t`. When t=0, returns a; when t=1, returns b.
258 ///
259 /// Critical for:
260 /// - Animation blending (skeletal animation, morph targets)
261 /// - Quaternion SLERP approximation (for small angles)
262 /// - Gradient computation in neural networks
263 /// - Smooth parameter transitions
264 /// - Color blending and image processing
265 fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self>;
266 /// Smoothstep interpolation: smoothstep(edge0, edge1, x)
267 ///
268 /// Returns smooth Hermite interpolation between 0 and 1 when edge0 < x < edge1.
269 /// - Returns 0 if x <= edge0
270 /// - Returns 1 if x >= edge1
271 /// - Returns smooth curve: 3t² - 2t³ where t = (x - edge0) / (edge1 - edge0)
272 ///
273 /// Critical for:
274 /// - Shader programming (lighting, transitions)
275 /// - Activation function variants
276 /// - Smooth threshold functions
277 /// - Anti-aliasing and blending
278 fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
279 /// Hypotenuse: hypot(x, y) = sqrt(x² + y²)
280 ///
281 /// Computes element-wise hypotenuse without overflow/underflow issues.
282 /// Uses the standard library implementation which handles extreme values.
283 ///
284 /// Critical for:
285 /// - Distance calculations in 2D/3D
286 /// - Computing vector magnitudes
287 /// - Graphics and physics simulations
288 /// - Complex number modulus: |a+bi| = hypot(a, b)
289 fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
290 /// Copysign: copysign(x, y) returns x with the sign of y
291 ///
292 /// For each element, returns the magnitude of x with the sign of y.
293 /// - copysign(1.0, -2.0) = -1.0
294 /// - copysign(-3.0, 4.0) = 3.0
295 ///
296 /// Critical for:
297 /// - Sign manipulation in numerical algorithms
298 /// - Implementing special functions (e.g., reflection formula)
299 /// - Gradient sign propagation
300 fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self>;
301 /// Smootherstep (Ken Perlin's improved smoothstep): 6t⁵ - 15t⁴ + 10t³
302 ///
303 /// An improved version of smoothstep with second-order continuous derivatives.
304 /// The first AND second derivatives are zero at the boundaries.
305 ///
306 /// Critical for:
307 /// - Perlin noise and procedural generation
308 /// - High-quality animation easing
309 /// - Shader programming (better lighting transitions)
310 /// - Gradient-based optimization (smoother loss landscapes)
311 fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self>;
312 /// Logaddexp: log(exp(a) + exp(b)) computed in a numerically stable way
313 ///
314 /// Uses the identity: log(exp(a) + exp(b)) = max(a,b) + log(1 + exp(-|a-b|))
315 /// This avoids overflow/underflow for large positive or negative values.
316 ///
317 /// Critical for:
318 /// - Log-probability computations (Bayesian inference)
319 /// - Log-likelihood calculations in ML
320 /// - Hidden Markov Model forward/backward algorithms
321 /// - Neural network loss functions (cross-entropy)
322 fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
323 /// Logit function: log(p / (1-p)) - inverse of sigmoid
324 ///
325 /// Maps probabilities in (0, 1) to log-odds in (-∞, +∞).
326 /// The logit function is the inverse of the sigmoid (logistic) function.
327 ///
328 /// Critical for:
329 /// - Logistic regression (log-odds interpretation)
330 /// - Probability calibration
331 /// - Converting probabilities to unbounded space for optimization
332 /// - Statistical modeling (link functions)
333 fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self>;
334 /// Element-wise square: x²
335 ///
336 /// More efficient than simd_pow(x, 2) or simd_mul(x, x) as it's a single multiplication.
337 ///
338 /// Critical for:
339 /// - Variance computation: E\[X²\] - E\[X\]²
340 /// - Distance calculations: ||a - b||² = (a - b)²
341 /// - Neural network loss functions (MSE)
342 /// - Physics simulations (kinetic energy: ½mv²)
343 fn simd_square(a: &ArrayView1<Self>) -> Array1<Self>;
344 /// Inverse square root: 1/sqrt(x)
345 ///
346 /// More efficient than simd_div(1, simd_sqrt(x)) for normalization operations.
347 ///
348 /// Critical for:
349 /// - Vector normalization: v * rsqrt(dot(v,v))
350 /// - Graphics (lighting, physics simulations)
351 /// - Layer normalization in neural networks
352 /// - Quaternion normalization
353 fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self>;
354 /// Simultaneous sin and cos: returns (sin(x), cos(x))
355 ///
356 /// More efficient than calling sin and cos separately when both are needed.
357 /// Returns a tuple of two arrays.
358 ///
359 /// Critical for:
360 /// - Rotation matrices (2D and 3D)
361 /// - Fourier transforms
362 /// - Wave simulations
363 /// - Animation and physics
364 fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>);
365 /// Numerically stable exp(x) - 1
366 ///
367 /// Returns exp(x) - 1 accurately for small x values where exp(x) ≈ 1.
368 /// For small x, the direct calculation exp(x) - 1 suffers from catastrophic cancellation.
369 ///
370 /// Critical for:
371 /// - Financial calculations (compound interest for small rates)
372 /// - Numerical integration of differential equations
373 /// - Statistical distributions (Poisson, exponential)
374 /// - Machine learning (softplus, log-sum-exp)
375 fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self>;
376 /// Numerically stable ln(1 + x)
377 ///
378 /// Returns ln(1 + x) accurately for small x values where 1 + x ≈ 1.
379 /// For small x, the direct calculation ln(1 + x) suffers from catastrophic cancellation.
380 ///
381 /// Critical for:
382 /// - Log-probability calculations (log(1 - p) for small p)
383 /// - Numerical integration
384 /// - Statistical distributions
385 /// - Machine learning (binary cross-entropy loss)
386 fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self>;
387 /// Sum of squares
388 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
389 /// Element-wise multiplication (alias for simd_mul)
390 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
391 /// Check if SIMD is available for this type
392 fn simd_available() -> bool;
393 /// Ultra-optimized sum reduction (alias for simd_sum for compatibility)
394 fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
395 Self::simd_sum(a)
396 }
397 /// Ultra-optimized subtraction (alias for simd_sub for compatibility)
398 fn simd_sub_f32_ultra(
399 a: &ArrayView1<Self>,
400 b: &ArrayView1<Self>,
401 result: &mut ArrayViewMut1<Self>,
402 );
403 /// Ultra-optimized multiplication (alias for simd_mul for compatibility)
404 fn simd_mul_f32_ultra(
405 a: &ArrayView1<Self>,
406 b: &ArrayView1<Self>,
407 result: &mut ArrayViewMut1<Self>,
408 );
409 /// Ultra-optimized cubes sum (power 3 sum)
410 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
411 /// Ultra-optimized division (alias for simd_div for compatibility)
412 fn simd_div_f32_ultra(
413 a: &ArrayView1<Self>,
414 b: &ArrayView1<Self>,
415 result: &mut ArrayViewMut1<Self>,
416 );
417 /// Ultra-optimized sine function
418 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
419 /// Ultra-optimized addition (alias for simd_add for compatibility)
420 fn simd_add_f32_ultra(
421 a: &ArrayView1<Self>,
422 b: &ArrayView1<Self>,
423 result: &mut ArrayViewMut1<Self>,
424 );
425 /// Ultra-optimized fused multiply-add
426 fn simd_fma_f32_ultra(
427 a: &ArrayView1<Self>,
428 b: &ArrayView1<Self>,
429 c: &ArrayView1<Self>,
430 result: &mut ArrayViewMut1<Self>,
431 );
432 /// Ultra-optimized power function
433 fn simd_pow_f32_ultra(
434 a: &ArrayView1<Self>,
435 b: &ArrayView1<Self>,
436 result: &mut ArrayViewMut1<Self>,
437 );
438 /// Ultra-optimized exponential function
439 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
440 /// Ultra-optimized cosine function
441 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
442 /// Ultra-optimized dot product
443 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
444 /// Variance (population variance)
445 fn simd_variance(a: &ArrayView1<Self>) -> Self;
446 /// Standard deviation
447 fn simd_std(a: &ArrayView1<Self>) -> Self;
448 /// L1 norm (Manhattan norm)
449 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self;
450 /// L∞ norm (Chebyshev norm / max absolute)
451 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self;
452 /// Cosine similarity between two vectors
453 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
454 /// Euclidean distance between two vectors
455 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
456 /// Manhattan distance between two vectors
457 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
458 /// Chebyshev distance between two vectors
459 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
460 /// Cosine distance (1 - cosine_similarity)
461 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
462 /// Weighted sum
463 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
464 /// Weighted mean
465 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self;
466 /// Find index of minimum element (argmin)
467 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize>;
468 /// Find index of maximum element (argmax)
469 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize>;
470 /// Clip values to [min_val, max_val] range
471 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self>;
472 /// Log-sum-exp for numerically stable softmax computation
473 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self;
474 /// Softmax for probability distribution (softmax = exp(x - log_sum_exp(x)))
475 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self>;
476 /// Cumulative sum
477 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self>;
478 /// Cumulative product
479 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self>;
480 /// First-order difference (`a[i+1] - a[i]`)
481 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self>;
482 /// Sign function: returns -1 for negative, 0 for zero, +1 for positive
483 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self>;
484 /// ReLU activation: max(0, x)
485 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self>;
486 /// Leaky ReLU: x if x > 0 else alpha * x
487 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self>;
488 /// L2 normalization (unit vector)
489 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self>;
490 /// Standardization: (x - mean) / std
491 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self>;
492
493 // ============================================================================
494 // ZERO-ALLOCATION SIMD OPERATIONS (Phase 1: ToRSh SIMD Performance Fix)
495 // ============================================================================
496 // These methods write directly to pre-allocated output buffers, eliminating
497 // all intermediate allocations. Critical for achieving SIMD speedup in ToRSh.
498 //
499 // Design rationale:
500 // - Use raw slices (&[Self], &mut [Self]) instead of ArrayView for maximum efficiency
501 // - No intermediate Array1 allocation - writes directly to output
502 // - Enables ToRSh to reduce from 4 allocations to 1 per operation
503 // ============================================================================
504
505 /// Zero-allocation element-wise addition: output = a + b
506 ///
507 /// Writes SIMD addition results directly to pre-allocated output buffer.
508 /// This is the core operation for achieving SIMD speedup without allocation overhead.
509 ///
510 /// # Panics
511 /// Panics if `a`, `b`, and `output` do not have the same length.
512 ///
513 /// # Performance
514 /// - x86_64 AVX2: Processes 8 elements per cycle
515 /// - ARM64 NEON: Processes 4 elements per cycle
516 /// - Expected 2-4x speedup over scalar for large arrays (>1000 elements)
517 fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]);
518
519 /// Zero-allocation element-wise subtraction: output = a - b
520 fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]);
521
522 /// Zero-allocation element-wise multiplication: output = a * b
523 ///
524 /// Writes SIMD multiplication results directly to pre-allocated output buffer.
525 fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]);
526
527 /// Zero-allocation element-wise division: output = a / b
528 fn simd_div_into(a: &[Self], b: &[Self], output: &mut [Self]);
529
530 /// In-place element-wise addition: a += b
531 ///
532 /// Modifies `a` in-place, adding corresponding elements from `b`.
533 /// Zero allocations, zero copies - pure SIMD operation.
534 fn simd_add_inplace(a: &mut [Self], b: &[Self]);
535
536 /// In-place element-wise subtraction: a -= b
537 fn simd_sub_inplace(a: &mut [Self], b: &[Self]);
538
539 /// In-place element-wise multiplication: a *= b
540 fn simd_mul_inplace(a: &mut [Self], b: &[Self]);
541
542 /// In-place element-wise division: a /= b
543 fn simd_div_inplace(a: &mut [Self], b: &[Self]);
544
545 /// In-place scalar addition: a += scalar
546 fn simd_add_scalar_inplace(a: &mut [Self], scalar: Self);
547
548 /// In-place scalar multiplication: a *= scalar
549 fn simd_mul_scalar_inplace(a: &mut [Self], scalar: Self);
550
551 /// Zero-allocation fused multiply-add: output = a * b + c
552 fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]);
553}