1use ::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
6
7use super::functions::SimdUnifiedOps;
8use super::functions_2::{
9 digamma_f32, digamma_f64, erf_f32, erf_f64, erfc_f32, erfc_f64, erfinv_f32, lanczos_gamma_f32,
10 lanczos_gamma_f64, ln_gamma_f32, ln_gamma_f64, trigamma_f32, trigamma_f64,
11};
12use super::functions_3::{
13 elu_f32, elu_f64, erfcinv_f32, erfcinv_f64, erfinv_f64, gelu_f32, gelu_f64, hardsigmoid_f32,
14 hardsigmoid_f64, hardswish_f32, hardswish_f64, mish_f32, mish_f64, selu_f32, selu_f64,
15 sigmoid_f32, sigmoid_f64, sinc_f32, sinc_f64, softplus_f32, softplus_f64, swish_f32, swish_f64,
16};
17#[cfg(feature = "simd")]
18use crate::simd_ops_polynomial;
19
20impl SimdUnifiedOps for f32 {
21 #[cfg(feature = "simd")]
22 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
23 crate::simd::simd_add_f32(a, b)
24 }
25 #[cfg(not(feature = "simd"))]
26 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
27 (a + b).to_owned()
28 }
29 #[cfg(feature = "simd")]
30 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
31 crate::simd::simd_sub_f32(a, b)
32 }
33 #[cfg(not(feature = "simd"))]
34 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
35 (a - b).to_owned()
36 }
37 #[cfg(feature = "simd")]
38 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
39 crate::simd::simd_mul_f32(a, b)
40 }
41 #[cfg(not(feature = "simd"))]
42 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
43 (a * b).to_owned()
44 }
45 #[cfg(feature = "simd")]
46 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
47 crate::simd::simd_div_f32(a, b)
48 }
49 #[cfg(not(feature = "simd"))]
50 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
51 (a / b).to_owned()
52 }
53 #[cfg(feature = "simd")]
54 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
55 crate::simd::simd_dot_f32(a, b)
56 }
57 #[cfg(not(feature = "simd"))]
58 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
59 a.dot(b)
60 }
61 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
62 let m = a.nrows();
63 let n = a.ncols();
64 assert_eq!(n, x.len());
65 assert_eq!(m, y.len());
66 if beta == 0.0 {
67 y.fill(0.0);
68 } else if beta != 1.0 {
69 y.mapv_inplace(|v| v * beta);
70 }
71 for i in 0..m {
72 let row = a.row(i);
73 y[i] += Self::simd_dot(&row, x);
74 }
75 }
76 fn simd_gemm(
77 alpha: Self,
78 a: &ArrayView2<Self>,
79 b: &ArrayView2<Self>,
80 beta: Self,
81 c: &mut Array2<Self>,
82 ) {
83 let m = a.nrows();
84 let k = a.ncols();
85 let n = b.ncols();
86 assert_eq!(k, b.nrows());
87 assert_eq!((m, n), c.dim());
88 if beta == 0.0 {
89 c.fill(0.0);
90 } else if beta != 1.0 {
91 c.mapv_inplace(|v| v * beta);
92 }
93 const GEMM_TRANSPOSE_THRESHOLD: usize = 4096;
94 if n * k > GEMM_TRANSPOSE_THRESHOLD {
95 let b_t = Self::simd_transpose_blocked(b);
96 for i in 0..m {
97 let a_row = a.row(i);
98 for j in 0..n {
99 let b_row = b_t.row(j);
100 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_row);
101 }
102 }
103 } else {
104 for i in 0..m {
105 let a_row = a.row(i);
106 for j in 0..n {
107 let b_col = b.column(j);
108 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
109 }
110 }
111 }
112 }
113 #[cfg(feature = "simd")]
114 fn simd_norm(a: &ArrayView1<Self>) -> Self {
115 crate::simd::norms::simd_norm_l2_f32(a)
116 }
117 #[cfg(not(feature = "simd"))]
118 fn simd_norm(a: &ArrayView1<Self>) -> Self {
119 a.iter().map(|&x| x * x).sum::<f32>().sqrt()
120 }
121 #[cfg(feature = "simd")]
122 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
123 crate::simd::simd_maximum_f32(a, b)
124 }
125 #[cfg(not(feature = "simd"))]
126 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
127 let mut result = Array1::zeros(a.len());
128 for _i in 0..a.len() {
129 result[0] = a[0].max(b[0]);
130 }
131 result
132 }
133 #[cfg(feature = "simd")]
134 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
135 crate::simd::simd_minimum_f32(a, b)
136 }
137 #[cfg(not(feature = "simd"))]
138 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
139 let mut result = Array1::zeros(a.len());
140 for _i in 0..a.len() {
141 result[0] = a[0].min(b[0]);
142 }
143 result
144 }
145 #[cfg(feature = "simd")]
146 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
147 crate::simd::simd_scalar_mul_f32(a, scalar)
148 }
149 #[cfg(not(feature = "simd"))]
150 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
151 a.mapv(|x| x * scalar)
152 }
153 #[cfg(feature = "simd")]
154 fn simd_sum(a: &ArrayView1<Self>) -> Self {
155 crate::simd::simd_sum_f32(a)
156 }
157 #[cfg(not(feature = "simd"))]
158 fn simd_sum(a: &ArrayView1<Self>) -> Self {
159 a.sum()
160 }
161 fn simd_mean(a: &ArrayView1<Self>) -> Self {
162 if a.is_empty() {
163 0.0
164 } else {
165 Self::simd_sum(a) / (a.len() as f32)
166 }
167 }
168 #[cfg(feature = "simd")]
169 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
170 crate::simd::simd_max_f32(a)
171 }
172 #[cfg(not(feature = "simd"))]
173 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
174 a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
175 }
176 #[cfg(feature = "simd")]
177 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
178 crate::simd::simd_min_f32(a)
179 }
180 #[cfg(not(feature = "simd"))]
181 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
182 a.fold(f32::INFINITY, |acc, &x| acc.min(x))
183 }
184 #[cfg(feature = "simd")]
185 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
186 crate::simd::simd_fused_multiply_add_f32(a, b, c)
187 }
188 #[cfg(not(feature = "simd"))]
189 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
190 let mut result = Array1::zeros(a.len());
191 for _i in 0..a.len() {
192 result[0] = a[0] * b[0] + c[0];
193 }
194 result
195 }
196 #[cfg(feature = "simd")]
197 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
198 crate::simd::simd_add_cache_optimized_f32(a, b)
199 }
200 #[cfg(not(feature = "simd"))]
201 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
202 a + b
203 }
204 #[cfg(feature = "simd")]
205 fn simd_fma_advanced_optimized(
206 a: &ArrayView1<Self>,
207 b: &ArrayView1<Self>,
208 c: &ArrayView1<Self>,
209 ) -> Array1<Self> {
210 crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
211 }
212 #[cfg(not(feature = "simd"))]
213 fn simd_fma_advanced_optimized(
214 a: &ArrayView1<Self>,
215 b: &ArrayView1<Self>,
216 c: &ArrayView1<Self>,
217 ) -> Array1<Self> {
218 let mut result = Array1::zeros(a.len());
219 for _i in 0..a.len() {
220 result[0] = a[0] * b[0] + c[0];
221 }
222 result
223 }
224 #[cfg(feature = "simd")]
225 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
226 crate::simd::simd_adaptive_add_f32(a, b)
227 }
228 #[cfg(not(feature = "simd"))]
229 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
230 a + b
231 }
232 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
233 a.t().to_owned()
234 }
235 fn simd_transpose_blocked(a: &ArrayView2<Self>) -> Array2<Self> {
236 #[cfg(feature = "simd")]
237 {
238 crate::simd::simd_transpose_blocked_f32(a)
239 }
240 #[cfg(not(feature = "simd"))]
241 {
242 a.t().to_owned()
243 }
244 }
245 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
246 a.iter().map(|&x| x * x).sum()
247 }
248 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
249 Self::simd_mul(a, b)
250 }
251 #[cfg(feature = "simd")]
252 fn simd_available() -> bool {
253 true
254 }
255 #[cfg(not(feature = "simd"))]
256 fn simd_available() -> bool {
257 false
258 }
259 fn simd_sub_f32_ultra(
260 a: &ArrayView1<Self>,
261 b: &ArrayView1<Self>,
262 result: &mut ArrayViewMut1<Self>,
263 ) {
264 let sub_result = Self::simd_sub(a, b);
265 result.assign(&sub_result);
266 }
267 fn simd_mul_f32_ultra(
268 a: &ArrayView1<Self>,
269 b: &ArrayView1<Self>,
270 result: &mut ArrayViewMut1<Self>,
271 ) {
272 let mul_result = Self::simd_mul(a, b);
273 result.assign(&mul_result);
274 }
275 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
276 a.iter().map(|&x| x * x * x).sum()
277 }
278 fn simd_div_f32_ultra(
279 a: &ArrayView1<Self>,
280 b: &ArrayView1<Self>,
281 result: &mut ArrayViewMut1<Self>,
282 ) {
283 let div_result = Self::simd_div(a, b);
284 result.assign(&div_result);
285 }
286 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
287 let sin_result = a.mapv(|x| x.sin());
288 result.assign(&sin_result);
289 }
290 fn simd_add_f32_ultra(
291 a: &ArrayView1<Self>,
292 b: &ArrayView1<Self>,
293 result: &mut ArrayViewMut1<Self>,
294 ) {
295 let add_result = Self::simd_add(a, b);
296 result.assign(&add_result);
297 }
298 fn simd_fma_f32_ultra(
299 a: &ArrayView1<Self>,
300 b: &ArrayView1<Self>,
301 c: &ArrayView1<Self>,
302 result: &mut ArrayViewMut1<Self>,
303 ) {
304 let fma_result = Self::simd_fma(a, b, c);
305 result.assign(&fma_result);
306 }
307 fn simd_pow_f32_ultra(
308 a: &ArrayView1<Self>,
309 b: &ArrayView1<Self>,
310 result: &mut ArrayViewMut1<Self>,
311 ) {
312 let pow_result = a
313 .iter()
314 .zip(b.iter())
315 .map(|(&x, &y)| x.powf(y))
316 .collect::<Vec<_>>();
317 result.assign(&Array1::from_vec(pow_result));
318 }
319 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
320 let exp_result = a.mapv(|x| x.exp());
321 result.assign(&exp_result);
322 }
323 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
324 let cos_result = a.mapv(|x| x.cos());
325 result.assign(&cos_result);
326 }
327 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
328 Self::simd_dot(a, b)
329 }
330 #[cfg(feature = "simd")]
331 fn simd_variance(a: &ArrayView1<Self>) -> Self {
332 crate::simd::simd_variance_f32(a)
333 }
334 #[cfg(not(feature = "simd"))]
335 fn simd_variance(a: &ArrayView1<Self>) -> Self {
336 let mean = Self::simd_mean(a);
337 let n = a.len() as f32;
338 if n < 2.0 {
339 return f32::NAN;
340 }
341 a.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / (n - 1.0)
342 }
343 #[cfg(feature = "simd")]
344 fn simd_std(a: &ArrayView1<Self>) -> Self {
345 crate::simd::simd_std_f32(a)
346 }
347 #[cfg(not(feature = "simd"))]
348 fn simd_std(a: &ArrayView1<Self>) -> Self {
349 Self::simd_variance(a).sqrt()
350 }
351 #[cfg(feature = "simd")]
352 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
353 crate::simd::simd_norm_l1_f32(a)
354 }
355 #[cfg(not(feature = "simd"))]
356 fn simd_norm_l1(a: &ArrayView1<Self>) -> Self {
357 a.iter().map(|&x| x.abs()).sum()
358 }
359 #[cfg(feature = "simd")]
360 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
361 crate::simd::simd_norm_linf_f32(a)
362 }
363 #[cfg(not(feature = "simd"))]
364 fn simd_norm_linf(a: &ArrayView1<Self>) -> Self {
365 a.iter().fold(0.0f32, |acc, &x| acc.max(x.abs()))
366 }
367 #[cfg(feature = "simd")]
368 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
369 crate::simd::simd_cosine_similarity_f32(a, b)
370 }
371 #[cfg(not(feature = "simd"))]
372 fn simd_cosine_similarity(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
373 let dot = Self::simd_dot(a, b);
374 let norm_a = Self::simd_norm(a);
375 let norm_b = Self::simd_norm(b);
376 dot / (norm_a * norm_b)
377 }
378 #[cfg(feature = "simd")]
379 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
380 crate::simd::simd_distance_euclidean_f32(a, b)
381 }
382 #[cfg(not(feature = "simd"))]
383 fn simd_distance_euclidean(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
384 a.iter()
385 .zip(b.iter())
386 .map(|(&x, &y)| (x - y).powi(2))
387 .sum::<f32>()
388 .sqrt()
389 }
390 #[cfg(feature = "simd")]
391 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
392 crate::simd::simd_distance_manhattan_f32(a, b)
393 }
394 #[cfg(not(feature = "simd"))]
395 fn simd_distance_manhattan(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
396 a.iter().zip(b.iter()).map(|(&x, &y)| (x - y).abs()).sum()
397 }
398 #[cfg(feature = "simd")]
399 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
400 crate::simd::simd_distance_chebyshev_f32(a, b)
401 }
402 #[cfg(not(feature = "simd"))]
403 fn simd_distance_chebyshev(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
404 a.iter()
405 .zip(b.iter())
406 .fold(0.0f32, |acc, (&x, &y)| acc.max((x - y).abs()))
407 }
408 #[cfg(feature = "simd")]
409 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
410 crate::simd::simd_distance_cosine_f32(a, b)
411 }
412 #[cfg(not(feature = "simd"))]
413 fn simd_distance_cosine(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
414 1.0 - Self::simd_cosine_similarity(a, b)
415 }
416 #[cfg(feature = "simd")]
417 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
418 crate::simd::simd_weighted_sum_f32(values, weights)
419 }
420 #[cfg(not(feature = "simd"))]
421 fn simd_weighted_sum(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
422 values
423 .iter()
424 .zip(weights.iter())
425 .map(|(&v, &w)| v * w)
426 .sum()
427 }
428 #[cfg(feature = "simd")]
429 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
430 crate::simd::simd_weighted_mean_f32(values, weights)
431 }
432 #[cfg(not(feature = "simd"))]
433 fn simd_weighted_mean(values: &ArrayView1<Self>, weights: &ArrayView1<Self>) -> Self {
434 let weighted_sum = Self::simd_weighted_sum(values, weights);
435 let weight_sum: f32 = weights.iter().sum();
436 weighted_sum / weight_sum
437 }
438 #[cfg(feature = "simd")]
439 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
440 crate::simd::simd_argmin_f32(a)
441 }
442 #[cfg(not(feature = "simd"))]
443 fn simd_argmin(a: &ArrayView1<Self>) -> Option<usize> {
444 if a.is_empty() {
445 return None;
446 }
447 let mut min_idx = 0;
448 let mut min_val = a[0];
449 for (i, &v) in a.iter().enumerate().skip(1) {
450 if v < min_val {
451 min_val = v;
452 min_idx = i;
453 }
454 }
455 Some(min_idx)
456 }
457 #[cfg(feature = "simd")]
458 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
459 crate::simd::simd_argmax_f32(a)
460 }
461 #[cfg(not(feature = "simd"))]
462 fn simd_argmax(a: &ArrayView1<Self>) -> Option<usize> {
463 if a.is_empty() {
464 return None;
465 }
466 let mut max_idx = 0;
467 let mut max_val = a[0];
468 for (i, &v) in a.iter().enumerate().skip(1) {
469 if v > max_val {
470 max_val = v;
471 max_idx = i;
472 }
473 }
474 Some(max_idx)
475 }
476 #[cfg(feature = "simd")]
477 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
478 crate::simd::simd_clip_f32(a, min_val, max_val)
479 }
480 #[cfg(not(feature = "simd"))]
481 fn simd_clip(a: &ArrayView1<Self>, min_val: Self, max_val: Self) -> Array1<Self> {
482 a.mapv(|v| v.max(min_val).min(max_val))
483 }
484 #[cfg(feature = "simd")]
485 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
486 crate::simd::simd_log_sum_exp_f32(a)
487 }
488 #[cfg(not(feature = "simd"))]
489 fn simd_log_sum_exp(a: &ArrayView1<Self>) -> Self {
490 if a.is_empty() {
491 return f32::NEG_INFINITY;
492 }
493 let max_val = a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x));
494 let sum_exp: f32 = a.iter().map(|&x| (x - max_val).exp()).sum();
495 max_val + sum_exp.ln()
496 }
497 #[cfg(feature = "simd")]
498 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
499 crate::simd::simd_softmax_f32(a)
500 }
501 #[cfg(not(feature = "simd"))]
502 fn simd_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
503 if a.is_empty() {
504 return Array1::zeros(0);
505 }
506 let lse = Self::simd_log_sum_exp(a);
507 a.mapv(|x| (x - lse).exp())
508 }
509 #[cfg(feature = "simd")]
510 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
511 crate::simd::simd_cumsum_f32(a)
512 }
513 #[cfg(not(feature = "simd"))]
514 fn simd_cumsum(a: &ArrayView1<Self>) -> Array1<Self> {
515 if a.is_empty() {
516 return Array1::zeros(0);
517 }
518 let mut cumsum = 0.0f32;
519 a.mapv(|x| {
520 cumsum += x;
521 cumsum
522 })
523 }
524 #[cfg(feature = "simd")]
525 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
526 crate::simd::simd_cumprod_f32(a)
527 }
528 #[cfg(not(feature = "simd"))]
529 fn simd_cumprod(a: &ArrayView1<Self>) -> Array1<Self> {
530 if a.is_empty() {
531 return Array1::zeros(0);
532 }
533 let mut cumprod = 1.0f32;
534 a.mapv(|x| {
535 cumprod *= x;
536 cumprod
537 })
538 }
539 #[cfg(feature = "simd")]
540 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
541 crate::simd::simd_diff_f32(a)
542 }
543 #[cfg(not(feature = "simd"))]
544 fn simd_diff(a: &ArrayView1<Self>) -> Array1<Self> {
545 if a.len() <= 1 {
546 return Array1::zeros(0);
547 }
548 Array1::from_iter((1..a.len()).map(|i| a[i] - a[i - 1]))
549 }
550 #[cfg(feature = "simd")]
551 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
552 crate::simd::simd_sign_f32(a)
553 }
554 #[cfg(not(feature = "simd"))]
555 fn simd_sign(a: &ArrayView1<Self>) -> Array1<Self> {
556 a.mapv(|x| {
557 if x > 0.0 {
558 1.0
559 } else if x < 0.0 {
560 -1.0
561 } else {
562 0.0
563 }
564 })
565 }
566 #[cfg(feature = "simd")]
567 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
568 crate::simd::simd_relu_f32(a)
569 }
570 #[cfg(not(feature = "simd"))]
571 fn simd_relu(a: &ArrayView1<Self>) -> Array1<Self> {
572 a.mapv(|x| x.max(0.0))
573 }
574 #[cfg(feature = "simd")]
575 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
576 crate::simd::simd_leaky_relu_f32(a, alpha)
577 }
578 #[cfg(not(feature = "simd"))]
579 fn simd_leaky_relu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
580 a.mapv(|x| if x > 0.0 { x } else { alpha * x })
581 }
582 #[cfg(feature = "simd")]
583 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
584 crate::simd::simd_normalize_f32(a)
585 }
586 #[cfg(not(feature = "simd"))]
587 fn simd_normalize(a: &ArrayView1<Self>) -> Array1<Self> {
588 let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
589 if norm == 0.0 {
590 return a.to_owned();
591 }
592 a.mapv(|x| x / norm)
593 }
594 #[cfg(feature = "simd")]
595 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
596 crate::simd::simd_standardize_f32(a)
597 }
598 #[cfg(not(feature = "simd"))]
599 fn simd_standardize(a: &ArrayView1<Self>) -> Array1<Self> {
600 if a.len() <= 1 {
601 return Array1::zeros(a.len());
602 }
603 let mean: f32 = a.iter().sum::<f32>() / a.len() as f32;
604 let variance: f32 =
605 a.iter().map(|x| (x - mean) * (x - mean)).sum::<f32>() / (a.len() - 1) as f32;
606 let std = variance.sqrt();
607 if std == 0.0 {
608 return Array1::zeros(a.len());
609 }
610 a.mapv(|x| (x - mean) / std)
611 }
612 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
613 a.mapv(|x| x.abs())
614 }
615 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
616 a.mapv(|x| x.sqrt())
617 }
618 fn simd_exp(a: &ArrayView1<Self>) -> Array1<Self> {
619 a.mapv(|x| x.exp())
620 }
621 fn simd_ln(a: &ArrayView1<Self>) -> Array1<Self> {
622 a.mapv(|x| x.ln())
623 }
624 fn simd_sin(a: &ArrayView1<Self>) -> Array1<Self> {
625 a.mapv(|x| x.sin())
626 }
627 fn simd_cos(a: &ArrayView1<Self>) -> Array1<Self> {
628 a.mapv(|x| x.cos())
629 }
630 fn simd_tan(a: &ArrayView1<Self>) -> Array1<Self> {
631 a.mapv(|x| x.tan())
632 }
633 fn simd_sinh(a: &ArrayView1<Self>) -> Array1<Self> {
634 let exp_a = Self::simd_exp(a);
635 let neg_a = Self::simd_scalar_mul(a, -1.0);
636 let exp_neg_a = Self::simd_exp(&neg_a.view());
637 let diff = Self::simd_sub(&exp_a.view(), &exp_neg_a.view());
638 Self::simd_scalar_mul(&diff.view(), 0.5)
639 }
640 fn simd_cosh(a: &ArrayView1<Self>) -> Array1<Self> {
641 let exp_a = Self::simd_exp(a);
642 let neg_a = Self::simd_scalar_mul(a, -1.0);
643 let exp_neg_a = Self::simd_exp(&neg_a.view());
644 let sum = Self::simd_add(&exp_a.view(), &exp_neg_a.view());
645 Self::simd_scalar_mul(&sum.view(), 0.5)
646 }
647 fn simd_tanh(a: &ArrayView1<Self>) -> Array1<Self> {
648 #[cfg(feature = "simd")]
649 {
650 simd_ops_polynomial::simd_tanh_f32_poly(a)
651 }
652 #[cfg(not(feature = "simd"))]
653 {
654 a.mapv(|x| x.tanh())
655 }
656 }
657 fn simd_floor(a: &ArrayView1<Self>) -> Array1<Self> {
658 #[cfg(feature = "simd")]
659 {
660 crate::simd::simd_floor_f32(a)
661 }
662 #[cfg(not(feature = "simd"))]
663 {
664 a.mapv(|x| x.floor())
665 }
666 }
667 fn simd_ceil(a: &ArrayView1<Self>) -> Array1<Self> {
668 #[cfg(feature = "simd")]
669 {
670 crate::simd::simd_ceil_f32(a)
671 }
672 #[cfg(not(feature = "simd"))]
673 {
674 a.mapv(|x| x.ceil())
675 }
676 }
677 fn simd_round(a: &ArrayView1<Self>) -> Array1<Self> {
678 #[cfg(feature = "simd")]
679 {
680 crate::simd::simd_round_f32(a)
681 }
682 #[cfg(not(feature = "simd"))]
683 {
684 a.mapv(|x| x.round())
685 }
686 }
687 fn simd_atan(a: &ArrayView1<Self>) -> Array1<Self> {
688 a.mapv(|x| x.atan())
689 }
690 fn simd_asin(a: &ArrayView1<Self>) -> Array1<Self> {
691 a.mapv(|x| x.asin())
692 }
693 fn simd_acos(a: &ArrayView1<Self>) -> Array1<Self> {
694 a.mapv(|x| x.acos())
695 }
696 fn simd_atan2(y: &ArrayView1<Self>, x: &ArrayView1<Self>) -> Array1<Self> {
697 y.iter()
698 .zip(x.iter())
699 .map(|(&y_val, &x_val)| y_val.atan2(x_val))
700 .collect::<Vec<_>>()
701 .into()
702 }
703 fn simd_log10(a: &ArrayView1<Self>) -> Array1<Self> {
704 const LOG10_E: f32 = std::f32::consts::LOG10_E;
705 let ln_a = Self::simd_ln(a);
706 Self::simd_scalar_mul(&ln_a.view(), LOG10_E)
707 }
708 fn simd_log2(a: &ArrayView1<Self>) -> Array1<Self> {
709 const LOG2_E: f32 = std::f32::consts::LOG2_E;
710 let ln_a = Self::simd_ln(a);
711 Self::simd_scalar_mul(&ln_a.view(), LOG2_E)
712 }
713 #[cfg(feature = "simd")]
714 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
715 crate::simd::simd_clip_f32(a, min, max)
716 }
717 #[cfg(not(feature = "simd"))]
718 fn simd_clamp(a: &ArrayView1<Self>, min: Self, max: Self) -> Array1<Self> {
719 a.mapv(|x| x.clamp(min, max))
720 }
721 fn simd_fract(a: &ArrayView1<Self>) -> Array1<Self> {
722 #[cfg(feature = "simd")]
723 {
724 let truncated = crate::simd::simd_trunc_f32(a);
725 Self::simd_sub(a, &truncated.view())
726 }
727 #[cfg(not(feature = "simd"))]
728 {
729 a.mapv(|x| x.fract())
730 }
731 }
732 fn simd_trunc(a: &ArrayView1<Self>) -> Array1<Self> {
733 #[cfg(feature = "simd")]
734 {
735 crate::simd::simd_trunc_f32(a)
736 }
737 #[cfg(not(feature = "simd"))]
738 {
739 a.mapv(|x| x.trunc())
740 }
741 }
742 fn simd_recip(a: &ArrayView1<Self>) -> Array1<Self> {
743 let ones = Array1::from_elem(a.len(), 1.0f32);
744 Self::simd_div(&ones.view(), a)
745 }
746 fn simd_powf(base: &ArrayView1<Self>, exp: Self) -> Array1<Self> {
747 let ln_base = Self::simd_ln(base);
748 let scaled = Self::simd_scalar_mul(&ln_base.view(), exp);
749 Self::simd_exp(&scaled.view())
750 }
751 fn simd_pow(base: &ArrayView1<Self>, exp: &ArrayView1<Self>) -> Array1<Self> {
752 let ln_base = Self::simd_ln(base);
753 let scaled = Self::simd_mul(&ln_base.view(), exp);
754 Self::simd_exp(&scaled.view())
755 }
756 #[cfg(feature = "simd")]
757 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
758 crate::simd::unary_powi::simd_powi_f32(base, n)
759 }
760 #[cfg(not(feature = "simd"))]
761 fn simd_powi(base: &ArrayView1<Self>, n: i32) -> Array1<Self> {
762 base.mapv(|x| x.powi(n))
763 }
764 fn simd_gamma(x: &ArrayView1<Self>) -> Array1<Self> {
765 x.mapv(lanczos_gamma_f32)
766 }
767 fn simd_exp2(a: &ArrayView1<Self>) -> Array1<Self> {
768 const LN2: f32 = std::f32::consts::LN_2;
769 let scaled = Self::simd_scalar_mul(a, LN2);
770 Self::simd_exp(&scaled.view())
771 }
772 fn simd_cbrt(a: &ArrayView1<Self>) -> Array1<Self> {
773 a.mapv(|x| x.cbrt())
774 }
775 fn simd_ln_1p(a: &ArrayView1<Self>) -> Array1<Self> {
776 a.mapv(|x| x.ln_1p())
777 }
778 fn simd_exp_m1(a: &ArrayView1<Self>) -> Array1<Self> {
779 a.mapv(|x| x.exp_m1())
780 }
781 fn simd_to_radians(a: &ArrayView1<Self>) -> Array1<Self> {
782 const DEG_TO_RAD: f32 = std::f32::consts::PI / 180.0;
783 Self::simd_scalar_mul(a, DEG_TO_RAD)
784 }
785 fn simd_to_degrees(a: &ArrayView1<Self>) -> Array1<Self> {
786 const RAD_TO_DEG: f32 = 180.0 / std::f32::consts::PI;
787 Self::simd_scalar_mul(a, RAD_TO_DEG)
788 }
789 fn simd_digamma(a: &ArrayView1<Self>) -> Array1<Self> {
790 a.mapv(digamma_f32)
791 }
792 fn simd_trigamma(a: &ArrayView1<Self>) -> Array1<Self> {
793 a.mapv(trigamma_f32)
794 }
795 fn simd_ln_gamma(a: &ArrayView1<Self>) -> Array1<Self> {
796 a.mapv(ln_gamma_f32)
797 }
798 fn simd_erf(a: &ArrayView1<Self>) -> Array1<Self> {
799 a.mapv(erf_f32)
800 }
801 fn simd_erfc(a: &ArrayView1<Self>) -> Array1<Self> {
802 a.mapv(erfc_f32)
803 }
804 fn simd_erfinv(a: &ArrayView1<Self>) -> Array1<Self> {
805 a.mapv(erfinv_f32)
806 }
807 fn simd_erfcinv(a: &ArrayView1<Self>) -> Array1<Self> {
808 a.mapv(erfcinv_f32)
809 }
810 fn simd_sigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
811 a.mapv(sigmoid_f32)
812 }
813 fn simd_gelu(a: &ArrayView1<Self>) -> Array1<Self> {
814 a.mapv(gelu_f32)
815 }
816 fn simd_swish(a: &ArrayView1<Self>) -> Array1<Self> {
817 a.mapv(swish_f32)
818 }
819 fn simd_softplus(a: &ArrayView1<Self>) -> Array1<Self> {
820 a.mapv(softplus_f32)
821 }
822 fn simd_mish(a: &ArrayView1<Self>) -> Array1<Self> {
823 a.mapv(mish_f32)
824 }
825 fn simd_elu(a: &ArrayView1<Self>, alpha: Self) -> Array1<Self> {
826 a.mapv(|x| elu_f32(x, alpha))
827 }
828 fn simd_selu(a: &ArrayView1<Self>) -> Array1<Self> {
829 a.mapv(selu_f32)
830 }
831 fn simd_hardsigmoid(a: &ArrayView1<Self>) -> Array1<Self> {
832 a.mapv(hardsigmoid_f32)
833 }
834 fn simd_hardswish(a: &ArrayView1<Self>) -> Array1<Self> {
835 a.mapv(hardswish_f32)
836 }
837 fn simd_sinc(a: &ArrayView1<Self>) -> Array1<Self> {
838 a.mapv(sinc_f32)
839 }
840 fn simd_log_softmax(a: &ArrayView1<Self>) -> Array1<Self> {
841 if a.is_empty() {
842 return Array1::zeros(0);
843 }
844 let lse = Self::simd_log_sum_exp(a);
845 a.mapv(|x| x - lse)
846 }
847 fn simd_asinh(a: &ArrayView1<Self>) -> Array1<Self> {
848 a.mapv(|x| x.asinh())
849 }
850 fn simd_acosh(a: &ArrayView1<Self>) -> Array1<Self> {
851 a.mapv(|x| x.acosh())
852 }
853 fn simd_atanh(a: &ArrayView1<Self>) -> Array1<Self> {
854 a.mapv(|x| x.atanh())
855 }
856 fn simd_ln_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
857 let ln_gamma_a = Self::simd_ln_gamma(a);
858 let ln_gamma_b = Self::simd_ln_gamma(b);
859 let a_plus_b = Self::simd_add(a, b);
860 let ln_gamma_ab = Self::simd_ln_gamma(&a_plus_b.view());
861 Self::simd_sub(
862 &Self::simd_add(&ln_gamma_a.view(), &ln_gamma_b.view()).view(),
863 &ln_gamma_ab.view(),
864 )
865 }
866 fn simd_beta(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
867 let ln_beta = Self::simd_ln_beta(a, b);
868 Self::simd_exp(&ln_beta.view())
869 }
870 fn simd_lerp(a: &ArrayView1<Self>, b: &ArrayView1<Self>, t: Self) -> Array1<Self> {
871 if a.is_empty() || b.is_empty() {
872 return Array1::zeros(0);
873 }
874 let diff = Self::simd_sub(b, a);
875 let scaled = Self::simd_scalar_mul(&diff.view(), t);
876 Self::simd_add(a, &scaled.view())
877 }
878 fn simd_smoothstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
879 if x.is_empty() {
880 return Array1::zeros(0);
881 }
882 let range = edge1 - edge0;
883 if range.abs() < Self::EPSILON {
884 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
885 }
886 x.mapv(|xi| {
887 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
888 t * t * (3.0 - 2.0 * t)
889 })
890 }
891 fn simd_hypot(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
892 if x.is_empty() || y.is_empty() {
893 return Array1::zeros(0);
894 }
895 let len = x.len().min(y.len());
896 Array1::from_iter((0..len).map(|i| x[i].hypot(y[i])))
897 }
898 fn simd_copysign(x: &ArrayView1<Self>, y: &ArrayView1<Self>) -> Array1<Self> {
899 if x.is_empty() || y.is_empty() {
900 return Array1::zeros(0);
901 }
902 let len = x.len().min(y.len());
903 Array1::from_iter((0..len).map(|i| x[i].copysign(y[i])))
904 }
905 fn simd_smootherstep(edge0: Self, edge1: Self, x: &ArrayView1<Self>) -> Array1<Self> {
906 if x.is_empty() {
907 return Array1::zeros(0);
908 }
909 let range = edge1 - edge0;
910 if range.abs() < Self::EPSILON {
911 return x.mapv(|xi| if xi < edge0 { 0.0 } else { 1.0 });
912 }
913 x.mapv(|xi| {
914 let t = ((xi - edge0) / range).clamp(0.0, 1.0);
915 let t3 = t * t * t;
916 t3 * (t * (t * 6.0 - 15.0) + 10.0)
917 })
918 }
919 fn simd_logaddexp(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
920 if a.is_empty() || b.is_empty() {
921 return Array1::zeros(0);
922 }
923 let len = a.len().min(b.len());
924 Array1::from_iter((0..len).map(|i| {
925 let ai = a[i];
926 let bi = b[i];
927 let max_val = ai.max(bi);
928 let diff = (ai - bi).abs();
929 if diff > 50.0 {
930 max_val
931 } else {
932 max_val + (1.0 + (-diff).exp()).ln()
933 }
934 }))
935 }
936 fn simd_logit(a: &ArrayView1<Self>) -> Array1<Self> {
937 if a.is_empty() {
938 return Array1::zeros(0);
939 }
940 a.mapv(|p| {
941 if p <= 0.0 {
942 Self::NEG_INFINITY
943 } else if p >= 1.0 {
944 Self::INFINITY
945 } else {
946 (p / (1.0 - p)).ln()
947 }
948 })
949 }
950 fn simd_square(a: &ArrayView1<Self>) -> Array1<Self> {
951 if a.is_empty() {
952 return Array1::zeros(0);
953 }
954 a.mapv(|x| x * x)
955 }
956 fn simd_rsqrt(a: &ArrayView1<Self>) -> Array1<Self> {
957 if a.is_empty() {
958 return Array1::zeros(0);
959 }
960 a.mapv(|x| {
961 if x <= 0.0 {
962 if x == 0.0 {
963 Self::INFINITY
964 } else {
965 Self::NAN
966 }
967 } else {
968 1.0 / x.sqrt()
969 }
970 })
971 }
972 fn simd_sincos(a: &ArrayView1<Self>) -> (Array1<Self>, Array1<Self>) {
973 if a.is_empty() {
974 return (Array1::zeros(0), Array1::zeros(0));
975 }
976 let sin_result = a.mapv(|x| x.sin());
977 let cos_result = a.mapv(|x| x.cos());
978 (sin_result, cos_result)
979 }
980 fn simd_expm1(a: &ArrayView1<Self>) -> Array1<Self> {
981 if a.is_empty() {
982 return Array1::zeros(0);
983 }
984 a.mapv(|x| x.exp_m1())
985 }
986 fn simd_log1p(a: &ArrayView1<Self>) -> Array1<Self> {
987 if a.is_empty() {
988 return Array1::zeros(0);
989 }
990 a.mapv(|x| x.ln_1p())
991 }
992
993 #[cfg(feature = "simd")]
998 fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
999 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1000 assert_eq!(
1001 a.len(),
1002 output.len(),
1003 "Output buffer must match input length"
1004 );
1005
1006 let len = a.len();
1007
1008 #[cfg(target_arch = "x86_64")]
1009 {
1010 use std::arch::x86_64::*;
1011
1012 if is_x86_feature_detected!("avx2") {
1013 unsafe {
1014 let mut i = 0;
1015 while i + 8 <= len {
1017 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1018 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1019 let result_vec = _mm256_add_ps(a_vec, b_vec);
1020 _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1021 i += 8;
1022 }
1023 while i < len {
1025 *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1026 i += 1;
1027 }
1028 }
1029 return;
1030 }
1031 }
1032
1033 #[cfg(target_arch = "aarch64")]
1034 {
1035 use std::arch::aarch64::*;
1036
1037 if std::arch::is_aarch64_feature_detected!("neon") {
1038 unsafe {
1039 let mut i = 0;
1040 while i + 4 <= len {
1042 let a_vec = vld1q_f32(a.as_ptr().add(i));
1043 let b_vec = vld1q_f32(b.as_ptr().add(i));
1044 let result_vec = vaddq_f32(a_vec, b_vec);
1045 vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1046 i += 4;
1047 }
1048 while i < len {
1050 *output.get_unchecked_mut(i) = *a.get_unchecked(i) + *b.get_unchecked(i);
1051 i += 1;
1052 }
1053 }
1054 return;
1055 }
1056 }
1057
1058 for i in 0..len {
1060 output[i] = a[i] + b[i];
1061 }
1062 }
1063
1064 #[cfg(not(feature = "simd"))]
1065 fn simd_add_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1066 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1067 assert_eq!(
1068 a.len(),
1069 output.len(),
1070 "Output buffer must match input length"
1071 );
1072 for i in 0..a.len() {
1073 output[i] = a[i] + b[i];
1074 }
1075 }
1076
1077 #[cfg(feature = "simd")]
1078 fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1079 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1080 assert_eq!(
1081 a.len(),
1082 output.len(),
1083 "Output buffer must match input length"
1084 );
1085
1086 let len = a.len();
1087
1088 #[cfg(target_arch = "x86_64")]
1089 {
1090 use std::arch::x86_64::*;
1091
1092 if is_x86_feature_detected!("avx2") {
1093 unsafe {
1094 let mut i = 0;
1095 while i + 8 <= len {
1096 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1097 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1098 let result_vec = _mm256_sub_ps(a_vec, b_vec);
1099 _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1100 i += 8;
1101 }
1102 while i < len {
1103 *output.get_unchecked_mut(i) = *a.get_unchecked(i) - *b.get_unchecked(i);
1104 i += 1;
1105 }
1106 }
1107 return;
1108 }
1109 }
1110
1111 #[cfg(target_arch = "aarch64")]
1112 {
1113 use std::arch::aarch64::*;
1114
1115 if std::arch::is_aarch64_feature_detected!("neon") {
1116 unsafe {
1117 let mut i = 0;
1118 while i + 4 <= len {
1119 let a_vec = vld1q_f32(a.as_ptr().add(i));
1120 let b_vec = vld1q_f32(b.as_ptr().add(i));
1121 let result_vec = vsubq_f32(a_vec, b_vec);
1122 vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1123 i += 4;
1124 }
1125 while i < len {
1126 *output.get_unchecked_mut(i) = *a.get_unchecked(i) - *b.get_unchecked(i);
1127 i += 1;
1128 }
1129 }
1130 return;
1131 }
1132 }
1133
1134 for i in 0..len {
1135 output[i] = a[i] - b[i];
1136 }
1137 }
1138
1139 #[cfg(not(feature = "simd"))]
1140 fn simd_sub_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1141 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1142 assert_eq!(
1143 a.len(),
1144 output.len(),
1145 "Output buffer must match input length"
1146 );
1147 for i in 0..a.len() {
1148 output[i] = a[i] - b[i];
1149 }
1150 }
1151
1152 #[cfg(feature = "simd")]
1153 fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1154 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1155 assert_eq!(
1156 a.len(),
1157 output.len(),
1158 "Output buffer must match input length"
1159 );
1160
1161 let len = a.len();
1162
1163 #[cfg(target_arch = "x86_64")]
1164 {
1165 use std::arch::x86_64::*;
1166
1167 if is_x86_feature_detected!("avx2") {
1168 unsafe {
1169 let mut i = 0;
1170 while i + 8 <= len {
1171 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1172 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1173 let result_vec = _mm256_mul_ps(a_vec, b_vec);
1174 _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1175 i += 8;
1176 }
1177 while i < len {
1178 *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1179 i += 1;
1180 }
1181 }
1182 return;
1183 }
1184 }
1185
1186 #[cfg(target_arch = "aarch64")]
1187 {
1188 use std::arch::aarch64::*;
1189
1190 if std::arch::is_aarch64_feature_detected!("neon") {
1191 unsafe {
1192 let mut i = 0;
1193 while i + 4 <= len {
1194 let a_vec = vld1q_f32(a.as_ptr().add(i));
1195 let b_vec = vld1q_f32(b.as_ptr().add(i));
1196 let result_vec = vmulq_f32(a_vec, b_vec);
1197 vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1198 i += 4;
1199 }
1200 while i < len {
1201 *output.get_unchecked_mut(i) = *a.get_unchecked(i) * *b.get_unchecked(i);
1202 i += 1;
1203 }
1204 }
1205 return;
1206 }
1207 }
1208
1209 for i in 0..len {
1210 output[i] = a[i] * b[i];
1211 }
1212 }
1213
1214 #[cfg(not(feature = "simd"))]
1215 fn simd_mul_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1216 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1217 assert_eq!(
1218 a.len(),
1219 output.len(),
1220 "Output buffer must match input length"
1221 );
1222 for i in 0..a.len() {
1223 output[i] = a[i] * b[i];
1224 }
1225 }
1226
1227 fn simd_div_into(a: &[Self], b: &[Self], output: &mut [Self]) {
1228 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1229 assert_eq!(
1230 a.len(),
1231 output.len(),
1232 "Output buffer must match input length"
1233 );
1234 for i in 0..a.len() {
1236 output[i] = a[i] / b[i];
1237 }
1238 }
1239
1240 #[cfg(feature = "simd")]
1241 fn simd_add_inplace(a: &mut [Self], b: &[Self]) {
1242 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1243
1244 let len = a.len();
1245
1246 #[cfg(target_arch = "x86_64")]
1247 {
1248 use std::arch::x86_64::*;
1249
1250 if is_x86_feature_detected!("avx2") {
1251 unsafe {
1252 let mut i = 0;
1253 while i + 8 <= len {
1254 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1255 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1256 let result_vec = _mm256_add_ps(a_vec, b_vec);
1257 _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
1258 i += 8;
1259 }
1260 while i < len {
1261 *a.get_unchecked_mut(i) += *b.get_unchecked(i);
1262 i += 1;
1263 }
1264 }
1265 return;
1266 }
1267 }
1268
1269 #[cfg(target_arch = "aarch64")]
1270 {
1271 use std::arch::aarch64::*;
1272
1273 if std::arch::is_aarch64_feature_detected!("neon") {
1274 unsafe {
1275 let mut i = 0;
1276 while i + 4 <= len {
1277 let a_vec = vld1q_f32(a.as_ptr().add(i));
1278 let b_vec = vld1q_f32(b.as_ptr().add(i));
1279 let result_vec = vaddq_f32(a_vec, b_vec);
1280 vst1q_f32(a.as_mut_ptr().add(i), result_vec);
1281 i += 4;
1282 }
1283 while i < len {
1284 *a.get_unchecked_mut(i) += *b.get_unchecked(i);
1285 i += 1;
1286 }
1287 }
1288 return;
1289 }
1290 }
1291
1292 for i in 0..len {
1293 a[i] += b[i];
1294 }
1295 }
1296
1297 #[cfg(not(feature = "simd"))]
1298 fn simd_add_inplace(a: &mut [Self], b: &[Self]) {
1299 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1300 for i in 0..a.len() {
1301 a[i] += b[i];
1302 }
1303 }
1304
1305 fn simd_sub_inplace(a: &mut [Self], b: &[Self]) {
1306 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1307 for i in 0..a.len() {
1308 a[i] -= b[i];
1309 }
1310 }
1311
1312 #[cfg(feature = "simd")]
1313 fn simd_mul_inplace(a: &mut [Self], b: &[Self]) {
1314 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1315
1316 let len = a.len();
1317
1318 #[cfg(target_arch = "x86_64")]
1319 {
1320 use std::arch::x86_64::*;
1321
1322 if is_x86_feature_detected!("avx2") {
1323 unsafe {
1324 let mut i = 0;
1325 while i + 8 <= len {
1326 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1327 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1328 let result_vec = _mm256_mul_ps(a_vec, b_vec);
1329 _mm256_storeu_ps(a.as_mut_ptr().add(i), result_vec);
1330 i += 8;
1331 }
1332 while i < len {
1333 *a.get_unchecked_mut(i) *= *b.get_unchecked(i);
1334 i += 1;
1335 }
1336 }
1337 return;
1338 }
1339 }
1340
1341 #[cfg(target_arch = "aarch64")]
1342 {
1343 use std::arch::aarch64::*;
1344
1345 if std::arch::is_aarch64_feature_detected!("neon") {
1346 unsafe {
1347 let mut i = 0;
1348 while i + 4 <= len {
1349 let a_vec = vld1q_f32(a.as_ptr().add(i));
1350 let b_vec = vld1q_f32(b.as_ptr().add(i));
1351 let result_vec = vmulq_f32(a_vec, b_vec);
1352 vst1q_f32(a.as_mut_ptr().add(i), result_vec);
1353 i += 4;
1354 }
1355 while i < len {
1356 *a.get_unchecked_mut(i) *= *b.get_unchecked(i);
1357 i += 1;
1358 }
1359 }
1360 return;
1361 }
1362 }
1363
1364 for i in 0..len {
1365 a[i] *= b[i];
1366 }
1367 }
1368
1369 #[cfg(not(feature = "simd"))]
1370 fn simd_mul_inplace(a: &mut [Self], b: &[Self]) {
1371 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1372 for i in 0..a.len() {
1373 a[i] *= b[i];
1374 }
1375 }
1376
1377 fn simd_div_inplace(a: &mut [Self], b: &[Self]) {
1378 assert_eq!(a.len(), b.len(), "Arrays must have same length");
1379 for i in 0..a.len() {
1380 a[i] /= b[i];
1381 }
1382 }
1383
1384 fn simd_add_scalar_inplace(a: &mut [Self], scalar: Self) {
1385 for x in a.iter_mut() {
1386 *x += scalar;
1387 }
1388 }
1389
1390 fn simd_mul_scalar_inplace(a: &mut [Self], scalar: Self) {
1391 for x in a.iter_mut() {
1392 *x *= scalar;
1393 }
1394 }
1395
1396 #[cfg(feature = "simd")]
1397 fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]) {
1398 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1399 assert_eq!(a.len(), c.len(), "Input arrays must have same length");
1400 assert_eq!(
1401 a.len(),
1402 output.len(),
1403 "Output buffer must match input length"
1404 );
1405
1406 let len = a.len();
1407
1408 #[cfg(target_arch = "x86_64")]
1409 {
1410 use std::arch::x86_64::*;
1411
1412 if is_x86_feature_detected!("fma") && is_x86_feature_detected!("avx2") {
1413 unsafe {
1414 let mut i = 0;
1415 while i + 8 <= len {
1416 let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
1417 let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
1418 let c_vec = _mm256_loadu_ps(c.as_ptr().add(i));
1419 let result_vec = _mm256_fmadd_ps(a_vec, b_vec, c_vec);
1421 _mm256_storeu_ps(output.as_mut_ptr().add(i), result_vec);
1422 i += 8;
1423 }
1424 while i < len {
1425 *output.get_unchecked_mut(i) = a
1426 .get_unchecked(i)
1427 .mul_add(*b.get_unchecked(i), *c.get_unchecked(i));
1428 i += 1;
1429 }
1430 }
1431 return;
1432 }
1433 }
1434
1435 #[cfg(target_arch = "aarch64")]
1436 {
1437 use std::arch::aarch64::*;
1438
1439 if std::arch::is_aarch64_feature_detected!("neon") {
1440 unsafe {
1441 let mut i = 0;
1442 while i + 4 <= len {
1443 let a_vec = vld1q_f32(a.as_ptr().add(i));
1444 let b_vec = vld1q_f32(b.as_ptr().add(i));
1445 let c_vec = vld1q_f32(c.as_ptr().add(i));
1446 let result_vec = vfmaq_f32(c_vec, a_vec, b_vec);
1448 vst1q_f32(output.as_mut_ptr().add(i), result_vec);
1449 i += 4;
1450 }
1451 while i < len {
1452 *output.get_unchecked_mut(i) = a
1453 .get_unchecked(i)
1454 .mul_add(*b.get_unchecked(i), *c.get_unchecked(i));
1455 i += 1;
1456 }
1457 }
1458 return;
1459 }
1460 }
1461
1462 for i in 0..len {
1463 output[i] = a[i].mul_add(b[i], c[i]);
1464 }
1465 }
1466
1467 #[cfg(not(feature = "simd"))]
1468 fn simd_fma_into(a: &[Self], b: &[Self], c: &[Self], output: &mut [Self]) {
1469 assert_eq!(a.len(), b.len(), "Input arrays must have same length");
1470 assert_eq!(a.len(), c.len(), "Input arrays must have same length");
1471 assert_eq!(
1472 a.len(),
1473 output.len(),
1474 "Output buffer must match input length"
1475 );
1476 for i in 0..a.len() {
1477 output[i] = a[i].mul_add(b[i], c[i]);
1478 }
1479 }
1480}