1use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
8use num_traits::Zero;
9
10pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
12 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
14
15 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
17
18 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
20
21 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
23
24 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
26
27 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
29
30 fn simd_gemm(
32 alpha: Self,
33 a: &ArrayView2<Self>,
34 b: &ArrayView2<Self>,
35 beta: Self,
36 c: &mut Array2<Self>,
37 );
38
39 fn simd_norm(a: &ArrayView1<Self>) -> Self;
41
42 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
44
45 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
47
48 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
50
51 fn simd_sum(a: &ArrayView1<Self>) -> Self;
53
54 fn simd_mean(a: &ArrayView1<Self>) -> Self;
56
57 fn simd_max_element(a: &ArrayView1<Self>) -> Self;
59
60 fn simd_min_element(a: &ArrayView1<Self>) -> Self;
62
63 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
65
66 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
68
69 fn simd_fma_advanced_optimized(
71 a: &ArrayView1<Self>,
72 b: &ArrayView1<Self>,
73 c: &ArrayView1<Self>,
74 ) -> Array1<Self>;
75
76 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
78
79 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
81
82 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
84
85 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
87
88 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
90
91 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
93
94 fn simd_available() -> bool;
96
97 fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
99 Self::simd_sum(a)
100 }
101
102 fn simd_sub_f32_ultra(
104 a: &ArrayView1<Self>,
105 b: &ArrayView1<Self>,
106 result: &mut ArrayViewMut1<Self>,
107 );
108
109 fn simd_mul_f32_ultra(
111 a: &ArrayView1<Self>,
112 b: &ArrayView1<Self>,
113 result: &mut ArrayViewMut1<Self>,
114 );
115
116 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
118
119 fn simd_div_f32_ultra(
121 a: &ArrayView1<Self>,
122 b: &ArrayView1<Self>,
123 result: &mut ArrayViewMut1<Self>,
124 );
125
126 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
128
129 fn simd_add_f32_ultra(
131 a: &ArrayView1<Self>,
132 b: &ArrayView1<Self>,
133 result: &mut ArrayViewMut1<Self>,
134 );
135
136 fn simd_fma_f32_ultra(
138 a: &ArrayView1<Self>,
139 b: &ArrayView1<Self>,
140 c: &ArrayView1<Self>,
141 result: &mut ArrayViewMut1<Self>,
142 );
143
144 fn simd_pow_f32_ultra(
146 a: &ArrayView1<Self>,
147 b: &ArrayView1<Self>,
148 result: &mut ArrayViewMut1<Self>,
149 );
150
151 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
153
154 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
156
157 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
159}
160
161impl SimdUnifiedOps for f32 {
163 #[cfg(feature = "simd")]
164 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
165 crate::simd::simd_add_f32(a, b)
166 }
167
168 #[cfg(not(feature = "simd"))]
169 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
170 (a + b).to_owned()
171 }
172
173 #[cfg(feature = "simd")]
174 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
175 crate::simd::simd_sub_f32(a, b)
176 }
177
178 #[cfg(not(feature = "simd"))]
179 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
180 (a - b).to_owned()
181 }
182
183 #[cfg(feature = "simd")]
184 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
185 crate::simd::simd_mul_f32(a, b)
186 }
187
188 #[cfg(not(feature = "simd"))]
189 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
190 (a * b).to_owned()
191 }
192
193 #[cfg(feature = "simd")]
194 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
195 crate::simd::simd_div_f32(a, b)
196 }
197
198 #[cfg(not(feature = "simd"))]
199 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
200 (a / b).to_owned()
201 }
202
203 #[cfg(feature = "simd")]
204 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
205 crate::simd::simd_dot_f32(a, b)
206 }
207
208 #[cfg(not(feature = "simd"))]
209 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
210 a.dot(b)
211 }
212
213 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
214 let m = a.nrows();
215 let n = a.ncols();
216
217 assert_eq!(n, x.len());
218 assert_eq!(m, y.len());
219
220 if beta == 0.0 {
222 y.fill(0.0);
223 } else if beta != 1.0 {
224 y.mapv_inplace(|v| v * beta);
225 }
226
227 for i in 0..m {
229 let row = a.row(i);
230 y[i] += Self::simd_dot(&row, x);
231 }
232 }
233
234 fn simd_gemm(
235 alpha: Self,
236 a: &ArrayView2<Self>,
237 b: &ArrayView2<Self>,
238 beta: Self,
239 c: &mut Array2<Self>,
240 ) {
241 let m = a.nrows();
242 let k = a.ncols();
243 let n = b.ncols();
244
245 assert_eq!(k, b.nrows());
246 assert_eq!((m, n), c.dim());
247
248 if beta == 0.0 {
250 c.fill(0.0);
251 } else if beta != 1.0 {
252 c.mapv_inplace(|v| v * beta);
253 }
254
255 for i in 0..m {
257 let a_row = a.row(i);
258 for j in 0..n {
259 let b_col = b.column(j);
260 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
261 }
262 }
263 }
264
265 fn simd_norm(a: &ArrayView1<Self>) -> Self {
266 Self::simd_dot(a, a).sqrt()
267 }
268
269 #[cfg(feature = "simd")]
270 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
271 crate::simd::simd_maximum_f32(a, b)
272 }
273
274 #[cfg(not(feature = "simd"))]
275 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
276 let mut result = Array1::zeros(a.len());
277 for _i in 0..a.len() {
278 result[0] = a[0].max(b[0]);
279 }
280 result
281 }
282
283 #[cfg(feature = "simd")]
284 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
285 crate::simd::simd_minimum_f32(a, b)
286 }
287
288 #[cfg(not(feature = "simd"))]
289 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
290 let mut result = Array1::zeros(a.len());
291 for _i in 0..a.len() {
292 result[0] = a[0].min(b[0]);
293 }
294 result
295 }
296
297 #[cfg(feature = "simd")]
298 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
299 crate::simd::simd_scalar_mul_f32(a, scalar)
300 }
301
302 #[cfg(not(feature = "simd"))]
303 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
304 a.mapv(|x| x * scalar)
305 }
306
307 #[cfg(feature = "simd")]
308 fn simd_sum(a: &ArrayView1<Self>) -> Self {
309 crate::simd::simd_sum_f32(a)
310 }
311
312 #[cfg(not(feature = "simd"))]
313 fn simd_sum(a: &ArrayView1<Self>) -> Self {
314 a.sum()
315 }
316
317 fn simd_mean(a: &ArrayView1<Self>) -> Self {
318 if a.is_empty() {
319 0.0
320 } else {
321 Self::simd_sum(a) / (a.len() as f32)
322 }
323 }
324
325 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
326 a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
327 }
328
329 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
330 a.fold(f32::INFINITY, |acc, &x| acc.min(x))
331 }
332
333 #[cfg(feature = "simd")]
334 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
335 crate::simd::simd_fused_multiply_add_f32(a, b, c)
336 }
337
338 #[cfg(not(feature = "simd"))]
339 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
340 let mut result = Array1::zeros(a.len());
341 for _i in 0..a.len() {
342 result[0] = a[0] * b[0] + c[0];
343 }
344 result
345 }
346
347 #[cfg(feature = "simd")]
348 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
349 crate::simd::simd_add_cache_optimized_f32(a, b)
350 }
351
352 #[cfg(not(feature = "simd"))]
353 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
354 a + b
355 }
356
357 #[cfg(feature = "simd")]
358 fn simd_fma_advanced_optimized(
359 a: &ArrayView1<Self>,
360 b: &ArrayView1<Self>,
361 c: &ArrayView1<Self>,
362 ) -> Array1<Self> {
363 crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
364 }
365
366 #[cfg(not(feature = "simd"))]
367 fn simd_fma_advanced_optimized(
368 a: &ArrayView1<Self>,
369 b: &ArrayView1<Self>,
370 c: &ArrayView1<Self>,
371 ) -> Array1<Self> {
372 let mut result = Array1::zeros(a.len());
373 for _i in 0..a.len() {
374 result[0] = a[0] * b[0] + c[0];
375 }
376 result
377 }
378
379 #[cfg(feature = "simd")]
380 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
381 crate::simd::simd_adaptive_add_f32(a, b)
382 }
383
384 #[cfg(not(feature = "simd"))]
385 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
386 a + b
387 }
388
389 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
390 a.t().to_owned()
391 }
392
393 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
394 a.mapv(|x| x.abs())
395 }
396
397 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
398 a.mapv(|x| x.sqrt())
399 }
400
401 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
402 a.iter().map(|&x| x * x).sum()
403 }
404
405 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
406 Self::simd_mul(a, b)
407 }
408
409 #[cfg(feature = "simd")]
410 fn simd_available() -> bool {
411 true
412 }
413
414 #[cfg(not(feature = "simd"))]
415 fn simd_available() -> bool {
416 false
417 }
418
419 fn simd_sub_f32_ultra(
420 a: &ArrayView1<Self>,
421 b: &ArrayView1<Self>,
422 result: &mut ArrayViewMut1<Self>,
423 ) {
424 let sub_result = Self::simd_sub(a, b);
425 result.assign(&sub_result);
426 }
427
428 fn simd_mul_f32_ultra(
429 a: &ArrayView1<Self>,
430 b: &ArrayView1<Self>,
431 result: &mut ArrayViewMut1<Self>,
432 ) {
433 let mul_result = Self::simd_mul(a, b);
434 result.assign(&mul_result);
435 }
436
437 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
438 a.iter().map(|&x| x * x * x).sum()
440 }
441
442 fn simd_div_f32_ultra(
443 a: &ArrayView1<Self>,
444 b: &ArrayView1<Self>,
445 result: &mut ArrayViewMut1<Self>,
446 ) {
447 let div_result = Self::simd_div(a, b);
448 result.assign(&div_result);
449 }
450
451 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
452 let sin_result = a.mapv(|x| x.sin());
453 result.assign(&sin_result);
454 }
455
456 fn simd_add_f32_ultra(
457 a: &ArrayView1<Self>,
458 b: &ArrayView1<Self>,
459 result: &mut ArrayViewMut1<Self>,
460 ) {
461 let add_result = Self::simd_add(a, b);
462 result.assign(&add_result);
463 }
464
465 fn simd_fma_f32_ultra(
466 a: &ArrayView1<Self>,
467 b: &ArrayView1<Self>,
468 c: &ArrayView1<Self>,
469 result: &mut ArrayViewMut1<Self>,
470 ) {
471 let fma_result = Self::simd_fma(a, b, c);
472 result.assign(&fma_result);
473 }
474
475 fn simd_pow_f32_ultra(
476 a: &ArrayView1<Self>,
477 b: &ArrayView1<Self>,
478 result: &mut ArrayViewMut1<Self>,
479 ) {
480 let pow_result = a
481 .iter()
482 .zip(b.iter())
483 .map(|(&x, &y)| x.powf(y))
484 .collect::<Vec<_>>();
485 result.assign(&Array1::from_vec(pow_result));
486 }
487
488 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
489 let exp_result = a.mapv(|x| x.exp());
490 result.assign(&exp_result);
491 }
492
493 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
494 let cos_result = a.mapv(|x| x.cos());
495 result.assign(&cos_result);
496 }
497
498 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
499 Self::simd_dot(a, b)
500 }
501}
502
503impl SimdUnifiedOps for f64 {
505 #[cfg(feature = "simd")]
506 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
507 crate::simd::simd_add_f64(a, b)
508 }
509
510 #[cfg(not(feature = "simd"))]
511 fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
512 (a + b).to_owned()
513 }
514
515 #[cfg(feature = "simd")]
516 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
517 crate::simd::simd_sub_f64(a, b)
518 }
519
520 #[cfg(not(feature = "simd"))]
521 fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
522 (a - b).to_owned()
523 }
524
525 #[cfg(feature = "simd")]
526 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
527 crate::simd::simd_mul_f64(a, b)
528 }
529
530 #[cfg(not(feature = "simd"))]
531 fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
532 (a * b).to_owned()
533 }
534
535 #[cfg(feature = "simd")]
536 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
537 crate::simd::simd_div_f64(a, b)
538 }
539
540 #[cfg(not(feature = "simd"))]
541 fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
542 (a / b).to_owned()
543 }
544
545 #[cfg(feature = "simd")]
546 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
547 crate::simd::simd_dot_f64(a, b)
548 }
549
550 #[cfg(not(feature = "simd"))]
551 fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
552 a.dot(b)
553 }
554
555 fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
556 let m = a.nrows();
557 let n = a.ncols();
558
559 assert_eq!(n, x.len());
560 assert_eq!(m, y.len());
561
562 if beta == 0.0 {
564 y.fill(0.0);
565 } else if beta != 1.0 {
566 y.mapv_inplace(|v| v * beta);
567 }
568
569 for i in 0..m {
571 let row = a.row(i);
572 y[i] += Self::simd_dot(&row, x);
573 }
574 }
575
576 fn simd_gemm(
577 alpha: Self,
578 a: &ArrayView2<Self>,
579 b: &ArrayView2<Self>,
580 beta: Self,
581 c: &mut Array2<Self>,
582 ) {
583 let m = a.nrows();
584 let k = a.ncols();
585 let n = b.ncols();
586
587 assert_eq!(k, b.nrows());
588 assert_eq!((m, n), c.dim());
589
590 if beta == 0.0 {
592 c.fill(0.0);
593 } else if beta != 1.0 {
594 c.mapv_inplace(|v| v * beta);
595 }
596
597 for i in 0..m {
599 let a_row = a.row(i);
600 for j in 0..n {
601 let b_col = b.column(j);
602 c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
603 }
604 }
605 }
606
607 fn simd_norm(a: &ArrayView1<Self>) -> Self {
608 Self::simd_dot(a, a).sqrt()
609 }
610
611 #[cfg(feature = "simd")]
612 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
613 crate::simd::simd_maximum_f64(a, b)
614 }
615
616 #[cfg(not(feature = "simd"))]
617 fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
618 let mut result = Array1::zeros(a.len());
619 for _i in 0..a.len() {
620 result[0] = a[0].max(b[0]);
621 }
622 result
623 }
624
625 #[cfg(feature = "simd")]
626 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
627 crate::simd::simd_minimum_f64(a, b)
628 }
629
630 #[cfg(not(feature = "simd"))]
631 fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
632 let mut result = Array1::zeros(a.len());
633 for _i in 0..a.len() {
634 result[0] = a[0].min(b[0]);
635 }
636 result
637 }
638
639 #[cfg(feature = "simd")]
640 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
641 crate::simd::simd_scalar_mul_f64(a, scalar)
642 }
643
644 #[cfg(not(feature = "simd"))]
645 fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
646 a.mapv(|x| x * scalar)
647 }
648
649 #[cfg(feature = "simd")]
650 fn simd_sum(a: &ArrayView1<Self>) -> Self {
651 crate::simd::simd_sum_f64(a)
652 }
653
654 #[cfg(not(feature = "simd"))]
655 fn simd_sum(a: &ArrayView1<Self>) -> Self {
656 a.sum()
657 }
658
659 fn simd_mean(a: &ArrayView1<Self>) -> Self {
660 if a.is_empty() {
661 0.0
662 } else {
663 Self::simd_sum(a) / (a.len() as f64)
664 }
665 }
666
667 fn simd_max_element(a: &ArrayView1<Self>) -> Self {
668 a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
669 }
670
671 fn simd_min_element(a: &ArrayView1<Self>) -> Self {
672 a.fold(f64::INFINITY, |acc, &x| acc.min(x))
673 }
674
675 #[cfg(feature = "simd")]
676 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
677 crate::simd::simd_fused_multiply_add_f64(a, b, c)
678 }
679
680 #[cfg(not(feature = "simd"))]
681 fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
682 let mut result = Array1::zeros(a.len());
683 for _i in 0..a.len() {
684 result[0] = a[0] * b[0] + c[0];
685 }
686 result
687 }
688
689 #[cfg(feature = "simd")]
690 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
691 crate::simd::simd_add_cache_optimized_f64(a, b)
692 }
693
694 #[cfg(not(feature = "simd"))]
695 fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
696 a + b
697 }
698
699 #[cfg(feature = "simd")]
700 fn simd_fma_advanced_optimized(
701 a: &ArrayView1<Self>,
702 b: &ArrayView1<Self>,
703 c: &ArrayView1<Self>,
704 ) -> Array1<Self> {
705 crate::simd::simd_fma_advanced_optimized_f64(a, b, c)
706 }
707
708 #[cfg(not(feature = "simd"))]
709 fn simd_fma_advanced_optimized(
710 a: &ArrayView1<Self>,
711 b: &ArrayView1<Self>,
712 c: &ArrayView1<Self>,
713 ) -> Array1<Self> {
714 let mut result = Array1::zeros(a.len());
715 for _i in 0..a.len() {
716 result[0] = a[0] * b[0] + c[0];
717 }
718 result
719 }
720
721 #[cfg(feature = "simd")]
722 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
723 crate::simd::simd_adaptive_add_f64(a, b)
724 }
725
726 #[cfg(not(feature = "simd"))]
727 fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
728 a + b
729 }
730
731 fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
732 a.t().to_owned()
733 }
734
735 fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
736 a.mapv(|x| x.abs())
737 }
738
739 fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
740 a.mapv(|x| x.sqrt())
741 }
742
743 fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
744 a.iter().map(|&x| x * x).sum()
745 }
746
747 fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
748 Self::simd_mul(a, b)
749 }
750
751 #[cfg(feature = "simd")]
752 fn simd_available() -> bool {
753 true
754 }
755
756 #[cfg(not(feature = "simd"))]
757 fn simd_available() -> bool {
758 false
759 }
760
761 fn simd_sub_f32_ultra(
762 a: &ArrayView1<Self>,
763 b: &ArrayView1<Self>,
764 result: &mut ArrayViewMut1<Self>,
765 ) {
766 let sub_result = Self::simd_sub(a, b);
767 result.assign(&sub_result);
768 }
769
770 fn simd_mul_f32_ultra(
771 a: &ArrayView1<Self>,
772 b: &ArrayView1<Self>,
773 result: &mut ArrayViewMut1<Self>,
774 ) {
775 let mul_result = Self::simd_mul(a, b);
776 result.assign(&mul_result);
777 }
778
779 fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
780 a.iter().map(|&x| x * x * x).sum()
782 }
783
784 fn simd_div_f32_ultra(
785 a: &ArrayView1<Self>,
786 b: &ArrayView1<Self>,
787 result: &mut ArrayViewMut1<Self>,
788 ) {
789 let div_result = Self::simd_div(a, b);
790 result.assign(&div_result);
791 }
792
793 fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
794 let sin_result = a.mapv(|x| x.sin());
795 result.assign(&sin_result);
796 }
797
798 fn simd_add_f32_ultra(
799 a: &ArrayView1<Self>,
800 b: &ArrayView1<Self>,
801 result: &mut ArrayViewMut1<Self>,
802 ) {
803 let add_result = Self::simd_add(a, b);
804 result.assign(&add_result);
805 }
806
807 fn simd_fma_f32_ultra(
808 a: &ArrayView1<Self>,
809 b: &ArrayView1<Self>,
810 c: &ArrayView1<Self>,
811 result: &mut ArrayViewMut1<Self>,
812 ) {
813 let fma_result = Self::simd_fma(a, b, c);
814 result.assign(&fma_result);
815 }
816
817 fn simd_pow_f32_ultra(
818 a: &ArrayView1<Self>,
819 b: &ArrayView1<Self>,
820 result: &mut ArrayViewMut1<Self>,
821 ) {
822 let pow_result = a
823 .iter()
824 .zip(b.iter())
825 .map(|(&x, &y)| x.powf(y))
826 .collect::<Vec<_>>();
827 result.assign(&Array1::from_vec(pow_result));
828 }
829
830 fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
831 let exp_result = a.mapv(|x| x.exp());
832 result.assign(&exp_result);
833 }
834
835 fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
836 let cos_result = a.mapv(|x| x.cos());
837 result.assign(&cos_result);
838 }
839
840 fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
841 Self::simd_dot(a, b)
842 }
843}
844
845#[derive(Debug, Clone)]
847pub struct PlatformCapabilities {
848 pub simd_available: bool,
849 pub gpu_available: bool,
850 pub cuda_available: bool,
851 pub opencl_available: bool,
852 pub metal_available: bool,
853 pub avx2_available: bool,
854 pub avx512_available: bool,
855 pub neon_available: bool,
856}
857
858impl PlatformCapabilities {
859 pub fn detect() -> Self {
861 Self {
862 simd_available: cfg!(feature = "simd"),
863 gpu_available: cfg!(feature = "gpu"),
864 cuda_available: cfg!(all(feature = "gpu", feature = "cuda")),
865 opencl_available: cfg!(all(feature = "gpu", feature = "opencl")),
866 metal_available: cfg!(all(feature = "gpu", feature = "metal", target_os = "macos")),
867 avx2_available: cfg!(target_feature = "avx2"),
868 avx512_available: cfg!(target_feature = "avx512f"),
869 neon_available: cfg!(target_arch = "aarch64"),
870 }
871 }
872
873 pub fn summary(&self) -> String {
875 let mut features = Vec::new();
876
877 if self.simd_available {
878 features.push("SIMD");
879 }
880 if self.gpu_available {
881 features.push("GPU");
882 }
883 if self.cuda_available {
884 features.push("CUDA");
885 }
886 if self.opencl_available {
887 features.push("OpenCL");
888 }
889 if self.metal_available {
890 features.push("Metal");
891 }
892 if self.avx2_available {
893 features.push("AVX2");
894 }
895 if self.avx512_available {
896 features.push("AVX512");
897 }
898 if self.neon_available {
899 features.push("NEON");
900 }
901
902 if features.is_empty() {
903 "No acceleration features available".to_string()
904 } else {
905 format!(
906 "Available acceleration: {features}",
907 features = features.join(", ")
908 )
909 }
910 }
911
912 pub fn has_avx2(&self) -> bool {
914 self.avx2_available
915 }
916
917 pub fn has_avx512(&self) -> bool {
919 self.avx512_available
920 }
921
922 pub fn has_sse(&self) -> bool {
924 self.simd_available || self.neon_available || self.avx2_available
925 }
926
927 pub fn num_cores(&self) -> usize {
929 std::thread::available_parallelism()
930 .map(|n| n.get())
931 .unwrap_or(1)
932 }
933
934 pub fn cache_line_size(&self) -> usize {
936 64
940 }
941}
942
943pub struct AutoOptimizer {
945 capabilities: PlatformCapabilities,
946}
947
948impl AutoOptimizer {
949 pub fn new() -> Self {
950 Self {
951 capabilities: PlatformCapabilities::detect(),
952 }
953 }
954
955 pub fn should_use_gpu(&self, size: usize) -> bool {
957 self.capabilities.gpu_available && size > 10000
959 }
960
961 pub fn should_use_metal(&self, size: usize) -> bool {
963 self.capabilities.metal_available && size > 1024
966 }
967
968 pub fn should_use_simd(&self, size: usize) -> bool {
970 self.capabilities.simd_available && size > 64
972 }
973
974 pub fn select_gemm_impl(&self, m: usize, n: usize, k: usize) -> &'static str {
976 let total_ops = m * n * k;
977
978 if self.capabilities.metal_available {
980 if total_ops > 8192 {
982 return "Metal";
984 }
985 }
986
987 if self.should_use_gpu(total_ops) {
988 if self.capabilities.cuda_available {
989 "CUDA"
990 } else if self.capabilities.metal_available {
991 "Metal"
992 } else if self.capabilities.opencl_available {
993 "OpenCL"
994 } else {
995 "GPU"
996 }
997 } else if self.should_use_simd(total_ops) {
998 "SIMD"
999 } else {
1000 "Scalar"
1001 }
1002 }
1003
1004 pub fn select_vector_impl(&self, size: usize) -> &'static str {
1006 if self.capabilities.metal_available && size > 1024 {
1008 return "Metal";
1009 }
1010
1011 if self.should_use_gpu(size) {
1012 if self.capabilities.cuda_available {
1013 "CUDA"
1014 } else if self.capabilities.metal_available {
1015 "Metal"
1016 } else if self.capabilities.opencl_available {
1017 "OpenCL"
1018 } else {
1019 "GPU"
1020 }
1021 } else if self.should_use_simd(size) {
1022 if self.capabilities.avx512_available {
1023 "AVX512"
1024 } else if self.capabilities.avx2_available {
1025 "AVX2"
1026 } else if self.capabilities.neon_available {
1027 "NEON"
1028 } else {
1029 "SIMD"
1030 }
1031 } else {
1032 "Scalar"
1033 }
1034 }
1035
1036 pub fn select_reduction_impl(&self, size: usize) -> &'static str {
1038 if self.capabilities.metal_available && size > 4096 {
1041 return "Metal";
1042 }
1043
1044 if self.should_use_gpu(size * 2) {
1045 if self.capabilities.cuda_available {
1047 "CUDA"
1048 } else if self.capabilities.metal_available {
1049 "Metal"
1050 } else {
1051 "GPU"
1052 }
1053 } else if self.should_use_simd(size) {
1054 "SIMD"
1055 } else {
1056 "Scalar"
1057 }
1058 }
1059
1060 pub fn select_fft_impl(&self, size: usize) -> &'static str {
1062 if self.capabilities.metal_available && size > 512 {
1065 return "Metal-MPS";
1066 }
1067
1068 if self.capabilities.cuda_available && size > 1024 {
1069 "cuFFT"
1070 } else if self.should_use_simd(size) {
1071 "SIMD"
1072 } else {
1073 "Scalar"
1074 }
1075 }
1076
1077 pub fn has_unified_memory(&self) -> bool {
1079 cfg!(all(target_os = "macos", target_arch = "aarch64"))
1080 }
1081
1082 pub fn recommend(&self, operation: &str, size: usize) -> String {
1084 let recommendation = match operation {
1085 "gemm" | "matmul" => self.select_gemm_impl(size, size, size),
1086 "vector" | "axpy" | "dot" => self.select_vector_impl(size),
1087 "reduction" | "sum" | "mean" => self.select_reduction_impl(size),
1088 "fft" => self.select_fft_impl(size),
1089 _ => "Scalar",
1090 };
1091
1092 if self.has_unified_memory() && recommendation == "Metal" {
1093 format!("{recommendation} (Unified Memory)")
1094 } else {
1095 recommendation.to_string()
1096 }
1097 }
1098}
1099
1100impl Default for AutoOptimizer {
1101 fn default() -> Self {
1102 Self::new()
1103 }
1104}
1105
1106pub fn simd_dot_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
1108 f32::simd_dot_f32_ultra(a, b)
1109}
1110
1111pub fn simd_fma_f32_ultra(
1113 a: &ArrayView1<f32>,
1114 b: &ArrayView1<f32>,
1115 c: &ArrayView1<f32>,
1116 result: &mut ArrayViewMut1<f32>,
1117) {
1118 f32::simd_fma_f32_ultra(a, b, c, result)
1119}
1120
1121pub fn simd_add_f32_adaptive(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
1123 f32::simd_add_adaptive(a, b)
1124}
1125
1126pub fn simd_mul_f32_hyperoptimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
1127 f32::simd_mul(a, b)
1128}
1129
1130pub fn simd_mul_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1135 let a_array = Array1::from_vec(a.clone());
1136 let b_array = Array1::from_vec(b.clone());
1137 let mut result_array = Array1::from_vec(result.clone());
1138
1139 f32::simd_mul_f32_ultra(
1140 &a_array.view(),
1141 &b_array.view(),
1142 &mut result_array.view_mut(),
1143 );
1144 *result = result_array.to_vec();
1145}
1146
1147pub fn simd_add_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1149 let a_array = Array1::from_vec(a.clone());
1150 let b_array = Array1::from_vec(b.clone());
1151 let mut result_array = Array1::from_vec(result.clone());
1152
1153 f32::simd_add_f32_ultra(
1154 &a_array.view(),
1155 &b_array.view(),
1156 &mut result_array.view_mut(),
1157 );
1158 *result = result_array.to_vec();
1159}
1160
1161pub fn simd_div_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1163 let a_array = Array1::from_vec(a.clone());
1164 let b_array = Array1::from_vec(b.clone());
1165 let mut result_array = Array1::from_vec(result.clone());
1166
1167 f32::simd_div_f32_ultra(
1168 &a_array.view(),
1169 &b_array.view(),
1170 &mut result_array.view_mut(),
1171 );
1172 *result = result_array.to_vec();
1173}
1174
1175pub fn simd_sin_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1177 let a_array = Array1::from_vec(a.clone());
1178 let mut result_array = Array1::from_vec(result.clone());
1179
1180 f32::simd_sin_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1181 *result = result_array.to_vec();
1182}
1183
1184pub fn simd_sub_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1186 let a_array = Array1::from_vec(a.clone());
1187 let b_array = Array1::from_vec(b.clone());
1188 let mut result_array = Array1::from_vec(result.clone());
1189
1190 f32::simd_sub_f32_ultra(
1191 &a_array.view(),
1192 &b_array.view(),
1193 &mut result_array.view_mut(),
1194 );
1195 *result = result_array.to_vec();
1196}
1197
1198pub fn simd_fma_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, c: &Vec<f32>, result: &mut Vec<f32>) {
1200 let a_array = Array1::from_vec(a.clone());
1201 let b_array = Array1::from_vec(b.clone());
1202 let c_array = Array1::from_vec(c.clone());
1203 let mut result_array = Array1::from_vec(result.clone());
1204
1205 f32::simd_fma_f32_ultra(
1206 &a_array.view(),
1207 &b_array.view(),
1208 &c_array.view(),
1209 &mut result_array.view_mut(),
1210 );
1211 *result = result_array.to_vec();
1212}
1213
1214pub fn simd_pow_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1216 let a_array = Array1::from_vec(a.clone());
1217 let b_array = Array1::from_vec(b.clone());
1218 let mut result_array = Array1::from_vec(result.clone());
1219
1220 f32::simd_pow_f32_ultra(
1221 &a_array.view(),
1222 &b_array.view(),
1223 &mut result_array.view_mut(),
1224 );
1225 *result = result_array.to_vec();
1226}
1227
1228pub fn simd_exp_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1230 let a_array = Array1::from_vec(a.clone());
1231 let mut result_array = Array1::from_vec(result.clone());
1232
1233 f32::simd_exp_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1234 *result = result_array.to_vec();
1235}
1236
1237pub fn simd_cos_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1239 let a_array = Array1::from_vec(a.clone());
1240 let mut result_array = Array1::from_vec(result.clone());
1241
1242 f32::simd_cos_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1243 *result = result_array.to_vec();
1244}
1245
1246#[cfg(test)]
1247mod tests {
1248 use super::*;
1249 use ndarray::arr1;
1250
1251 #[test]
1252 fn test_simd_unified_ops_f32() {
1253 let a = arr1(&[1.0f32, 2.0, 3.0, 4.0]);
1254 let b = arr1(&[5.0f32, 6.0, 7.0, 8.0]);
1255
1256 let sum = f32::simd_add(&a.view(), &b.view());
1257 assert_eq!(sum, arr1(&[6.0f32, 8.0, 10.0, 12.0]));
1258
1259 let product = f32::simd_mul(&a.view(), &b.view());
1260 assert_eq!(product, arr1(&[5.0f32, 12.0, 21.0, 32.0]));
1261
1262 let dot = f32::simd_dot(&a.view(), &b.view());
1263 assert_eq!(dot, 70.0);
1264 }
1265
1266 #[test]
1267 fn test_platform_capabilities() {
1268 let caps = PlatformCapabilities::detect();
1269 println!("{}", caps.summary());
1270 }
1271
1272 #[test]
1273 fn test_auto_optimizer() {
1274 let optimizer = AutoOptimizer::new();
1275
1276 assert!(!optimizer.should_use_gpu(100));
1278
1279 let large_size = 100000;
1281 if optimizer.capabilities.gpu_available {
1282 assert!(optimizer.should_use_gpu(large_size));
1283 }
1284 }
1285}