scirs2_core/
simd_ops.rs

1//! Unified SIMD operations abstraction layer
2//!
3//! This module provides a comprehensive abstraction layer for all SIMD operations
4//! used across the scirs2 ecosystem. All modules should use these operations
5//! instead of implementing their own SIMD code.
6
7use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
8use num_traits::Zero;
9
10/// Unified SIMD operations trait
11pub trait SimdUnifiedOps: Sized + Copy + PartialOrd + Zero {
12    /// Element-wise addition
13    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
14
15    /// Element-wise subtraction
16    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
17
18    /// Element-wise multiplication
19    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
20
21    /// Element-wise division
22    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
23
24    /// Dot product
25    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
26
27    /// Matrix-vector multiplication (GEMV)
28    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>);
29
30    /// Matrix-matrix multiplication (GEMM)
31    fn simd_gemm(
32        alpha: Self,
33        a: &ArrayView2<Self>,
34        b: &ArrayView2<Self>,
35        beta: Self,
36        c: &mut Array2<Self>,
37    );
38
39    /// Vector norm (L2)
40    fn simd_norm(a: &ArrayView1<Self>) -> Self;
41
42    /// Element-wise maximum
43    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
44
45    /// Element-wise minimum
46    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
47
48    /// Scalar multiplication
49    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self>;
50
51    /// Sum reduction
52    fn simd_sum(a: &ArrayView1<Self>) -> Self;
53
54    /// Mean reduction
55    fn simd_mean(a: &ArrayView1<Self>) -> Self;
56
57    /// Find maximum element
58    fn simd_max_element(a: &ArrayView1<Self>) -> Self;
59
60    /// Find minimum element
61    fn simd_min_element(a: &ArrayView1<Self>) -> Self;
62
63    /// Fused multiply-add: a * b + c
64    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self>;
65
66    /// Enhanced cache-optimized addition for large arrays
67    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
68
69    /// Advanced-optimized fused multiply-add for maximum performance
70    fn simd_fma_advanced_optimized(
71        a: &ArrayView1<Self>,
72        b: &ArrayView1<Self>,
73        c: &ArrayView1<Self>,
74    ) -> Array1<Self>;
75
76    /// Adaptive SIMD operation that selects optimal implementation
77    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
78
79    /// Matrix transpose
80    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self>;
81
82    /// Element-wise absolute value
83    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self>;
84
85    /// Element-wise square root
86    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self>;
87
88    /// Sum of squares
89    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self;
90
91    /// Element-wise multiplication (alias for simd_mul)
92    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self>;
93
94    /// Check if SIMD is available for this type
95    fn simd_available() -> bool;
96
97    /// Ultra-optimized sum reduction (alias for simd_sum for compatibility)
98    fn simd_sum_f32_ultra(a: &ArrayView1<Self>) -> Self {
99        Self::simd_sum(a)
100    }
101
102    /// Ultra-optimized subtraction (alias for simd_sub for compatibility)
103    fn simd_sub_f32_ultra(
104        a: &ArrayView1<Self>,
105        b: &ArrayView1<Self>,
106        result: &mut ArrayViewMut1<Self>,
107    );
108
109    /// Ultra-optimized multiplication (alias for simd_mul for compatibility)
110    fn simd_mul_f32_ultra(
111        a: &ArrayView1<Self>,
112        b: &ArrayView1<Self>,
113        result: &mut ArrayViewMut1<Self>,
114    );
115
116    /// Ultra-optimized cubes sum (power 3 sum)
117    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self;
118
119    /// Ultra-optimized division (alias for simd_div for compatibility)
120    fn simd_div_f32_ultra(
121        a: &ArrayView1<Self>,
122        b: &ArrayView1<Self>,
123        result: &mut ArrayViewMut1<Self>,
124    );
125
126    /// Ultra-optimized sine function
127    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
128
129    /// Ultra-optimized addition (alias for simd_add for compatibility)
130    fn simd_add_f32_ultra(
131        a: &ArrayView1<Self>,
132        b: &ArrayView1<Self>,
133        result: &mut ArrayViewMut1<Self>,
134    );
135
136    /// Ultra-optimized fused multiply-add
137    fn simd_fma_f32_ultra(
138        a: &ArrayView1<Self>,
139        b: &ArrayView1<Self>,
140        c: &ArrayView1<Self>,
141        result: &mut ArrayViewMut1<Self>,
142    );
143
144    /// Ultra-optimized power function
145    fn simd_pow_f32_ultra(
146        a: &ArrayView1<Self>,
147        b: &ArrayView1<Self>,
148        result: &mut ArrayViewMut1<Self>,
149    );
150
151    /// Ultra-optimized exponential function
152    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
153
154    /// Ultra-optimized cosine function
155    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>);
156
157    /// Ultra-optimized dot product
158    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self;
159}
160
161// Implementation for f32
162impl SimdUnifiedOps for f32 {
163    #[cfg(feature = "simd")]
164    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
165        crate::simd::simd_add_f32(a, b)
166    }
167
168    #[cfg(not(feature = "simd"))]
169    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
170        (a + b).to_owned()
171    }
172
173    #[cfg(feature = "simd")]
174    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
175        crate::simd::simd_sub_f32(a, b)
176    }
177
178    #[cfg(not(feature = "simd"))]
179    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
180        (a - b).to_owned()
181    }
182
183    #[cfg(feature = "simd")]
184    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
185        crate::simd::simd_mul_f32(a, b)
186    }
187
188    #[cfg(not(feature = "simd"))]
189    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
190        (a * b).to_owned()
191    }
192
193    #[cfg(feature = "simd")]
194    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
195        crate::simd::simd_div_f32(a, b)
196    }
197
198    #[cfg(not(feature = "simd"))]
199    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
200        (a / b).to_owned()
201    }
202
203    #[cfg(feature = "simd")]
204    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
205        crate::simd::simd_dot_f32(a, b)
206    }
207
208    #[cfg(not(feature = "simd"))]
209    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
210        a.dot(b)
211    }
212
213    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
214        let m = a.nrows();
215        let n = a.ncols();
216
217        assert_eq!(n, x.len());
218        assert_eq!(m, y.len());
219
220        // Scale y by beta
221        if beta == 0.0 {
222            y.fill(0.0);
223        } else if beta != 1.0 {
224            y.mapv_inplace(|v| v * beta);
225        }
226
227        // Compute matrix-vector product
228        for i in 0..m {
229            let row = a.row(i);
230            y[i] += Self::simd_dot(&row, x);
231        }
232    }
233
234    fn simd_gemm(
235        alpha: Self,
236        a: &ArrayView2<Self>,
237        b: &ArrayView2<Self>,
238        beta: Self,
239        c: &mut Array2<Self>,
240    ) {
241        let m = a.nrows();
242        let k = a.ncols();
243        let n = b.ncols();
244
245        assert_eq!(k, b.nrows());
246        assert_eq!((m, n), c.dim());
247
248        // Scale C by beta
249        if beta == 0.0 {
250            c.fill(0.0);
251        } else if beta != 1.0 {
252            c.mapv_inplace(|v| v * beta);
253        }
254
255        // Compute matrix multiplication
256        for i in 0..m {
257            let a_row = a.row(i);
258            for j in 0..n {
259                let b_col = b.column(j);
260                c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
261            }
262        }
263    }
264
265    fn simd_norm(a: &ArrayView1<Self>) -> Self {
266        Self::simd_dot(a, a).sqrt()
267    }
268
269    #[cfg(feature = "simd")]
270    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
271        crate::simd::simd_maximum_f32(a, b)
272    }
273
274    #[cfg(not(feature = "simd"))]
275    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
276        let mut result = Array1::zeros(a.len());
277        for _i in 0..a.len() {
278            result[0] = a[0].max(b[0]);
279        }
280        result
281    }
282
283    #[cfg(feature = "simd")]
284    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
285        crate::simd::simd_minimum_f32(a, b)
286    }
287
288    #[cfg(not(feature = "simd"))]
289    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
290        let mut result = Array1::zeros(a.len());
291        for _i in 0..a.len() {
292            result[0] = a[0].min(b[0]);
293        }
294        result
295    }
296
297    #[cfg(feature = "simd")]
298    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
299        crate::simd::simd_scalar_mul_f32(a, scalar)
300    }
301
302    #[cfg(not(feature = "simd"))]
303    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
304        a.mapv(|x| x * scalar)
305    }
306
307    #[cfg(feature = "simd")]
308    fn simd_sum(a: &ArrayView1<Self>) -> Self {
309        crate::simd::simd_sum_f32(a)
310    }
311
312    #[cfg(not(feature = "simd"))]
313    fn simd_sum(a: &ArrayView1<Self>) -> Self {
314        a.sum()
315    }
316
317    fn simd_mean(a: &ArrayView1<Self>) -> Self {
318        if a.is_empty() {
319            0.0
320        } else {
321            Self::simd_sum(a) / (a.len() as f32)
322        }
323    }
324
325    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
326        a.fold(f32::NEG_INFINITY, |acc, &x| acc.max(x))
327    }
328
329    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
330        a.fold(f32::INFINITY, |acc, &x| acc.min(x))
331    }
332
333    #[cfg(feature = "simd")]
334    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
335        crate::simd::simd_fused_multiply_add_f32(a, b, c)
336    }
337
338    #[cfg(not(feature = "simd"))]
339    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
340        let mut result = Array1::zeros(a.len());
341        for _i in 0..a.len() {
342            result[0] = a[0] * b[0] + c[0];
343        }
344        result
345    }
346
347    #[cfg(feature = "simd")]
348    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
349        crate::simd::simd_add_cache_optimized_f32(a, b)
350    }
351
352    #[cfg(not(feature = "simd"))]
353    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
354        a + b
355    }
356
357    #[cfg(feature = "simd")]
358    fn simd_fma_advanced_optimized(
359        a: &ArrayView1<Self>,
360        b: &ArrayView1<Self>,
361        c: &ArrayView1<Self>,
362    ) -> Array1<Self> {
363        crate::simd::simd_fma_advanced_optimized_f32(a, b, c)
364    }
365
366    #[cfg(not(feature = "simd"))]
367    fn simd_fma_advanced_optimized(
368        a: &ArrayView1<Self>,
369        b: &ArrayView1<Self>,
370        c: &ArrayView1<Self>,
371    ) -> Array1<Self> {
372        let mut result = Array1::zeros(a.len());
373        for _i in 0..a.len() {
374            result[0] = a[0] * b[0] + c[0];
375        }
376        result
377    }
378
379    #[cfg(feature = "simd")]
380    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
381        crate::simd::simd_adaptive_add_f32(a, b)
382    }
383
384    #[cfg(not(feature = "simd"))]
385    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
386        a + b
387    }
388
389    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
390        a.t().to_owned()
391    }
392
393    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
394        a.mapv(|x| x.abs())
395    }
396
397    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
398        a.mapv(|x| x.sqrt())
399    }
400
401    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
402        a.iter().map(|&x| x * x).sum()
403    }
404
405    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
406        Self::simd_mul(a, b)
407    }
408
409    #[cfg(feature = "simd")]
410    fn simd_available() -> bool {
411        true
412    }
413
414    #[cfg(not(feature = "simd"))]
415    fn simd_available() -> bool {
416        false
417    }
418
419    fn simd_sub_f32_ultra(
420        a: &ArrayView1<Self>,
421        b: &ArrayView1<Self>,
422        result: &mut ArrayViewMut1<Self>,
423    ) {
424        let sub_result = Self::simd_sub(a, b);
425        result.assign(&sub_result);
426    }
427
428    fn simd_mul_f32_ultra(
429        a: &ArrayView1<Self>,
430        b: &ArrayView1<Self>,
431        result: &mut ArrayViewMut1<Self>,
432    ) {
433        let mul_result = Self::simd_mul(a, b);
434        result.assign(&mul_result);
435    }
436
437    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
438        // Calculate sum of cubes: sum(x^3)
439        a.iter().map(|&x| x * x * x).sum()
440    }
441
442    fn simd_div_f32_ultra(
443        a: &ArrayView1<Self>,
444        b: &ArrayView1<Self>,
445        result: &mut ArrayViewMut1<Self>,
446    ) {
447        let div_result = Self::simd_div(a, b);
448        result.assign(&div_result);
449    }
450
451    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
452        let sin_result = a.mapv(|x| x.sin());
453        result.assign(&sin_result);
454    }
455
456    fn simd_add_f32_ultra(
457        a: &ArrayView1<Self>,
458        b: &ArrayView1<Self>,
459        result: &mut ArrayViewMut1<Self>,
460    ) {
461        let add_result = Self::simd_add(a, b);
462        result.assign(&add_result);
463    }
464
465    fn simd_fma_f32_ultra(
466        a: &ArrayView1<Self>,
467        b: &ArrayView1<Self>,
468        c: &ArrayView1<Self>,
469        result: &mut ArrayViewMut1<Self>,
470    ) {
471        let fma_result = Self::simd_fma(a, b, c);
472        result.assign(&fma_result);
473    }
474
475    fn simd_pow_f32_ultra(
476        a: &ArrayView1<Self>,
477        b: &ArrayView1<Self>,
478        result: &mut ArrayViewMut1<Self>,
479    ) {
480        let pow_result = a
481            .iter()
482            .zip(b.iter())
483            .map(|(&x, &y)| x.powf(y))
484            .collect::<Vec<_>>();
485        result.assign(&Array1::from_vec(pow_result));
486    }
487
488    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
489        let exp_result = a.mapv(|x| x.exp());
490        result.assign(&exp_result);
491    }
492
493    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
494        let cos_result = a.mapv(|x| x.cos());
495        result.assign(&cos_result);
496    }
497
498    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
499        Self::simd_dot(a, b)
500    }
501}
502
503// Implementation for f64
504impl SimdUnifiedOps for f64 {
505    #[cfg(feature = "simd")]
506    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
507        crate::simd::simd_add_f64(a, b)
508    }
509
510    #[cfg(not(feature = "simd"))]
511    fn simd_add(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
512        (a + b).to_owned()
513    }
514
515    #[cfg(feature = "simd")]
516    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
517        crate::simd::simd_sub_f64(a, b)
518    }
519
520    #[cfg(not(feature = "simd"))]
521    fn simd_sub(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
522        (a - b).to_owned()
523    }
524
525    #[cfg(feature = "simd")]
526    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
527        crate::simd::simd_mul_f64(a, b)
528    }
529
530    #[cfg(not(feature = "simd"))]
531    fn simd_mul(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
532        (a * b).to_owned()
533    }
534
535    #[cfg(feature = "simd")]
536    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
537        crate::simd::simd_div_f64(a, b)
538    }
539
540    #[cfg(not(feature = "simd"))]
541    fn simd_div(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
542        (a / b).to_owned()
543    }
544
545    #[cfg(feature = "simd")]
546    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
547        crate::simd::simd_dot_f64(a, b)
548    }
549
550    #[cfg(not(feature = "simd"))]
551    fn simd_dot(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
552        a.dot(b)
553    }
554
555    fn simd_gemv(a: &ArrayView2<Self>, x: &ArrayView1<Self>, beta: Self, y: &mut Array1<Self>) {
556        let m = a.nrows();
557        let n = a.ncols();
558
559        assert_eq!(n, x.len());
560        assert_eq!(m, y.len());
561
562        // Scale y by beta
563        if beta == 0.0 {
564            y.fill(0.0);
565        } else if beta != 1.0 {
566            y.mapv_inplace(|v| v * beta);
567        }
568
569        // Compute matrix-vector product
570        for i in 0..m {
571            let row = a.row(i);
572            y[i] += Self::simd_dot(&row, x);
573        }
574    }
575
576    fn simd_gemm(
577        alpha: Self,
578        a: &ArrayView2<Self>,
579        b: &ArrayView2<Self>,
580        beta: Self,
581        c: &mut Array2<Self>,
582    ) {
583        let m = a.nrows();
584        let k = a.ncols();
585        let n = b.ncols();
586
587        assert_eq!(k, b.nrows());
588        assert_eq!((m, n), c.dim());
589
590        // Scale C by beta
591        if beta == 0.0 {
592            c.fill(0.0);
593        } else if beta != 1.0 {
594            c.mapv_inplace(|v| v * beta);
595        }
596
597        // Compute matrix multiplication
598        for i in 0..m {
599            let a_row = a.row(i);
600            for j in 0..n {
601                let b_col = b.column(j);
602                c[[i, j]] += alpha * Self::simd_dot(&a_row, &b_col);
603            }
604        }
605    }
606
607    fn simd_norm(a: &ArrayView1<Self>) -> Self {
608        Self::simd_dot(a, a).sqrt()
609    }
610
611    #[cfg(feature = "simd")]
612    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
613        crate::simd::simd_maximum_f64(a, b)
614    }
615
616    #[cfg(not(feature = "simd"))]
617    fn simd_max(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
618        let mut result = Array1::zeros(a.len());
619        for _i in 0..a.len() {
620            result[0] = a[0].max(b[0]);
621        }
622        result
623    }
624
625    #[cfg(feature = "simd")]
626    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
627        crate::simd::simd_minimum_f64(a, b)
628    }
629
630    #[cfg(not(feature = "simd"))]
631    fn simd_min(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
632        let mut result = Array1::zeros(a.len());
633        for _i in 0..a.len() {
634            result[0] = a[0].min(b[0]);
635        }
636        result
637    }
638
639    #[cfg(feature = "simd")]
640    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
641        crate::simd::simd_scalar_mul_f64(a, scalar)
642    }
643
644    #[cfg(not(feature = "simd"))]
645    fn simd_scalar_mul(a: &ArrayView1<Self>, scalar: Self) -> Array1<Self> {
646        a.mapv(|x| x * scalar)
647    }
648
649    #[cfg(feature = "simd")]
650    fn simd_sum(a: &ArrayView1<Self>) -> Self {
651        crate::simd::simd_sum_f64(a)
652    }
653
654    #[cfg(not(feature = "simd"))]
655    fn simd_sum(a: &ArrayView1<Self>) -> Self {
656        a.sum()
657    }
658
659    fn simd_mean(a: &ArrayView1<Self>) -> Self {
660        if a.is_empty() {
661            0.0
662        } else {
663            Self::simd_sum(a) / (a.len() as f64)
664        }
665    }
666
667    fn simd_max_element(a: &ArrayView1<Self>) -> Self {
668        a.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
669    }
670
671    fn simd_min_element(a: &ArrayView1<Self>) -> Self {
672        a.fold(f64::INFINITY, |acc, &x| acc.min(x))
673    }
674
675    #[cfg(feature = "simd")]
676    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
677        crate::simd::simd_fused_multiply_add_f64(a, b, c)
678    }
679
680    #[cfg(not(feature = "simd"))]
681    fn simd_fma(a: &ArrayView1<Self>, b: &ArrayView1<Self>, c: &ArrayView1<Self>) -> Array1<Self> {
682        let mut result = Array1::zeros(a.len());
683        for _i in 0..a.len() {
684            result[0] = a[0] * b[0] + c[0];
685        }
686        result
687    }
688
689    #[cfg(feature = "simd")]
690    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
691        crate::simd::simd_add_cache_optimized_f64(a, b)
692    }
693
694    #[cfg(not(feature = "simd"))]
695    fn simd_add_cache_optimized(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
696        a + b
697    }
698
699    #[cfg(feature = "simd")]
700    fn simd_fma_advanced_optimized(
701        a: &ArrayView1<Self>,
702        b: &ArrayView1<Self>,
703        c: &ArrayView1<Self>,
704    ) -> Array1<Self> {
705        crate::simd::simd_fma_advanced_optimized_f64(a, b, c)
706    }
707
708    #[cfg(not(feature = "simd"))]
709    fn simd_fma_advanced_optimized(
710        a: &ArrayView1<Self>,
711        b: &ArrayView1<Self>,
712        c: &ArrayView1<Self>,
713    ) -> Array1<Self> {
714        let mut result = Array1::zeros(a.len());
715        for _i in 0..a.len() {
716            result[0] = a[0] * b[0] + c[0];
717        }
718        result
719    }
720
721    #[cfg(feature = "simd")]
722    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
723        crate::simd::simd_adaptive_add_f64(a, b)
724    }
725
726    #[cfg(not(feature = "simd"))]
727    fn simd_add_adaptive(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
728        a + b
729    }
730
731    fn simd_transpose(a: &ArrayView2<Self>) -> Array2<Self> {
732        a.t().to_owned()
733    }
734
735    fn simd_abs(a: &ArrayView1<Self>) -> Array1<Self> {
736        a.mapv(|x| x.abs())
737    }
738
739    fn simd_sqrt(a: &ArrayView1<Self>) -> Array1<Self> {
740        a.mapv(|x| x.sqrt())
741    }
742
743    fn simd_sum_squares(a: &ArrayView1<Self>) -> Self {
744        a.iter().map(|&x| x * x).sum()
745    }
746
747    fn simd_multiply(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Array1<Self> {
748        Self::simd_mul(a, b)
749    }
750
751    #[cfg(feature = "simd")]
752    fn simd_available() -> bool {
753        true
754    }
755
756    #[cfg(not(feature = "simd"))]
757    fn simd_available() -> bool {
758        false
759    }
760
761    fn simd_sub_f32_ultra(
762        a: &ArrayView1<Self>,
763        b: &ArrayView1<Self>,
764        result: &mut ArrayViewMut1<Self>,
765    ) {
766        let sub_result = Self::simd_sub(a, b);
767        result.assign(&sub_result);
768    }
769
770    fn simd_mul_f32_ultra(
771        a: &ArrayView1<Self>,
772        b: &ArrayView1<Self>,
773        result: &mut ArrayViewMut1<Self>,
774    ) {
775        let mul_result = Self::simd_mul(a, b);
776        result.assign(&mul_result);
777    }
778
779    fn simd_sum_cubes(a: &ArrayView1<Self>) -> Self {
780        // Calculate sum of cubes: sum(x^3)
781        a.iter().map(|&x| x * x * x).sum()
782    }
783
784    fn simd_div_f32_ultra(
785        a: &ArrayView1<Self>,
786        b: &ArrayView1<Self>,
787        result: &mut ArrayViewMut1<Self>,
788    ) {
789        let div_result = Self::simd_div(a, b);
790        result.assign(&div_result);
791    }
792
793    fn simd_sin_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
794        let sin_result = a.mapv(|x| x.sin());
795        result.assign(&sin_result);
796    }
797
798    fn simd_add_f32_ultra(
799        a: &ArrayView1<Self>,
800        b: &ArrayView1<Self>,
801        result: &mut ArrayViewMut1<Self>,
802    ) {
803        let add_result = Self::simd_add(a, b);
804        result.assign(&add_result);
805    }
806
807    fn simd_fma_f32_ultra(
808        a: &ArrayView1<Self>,
809        b: &ArrayView1<Self>,
810        c: &ArrayView1<Self>,
811        result: &mut ArrayViewMut1<Self>,
812    ) {
813        let fma_result = Self::simd_fma(a, b, c);
814        result.assign(&fma_result);
815    }
816
817    fn simd_pow_f32_ultra(
818        a: &ArrayView1<Self>,
819        b: &ArrayView1<Self>,
820        result: &mut ArrayViewMut1<Self>,
821    ) {
822        let pow_result = a
823            .iter()
824            .zip(b.iter())
825            .map(|(&x, &y)| x.powf(y))
826            .collect::<Vec<_>>();
827        result.assign(&Array1::from_vec(pow_result));
828    }
829
830    fn simd_exp_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
831        let exp_result = a.mapv(|x| x.exp());
832        result.assign(&exp_result);
833    }
834
835    fn simd_cos_f32_ultra(a: &ArrayView1<Self>, result: &mut ArrayViewMut1<Self>) {
836        let cos_result = a.mapv(|x| x.cos());
837        result.assign(&cos_result);
838    }
839
840    fn simd_dot_f32_ultra(a: &ArrayView1<Self>, b: &ArrayView1<Self>) -> Self {
841        Self::simd_dot(a, b)
842    }
843}
844
845/// Platform capability detection
846#[derive(Debug, Clone)]
847pub struct PlatformCapabilities {
848    pub simd_available: bool,
849    pub gpu_available: bool,
850    pub cuda_available: bool,
851    pub opencl_available: bool,
852    pub metal_available: bool,
853    pub avx2_available: bool,
854    pub avx512_available: bool,
855    pub neon_available: bool,
856}
857
858impl PlatformCapabilities {
859    /// Detect current platform capabilities
860    pub fn detect() -> Self {
861        Self {
862            simd_available: cfg!(feature = "simd"),
863            gpu_available: cfg!(feature = "gpu"),
864            cuda_available: cfg!(all(feature = "gpu", feature = "cuda")),
865            opencl_available: cfg!(all(feature = "gpu", feature = "opencl")),
866            metal_available: cfg!(all(feature = "gpu", feature = "metal", target_os = "macos")),
867            avx2_available: cfg!(target_feature = "avx2"),
868            avx512_available: cfg!(target_feature = "avx512f"),
869            neon_available: cfg!(target_arch = "aarch64"),
870        }
871    }
872
873    /// Get a summary of available acceleration features
874    pub fn summary(&self) -> String {
875        let mut features = Vec::new();
876
877        if self.simd_available {
878            features.push("SIMD");
879        }
880        if self.gpu_available {
881            features.push("GPU");
882        }
883        if self.cuda_available {
884            features.push("CUDA");
885        }
886        if self.opencl_available {
887            features.push("OpenCL");
888        }
889        if self.metal_available {
890            features.push("Metal");
891        }
892        if self.avx2_available {
893            features.push("AVX2");
894        }
895        if self.avx512_available {
896            features.push("AVX512");
897        }
898        if self.neon_available {
899            features.push("NEON");
900        }
901
902        if features.is_empty() {
903            "No acceleration features available".to_string()
904        } else {
905            format!(
906                "Available acceleration: {features}",
907                features = features.join(", ")
908            )
909        }
910    }
911
912    /// Check if AVX2 is available
913    pub fn has_avx2(&self) -> bool {
914        self.avx2_available
915    }
916
917    /// Check if AVX512 is available
918    pub fn has_avx512(&self) -> bool {
919        self.avx512_available
920    }
921
922    /// Check if SSE is available (fallback to SIMD availability)
923    pub fn has_sse(&self) -> bool {
924        self.simd_available || self.neon_available || self.avx2_available
925    }
926
927    /// Get the number of CPU cores
928    pub fn num_cores(&self) -> usize {
929        std::thread::available_parallelism()
930            .map(|n| n.get())
931            .unwrap_or(1)
932    }
933
934    /// Get the cache line size in bytes
935    pub fn cache_line_size(&self) -> usize {
936        // Standard cache line size on most modern processors
937        // x86/x64: typically 64 bytes
938        // ARM: typically 64 bytes (Apple Silicon, newer ARM)
939        64
940    }
941}
942
943/// Automatic operation selection based on problem size and available features
944pub struct AutoOptimizer {
945    capabilities: PlatformCapabilities,
946}
947
948impl AutoOptimizer {
949    pub fn new() -> Self {
950        Self {
951            capabilities: PlatformCapabilities::detect(),
952        }
953    }
954
955    /// Determine if GPU should be used for a given problem size
956    pub fn should_use_gpu(&self, size: usize) -> bool {
957        // Use GPU for large problems when available
958        self.capabilities.gpu_available && size > 10000
959    }
960
961    /// Determine if Metal should be used on macOS
962    pub fn should_use_metal(&self, size: usize) -> bool {
963        // Use Metal for medium to large problems on macOS
964        // Metal has lower overhead than CUDA/OpenCL, so we can use it for smaller problems
965        self.capabilities.metal_available && size > 1024
966    }
967
968    /// Determine if SIMD should be used
969    pub fn should_use_simd(&self, size: usize) -> bool {
970        // Use SIMD for medium to large problems
971        self.capabilities.simd_available && size > 64
972    }
973
974    /// Select the best implementation for matrix multiplication
975    pub fn select_gemm_impl(&self, m: usize, n: usize, k: usize) -> &'static str {
976        let total_ops = m * n * k;
977
978        // Metal-specific heuristics for macOS
979        if self.capabilities.metal_available {
980            // For Apple Silicon with unified memory, Metal is efficient even for smaller matrices
981            if total_ops > 8192 {
982                // 16x16x32 or larger
983                return "Metal";
984            }
985        }
986
987        if self.should_use_gpu(total_ops) {
988            if self.capabilities.cuda_available {
989                "CUDA"
990            } else if self.capabilities.metal_available {
991                "Metal"
992            } else if self.capabilities.opencl_available {
993                "OpenCL"
994            } else {
995                "GPU"
996            }
997        } else if self.should_use_simd(total_ops) {
998            "SIMD"
999        } else {
1000            "Scalar"
1001        }
1002    }
1003
1004    /// Select the best implementation for vector operations
1005    pub fn select_vector_impl(&self, size: usize) -> &'static str {
1006        // Metal is efficient for vector operations on Apple Silicon
1007        if self.capabilities.metal_available && size > 1024 {
1008            return "Metal";
1009        }
1010
1011        if self.should_use_gpu(size) {
1012            if self.capabilities.cuda_available {
1013                "CUDA"
1014            } else if self.capabilities.metal_available {
1015                "Metal"
1016            } else if self.capabilities.opencl_available {
1017                "OpenCL"
1018            } else {
1019                "GPU"
1020            }
1021        } else if self.should_use_simd(size) {
1022            if self.capabilities.avx512_available {
1023                "AVX512"
1024            } else if self.capabilities.avx2_available {
1025                "AVX2"
1026            } else if self.capabilities.neon_available {
1027                "NEON"
1028            } else {
1029                "SIMD"
1030            }
1031        } else {
1032            "Scalar"
1033        }
1034    }
1035
1036    /// Select the best implementation for reduction operations
1037    pub fn select_reduction_impl(&self, size: usize) -> &'static str {
1038        // Reductions benefit from GPU parallelism at larger sizes
1039        // Metal has efficient reduction primitives
1040        if self.capabilities.metal_available && size > 4096 {
1041            return "Metal";
1042        }
1043
1044        if self.should_use_gpu(size * 2) {
1045            // Higher threshold for reductions
1046            if self.capabilities.cuda_available {
1047                "CUDA"
1048            } else if self.capabilities.metal_available {
1049                "Metal"
1050            } else {
1051                "GPU"
1052            }
1053        } else if self.should_use_simd(size) {
1054            "SIMD"
1055        } else {
1056            "Scalar"
1057        }
1058    }
1059
1060    /// Select the best implementation for FFT operations
1061    pub fn select_fft_impl(&self, size: usize) -> &'static str {
1062        // FFT benefits greatly from GPU acceleration
1063        // Metal Performance Shaders has optimized FFT
1064        if self.capabilities.metal_available && size > 512 {
1065            return "Metal-MPS";
1066        }
1067
1068        if self.capabilities.cuda_available && size > 1024 {
1069            "cuFFT"
1070        } else if self.should_use_simd(size) {
1071            "SIMD"
1072        } else {
1073            "Scalar"
1074        }
1075    }
1076
1077    /// Check if running on Apple Silicon with unified memory
1078    pub fn has_unified_memory(&self) -> bool {
1079        cfg!(all(target_os = "macos", target_arch = "aarch64"))
1080    }
1081
1082    /// Get optimization recommendation for a specific operation
1083    pub fn recommend(&self, operation: &str, size: usize) -> String {
1084        let recommendation = match operation {
1085            "gemm" | "matmul" => self.select_gemm_impl(size, size, size),
1086            "vector" | "axpy" | "dot" => self.select_vector_impl(size),
1087            "reduction" | "sum" | "mean" => self.select_reduction_impl(size),
1088            "fft" => self.select_fft_impl(size),
1089            _ => "Scalar",
1090        };
1091
1092        if self.has_unified_memory() && recommendation == "Metal" {
1093            format!("{recommendation} (Unified Memory)")
1094        } else {
1095            recommendation.to_string()
1096        }
1097    }
1098}
1099
1100impl Default for AutoOptimizer {
1101    fn default() -> Self {
1102        Self::new()
1103    }
1104}
1105
1106/// Standalone ultra-optimized dot product function for f32
1107pub fn simd_dot_f32_ultra(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
1108    f32::simd_dot_f32_ultra(a, b)
1109}
1110
1111/// Standalone ultra-optimized FMA function for f32
1112pub fn simd_fma_f32_ultra(
1113    a: &ArrayView1<f32>,
1114    b: &ArrayView1<f32>,
1115    c: &ArrayView1<f32>,
1116    result: &mut ArrayViewMut1<f32>,
1117) {
1118    f32::simd_fma_f32_ultra(a, b, c, result)
1119}
1120
1121/// Additional standalone functions that might be needed
1122pub fn simd_add_f32_adaptive(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
1123    f32::simd_add_adaptive(a, b)
1124}
1125
1126pub fn simd_mul_f32_hyperoptimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
1127    f32::simd_mul(a, b)
1128}
1129
1130/// Helper functions for Vec<T> compatibility
1131/// These functions accept Vec<T> and internally convert to Array types
1132
1133/// Helper function for Vec-based SIMD multiplication
1134pub fn simd_mul_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1135    let a_array = Array1::from_vec(a.clone());
1136    let b_array = Array1::from_vec(b.clone());
1137    let mut result_array = Array1::from_vec(result.clone());
1138
1139    f32::simd_mul_f32_ultra(
1140        &a_array.view(),
1141        &b_array.view(),
1142        &mut result_array.view_mut(),
1143    );
1144    *result = result_array.to_vec();
1145}
1146
1147/// Helper function for Vec-based SIMD addition
1148pub fn simd_add_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1149    let a_array = Array1::from_vec(a.clone());
1150    let b_array = Array1::from_vec(b.clone());
1151    let mut result_array = Array1::from_vec(result.clone());
1152
1153    f32::simd_add_f32_ultra(
1154        &a_array.view(),
1155        &b_array.view(),
1156        &mut result_array.view_mut(),
1157    );
1158    *result = result_array.to_vec();
1159}
1160
1161/// Helper function for Vec-based SIMD division
1162pub fn simd_div_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1163    let a_array = Array1::from_vec(a.clone());
1164    let b_array = Array1::from_vec(b.clone());
1165    let mut result_array = Array1::from_vec(result.clone());
1166
1167    f32::simd_div_f32_ultra(
1168        &a_array.view(),
1169        &b_array.view(),
1170        &mut result_array.view_mut(),
1171    );
1172    *result = result_array.to_vec();
1173}
1174
1175/// Helper function for Vec-based SIMD sine
1176pub fn simd_sin_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1177    let a_array = Array1::from_vec(a.clone());
1178    let mut result_array = Array1::from_vec(result.clone());
1179
1180    f32::simd_sin_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1181    *result = result_array.to_vec();
1182}
1183
1184/// Helper function for Vec-based SIMD subtraction
1185pub fn simd_sub_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1186    let a_array = Array1::from_vec(a.clone());
1187    let b_array = Array1::from_vec(b.clone());
1188    let mut result_array = Array1::from_vec(result.clone());
1189
1190    f32::simd_sub_f32_ultra(
1191        &a_array.view(),
1192        &b_array.view(),
1193        &mut result_array.view_mut(),
1194    );
1195    *result = result_array.to_vec();
1196}
1197
1198/// Helper function for Vec-based SIMD FMA
1199pub fn simd_fma_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, c: &Vec<f32>, result: &mut Vec<f32>) {
1200    let a_array = Array1::from_vec(a.clone());
1201    let b_array = Array1::from_vec(b.clone());
1202    let c_array = Array1::from_vec(c.clone());
1203    let mut result_array = Array1::from_vec(result.clone());
1204
1205    f32::simd_fma_f32_ultra(
1206        &a_array.view(),
1207        &b_array.view(),
1208        &c_array.view(),
1209        &mut result_array.view_mut(),
1210    );
1211    *result = result_array.to_vec();
1212}
1213
1214/// Helper function for Vec-based SIMD power
1215pub fn simd_pow_f32_ultra_vec(a: &Vec<f32>, b: &Vec<f32>, result: &mut Vec<f32>) {
1216    let a_array = Array1::from_vec(a.clone());
1217    let b_array = Array1::from_vec(b.clone());
1218    let mut result_array = Array1::from_vec(result.clone());
1219
1220    f32::simd_pow_f32_ultra(
1221        &a_array.view(),
1222        &b_array.view(),
1223        &mut result_array.view_mut(),
1224    );
1225    *result = result_array.to_vec();
1226}
1227
1228/// Helper function for Vec-based SIMD exp
1229pub fn simd_exp_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1230    let a_array = Array1::from_vec(a.clone());
1231    let mut result_array = Array1::from_vec(result.clone());
1232
1233    f32::simd_exp_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1234    *result = result_array.to_vec();
1235}
1236
1237/// Helper function for Vec-based SIMD cos
1238pub fn simd_cos_f32_ultra_vec(a: &Vec<f32>, result: &mut Vec<f32>) {
1239    let a_array = Array1::from_vec(a.clone());
1240    let mut result_array = Array1::from_vec(result.clone());
1241
1242    f32::simd_cos_f32_ultra(&a_array.view(), &mut result_array.view_mut());
1243    *result = result_array.to_vec();
1244}
1245
1246#[cfg(test)]
1247mod tests {
1248    use super::*;
1249    use ndarray::arr1;
1250
1251    #[test]
1252    fn test_simd_unified_ops_f32() {
1253        let a = arr1(&[1.0f32, 2.0, 3.0, 4.0]);
1254        let b = arr1(&[5.0f32, 6.0, 7.0, 8.0]);
1255
1256        let sum = f32::simd_add(&a.view(), &b.view());
1257        assert_eq!(sum, arr1(&[6.0f32, 8.0, 10.0, 12.0]));
1258
1259        let product = f32::simd_mul(&a.view(), &b.view());
1260        assert_eq!(product, arr1(&[5.0f32, 12.0, 21.0, 32.0]));
1261
1262        let dot = f32::simd_dot(&a.view(), &b.view());
1263        assert_eq!(dot, 70.0);
1264    }
1265
1266    #[test]
1267    fn test_platform_capabilities() {
1268        let caps = PlatformCapabilities::detect();
1269        println!("{}", caps.summary());
1270    }
1271
1272    #[test]
1273    fn test_auto_optimizer() {
1274        let optimizer = AutoOptimizer::new();
1275
1276        // Small problem - should use scalar
1277        assert!(!optimizer.should_use_gpu(100));
1278
1279        // Large problem - depends on GPU availability
1280        let large_size = 100000;
1281        if optimizer.capabilities.gpu_available {
1282            assert!(optimizer.should_use_gpu(large_size));
1283        }
1284    }
1285}