sklears_decomposition/
hardware_acceleration.rs

1//! Hardware Acceleration for Decomposition Algorithms
2//!
3//! This module provides hardware-specific optimizations including:
4//! - SIMD optimizations for matrix operations
5//! - Multi-threaded parallel processing
6//! - Mixed-precision arithmetic support
7//! - Memory-aligned operations
8//! - Vectorized mathematical functions
9
10#[cfg(feature = "parallel")]
11use rayon::prelude::*;
12use scirs2_core::ndarray::{Array1, Array2};
13use sklears_core::{
14    error::{Result, SklearsError},
15    types::Float,
16};
17use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
18#[cfg(target_arch = "aarch64")]
19use std::arch::aarch64::*;
20use std::ops::{Deref, DerefMut};
21use std::ptr::NonNull;
22use std::slice;
23
24#[cfg(feature = "gpu")]
25use candle_core::{DType, Device, Tensor};
26// cudarc is only available on non-macOS platforms (no CUDA on macOS)
27#[cfg(all(feature = "gpu", not(target_os = "macos")))]
28use cudarc::driver::safe::CudaContext;
29
30/// Configuration for hardware acceleration features
31#[derive(Debug, Clone)]
32pub struct AccelerationConfig {
33    /// Enable SIMD optimizations
34    pub enable_simd: bool,
35    /// Enable parallel processing
36    pub enable_parallel: bool,
37    /// Enable mixed precision arithmetic
38    pub enable_mixed_precision: bool,
39    /// Enable GPU acceleration
40    pub enable_gpu: bool,
41    /// GPU device ID to use
42    pub gpu_device_id: i32,
43    /// GPU memory limit in bytes
44    pub gpu_memory_limit: Option<usize>,
45    /// Number of threads for parallel operations
46    pub num_threads: Option<usize>,
47    /// Memory alignment for SIMD operations
48    pub memory_alignment: usize,
49}
50
51impl Default for AccelerationConfig {
52    fn default() -> Self {
53        Self {
54            enable_simd: true,
55            enable_parallel: true,
56            enable_mixed_precision: false,
57            enable_gpu: false,      // GPU disabled by default
58            gpu_device_id: 0,       // Primary GPU
59            gpu_memory_limit: None, // No memory limit
60            num_threads: None,      // Use rayon default
61            memory_alignment: 32,   // 256-bit alignment for AVX2/NEON
62        }
63    }
64}
65
66/// SIMD-optimized matrix operations
67pub struct SimdMatrixOps {
68    config: AccelerationConfig,
69}
70
71impl SimdMatrixOps {
72    /// Create a new SIMD matrix operations instance
73    pub fn new() -> Self {
74        Self {
75            config: AccelerationConfig::default(),
76        }
77    }
78
79    /// Set configuration
80    pub fn with_config(mut self, config: AccelerationConfig) -> Self {
81        self.config = config;
82        self
83    }
84
85    /// SIMD-optimized vector dot product
86    pub fn dot_product_simd(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Float> {
87        if a.len() != b.len() {
88            return Err(SklearsError::InvalidInput(
89                "Vector dimensions must match for dot product".to_string(),
90            ));
91        }
92
93        if !self.config.enable_simd {
94            return Ok(self.dot_product_fallback(a, b));
95        }
96
97        #[cfg(target_arch = "aarch64")]
98        {
99            self.dot_product_neon(a, b)
100        }
101        #[cfg(not(target_arch = "aarch64"))]
102        {
103            Ok(self.dot_product_fallback(a, b))
104        }
105    }
106
107    #[cfg(target_arch = "aarch64")]
108    fn dot_product_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Float> {
109        let n = a.len();
110        let mut sum;
111
112        if std::mem::size_of::<Float>() == 8 {
113            // Double precision NEON operations
114            let chunks = n / 2;
115            let _remainder = n % 2;
116
117            unsafe {
118                let mut acc = vdupq_n_f64(0.0);
119
120                for i in 0..chunks {
121                    let idx = i * 2;
122                    let va = vld1q_f64(a.as_ptr().add(idx));
123                    let vb = vld1q_f64(b.as_ptr().add(idx));
124                    acc = vfmaq_f64(acc, va, vb);
125                }
126
127                // Sum the accumulator
128                sum = vgetq_lane_f64(acc, 0) + vgetq_lane_f64(acc, 1);
129
130                // Handle remainder
131                for i in (chunks * 2)..n {
132                    sum += a[i] * b[i];
133                }
134            }
135        } else {
136            // Single precision NEON operations
137            let chunks = n / 4;
138            let _remainder = n % 4;
139
140            unsafe {
141                let mut acc = vdupq_n_f32(0.0);
142                let a_ptr = a.as_ptr() as *const f32;
143                let b_ptr = b.as_ptr() as *const f32;
144
145                for i in 0..chunks {
146                    let idx = i * 4;
147                    let va = vld1q_f32(a_ptr.add(idx));
148                    let vb = vld1q_f32(b_ptr.add(idx));
149                    acc = vfmaq_f32(acc, va, vb);
150                }
151
152                // Sum the accumulator
153                let sum_vec = vpaddq_f32(acc, acc);
154                let sum_vec2 = vpaddq_f32(sum_vec, sum_vec);
155                sum = vgetq_lane_f32(sum_vec2, 0) as Float;
156
157                // Handle remainder
158                for i in (chunks * 4)..n {
159                    sum += a[i] * b[i];
160                }
161            }
162        }
163
164        Ok(sum)
165    }
166
167    fn dot_product_fallback(&self, a: &Array1<Float>, b: &Array1<Float>) -> Float {
168        a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum()
169    }
170
171    /// SIMD-optimized matrix-vector multiplication
172    pub fn matrix_vector_mul_simd(
173        &self,
174        matrix: &Array2<Float>,
175        vector: &Array1<Float>,
176    ) -> Result<Array1<Float>> {
177        let (m, n) = matrix.dim();
178        if n != vector.len() {
179            return Err(SklearsError::InvalidInput(
180                "Matrix columns must match vector length".to_string(),
181            ));
182        }
183
184        if !self.config.enable_simd || !self.config.enable_parallel {
185            return Ok(self.matrix_vector_mul_fallback(matrix, vector));
186        }
187
188        // Parallel SIMD matrix-vector multiplication
189        #[cfg(feature = "parallel")]
190        let result: Vec<Float> = (0..m)
191            .into_par_iter()
192            .map(|i| {
193                let row = matrix.row(i);
194                self.dot_product_simd(&row.to_owned(), vector)
195                    .unwrap_or(0.0)
196            })
197            .collect();
198
199        #[cfg(not(feature = "parallel"))]
200        let result: Vec<Float> = (0..m)
201            .map(|i| {
202                let row = matrix.row(i);
203                self.dot_product_simd(&row.to_owned(), vector)
204                    .unwrap_or(0.0)
205            })
206            .collect();
207
208        Ok(Array1::from_vec(result))
209    }
210
211    fn matrix_vector_mul_fallback(
212        &self,
213        matrix: &Array2<Float>,
214        vector: &Array1<Float>,
215    ) -> Array1<Float> {
216        matrix.dot(vector)
217    }
218
219    /// SIMD-optimized element-wise operations
220    pub fn elementwise_add_simd(
221        &self,
222        a: &Array1<Float>,
223        b: &Array1<Float>,
224    ) -> Result<Array1<Float>> {
225        if a.len() != b.len() {
226            return Err(SklearsError::InvalidInput(
227                "Array dimensions must match".to_string(),
228            ));
229        }
230
231        if !self.config.enable_simd {
232            return Ok(a + b);
233        }
234
235        #[cfg(target_arch = "aarch64")]
236        {
237            self.elementwise_add_neon(a, b)
238        }
239        #[cfg(not(target_arch = "aarch64"))]
240        {
241            Ok(a + b)
242        }
243    }
244
245    #[cfg(target_arch = "aarch64")]
246    fn elementwise_add_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Array1<Float>> {
247        let n = a.len();
248        let mut result = Array1::<Float>::zeros(n);
249
250        if std::mem::size_of::<Float>() == 8 {
251            // Double precision
252            let chunks = n / 2;
253
254            unsafe {
255                for i in 0..chunks {
256                    let idx = i * 2;
257                    let va = vld1q_f64(a.as_ptr().add(idx));
258                    let vb = vld1q_f64(b.as_ptr().add(idx));
259                    let vr = vaddq_f64(va, vb);
260                    vst1q_f64(result.as_mut_ptr().add(idx), vr);
261                }
262
263                // Handle remainder
264                for i in (chunks * 2)..n {
265                    result[i] = a[i] + b[i];
266                }
267            }
268        } else {
269            // Single precision
270            let chunks = n / 4;
271            let a_ptr = a.as_ptr() as *const f32;
272            let b_ptr = b.as_ptr() as *const f32;
273            let result_ptr = result.as_mut_ptr() as *mut f32;
274
275            unsafe {
276                for i in 0..chunks {
277                    let idx = i * 4;
278                    let va = vld1q_f32(a_ptr.add(idx));
279                    let vb = vld1q_f32(b_ptr.add(idx));
280                    let vr = vaddq_f32(va, vb);
281                    vst1q_f32(result_ptr.add(idx), vr);
282                }
283
284                // Handle remainder
285                for i in (chunks * 4)..n {
286                    result[i] = a[i] + b[i];
287                }
288            }
289        }
290
291        Ok(result)
292    }
293
294    /// SIMD-optimized element-wise multiplication
295    pub fn elementwise_mul_simd(
296        &self,
297        a: &Array1<Float>,
298        b: &Array1<Float>,
299    ) -> Result<Array1<Float>> {
300        if a.len() != b.len() {
301            return Err(SklearsError::InvalidInput(
302                "Array dimensions must match".to_string(),
303            ));
304        }
305
306        if !self.config.enable_simd {
307            return Ok(a * b);
308        }
309
310        #[cfg(target_arch = "aarch64")]
311        {
312            self.elementwise_mul_neon(a, b)
313        }
314        #[cfg(not(target_arch = "aarch64"))]
315        {
316            Ok(a * b)
317        }
318    }
319
320    #[cfg(target_arch = "aarch64")]
321    fn elementwise_mul_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Array1<Float>> {
322        let n = a.len();
323        let mut result = Array1::<Float>::zeros(n);
324
325        if std::mem::size_of::<Float>() == 8 {
326            // Double precision
327            let chunks = n / 2;
328
329            unsafe {
330                for i in 0..chunks {
331                    let idx = i * 2;
332                    let va = vld1q_f64(a.as_ptr().add(idx));
333                    let vb = vld1q_f64(b.as_ptr().add(idx));
334                    let vr = vmulq_f64(va, vb);
335                    vst1q_f64(result.as_mut_ptr().add(idx), vr);
336                }
337
338                // Handle remainder
339                for i in (chunks * 2)..n {
340                    result[i] = a[i] * b[i];
341                }
342            }
343        } else {
344            // Single precision
345            let chunks = n / 4;
346            let a_ptr = a.as_ptr() as *const f32;
347            let b_ptr = b.as_ptr() as *const f32;
348            let result_ptr = result.as_mut_ptr() as *mut f32;
349
350            unsafe {
351                for i in 0..chunks {
352                    let idx = i * 4;
353                    let va = vld1q_f32(a_ptr.add(idx));
354                    let vb = vld1q_f32(b_ptr.add(idx));
355                    let vr = vmulq_f32(va, vb);
356                    vst1q_f32(result_ptr.add(idx), vr);
357                }
358
359                // Handle remainder
360                for i in (chunks * 4)..n {
361                    result[i] = a[i] * b[i];
362                }
363            }
364        }
365
366        Ok(result)
367    }
368
369    /// Vectorized mathematical functions
370    pub fn vector_exp_simd(&self, input: &Array1<Float>) -> Array1<Float> {
371        if !self.config.enable_simd || !self.config.enable_parallel {
372            return input.mapv(|x| x.exp());
373        }
374
375        // Parallel vectorized exponential
376        #[cfg(feature = "parallel")]
377        let result: Vec<Float> = input.par_iter().map(|&x| x.exp()).collect();
378
379        #[cfg(not(feature = "parallel"))]
380        let result: Vec<Float> = input.iter().map(|&x| x.exp()).collect();
381        Array1::from_vec(result)
382    }
383
384    pub fn vector_sqrt_simd(&self, input: &Array1<Float>) -> Array1<Float> {
385        if !self.config.enable_simd || !self.config.enable_parallel {
386            return input.mapv(|x| x.sqrt());
387        }
388
389        // Parallel vectorized square root
390        #[cfg(feature = "parallel")]
391        let result: Vec<Float> = input.par_iter().map(|&x| x.sqrt()).collect();
392
393        #[cfg(not(feature = "parallel"))]
394        let result: Vec<Float> = input.iter().map(|&x| x.sqrt()).collect();
395        Array1::from_vec(result)
396    }
397
398    pub fn vector_sin_simd(&self, input: &Array1<Float>) -> Array1<Float> {
399        if !self.config.enable_simd || !self.config.enable_parallel {
400            return input.mapv(|x| x.sin());
401        }
402
403        // Parallel vectorized sine
404        #[cfg(feature = "parallel")]
405        let result: Vec<Float> = input.par_iter().map(|&x| x.sin()).collect();
406
407        #[cfg(not(feature = "parallel"))]
408        let result: Vec<Float> = input.iter().map(|&x| x.sin()).collect();
409        Array1::from_vec(result)
410    }
411}
412
413impl Default for SimdMatrixOps {
414    fn default() -> Self {
415        Self::new()
416    }
417}
418
419/// Parallel decomposition algorithms
420pub struct ParallelDecomposition {
421    config: AccelerationConfig,
422}
423
424impl ParallelDecomposition {
425    /// Create a new parallel decomposition instance
426    pub fn new() -> Self {
427        Self {
428            config: AccelerationConfig::default(),
429        }
430    }
431
432    /// Set configuration
433    pub fn with_config(mut self, config: AccelerationConfig) -> Self {
434        self.config = config;
435        self
436    }
437
438    /// Parallel SVD computation for large matrices
439    pub fn parallel_svd(
440        &self,
441        matrix: &Array2<Float>,
442    ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
443        let (m, n) = matrix.dim();
444
445        if !self.config.enable_parallel {
446            return self.sequential_svd(matrix);
447        }
448
449        // For large matrices, use block-based parallel SVD
450        if m > 1000 && n > 1000 {
451            self.block_parallel_svd(matrix)
452        } else {
453            self.sequential_svd(matrix)
454        }
455    }
456
457    fn block_parallel_svd(
458        &self,
459        matrix: &Array2<Float>,
460    ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
461        // Simplified block-based SVD - in practice would use more sophisticated algorithms
462        let (_m, _n) = matrix.dim();
463        let _block_size = 256;
464
465        // For now, fall back to sequential SVD
466        // A full implementation would use randomized SVD or hierarchical SVD
467        self.sequential_svd(matrix)
468    }
469
470    fn sequential_svd(
471        &self,
472        matrix: &Array2<Float>,
473    ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
474        // Use ndarray_linalg for SVD computation
475        // This is a placeholder - actual implementation would depend on available LAPACK
476        let (m, n) = matrix.dim();
477        let min_dim = m.min(n);
478
479        // Simplified SVD placeholder
480        let u = Array2::eye(m);
481        let s = Array1::ones(min_dim);
482        let vt = Array2::eye(n);
483
484        Ok((u, s, vt))
485    }
486
487    /// Parallel eigendecomposition
488    pub fn parallel_eigendecomposition(
489        &self,
490        matrix: &Array2<Float>,
491    ) -> Result<(Array1<Float>, Array2<Float>)> {
492        let n = matrix.nrows();
493        if n != matrix.ncols() {
494            return Err(SklearsError::InvalidInput(
495                "Matrix must be square for eigendecomposition".to_string(),
496            ));
497        }
498
499        if !self.config.enable_parallel || n < 500 {
500            return self.sequential_eigendecomposition(matrix);
501        }
502
503        // For large matrices, use parallel algorithms
504        self.block_parallel_eigendecomposition(matrix)
505    }
506
507    fn block_parallel_eigendecomposition(
508        &self,
509        matrix: &Array2<Float>,
510    ) -> Result<(Array1<Float>, Array2<Float>)> {
511        // Simplified parallel eigendecomposition
512        // A full implementation would use divide-and-conquer algorithms
513        self.sequential_eigendecomposition(matrix)
514    }
515
516    fn sequential_eigendecomposition(
517        &self,
518        matrix: &Array2<Float>,
519    ) -> Result<(Array1<Float>, Array2<Float>)> {
520        let n = matrix.nrows();
521
522        // Simplified eigendecomposition placeholder
523        let eigenvalues = Array1::ones(n);
524        let eigenvectors = Array2::eye(n);
525
526        Ok((eigenvalues, eigenvectors))
527    }
528
529    /// Parallel matrix multiplication with tiling
530    pub fn parallel_matrix_multiply(
531        &self,
532        a: &Array2<Float>,
533        b: &Array2<Float>,
534    ) -> Result<Array2<Float>> {
535        let (m, k1) = a.dim();
536        let (k2, n) = b.dim();
537
538        if k1 != k2 {
539            return Err(SklearsError::InvalidInput(
540                "Matrix dimensions incompatible for multiplication".to_string(),
541            ));
542        }
543
544        if !self.config.enable_parallel {
545            return Ok(a.dot(b));
546        }
547
548        // Use tiled parallel multiplication for large matrices
549        if m > 100 && n > 100 && k1 > 100 {
550            self.tiled_parallel_multiply(a, b)
551        } else {
552            Ok(a.dot(b))
553        }
554    }
555
556    fn tiled_parallel_multiply(
557        &self,
558        a: &Array2<Float>,
559        b: &Array2<Float>,
560    ) -> Result<Array2<Float>> {
561        let (m, k) = a.dim();
562        let (_, n) = b.dim();
563        let tile_size = 64; // Cache-friendly tile size
564
565        let _result = Array2::<Float>::zeros((m, n));
566
567        // Parallel tiled multiplication
568        let m_tiles = (m + tile_size - 1) / tile_size;
569        let n_tiles = (n + tile_size - 1) / tile_size;
570        let k_tiles = (k + tile_size - 1) / tile_size;
571
572        #[cfg(feature = "parallel")]
573        {
574            (0..m_tiles).into_par_iter().for_each(|i_tile| {
575                for j_tile in 0..n_tiles {
576                    let mut tile_result = Array2::zeros((
577                        (tile_size).min(m - i_tile * tile_size),
578                        (tile_size).min(n - j_tile * tile_size),
579                    ));
580
581                    for k_tile in 0..k_tiles {
582                        let i_start = i_tile * tile_size;
583                        let i_end = (i_start + tile_size).min(m);
584                        let j_start = j_tile * tile_size;
585                        let j_end = (j_start + tile_size).min(n);
586                        let k_start = k_tile * tile_size;
587                        let k_end = (k_start + tile_size).min(k);
588
589                        let a_tile =
590                            a.slice(scirs2_core::ndarray::s![i_start..i_end, k_start..k_end]);
591                        let b_tile =
592                            b.slice(scirs2_core::ndarray::s![k_start..k_end, j_start..j_end]);
593
594                        tile_result += &a_tile.dot(&b_tile);
595                    }
596
597                    // This requires synchronization - simplified for demonstration
598                    // In practice, would need proper synchronization mechanisms
599                }
600            });
601        }
602
603        #[cfg(not(feature = "parallel"))]
604        {
605            (0..m_tiles).for_each(|i_tile| {
606                for j_tile in 0..n_tiles {
607                    let mut tile_result = Array2::zeros((
608                        (tile_size).min(m - i_tile * tile_size),
609                        (tile_size).min(n - j_tile * tile_size),
610                    ));
611
612                    for k_tile in 0..k_tiles {
613                        let i_start = i_tile * tile_size;
614                        let i_end = (i_start + tile_size).min(m);
615                        let j_start = j_tile * tile_size;
616                        let j_end = (j_start + tile_size).min(n);
617                        let k_start = k_tile * tile_size;
618                        let k_end = (k_start + tile_size).min(k);
619
620                        let a_tile =
621                            a.slice(scirs2_core::ndarray::s![i_start..i_end, k_start..k_end]);
622                        let b_tile =
623                            b.slice(scirs2_core::ndarray::s![k_start..k_end, j_start..j_end]);
624
625                        tile_result += &a_tile.dot(&b_tile);
626                    }
627
628                    // Write tile result back (this would need proper synchronization)
629                    // For now, skip the actual writing since we fall back anyway
630                }
631            });
632        }
633
634        // For now, fall back to standard multiplication
635        Ok(a.dot(b))
636    }
637}
638
639impl Default for ParallelDecomposition {
640    fn default() -> Self {
641        Self::new()
642    }
643}
644
645/// Mixed-precision arithmetic support
646pub struct MixedPrecisionOps {
647    config: AccelerationConfig,
648}
649
650impl MixedPrecisionOps {
651    /// Create a new mixed-precision operations instance
652    pub fn new() -> Self {
653        Self {
654            config: AccelerationConfig::default(),
655        }
656    }
657
658    /// Set configuration
659    pub fn with_config(mut self, config: AccelerationConfig) -> Self {
660        self.config = config;
661        self
662    }
663
664    /// Convert double precision array to single precision
665    pub fn to_single_precision(&self, input: &Array1<f64>) -> Array1<f32> {
666        if self.config.enable_parallel {
667            #[cfg(feature = "parallel")]
668            let result: Vec<f32> = input.par_iter().map(|&x| x as f32).collect();
669
670            #[cfg(not(feature = "parallel"))]
671            let result: Vec<f32> = input.iter().map(|&x| x as f32).collect();
672            Array1::from_vec(result)
673        } else {
674            input.mapv(|x| x as f32)
675        }
676    }
677
678    /// Convert single precision array to double precision
679    pub fn to_double_precision(&self, input: &Array1<f32>) -> Array1<f64> {
680        if self.config.enable_parallel {
681            #[cfg(feature = "parallel")]
682            let result: Vec<f64> = input.par_iter().map(|&x| x as f64).collect();
683
684            #[cfg(not(feature = "parallel"))]
685            let result: Vec<f64> = input.iter().map(|&x| x as f64).collect();
686            Array1::from_vec(result)
687        } else {
688            input.mapv(|x| x as f64)
689        }
690    }
691
692    /// Mixed-precision matrix multiplication (compute in f32, accumulate in f64)
693    pub fn mixed_precision_multiply(
694        &self,
695        a: &Array2<f64>,
696        b: &Array2<f64>,
697    ) -> Result<Array2<f64>> {
698        if !self.config.enable_mixed_precision {
699            return Ok(a.dot(b));
700        }
701
702        let (_m, k1) = a.dim();
703        let (k2, _n) = b.dim();
704
705        if k1 != k2 {
706            return Err(SklearsError::InvalidInput(
707                "Matrix dimensions incompatible for multiplication".to_string(),
708            ));
709        }
710
711        // Convert to single precision for computation
712        let a_f32 = a.mapv(|x| x as f32);
713        let b_f32 = b.mapv(|x| x as f32);
714
715        // Compute in single precision
716        let result_f32 = a_f32.dot(&b_f32);
717
718        // Convert back to double precision
719        let result = result_f32.mapv(|x| x as f64);
720
721        Ok(result)
722    }
723}
724
725impl Default for MixedPrecisionOps {
726    fn default() -> Self {
727        Self::new()
728    }
729}
730
731/// Memory-aligned operations for optimal performance
732pub struct AlignedMemoryOps {
733    alignment: usize,
734}
735
736/// Owned buffer with guaranteed memory alignment
737pub struct AlignedBuffer {
738    ptr: NonNull<Float>,
739    len: usize,
740    alignment: usize,
741}
742
743impl AlignedMemoryOps {
744    /// Create new aligned memory operations with specified alignment
745    pub fn new(alignment: usize) -> Self {
746        Self {
747            alignment: Self::sanitize_alignment(alignment),
748        }
749    }
750
751    fn sanitize_alignment(requested: usize) -> usize {
752        let base = requested.max(std::mem::align_of::<Float>()).max(1);
753        if base.is_power_of_two() {
754            base
755        } else {
756            base.next_power_of_two()
757        }
758    }
759
760    /// Create aligned array with specified size
761    pub fn create_aligned_array(&self, size: usize) -> AlignedBuffer {
762        AlignedBuffer::new(size, self.alignment)
763    }
764
765    /// Check if array is properly aligned
766    pub fn is_aligned(&self, data: &[Float]) -> bool {
767        let ptr = data.as_ptr() as usize;
768        ptr % self.alignment == 0
769    }
770
771    /// Copy data to aligned buffer if necessary
772    pub fn ensure_aligned(&self, data: &Array1<Float>) -> AlignedBuffer {
773        let slice = data
774            .as_slice()
775            .expect("Array1 should have a contiguous slice representation");
776
777        let mut buffer = self.create_aligned_array(slice.len());
778        buffer.as_mut_slice().copy_from_slice(slice);
779        buffer
780    }
781}
782
783impl Default for AlignedMemoryOps {
784    fn default() -> Self {
785        Self::new(32) // 256-bit alignment
786    }
787}
788
789impl AlignedBuffer {
790    pub fn new(size: usize, alignment: usize) -> Self {
791        if size == 0 {
792            return Self {
793                ptr: NonNull::dangling(),
794                len: 0,
795                alignment,
796            };
797        }
798
799        let elem_size = std::mem::size_of::<Float>();
800        let total_size = elem_size
801            .checked_mul(size)
802            .expect("Requested buffer size exceeds addressable memory");
803        let layout = Layout::from_size_align(total_size, alignment)
804            .expect("Invalid layout for aligned allocation");
805
806        unsafe {
807            let ptr = alloc_zeroed(layout);
808            if ptr.is_null() {
809                handle_alloc_error(layout);
810            }
811
812            Self {
813                ptr: NonNull::new_unchecked(ptr as *mut Float),
814                len: size,
815                alignment,
816            }
817        }
818    }
819
820    pub fn len(&self) -> usize {
821        self.len
822    }
823
824    pub fn is_empty(&self) -> bool {
825        self.len == 0
826    }
827
828    pub fn as_slice(&self) -> &[Float] {
829        unsafe { slice::from_raw_parts(self.ptr.as_ptr(), self.len) }
830    }
831
832    pub fn as_mut_slice(&mut self) -> &mut [Float] {
833        unsafe { slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len) }
834    }
835}
836
837impl Deref for AlignedBuffer {
838    type Target = [Float];
839
840    fn deref(&self) -> &Self::Target {
841        self.as_slice()
842    }
843}
844
845impl DerefMut for AlignedBuffer {
846    fn deref_mut(&mut self) -> &mut Self::Target {
847        self.as_mut_slice()
848    }
849}
850
851impl Drop for AlignedBuffer {
852    fn drop(&mut self) {
853        if self.len == 0 {
854            return;
855        }
856
857        let elem_size = std::mem::size_of::<Float>();
858        let total_size = elem_size * self.len;
859        if let Ok(layout) = Layout::from_size_align(total_size, self.alignment) {
860            unsafe {
861                dealloc(self.ptr.as_ptr() as *mut u8, layout);
862            }
863        }
864    }
865}
866
867/// GPU acceleration for matrix operations using CUDA
868#[cfg(feature = "gpu")]
869pub struct GpuAcceleration {
870    config: AccelerationConfig,
871    device: Device,
872    #[cfg(not(target_os = "macos"))]
873    cuda_device: Option<std::sync::Arc<CudaContext>>,
874}
875
876#[cfg(feature = "gpu")]
877impl GpuAcceleration {
878    /// Create a new GPU acceleration instance
879    pub fn new() -> Result<Self> {
880        Self::with_config(AccelerationConfig::default())
881    }
882
883    /// Create GPU acceleration with specific configuration
884    pub fn with_config(config: AccelerationConfig) -> Result<Self> {
885        if !config.enable_gpu {
886            return Ok(Self {
887                config,
888                device: Device::Cpu,
889                #[cfg(not(target_os = "macos"))]
890                cuda_device: None,
891            });
892        }
893
894        // Initialize CUDA device (only on non-macOS platforms)
895        #[cfg(not(target_os = "macos"))]
896        let cuda_device = match CudaContext::new(config.gpu_device_id as usize) {
897            Ok(device) => Some(device),
898            Err(_) => {
899                return Err(SklearsError::InvalidInput(
900                    "Failed to initialize CUDA device".to_string(),
901                ));
902            }
903        };
904
905        // Initialize Candle device
906        let device = match Device::new_cuda(config.gpu_device_id as usize) {
907            Ok(dev) => dev,
908            Err(_) => Device::Cpu, // Fallback to CPU
909        };
910
911        Ok(Self {
912            config,
913            device,
914            #[cfg(not(target_os = "macos"))]
915            cuda_device,
916        })
917    }
918
919    /// Check if GPU is available and properly initialized
920    pub fn is_gpu_available(&self) -> bool {
921        #[cfg(not(target_os = "macos"))]
922        return self.cuda_device.is_some() && matches!(self.device, Device::Cuda(_));
923
924        #[cfg(target_os = "macos")]
925        return matches!(self.device, Device::Cuda(_));
926    }
927
928    /// Get GPU memory information
929    pub fn gpu_memory_info(&self) -> Result<(usize, usize)> {
930        #[cfg(not(target_os = "macos"))]
931        {
932            if let Some(ref _cuda_device) = self.cuda_device {
933                // CudaContext in cudarc 0.17 uses attribute() for device queries
934                // For now, return default values (actual implementation would query CUDA)
935                let total = 8 * 1024 * 1024 * 1024; // Default 8GB
936                let free = total / 2; // Conservative estimate
937                Ok((free, total))
938            } else {
939                Err(SklearsError::InvalidInput("GPU not available".to_string()))
940            }
941        }
942
943        #[cfg(target_os = "macos")]
944        Err(SklearsError::InvalidInput(
945            "CUDA not available on macOS".to_string(),
946        ))
947    }
948
949    /// Convert ndarray to GPU tensor
950    pub fn array_to_tensor(&self, array: &Array2<Float>) -> Result<Tensor> {
951        let shape = array.shape();
952        let data: Vec<f32> = if std::mem::size_of::<Float>() == 8 {
953            array.iter().map(|&x| x as f32).collect()
954        } else {
955            array.iter().map(|&x| x as f32).collect()
956        };
957
958        match Tensor::from_vec(data, (shape[0], shape[1]), &self.device) {
959            Ok(tensor) => Ok(tensor),
960            Err(_) => Err(SklearsError::InvalidInput(
961                "Failed to create GPU tensor".to_string(),
962            )),
963        }
964    }
965
966    /// Convert GPU tensor back to ndarray
967    pub fn tensor_to_array(&self, tensor: &Tensor) -> Result<Array2<Float>> {
968        let shape = tensor.shape();
969        let dims = shape.dims();
970        if dims.len() != 2 {
971            return Err(SklearsError::InvalidInput("Tensor must be 2D".to_string()));
972        }
973
974        match tensor.to_vec2::<f32>() {
975            Ok(data) => {
976                let flat_data: Vec<Float> = data
977                    .into_iter()
978                    .flatten()
979                    .map(|x| {
980                        if std::mem::size_of::<Float>() == 8 {
981                            x as f64
982                        } else {
983                            x as f64
984                        }
985                    })
986                    .collect();
987
988                match Array2::from_shape_vec((dims[0], dims[1]), flat_data) {
989                    Ok(array) => Ok(array),
990                    Err(_) => Err(SklearsError::InvalidInput(
991                        "Failed to create array from tensor".to_string(),
992                    )),
993                }
994            }
995            Err(_) => Err(SklearsError::InvalidInput(
996                "Failed to convert tensor to array".to_string(),
997            )),
998        }
999    }
1000
1001    /// GPU-accelerated matrix multiplication
1002    pub fn gpu_matrix_multiply(
1003        &self,
1004        a: &Array2<Float>,
1005        b: &Array2<Float>,
1006    ) -> Result<Array2<Float>> {
1007        if !self.is_gpu_available() {
1008            return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1009        }
1010
1011        let (_m, k1) = a.dim();
1012        let (k2, _n) = b.dim();
1013
1014        if k1 != k2 {
1015            return Err(SklearsError::InvalidInput(
1016                "Matrix dimensions incompatible for multiplication".to_string(),
1017            ));
1018        }
1019
1020        // Convert to GPU tensors
1021        let tensor_a = self.array_to_tensor(a)?;
1022        let tensor_b = self.array_to_tensor(b)?;
1023
1024        // Perform GPU matrix multiplication
1025        let result_tensor = match tensor_a.matmul(&tensor_b) {
1026            Ok(tensor) => tensor,
1027            Err(_) => {
1028                return Err(SklearsError::InvalidInput(
1029                    "GPU matrix multiplication failed".to_string(),
1030                ))
1031            }
1032        };
1033
1034        // Convert back to array
1035        self.tensor_to_array(&result_tensor)
1036    }
1037
1038    /// GPU-accelerated SVD decomposition
1039    pub fn gpu_svd(
1040        &self,
1041        matrix: &Array2<Float>,
1042    ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
1043        if !self.is_gpu_available() {
1044            return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1045        }
1046
1047        let _tensor = self.array_to_tensor(matrix)?;
1048
1049        // For now, use a simplified GPU-based SVD implementation
1050        // In practice, this would use cuSOLVER or similar libraries
1051        let (m, n) = matrix.dim();
1052        let min_dim = m.min(n);
1053
1054        // This is a placeholder - real implementation would use cuSOLVER
1055        let u_tensor = match Tensor::eye(m, DType::F32, &self.device) {
1056            Ok(t) => t,
1057            Err(_) => {
1058                return Err(SklearsError::InvalidInput(
1059                    "Failed to create identity matrix".to_string(),
1060                ))
1061            }
1062        };
1063
1064        let s_data = vec![1.0f32; min_dim];
1065        let s_tensor = match Tensor::from_vec(s_data, min_dim, &self.device) {
1066            Ok(t) => t,
1067            Err(_) => {
1068                return Err(SklearsError::InvalidInput(
1069                    "Failed to create singular values".to_string(),
1070                ))
1071            }
1072        };
1073
1074        let vt_tensor = match Tensor::eye(n, DType::F32, &self.device) {
1075            Ok(t) => t,
1076            Err(_) => {
1077                return Err(SklearsError::InvalidInput(
1078                    "Failed to create identity matrix".to_string(),
1079                ))
1080            }
1081        };
1082
1083        // Convert tensors back to arrays
1084        let u = self.tensor_to_array(&u_tensor)?;
1085        let vt = self.tensor_to_array(&vt_tensor)?;
1086
1087        // Convert singular values
1088        let s_vec = match s_tensor.to_vec1::<f32>() {
1089            Ok(vec) => vec.into_iter().map(|x| x as Float).collect(),
1090            Err(_) => {
1091                return Err(SklearsError::InvalidInput(
1092                    "Failed to convert singular values".to_string(),
1093                ))
1094            }
1095        };
1096        let s = Array1::from_vec(s_vec);
1097
1098        Ok((u, s, vt))
1099    }
1100
1101    /// GPU-accelerated eigendecomposition
1102    pub fn gpu_eigendecomposition(
1103        &self,
1104        matrix: &Array2<Float>,
1105    ) -> Result<(Array1<Float>, Array2<Float>)> {
1106        if !self.is_gpu_available() {
1107            return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1108        }
1109
1110        let n = matrix.nrows();
1111        if n != matrix.ncols() {
1112            return Err(SklearsError::InvalidInput(
1113                "Matrix must be square for eigendecomposition".to_string(),
1114            ));
1115        }
1116
1117        let _tensor = self.array_to_tensor(matrix)?;
1118
1119        // Simplified GPU eigendecomposition - real implementation would use cuSOLVER
1120        let eigenvals_data = vec![1.0f32; n];
1121        let eigenvals_tensor = match Tensor::from_vec(eigenvals_data, n, &self.device) {
1122            Ok(t) => t,
1123            Err(_) => {
1124                return Err(SklearsError::InvalidInput(
1125                    "Failed to create eigenvalues".to_string(),
1126                ))
1127            }
1128        };
1129
1130        let eigenvecs_tensor = match Tensor::eye(n, DType::F32, &self.device) {
1131            Ok(t) => t,
1132            Err(_) => {
1133                return Err(SklearsError::InvalidInput(
1134                    "Failed to create eigenvectors".to_string(),
1135                ))
1136            }
1137        };
1138
1139        // Convert back to arrays
1140        let eigenvecs = self.tensor_to_array(&eigenvecs_tensor)?;
1141        let eigenvals_vec = match eigenvals_tensor.to_vec1::<f32>() {
1142            Ok(vec) => vec.into_iter().map(|x| x as Float).collect(),
1143            Err(_) => {
1144                return Err(SklearsError::InvalidInput(
1145                    "Failed to convert eigenvalues".to_string(),
1146                ))
1147            }
1148        };
1149        let eigenvals = Array1::from_vec(eigenvals_vec);
1150
1151        Ok((eigenvals, eigenvecs))
1152    }
1153
1154    /// Batch GPU operations for multiple matrices
1155    pub fn batch_gpu_multiply(&self, matrices: &[Array2<Float>]) -> Result<Vec<Array2<Float>>> {
1156        if !self.is_gpu_available() {
1157            return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1158        }
1159
1160        let mut results = Vec::with_capacity(matrices.len() / 2);
1161
1162        for chunk in matrices.chunks_exact(2) {
1163            let result = self.gpu_matrix_multiply(&chunk[0], &chunk[1])?;
1164            results.push(result);
1165        }
1166
1167        Ok(results)
1168    }
1169
1170    /// GPU memory management
1171    pub fn free_gpu_memory(&self) -> Result<()> {
1172        #[cfg(not(target_os = "macos"))]
1173        {
1174            if let Some(ref cuda_device) = self.cuda_device {
1175                match cuda_device.synchronize() {
1176                    Ok(()) => Ok(()),
1177                    Err(_) => Err(SklearsError::InvalidInput(
1178                        "Failed to synchronize GPU device".to_string(),
1179                    )),
1180                }
1181            } else {
1182                Ok(())
1183            }
1184        }
1185
1186        #[cfg(target_os = "macos")]
1187        Ok(())
1188    }
1189
1190    /// Profile GPU operation performance
1191    pub fn profile_gpu_operation<F, T>(&self, operation: F) -> Result<(T, std::time::Duration)>
1192    where
1193        F: FnOnce() -> Result<T>,
1194    {
1195        let start = std::time::Instant::now();
1196        let result = operation()?;
1197
1198        // Synchronize GPU to ensure timing accuracy
1199        self.free_gpu_memory()?;
1200
1201        let duration = start.elapsed();
1202        Ok((result, duration))
1203    }
1204}
1205
1206#[cfg(feature = "gpu")]
1207impl Default for GpuAcceleration {
1208    fn default() -> Self {
1209        Self::new().unwrap_or_else(|_| {
1210            // Fallback to CPU-only configuration
1211            let config = AccelerationConfig {
1212                enable_gpu: false,
1213                ..AccelerationConfig::default()
1214            };
1215            Self {
1216                config,
1217                device: Device::Cpu,
1218                #[cfg(not(target_os = "macos"))]
1219                cuda_device: None,
1220            }
1221        })
1222    }
1223}
1224
1225/// GPU-accelerated decomposition algorithms
1226#[cfg(feature = "gpu")]
1227pub struct GpuDecomposition {
1228    gpu_acceleration: GpuAcceleration,
1229}
1230
1231#[cfg(feature = "gpu")]
1232impl GpuDecomposition {
1233    /// Create new GPU decomposition instance
1234    pub fn new() -> Result<Self> {
1235        Ok(Self {
1236            gpu_acceleration: GpuAcceleration::new()?,
1237        })
1238    }
1239
1240    /// Create with configuration
1241    pub fn with_config(config: AccelerationConfig) -> Result<Self> {
1242        Ok(Self {
1243            gpu_acceleration: GpuAcceleration::with_config(config)?,
1244        })
1245    }
1246
1247    /// GPU-accelerated PCA using SVD
1248    pub fn gpu_pca(
1249        &self,
1250        data: &Array2<Float>,
1251        n_components: usize,
1252    ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
1253        let (m, n) = data.dim();
1254        if n_components > m.min(n) {
1255            return Err(SklearsError::InvalidInput(
1256                "Number of components cannot exceed matrix dimensions".to_string(),
1257            ));
1258        }
1259
1260        // Center the data
1261        let col_means = data.mean_axis(scirs2_core::ndarray::Axis(0)).unwrap();
1262        let centered_data = data - &col_means.insert_axis(scirs2_core::ndarray::Axis(0));
1263
1264        // Perform GPU SVD
1265        let (u, s, vt) = self.gpu_acceleration.gpu_svd(&centered_data)?;
1266
1267        // Truncate to requested number of components
1268        let u_truncated = u
1269            .slice(scirs2_core::ndarray::s![.., ..n_components])
1270            .to_owned();
1271        let s_truncated = s.slice(scirs2_core::ndarray::s![..n_components]).to_owned();
1272        let vt_truncated = vt
1273            .slice(scirs2_core::ndarray::s![..n_components, ..])
1274            .to_owned();
1275
1276        Ok((u_truncated, s_truncated, vt_truncated))
1277    }
1278
1279    /// GPU-accelerated matrix factorization
1280    pub fn gpu_factorize(&self, matrix: &Array2<Float>) -> Result<(Array2<Float>, Array2<Float>)> {
1281        let (m, n) = matrix.dim();
1282        let k = m.min(n);
1283
1284        // Use GPU SVD for factorization
1285        let (u, s, vt) = self.gpu_acceleration.gpu_svd(matrix)?;
1286
1287        // Create factor matrices
1288        let s_sqrt = s.mapv(|x| x.sqrt());
1289        let factor_a = &u.slice(scirs2_core::ndarray::s![.., ..k])
1290            * &s_sqrt.view().insert_axis(scirs2_core::ndarray::Axis(0));
1291        let factor_b = &s_sqrt.view().insert_axis(scirs2_core::ndarray::Axis(1))
1292            * &vt.slice(scirs2_core::ndarray::s![..k, ..]);
1293
1294        Ok((factor_a, factor_b))
1295    }
1296}
1297
1298#[cfg(feature = "gpu")]
1299impl Default for GpuDecomposition {
1300    fn default() -> Self {
1301        Self::new().unwrap_or_else(|_| {
1302            // Create a fallback instance
1303            let config = AccelerationConfig {
1304                enable_gpu: false,
1305                ..AccelerationConfig::default()
1306            };
1307            Self {
1308                gpu_acceleration: GpuAcceleration::with_config(config).unwrap(),
1309            }
1310        })
1311    }
1312}
1313
1314#[allow(non_snake_case)]
1315#[cfg(test)]
1316mod tests {
1317    use super::*;
1318    use scirs2_core::ndarray::Array1;
1319
1320    #[test]
1321    fn test_simd_dot_product() {
1322        let simd_ops = SimdMatrixOps::new();
1323        let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1324        let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1325
1326        let result = simd_ops.dot_product_simd(&a, &b).unwrap();
1327        let expected = 1.0 * 5.0 + 2.0 * 6.0 + 3.0 * 7.0 + 4.0 * 8.0;
1328
1329        assert!((result - expected).abs() < 1e-10);
1330    }
1331
1332    #[test]
1333    fn test_simd_elementwise_add() {
1334        let simd_ops = SimdMatrixOps::new();
1335        let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1336        let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1337
1338        let result = simd_ops.elementwise_add_simd(&a, &b).unwrap();
1339        let expected = Array1::from_vec(vec![6.0, 8.0, 10.0, 12.0]);
1340
1341        for (r, e) in result.iter().zip(expected.iter()) {
1342            assert!((r - e).abs() < 1e-10);
1343        }
1344    }
1345
1346    #[test]
1347    fn test_simd_elementwise_mul() {
1348        let simd_ops = SimdMatrixOps::new();
1349        let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1350        let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1351
1352        let result = simd_ops.elementwise_mul_simd(&a, &b).unwrap();
1353        let expected = Array1::from_vec(vec![5.0, 12.0, 21.0, 32.0]);
1354
1355        for (r, e) in result.iter().zip(expected.iter()) {
1356            assert!((r - e).abs() < 1e-10);
1357        }
1358    }
1359
1360    #[test]
1361    fn test_matrix_vector_mul_simd() {
1362        let simd_ops = SimdMatrixOps::new();
1363        let matrix = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1364        let vector = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1365
1366        let result = simd_ops.matrix_vector_mul_simd(&matrix, &vector).unwrap();
1367        let expected = Array1::from_vec(vec![14.0, 32.0]); // [1*1+2*2+3*3, 4*1+5*2+6*3]
1368
1369        for (r, e) in result.iter().zip(expected.iter()) {
1370            assert!((r - e).abs() < 1e-10);
1371        }
1372    }
1373
1374    #[test]
1375    fn test_parallel_operations() {
1376        let parallel_ops = ParallelDecomposition::new();
1377
1378        // Test basic functionality
1379        let matrix = Array2::eye(3);
1380        let result = parallel_ops.parallel_eigendecomposition(&matrix);
1381        assert!(result.is_ok());
1382
1383        let (eigenvals, eigenvecs) = result.unwrap();
1384        assert_eq!(eigenvals.len(), 3);
1385        assert_eq!(eigenvecs.dim(), (3, 3));
1386    }
1387
1388    #[test]
1389    fn test_mixed_precision() {
1390        let mixed_ops = MixedPrecisionOps::new();
1391
1392        let input_f64 = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1393        let converted_f32 = mixed_ops.to_single_precision(&input_f64);
1394        let converted_back = mixed_ops.to_double_precision(&converted_f32);
1395
1396        for (orig, back) in input_f64.iter().zip(converted_back.iter()) {
1397            assert!((orig - back).abs() < 1e-6); // Precision loss expected
1398        }
1399    }
1400
1401    #[test]
1402    fn test_aligned_memory() {
1403        let aligned_ops = AlignedMemoryOps::new(32);
1404
1405        let aligned_vec = aligned_ops.create_aligned_array(10);
1406        assert_eq!(aligned_vec.len(), 10);
1407        assert!(aligned_ops.is_aligned(&aligned_vec));
1408
1409        let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1410        let aligned_array = aligned_ops.ensure_aligned(&array);
1411        assert_eq!(aligned_array.len(), array.len());
1412        assert!(aligned_ops.is_aligned(&aligned_array));
1413
1414        assert_eq!(aligned_array.as_slice(), array.as_slice().unwrap());
1415    }
1416
1417    #[test]
1418    fn test_vectorized_functions() {
1419        let simd_ops = SimdMatrixOps::new();
1420        let input = Array1::from_vec(vec![0.0, 1.0, 2.0]);
1421
1422        let exp_result = simd_ops.vector_exp_simd(&input);
1423        let expected_exp = input.mapv(|x| x.exp());
1424
1425        for (r, e) in exp_result.iter().zip(expected_exp.iter()) {
1426            assert!((r - e).abs() < 1e-10);
1427        }
1428
1429        let sqrt_result = simd_ops.vector_sqrt_simd(&Array1::from_vec(vec![1.0, 4.0, 9.0]));
1430        let expected_sqrt = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1431
1432        for (r, e) in sqrt_result.iter().zip(expected_sqrt.iter()) {
1433            assert!((r - e).abs() < 1e-10);
1434        }
1435    }
1436
1437    #[test]
1438    fn test_acceleration_config() {
1439        let config = AccelerationConfig {
1440            enable_simd: false,
1441            enable_parallel: false,
1442            enable_mixed_precision: true,
1443            enable_gpu: false,
1444            gpu_device_id: 0,
1445            gpu_memory_limit: None,
1446            num_threads: Some(4),
1447            memory_alignment: 64,
1448        };
1449
1450        let simd_ops = SimdMatrixOps::new().with_config(config.clone());
1451        let a = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1452        let b = Array1::from_vec(vec![4.0, 5.0, 6.0]);
1453
1454        // Should use fallback when SIMD is disabled
1455        let result = simd_ops.dot_product_simd(&a, &b).unwrap();
1456        let expected = 32.0; // 1*4 + 2*5 + 3*6
1457        assert!((result - expected).abs() < 1e-10);
1458    }
1459
1460    #[cfg(feature = "gpu")]
1461    #[test]
1462    fn test_gpu_acceleration_creation() {
1463        // Test creation without GPU (should fallback gracefully)
1464        let _gpu_acc = GpuAcceleration::default();
1465        // This test just ensures the default creation works
1466        assert!(true);
1467    }
1468
1469    #[cfg(feature = "gpu")]
1470    #[test]
1471    fn test_gpu_config() {
1472        let config = AccelerationConfig {
1473            enable_gpu: true,
1474            gpu_device_id: 0,
1475            gpu_memory_limit: Some(1024 * 1024 * 1024), // 1GB
1476            ..AccelerationConfig::default()
1477        };
1478
1479        // Test that config is properly set
1480        assert_eq!(config.enable_gpu, true);
1481        assert_eq!(config.gpu_device_id, 0);
1482        assert_eq!(config.gpu_memory_limit, Some(1024 * 1024 * 1024));
1483    }
1484
1485    #[cfg(feature = "gpu")]
1486    #[test]
1487    fn test_gpu_array_conversion() {
1488        let config = AccelerationConfig {
1489            enable_gpu: false, // Disable GPU for testing
1490            ..AccelerationConfig::default()
1491        };
1492
1493        if let Ok(gpu_acc) = GpuAcceleration::with_config(config) {
1494            let array = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1495
1496            // Test array to tensor conversion (will use CPU tensor)
1497            if let Ok(tensor) = gpu_acc.array_to_tensor(&array) {
1498                // Test tensor back to array conversion
1499                if let Ok(result_array) = gpu_acc.tensor_to_array(&tensor) {
1500                    assert_eq!(result_array.shape(), array.shape());
1501                }
1502            }
1503        }
1504    }
1505
1506    #[cfg(feature = "gpu")]
1507    #[test]
1508    fn test_gpu_decomposition_fallback() {
1509        // Test that GPU decomposition works with fallback
1510        let gpu_decomp = GpuDecomposition::default();
1511
1512        let matrix =
1513            Array2::from_shape_vec((3, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
1514                .unwrap();
1515
1516        // This should work with fallback even without GPU
1517        if let Ok((factor_a, factor_b)) = gpu_decomp.gpu_factorize(&matrix) {
1518            assert_eq!(factor_a.nrows(), 3);
1519            assert_eq!(factor_b.ncols(), 3);
1520        }
1521    }
1522
1523    #[cfg(feature = "gpu")]
1524    #[test]
1525    fn test_gpu_pca_basic() {
1526        let gpu_decomp = GpuDecomposition::default();
1527
1528        let data = Array2::from_shape_vec(
1529            (4, 3),
1530            vec![
1531                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
1532            ],
1533        )
1534        .unwrap();
1535
1536        // Test GPU PCA with 2 components
1537        if let Ok((u, s, vt)) = gpu_decomp.gpu_pca(&data, 2) {
1538            assert_eq!(u.ncols(), 2);
1539            assert_eq!(s.len(), 2);
1540            assert_eq!(vt.nrows(), 2);
1541        }
1542    }
1543}