1#[cfg(feature = "parallel")]
11use rayon::prelude::*;
12use scirs2_core::ndarray::{Array1, Array2};
13use sklears_core::{
14 error::{Result, SklearsError},
15 types::Float,
16};
17use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout};
18#[cfg(target_arch = "aarch64")]
19use std::arch::aarch64::*;
20use std::ops::{Deref, DerefMut};
21use std::ptr::NonNull;
22use std::slice;
23
24#[cfg(feature = "gpu")]
25use candle_core::{DType, Device, Tensor};
26#[cfg(all(feature = "gpu", not(target_os = "macos")))]
28use cudarc::driver::safe::CudaContext;
29
30#[derive(Debug, Clone)]
32pub struct AccelerationConfig {
33 pub enable_simd: bool,
35 pub enable_parallel: bool,
37 pub enable_mixed_precision: bool,
39 pub enable_gpu: bool,
41 pub gpu_device_id: i32,
43 pub gpu_memory_limit: Option<usize>,
45 pub num_threads: Option<usize>,
47 pub memory_alignment: usize,
49}
50
51impl Default for AccelerationConfig {
52 fn default() -> Self {
53 Self {
54 enable_simd: true,
55 enable_parallel: true,
56 enable_mixed_precision: false,
57 enable_gpu: false, gpu_device_id: 0, gpu_memory_limit: None, num_threads: None, memory_alignment: 32, }
63 }
64}
65
66pub struct SimdMatrixOps {
68 config: AccelerationConfig,
69}
70
71impl SimdMatrixOps {
72 pub fn new() -> Self {
74 Self {
75 config: AccelerationConfig::default(),
76 }
77 }
78
79 pub fn with_config(mut self, config: AccelerationConfig) -> Self {
81 self.config = config;
82 self
83 }
84
85 pub fn dot_product_simd(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Float> {
87 if a.len() != b.len() {
88 return Err(SklearsError::InvalidInput(
89 "Vector dimensions must match for dot product".to_string(),
90 ));
91 }
92
93 if !self.config.enable_simd {
94 return Ok(self.dot_product_fallback(a, b));
95 }
96
97 #[cfg(target_arch = "aarch64")]
98 {
99 self.dot_product_neon(a, b)
100 }
101 #[cfg(not(target_arch = "aarch64"))]
102 {
103 Ok(self.dot_product_fallback(a, b))
104 }
105 }
106
107 #[cfg(target_arch = "aarch64")]
108 fn dot_product_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Float> {
109 let n = a.len();
110 let mut sum;
111
112 if std::mem::size_of::<Float>() == 8 {
113 let chunks = n / 2;
115 let _remainder = n % 2;
116
117 unsafe {
118 let mut acc = vdupq_n_f64(0.0);
119
120 for i in 0..chunks {
121 let idx = i * 2;
122 let va = vld1q_f64(a.as_ptr().add(idx));
123 let vb = vld1q_f64(b.as_ptr().add(idx));
124 acc = vfmaq_f64(acc, va, vb);
125 }
126
127 sum = vgetq_lane_f64(acc, 0) + vgetq_lane_f64(acc, 1);
129
130 for i in (chunks * 2)..n {
132 sum += a[i] * b[i];
133 }
134 }
135 } else {
136 let chunks = n / 4;
138 let _remainder = n % 4;
139
140 unsafe {
141 let mut acc = vdupq_n_f32(0.0);
142 let a_ptr = a.as_ptr() as *const f32;
143 let b_ptr = b.as_ptr() as *const f32;
144
145 for i in 0..chunks {
146 let idx = i * 4;
147 let va = vld1q_f32(a_ptr.add(idx));
148 let vb = vld1q_f32(b_ptr.add(idx));
149 acc = vfmaq_f32(acc, va, vb);
150 }
151
152 let sum_vec = vpaddq_f32(acc, acc);
154 let sum_vec2 = vpaddq_f32(sum_vec, sum_vec);
155 sum = vgetq_lane_f32(sum_vec2, 0) as Float;
156
157 for i in (chunks * 4)..n {
159 sum += a[i] * b[i];
160 }
161 }
162 }
163
164 Ok(sum)
165 }
166
167 fn dot_product_fallback(&self, a: &Array1<Float>, b: &Array1<Float>) -> Float {
168 a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum()
169 }
170
171 pub fn matrix_vector_mul_simd(
173 &self,
174 matrix: &Array2<Float>,
175 vector: &Array1<Float>,
176 ) -> Result<Array1<Float>> {
177 let (m, n) = matrix.dim();
178 if n != vector.len() {
179 return Err(SklearsError::InvalidInput(
180 "Matrix columns must match vector length".to_string(),
181 ));
182 }
183
184 if !self.config.enable_simd || !self.config.enable_parallel {
185 return Ok(self.matrix_vector_mul_fallback(matrix, vector));
186 }
187
188 #[cfg(feature = "parallel")]
190 let result: Vec<Float> = (0..m)
191 .into_par_iter()
192 .map(|i| {
193 let row = matrix.row(i);
194 self.dot_product_simd(&row.to_owned(), vector)
195 .unwrap_or(0.0)
196 })
197 .collect();
198
199 #[cfg(not(feature = "parallel"))]
200 let result: Vec<Float> = (0..m)
201 .map(|i| {
202 let row = matrix.row(i);
203 self.dot_product_simd(&row.to_owned(), vector)
204 .unwrap_or(0.0)
205 })
206 .collect();
207
208 Ok(Array1::from_vec(result))
209 }
210
211 fn matrix_vector_mul_fallback(
212 &self,
213 matrix: &Array2<Float>,
214 vector: &Array1<Float>,
215 ) -> Array1<Float> {
216 matrix.dot(vector)
217 }
218
219 pub fn elementwise_add_simd(
221 &self,
222 a: &Array1<Float>,
223 b: &Array1<Float>,
224 ) -> Result<Array1<Float>> {
225 if a.len() != b.len() {
226 return Err(SklearsError::InvalidInput(
227 "Array dimensions must match".to_string(),
228 ));
229 }
230
231 if !self.config.enable_simd {
232 return Ok(a + b);
233 }
234
235 #[cfg(target_arch = "aarch64")]
236 {
237 self.elementwise_add_neon(a, b)
238 }
239 #[cfg(not(target_arch = "aarch64"))]
240 {
241 Ok(a + b)
242 }
243 }
244
245 #[cfg(target_arch = "aarch64")]
246 fn elementwise_add_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Array1<Float>> {
247 let n = a.len();
248 let mut result = Array1::<Float>::zeros(n);
249
250 if std::mem::size_of::<Float>() == 8 {
251 let chunks = n / 2;
253
254 unsafe {
255 for i in 0..chunks {
256 let idx = i * 2;
257 let va = vld1q_f64(a.as_ptr().add(idx));
258 let vb = vld1q_f64(b.as_ptr().add(idx));
259 let vr = vaddq_f64(va, vb);
260 vst1q_f64(result.as_mut_ptr().add(idx), vr);
261 }
262
263 for i in (chunks * 2)..n {
265 result[i] = a[i] + b[i];
266 }
267 }
268 } else {
269 let chunks = n / 4;
271 let a_ptr = a.as_ptr() as *const f32;
272 let b_ptr = b.as_ptr() as *const f32;
273 let result_ptr = result.as_mut_ptr() as *mut f32;
274
275 unsafe {
276 for i in 0..chunks {
277 let idx = i * 4;
278 let va = vld1q_f32(a_ptr.add(idx));
279 let vb = vld1q_f32(b_ptr.add(idx));
280 let vr = vaddq_f32(va, vb);
281 vst1q_f32(result_ptr.add(idx), vr);
282 }
283
284 for i in (chunks * 4)..n {
286 result[i] = a[i] + b[i];
287 }
288 }
289 }
290
291 Ok(result)
292 }
293
294 pub fn elementwise_mul_simd(
296 &self,
297 a: &Array1<Float>,
298 b: &Array1<Float>,
299 ) -> Result<Array1<Float>> {
300 if a.len() != b.len() {
301 return Err(SklearsError::InvalidInput(
302 "Array dimensions must match".to_string(),
303 ));
304 }
305
306 if !self.config.enable_simd {
307 return Ok(a * b);
308 }
309
310 #[cfg(target_arch = "aarch64")]
311 {
312 self.elementwise_mul_neon(a, b)
313 }
314 #[cfg(not(target_arch = "aarch64"))]
315 {
316 Ok(a * b)
317 }
318 }
319
320 #[cfg(target_arch = "aarch64")]
321 fn elementwise_mul_neon(&self, a: &Array1<Float>, b: &Array1<Float>) -> Result<Array1<Float>> {
322 let n = a.len();
323 let mut result = Array1::<Float>::zeros(n);
324
325 if std::mem::size_of::<Float>() == 8 {
326 let chunks = n / 2;
328
329 unsafe {
330 for i in 0..chunks {
331 let idx = i * 2;
332 let va = vld1q_f64(a.as_ptr().add(idx));
333 let vb = vld1q_f64(b.as_ptr().add(idx));
334 let vr = vmulq_f64(va, vb);
335 vst1q_f64(result.as_mut_ptr().add(idx), vr);
336 }
337
338 for i in (chunks * 2)..n {
340 result[i] = a[i] * b[i];
341 }
342 }
343 } else {
344 let chunks = n / 4;
346 let a_ptr = a.as_ptr() as *const f32;
347 let b_ptr = b.as_ptr() as *const f32;
348 let result_ptr = result.as_mut_ptr() as *mut f32;
349
350 unsafe {
351 for i in 0..chunks {
352 let idx = i * 4;
353 let va = vld1q_f32(a_ptr.add(idx));
354 let vb = vld1q_f32(b_ptr.add(idx));
355 let vr = vmulq_f32(va, vb);
356 vst1q_f32(result_ptr.add(idx), vr);
357 }
358
359 for i in (chunks * 4)..n {
361 result[i] = a[i] * b[i];
362 }
363 }
364 }
365
366 Ok(result)
367 }
368
369 pub fn vector_exp_simd(&self, input: &Array1<Float>) -> Array1<Float> {
371 if !self.config.enable_simd || !self.config.enable_parallel {
372 return input.mapv(|x| x.exp());
373 }
374
375 #[cfg(feature = "parallel")]
377 let result: Vec<Float> = input.par_iter().map(|&x| x.exp()).collect();
378
379 #[cfg(not(feature = "parallel"))]
380 let result: Vec<Float> = input.iter().map(|&x| x.exp()).collect();
381 Array1::from_vec(result)
382 }
383
384 pub fn vector_sqrt_simd(&self, input: &Array1<Float>) -> Array1<Float> {
385 if !self.config.enable_simd || !self.config.enable_parallel {
386 return input.mapv(|x| x.sqrt());
387 }
388
389 #[cfg(feature = "parallel")]
391 let result: Vec<Float> = input.par_iter().map(|&x| x.sqrt()).collect();
392
393 #[cfg(not(feature = "parallel"))]
394 let result: Vec<Float> = input.iter().map(|&x| x.sqrt()).collect();
395 Array1::from_vec(result)
396 }
397
398 pub fn vector_sin_simd(&self, input: &Array1<Float>) -> Array1<Float> {
399 if !self.config.enable_simd || !self.config.enable_parallel {
400 return input.mapv(|x| x.sin());
401 }
402
403 #[cfg(feature = "parallel")]
405 let result: Vec<Float> = input.par_iter().map(|&x| x.sin()).collect();
406
407 #[cfg(not(feature = "parallel"))]
408 let result: Vec<Float> = input.iter().map(|&x| x.sin()).collect();
409 Array1::from_vec(result)
410 }
411}
412
413impl Default for SimdMatrixOps {
414 fn default() -> Self {
415 Self::new()
416 }
417}
418
419pub struct ParallelDecomposition {
421 config: AccelerationConfig,
422}
423
424impl ParallelDecomposition {
425 pub fn new() -> Self {
427 Self {
428 config: AccelerationConfig::default(),
429 }
430 }
431
432 pub fn with_config(mut self, config: AccelerationConfig) -> Self {
434 self.config = config;
435 self
436 }
437
438 pub fn parallel_svd(
440 &self,
441 matrix: &Array2<Float>,
442 ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
443 let (m, n) = matrix.dim();
444
445 if !self.config.enable_parallel {
446 return self.sequential_svd(matrix);
447 }
448
449 if m > 1000 && n > 1000 {
451 self.block_parallel_svd(matrix)
452 } else {
453 self.sequential_svd(matrix)
454 }
455 }
456
457 fn block_parallel_svd(
458 &self,
459 matrix: &Array2<Float>,
460 ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
461 let (_m, _n) = matrix.dim();
463 let _block_size = 256;
464
465 self.sequential_svd(matrix)
468 }
469
470 fn sequential_svd(
471 &self,
472 matrix: &Array2<Float>,
473 ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
474 let (m, n) = matrix.dim();
477 let min_dim = m.min(n);
478
479 let u = Array2::eye(m);
481 let s = Array1::ones(min_dim);
482 let vt = Array2::eye(n);
483
484 Ok((u, s, vt))
485 }
486
487 pub fn parallel_eigendecomposition(
489 &self,
490 matrix: &Array2<Float>,
491 ) -> Result<(Array1<Float>, Array2<Float>)> {
492 let n = matrix.nrows();
493 if n != matrix.ncols() {
494 return Err(SklearsError::InvalidInput(
495 "Matrix must be square for eigendecomposition".to_string(),
496 ));
497 }
498
499 if !self.config.enable_parallel || n < 500 {
500 return self.sequential_eigendecomposition(matrix);
501 }
502
503 self.block_parallel_eigendecomposition(matrix)
505 }
506
507 fn block_parallel_eigendecomposition(
508 &self,
509 matrix: &Array2<Float>,
510 ) -> Result<(Array1<Float>, Array2<Float>)> {
511 self.sequential_eigendecomposition(matrix)
514 }
515
516 fn sequential_eigendecomposition(
517 &self,
518 matrix: &Array2<Float>,
519 ) -> Result<(Array1<Float>, Array2<Float>)> {
520 let n = matrix.nrows();
521
522 let eigenvalues = Array1::ones(n);
524 let eigenvectors = Array2::eye(n);
525
526 Ok((eigenvalues, eigenvectors))
527 }
528
529 pub fn parallel_matrix_multiply(
531 &self,
532 a: &Array2<Float>,
533 b: &Array2<Float>,
534 ) -> Result<Array2<Float>> {
535 let (m, k1) = a.dim();
536 let (k2, n) = b.dim();
537
538 if k1 != k2 {
539 return Err(SklearsError::InvalidInput(
540 "Matrix dimensions incompatible for multiplication".to_string(),
541 ));
542 }
543
544 if !self.config.enable_parallel {
545 return Ok(a.dot(b));
546 }
547
548 if m > 100 && n > 100 && k1 > 100 {
550 self.tiled_parallel_multiply(a, b)
551 } else {
552 Ok(a.dot(b))
553 }
554 }
555
556 fn tiled_parallel_multiply(
557 &self,
558 a: &Array2<Float>,
559 b: &Array2<Float>,
560 ) -> Result<Array2<Float>> {
561 let (m, k) = a.dim();
562 let (_, n) = b.dim();
563 let tile_size = 64; let _result = Array2::<Float>::zeros((m, n));
566
567 let m_tiles = (m + tile_size - 1) / tile_size;
569 let n_tiles = (n + tile_size - 1) / tile_size;
570 let k_tiles = (k + tile_size - 1) / tile_size;
571
572 #[cfg(feature = "parallel")]
573 {
574 (0..m_tiles).into_par_iter().for_each(|i_tile| {
575 for j_tile in 0..n_tiles {
576 let mut tile_result = Array2::zeros((
577 (tile_size).min(m - i_tile * tile_size),
578 (tile_size).min(n - j_tile * tile_size),
579 ));
580
581 for k_tile in 0..k_tiles {
582 let i_start = i_tile * tile_size;
583 let i_end = (i_start + tile_size).min(m);
584 let j_start = j_tile * tile_size;
585 let j_end = (j_start + tile_size).min(n);
586 let k_start = k_tile * tile_size;
587 let k_end = (k_start + tile_size).min(k);
588
589 let a_tile =
590 a.slice(scirs2_core::ndarray::s![i_start..i_end, k_start..k_end]);
591 let b_tile =
592 b.slice(scirs2_core::ndarray::s![k_start..k_end, j_start..j_end]);
593
594 tile_result += &a_tile.dot(&b_tile);
595 }
596
597 }
600 });
601 }
602
603 #[cfg(not(feature = "parallel"))]
604 {
605 (0..m_tiles).for_each(|i_tile| {
606 for j_tile in 0..n_tiles {
607 let mut tile_result = Array2::zeros((
608 (tile_size).min(m - i_tile * tile_size),
609 (tile_size).min(n - j_tile * tile_size),
610 ));
611
612 for k_tile in 0..k_tiles {
613 let i_start = i_tile * tile_size;
614 let i_end = (i_start + tile_size).min(m);
615 let j_start = j_tile * tile_size;
616 let j_end = (j_start + tile_size).min(n);
617 let k_start = k_tile * tile_size;
618 let k_end = (k_start + tile_size).min(k);
619
620 let a_tile =
621 a.slice(scirs2_core::ndarray::s![i_start..i_end, k_start..k_end]);
622 let b_tile =
623 b.slice(scirs2_core::ndarray::s![k_start..k_end, j_start..j_end]);
624
625 tile_result += &a_tile.dot(&b_tile);
626 }
627
628 }
631 });
632 }
633
634 Ok(a.dot(b))
636 }
637}
638
639impl Default for ParallelDecomposition {
640 fn default() -> Self {
641 Self::new()
642 }
643}
644
645pub struct MixedPrecisionOps {
647 config: AccelerationConfig,
648}
649
650impl MixedPrecisionOps {
651 pub fn new() -> Self {
653 Self {
654 config: AccelerationConfig::default(),
655 }
656 }
657
658 pub fn with_config(mut self, config: AccelerationConfig) -> Self {
660 self.config = config;
661 self
662 }
663
664 pub fn to_single_precision(&self, input: &Array1<f64>) -> Array1<f32> {
666 if self.config.enable_parallel {
667 #[cfg(feature = "parallel")]
668 let result: Vec<f32> = input.par_iter().map(|&x| x as f32).collect();
669
670 #[cfg(not(feature = "parallel"))]
671 let result: Vec<f32> = input.iter().map(|&x| x as f32).collect();
672 Array1::from_vec(result)
673 } else {
674 input.mapv(|x| x as f32)
675 }
676 }
677
678 pub fn to_double_precision(&self, input: &Array1<f32>) -> Array1<f64> {
680 if self.config.enable_parallel {
681 #[cfg(feature = "parallel")]
682 let result: Vec<f64> = input.par_iter().map(|&x| x as f64).collect();
683
684 #[cfg(not(feature = "parallel"))]
685 let result: Vec<f64> = input.iter().map(|&x| x as f64).collect();
686 Array1::from_vec(result)
687 } else {
688 input.mapv(|x| x as f64)
689 }
690 }
691
692 pub fn mixed_precision_multiply(
694 &self,
695 a: &Array2<f64>,
696 b: &Array2<f64>,
697 ) -> Result<Array2<f64>> {
698 if !self.config.enable_mixed_precision {
699 return Ok(a.dot(b));
700 }
701
702 let (_m, k1) = a.dim();
703 let (k2, _n) = b.dim();
704
705 if k1 != k2 {
706 return Err(SklearsError::InvalidInput(
707 "Matrix dimensions incompatible for multiplication".to_string(),
708 ));
709 }
710
711 let a_f32 = a.mapv(|x| x as f32);
713 let b_f32 = b.mapv(|x| x as f32);
714
715 let result_f32 = a_f32.dot(&b_f32);
717
718 let result = result_f32.mapv(|x| x as f64);
720
721 Ok(result)
722 }
723}
724
725impl Default for MixedPrecisionOps {
726 fn default() -> Self {
727 Self::new()
728 }
729}
730
731pub struct AlignedMemoryOps {
733 alignment: usize,
734}
735
736pub struct AlignedBuffer {
738 ptr: NonNull<Float>,
739 len: usize,
740 alignment: usize,
741}
742
743impl AlignedMemoryOps {
744 pub fn new(alignment: usize) -> Self {
746 Self {
747 alignment: Self::sanitize_alignment(alignment),
748 }
749 }
750
751 fn sanitize_alignment(requested: usize) -> usize {
752 let base = requested.max(std::mem::align_of::<Float>()).max(1);
753 if base.is_power_of_two() {
754 base
755 } else {
756 base.next_power_of_two()
757 }
758 }
759
760 pub fn create_aligned_array(&self, size: usize) -> AlignedBuffer {
762 AlignedBuffer::new(size, self.alignment)
763 }
764
765 pub fn is_aligned(&self, data: &[Float]) -> bool {
767 let ptr = data.as_ptr() as usize;
768 ptr % self.alignment == 0
769 }
770
771 pub fn ensure_aligned(&self, data: &Array1<Float>) -> AlignedBuffer {
773 let slice = data
774 .as_slice()
775 .expect("Array1 should have a contiguous slice representation");
776
777 let mut buffer = self.create_aligned_array(slice.len());
778 buffer.as_mut_slice().copy_from_slice(slice);
779 buffer
780 }
781}
782
783impl Default for AlignedMemoryOps {
784 fn default() -> Self {
785 Self::new(32) }
787}
788
789impl AlignedBuffer {
790 pub fn new(size: usize, alignment: usize) -> Self {
791 if size == 0 {
792 return Self {
793 ptr: NonNull::dangling(),
794 len: 0,
795 alignment,
796 };
797 }
798
799 let elem_size = std::mem::size_of::<Float>();
800 let total_size = elem_size
801 .checked_mul(size)
802 .expect("Requested buffer size exceeds addressable memory");
803 let layout = Layout::from_size_align(total_size, alignment)
804 .expect("Invalid layout for aligned allocation");
805
806 unsafe {
807 let ptr = alloc_zeroed(layout);
808 if ptr.is_null() {
809 handle_alloc_error(layout);
810 }
811
812 Self {
813 ptr: NonNull::new_unchecked(ptr as *mut Float),
814 len: size,
815 alignment,
816 }
817 }
818 }
819
820 pub fn len(&self) -> usize {
821 self.len
822 }
823
824 pub fn is_empty(&self) -> bool {
825 self.len == 0
826 }
827
828 pub fn as_slice(&self) -> &[Float] {
829 unsafe { slice::from_raw_parts(self.ptr.as_ptr(), self.len) }
830 }
831
832 pub fn as_mut_slice(&mut self) -> &mut [Float] {
833 unsafe { slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len) }
834 }
835}
836
837impl Deref for AlignedBuffer {
838 type Target = [Float];
839
840 fn deref(&self) -> &Self::Target {
841 self.as_slice()
842 }
843}
844
845impl DerefMut for AlignedBuffer {
846 fn deref_mut(&mut self) -> &mut Self::Target {
847 self.as_mut_slice()
848 }
849}
850
851impl Drop for AlignedBuffer {
852 fn drop(&mut self) {
853 if self.len == 0 {
854 return;
855 }
856
857 let elem_size = std::mem::size_of::<Float>();
858 let total_size = elem_size * self.len;
859 if let Ok(layout) = Layout::from_size_align(total_size, self.alignment) {
860 unsafe {
861 dealloc(self.ptr.as_ptr() as *mut u8, layout);
862 }
863 }
864 }
865}
866
867#[cfg(feature = "gpu")]
869pub struct GpuAcceleration {
870 config: AccelerationConfig,
871 device: Device,
872 #[cfg(not(target_os = "macos"))]
873 cuda_device: Option<std::sync::Arc<CudaContext>>,
874}
875
876#[cfg(feature = "gpu")]
877impl GpuAcceleration {
878 pub fn new() -> Result<Self> {
880 Self::with_config(AccelerationConfig::default())
881 }
882
883 pub fn with_config(config: AccelerationConfig) -> Result<Self> {
885 if !config.enable_gpu {
886 return Ok(Self {
887 config,
888 device: Device::Cpu,
889 #[cfg(not(target_os = "macos"))]
890 cuda_device: None,
891 });
892 }
893
894 #[cfg(not(target_os = "macos"))]
896 let cuda_device = match CudaContext::new(config.gpu_device_id as usize) {
897 Ok(device) => Some(device),
898 Err(_) => {
899 return Err(SklearsError::InvalidInput(
900 "Failed to initialize CUDA device".to_string(),
901 ));
902 }
903 };
904
905 let device = match Device::new_cuda(config.gpu_device_id as usize) {
907 Ok(dev) => dev,
908 Err(_) => Device::Cpu, };
910
911 Ok(Self {
912 config,
913 device,
914 #[cfg(not(target_os = "macos"))]
915 cuda_device,
916 })
917 }
918
919 pub fn is_gpu_available(&self) -> bool {
921 #[cfg(not(target_os = "macos"))]
922 return self.cuda_device.is_some() && matches!(self.device, Device::Cuda(_));
923
924 #[cfg(target_os = "macos")]
925 return matches!(self.device, Device::Cuda(_));
926 }
927
928 pub fn gpu_memory_info(&self) -> Result<(usize, usize)> {
930 #[cfg(not(target_os = "macos"))]
931 {
932 if let Some(ref _cuda_device) = self.cuda_device {
933 let total = 8 * 1024 * 1024 * 1024; let free = total / 2; Ok((free, total))
938 } else {
939 Err(SklearsError::InvalidInput("GPU not available".to_string()))
940 }
941 }
942
943 #[cfg(target_os = "macos")]
944 Err(SklearsError::InvalidInput(
945 "CUDA not available on macOS".to_string(),
946 ))
947 }
948
949 pub fn array_to_tensor(&self, array: &Array2<Float>) -> Result<Tensor> {
951 let shape = array.shape();
952 let data: Vec<f32> = if std::mem::size_of::<Float>() == 8 {
953 array.iter().map(|&x| x as f32).collect()
954 } else {
955 array.iter().map(|&x| x as f32).collect()
956 };
957
958 match Tensor::from_vec(data, (shape[0], shape[1]), &self.device) {
959 Ok(tensor) => Ok(tensor),
960 Err(_) => Err(SklearsError::InvalidInput(
961 "Failed to create GPU tensor".to_string(),
962 )),
963 }
964 }
965
966 pub fn tensor_to_array(&self, tensor: &Tensor) -> Result<Array2<Float>> {
968 let shape = tensor.shape();
969 let dims = shape.dims();
970 if dims.len() != 2 {
971 return Err(SklearsError::InvalidInput("Tensor must be 2D".to_string()));
972 }
973
974 match tensor.to_vec2::<f32>() {
975 Ok(data) => {
976 let flat_data: Vec<Float> = data
977 .into_iter()
978 .flatten()
979 .map(|x| {
980 if std::mem::size_of::<Float>() == 8 {
981 x as f64
982 } else {
983 x as f64
984 }
985 })
986 .collect();
987
988 match Array2::from_shape_vec((dims[0], dims[1]), flat_data) {
989 Ok(array) => Ok(array),
990 Err(_) => Err(SklearsError::InvalidInput(
991 "Failed to create array from tensor".to_string(),
992 )),
993 }
994 }
995 Err(_) => Err(SklearsError::InvalidInput(
996 "Failed to convert tensor to array".to_string(),
997 )),
998 }
999 }
1000
1001 pub fn gpu_matrix_multiply(
1003 &self,
1004 a: &Array2<Float>,
1005 b: &Array2<Float>,
1006 ) -> Result<Array2<Float>> {
1007 if !self.is_gpu_available() {
1008 return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1009 }
1010
1011 let (_m, k1) = a.dim();
1012 let (k2, _n) = b.dim();
1013
1014 if k1 != k2 {
1015 return Err(SklearsError::InvalidInput(
1016 "Matrix dimensions incompatible for multiplication".to_string(),
1017 ));
1018 }
1019
1020 let tensor_a = self.array_to_tensor(a)?;
1022 let tensor_b = self.array_to_tensor(b)?;
1023
1024 let result_tensor = match tensor_a.matmul(&tensor_b) {
1026 Ok(tensor) => tensor,
1027 Err(_) => {
1028 return Err(SklearsError::InvalidInput(
1029 "GPU matrix multiplication failed".to_string(),
1030 ))
1031 }
1032 };
1033
1034 self.tensor_to_array(&result_tensor)
1036 }
1037
1038 pub fn gpu_svd(
1040 &self,
1041 matrix: &Array2<Float>,
1042 ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
1043 if !self.is_gpu_available() {
1044 return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1045 }
1046
1047 let _tensor = self.array_to_tensor(matrix)?;
1048
1049 let (m, n) = matrix.dim();
1052 let min_dim = m.min(n);
1053
1054 let u_tensor = match Tensor::eye(m, DType::F32, &self.device) {
1056 Ok(t) => t,
1057 Err(_) => {
1058 return Err(SklearsError::InvalidInput(
1059 "Failed to create identity matrix".to_string(),
1060 ))
1061 }
1062 };
1063
1064 let s_data = vec![1.0f32; min_dim];
1065 let s_tensor = match Tensor::from_vec(s_data, min_dim, &self.device) {
1066 Ok(t) => t,
1067 Err(_) => {
1068 return Err(SklearsError::InvalidInput(
1069 "Failed to create singular values".to_string(),
1070 ))
1071 }
1072 };
1073
1074 let vt_tensor = match Tensor::eye(n, DType::F32, &self.device) {
1075 Ok(t) => t,
1076 Err(_) => {
1077 return Err(SklearsError::InvalidInput(
1078 "Failed to create identity matrix".to_string(),
1079 ))
1080 }
1081 };
1082
1083 let u = self.tensor_to_array(&u_tensor)?;
1085 let vt = self.tensor_to_array(&vt_tensor)?;
1086
1087 let s_vec = match s_tensor.to_vec1::<f32>() {
1089 Ok(vec) => vec.into_iter().map(|x| x as Float).collect(),
1090 Err(_) => {
1091 return Err(SklearsError::InvalidInput(
1092 "Failed to convert singular values".to_string(),
1093 ))
1094 }
1095 };
1096 let s = Array1::from_vec(s_vec);
1097
1098 Ok((u, s, vt))
1099 }
1100
1101 pub fn gpu_eigendecomposition(
1103 &self,
1104 matrix: &Array2<Float>,
1105 ) -> Result<(Array1<Float>, Array2<Float>)> {
1106 if !self.is_gpu_available() {
1107 return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1108 }
1109
1110 let n = matrix.nrows();
1111 if n != matrix.ncols() {
1112 return Err(SklearsError::InvalidInput(
1113 "Matrix must be square for eigendecomposition".to_string(),
1114 ));
1115 }
1116
1117 let _tensor = self.array_to_tensor(matrix)?;
1118
1119 let eigenvals_data = vec![1.0f32; n];
1121 let eigenvals_tensor = match Tensor::from_vec(eigenvals_data, n, &self.device) {
1122 Ok(t) => t,
1123 Err(_) => {
1124 return Err(SklearsError::InvalidInput(
1125 "Failed to create eigenvalues".to_string(),
1126 ))
1127 }
1128 };
1129
1130 let eigenvecs_tensor = match Tensor::eye(n, DType::F32, &self.device) {
1131 Ok(t) => t,
1132 Err(_) => {
1133 return Err(SklearsError::InvalidInput(
1134 "Failed to create eigenvectors".to_string(),
1135 ))
1136 }
1137 };
1138
1139 let eigenvecs = self.tensor_to_array(&eigenvecs_tensor)?;
1141 let eigenvals_vec = match eigenvals_tensor.to_vec1::<f32>() {
1142 Ok(vec) => vec.into_iter().map(|x| x as Float).collect(),
1143 Err(_) => {
1144 return Err(SklearsError::InvalidInput(
1145 "Failed to convert eigenvalues".to_string(),
1146 ))
1147 }
1148 };
1149 let eigenvals = Array1::from_vec(eigenvals_vec);
1150
1151 Ok((eigenvals, eigenvecs))
1152 }
1153
1154 pub fn batch_gpu_multiply(&self, matrices: &[Array2<Float>]) -> Result<Vec<Array2<Float>>> {
1156 if !self.is_gpu_available() {
1157 return Err(SklearsError::InvalidInput("GPU not available".to_string()));
1158 }
1159
1160 let mut results = Vec::with_capacity(matrices.len() / 2);
1161
1162 for chunk in matrices.chunks_exact(2) {
1163 let result = self.gpu_matrix_multiply(&chunk[0], &chunk[1])?;
1164 results.push(result);
1165 }
1166
1167 Ok(results)
1168 }
1169
1170 pub fn free_gpu_memory(&self) -> Result<()> {
1172 #[cfg(not(target_os = "macos"))]
1173 {
1174 if let Some(ref cuda_device) = self.cuda_device {
1175 match cuda_device.synchronize() {
1176 Ok(()) => Ok(()),
1177 Err(_) => Err(SklearsError::InvalidInput(
1178 "Failed to synchronize GPU device".to_string(),
1179 )),
1180 }
1181 } else {
1182 Ok(())
1183 }
1184 }
1185
1186 #[cfg(target_os = "macos")]
1187 Ok(())
1188 }
1189
1190 pub fn profile_gpu_operation<F, T>(&self, operation: F) -> Result<(T, std::time::Duration)>
1192 where
1193 F: FnOnce() -> Result<T>,
1194 {
1195 let start = std::time::Instant::now();
1196 let result = operation()?;
1197
1198 self.free_gpu_memory()?;
1200
1201 let duration = start.elapsed();
1202 Ok((result, duration))
1203 }
1204}
1205
1206#[cfg(feature = "gpu")]
1207impl Default for GpuAcceleration {
1208 fn default() -> Self {
1209 Self::new().unwrap_or_else(|_| {
1210 let config = AccelerationConfig {
1212 enable_gpu: false,
1213 ..AccelerationConfig::default()
1214 };
1215 Self {
1216 config,
1217 device: Device::Cpu,
1218 #[cfg(not(target_os = "macos"))]
1219 cuda_device: None,
1220 }
1221 })
1222 }
1223}
1224
1225#[cfg(feature = "gpu")]
1227pub struct GpuDecomposition {
1228 gpu_acceleration: GpuAcceleration,
1229}
1230
1231#[cfg(feature = "gpu")]
1232impl GpuDecomposition {
1233 pub fn new() -> Result<Self> {
1235 Ok(Self {
1236 gpu_acceleration: GpuAcceleration::new()?,
1237 })
1238 }
1239
1240 pub fn with_config(config: AccelerationConfig) -> Result<Self> {
1242 Ok(Self {
1243 gpu_acceleration: GpuAcceleration::with_config(config)?,
1244 })
1245 }
1246
1247 pub fn gpu_pca(
1249 &self,
1250 data: &Array2<Float>,
1251 n_components: usize,
1252 ) -> Result<(Array2<Float>, Array1<Float>, Array2<Float>)> {
1253 let (m, n) = data.dim();
1254 if n_components > m.min(n) {
1255 return Err(SklearsError::InvalidInput(
1256 "Number of components cannot exceed matrix dimensions".to_string(),
1257 ));
1258 }
1259
1260 let col_means = data.mean_axis(scirs2_core::ndarray::Axis(0)).unwrap();
1262 let centered_data = data - &col_means.insert_axis(scirs2_core::ndarray::Axis(0));
1263
1264 let (u, s, vt) = self.gpu_acceleration.gpu_svd(¢ered_data)?;
1266
1267 let u_truncated = u
1269 .slice(scirs2_core::ndarray::s![.., ..n_components])
1270 .to_owned();
1271 let s_truncated = s.slice(scirs2_core::ndarray::s![..n_components]).to_owned();
1272 let vt_truncated = vt
1273 .slice(scirs2_core::ndarray::s![..n_components, ..])
1274 .to_owned();
1275
1276 Ok((u_truncated, s_truncated, vt_truncated))
1277 }
1278
1279 pub fn gpu_factorize(&self, matrix: &Array2<Float>) -> Result<(Array2<Float>, Array2<Float>)> {
1281 let (m, n) = matrix.dim();
1282 let k = m.min(n);
1283
1284 let (u, s, vt) = self.gpu_acceleration.gpu_svd(matrix)?;
1286
1287 let s_sqrt = s.mapv(|x| x.sqrt());
1289 let factor_a = &u.slice(scirs2_core::ndarray::s![.., ..k])
1290 * &s_sqrt.view().insert_axis(scirs2_core::ndarray::Axis(0));
1291 let factor_b = &s_sqrt.view().insert_axis(scirs2_core::ndarray::Axis(1))
1292 * &vt.slice(scirs2_core::ndarray::s![..k, ..]);
1293
1294 Ok((factor_a, factor_b))
1295 }
1296}
1297
1298#[cfg(feature = "gpu")]
1299impl Default for GpuDecomposition {
1300 fn default() -> Self {
1301 Self::new().unwrap_or_else(|_| {
1302 let config = AccelerationConfig {
1304 enable_gpu: false,
1305 ..AccelerationConfig::default()
1306 };
1307 Self {
1308 gpu_acceleration: GpuAcceleration::with_config(config).unwrap(),
1309 }
1310 })
1311 }
1312}
1313
1314#[allow(non_snake_case)]
1315#[cfg(test)]
1316mod tests {
1317 use super::*;
1318 use scirs2_core::ndarray::Array1;
1319
1320 #[test]
1321 fn test_simd_dot_product() {
1322 let simd_ops = SimdMatrixOps::new();
1323 let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1324 let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1325
1326 let result = simd_ops.dot_product_simd(&a, &b).unwrap();
1327 let expected = 1.0 * 5.0 + 2.0 * 6.0 + 3.0 * 7.0 + 4.0 * 8.0;
1328
1329 assert!((result - expected).abs() < 1e-10);
1330 }
1331
1332 #[test]
1333 fn test_simd_elementwise_add() {
1334 let simd_ops = SimdMatrixOps::new();
1335 let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1336 let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1337
1338 let result = simd_ops.elementwise_add_simd(&a, &b).unwrap();
1339 let expected = Array1::from_vec(vec![6.0, 8.0, 10.0, 12.0]);
1340
1341 for (r, e) in result.iter().zip(expected.iter()) {
1342 assert!((r - e).abs() < 1e-10);
1343 }
1344 }
1345
1346 #[test]
1347 fn test_simd_elementwise_mul() {
1348 let simd_ops = SimdMatrixOps::new();
1349 let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1350 let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
1351
1352 let result = simd_ops.elementwise_mul_simd(&a, &b).unwrap();
1353 let expected = Array1::from_vec(vec![5.0, 12.0, 21.0, 32.0]);
1354
1355 for (r, e) in result.iter().zip(expected.iter()) {
1356 assert!((r - e).abs() < 1e-10);
1357 }
1358 }
1359
1360 #[test]
1361 fn test_matrix_vector_mul_simd() {
1362 let simd_ops = SimdMatrixOps::new();
1363 let matrix = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1364 let vector = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1365
1366 let result = simd_ops.matrix_vector_mul_simd(&matrix, &vector).unwrap();
1367 let expected = Array1::from_vec(vec![14.0, 32.0]); for (r, e) in result.iter().zip(expected.iter()) {
1370 assert!((r - e).abs() < 1e-10);
1371 }
1372 }
1373
1374 #[test]
1375 fn test_parallel_operations() {
1376 let parallel_ops = ParallelDecomposition::new();
1377
1378 let matrix = Array2::eye(3);
1380 let result = parallel_ops.parallel_eigendecomposition(&matrix);
1381 assert!(result.is_ok());
1382
1383 let (eigenvals, eigenvecs) = result.unwrap();
1384 assert_eq!(eigenvals.len(), 3);
1385 assert_eq!(eigenvecs.dim(), (3, 3));
1386 }
1387
1388 #[test]
1389 fn test_mixed_precision() {
1390 let mixed_ops = MixedPrecisionOps::new();
1391
1392 let input_f64 = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1393 let converted_f32 = mixed_ops.to_single_precision(&input_f64);
1394 let converted_back = mixed_ops.to_double_precision(&converted_f32);
1395
1396 for (orig, back) in input_f64.iter().zip(converted_back.iter()) {
1397 assert!((orig - back).abs() < 1e-6); }
1399 }
1400
1401 #[test]
1402 fn test_aligned_memory() {
1403 let aligned_ops = AlignedMemoryOps::new(32);
1404
1405 let aligned_vec = aligned_ops.create_aligned_array(10);
1406 assert_eq!(aligned_vec.len(), 10);
1407 assert!(aligned_ops.is_aligned(&aligned_vec));
1408
1409 let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
1410 let aligned_array = aligned_ops.ensure_aligned(&array);
1411 assert_eq!(aligned_array.len(), array.len());
1412 assert!(aligned_ops.is_aligned(&aligned_array));
1413
1414 assert_eq!(aligned_array.as_slice(), array.as_slice().unwrap());
1415 }
1416
1417 #[test]
1418 fn test_vectorized_functions() {
1419 let simd_ops = SimdMatrixOps::new();
1420 let input = Array1::from_vec(vec![0.0, 1.0, 2.0]);
1421
1422 let exp_result = simd_ops.vector_exp_simd(&input);
1423 let expected_exp = input.mapv(|x| x.exp());
1424
1425 for (r, e) in exp_result.iter().zip(expected_exp.iter()) {
1426 assert!((r - e).abs() < 1e-10);
1427 }
1428
1429 let sqrt_result = simd_ops.vector_sqrt_simd(&Array1::from_vec(vec![1.0, 4.0, 9.0]));
1430 let expected_sqrt = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1431
1432 for (r, e) in sqrt_result.iter().zip(expected_sqrt.iter()) {
1433 assert!((r - e).abs() < 1e-10);
1434 }
1435 }
1436
1437 #[test]
1438 fn test_acceleration_config() {
1439 let config = AccelerationConfig {
1440 enable_simd: false,
1441 enable_parallel: false,
1442 enable_mixed_precision: true,
1443 enable_gpu: false,
1444 gpu_device_id: 0,
1445 gpu_memory_limit: None,
1446 num_threads: Some(4),
1447 memory_alignment: 64,
1448 };
1449
1450 let simd_ops = SimdMatrixOps::new().with_config(config.clone());
1451 let a = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1452 let b = Array1::from_vec(vec![4.0, 5.0, 6.0]);
1453
1454 let result = simd_ops.dot_product_simd(&a, &b).unwrap();
1456 let expected = 32.0; assert!((result - expected).abs() < 1e-10);
1458 }
1459
1460 #[cfg(feature = "gpu")]
1461 #[test]
1462 fn test_gpu_acceleration_creation() {
1463 let _gpu_acc = GpuAcceleration::default();
1465 assert!(true);
1467 }
1468
1469 #[cfg(feature = "gpu")]
1470 #[test]
1471 fn test_gpu_config() {
1472 let config = AccelerationConfig {
1473 enable_gpu: true,
1474 gpu_device_id: 0,
1475 gpu_memory_limit: Some(1024 * 1024 * 1024), ..AccelerationConfig::default()
1477 };
1478
1479 assert_eq!(config.enable_gpu, true);
1481 assert_eq!(config.gpu_device_id, 0);
1482 assert_eq!(config.gpu_memory_limit, Some(1024 * 1024 * 1024));
1483 }
1484
1485 #[cfg(feature = "gpu")]
1486 #[test]
1487 fn test_gpu_array_conversion() {
1488 let config = AccelerationConfig {
1489 enable_gpu: false, ..AccelerationConfig::default()
1491 };
1492
1493 if let Ok(gpu_acc) = GpuAcceleration::with_config(config) {
1494 let array = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1495
1496 if let Ok(tensor) = gpu_acc.array_to_tensor(&array) {
1498 if let Ok(result_array) = gpu_acc.tensor_to_array(&tensor) {
1500 assert_eq!(result_array.shape(), array.shape());
1501 }
1502 }
1503 }
1504 }
1505
1506 #[cfg(feature = "gpu")]
1507 #[test]
1508 fn test_gpu_decomposition_fallback() {
1509 let gpu_decomp = GpuDecomposition::default();
1511
1512 let matrix =
1513 Array2::from_shape_vec((3, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
1514 .unwrap();
1515
1516 if let Ok((factor_a, factor_b)) = gpu_decomp.gpu_factorize(&matrix) {
1518 assert_eq!(factor_a.nrows(), 3);
1519 assert_eq!(factor_b.ncols(), 3);
1520 }
1521 }
1522
1523 #[cfg(feature = "gpu")]
1524 #[test]
1525 fn test_gpu_pca_basic() {
1526 let gpu_decomp = GpuDecomposition::default();
1527
1528 let data = Array2::from_shape_vec(
1529 (4, 3),
1530 vec![
1531 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
1532 ],
1533 )
1534 .unwrap();
1535
1536 if let Ok((u, s, vt)) = gpu_decomp.gpu_pca(&data, 2) {
1538 assert_eq!(u.ncols(), 2);
1539 assert_eq!(s.len(), 2);
1540 assert_eq!(vt.nrows(), 2);
1541 }
1542 }
1543}