1use scirs2_core::ndarray::{Array1, Array2};
7use scirs2_core::random::essentials::{Normal as RandNormal, Uniform as RandUniform};
8use scirs2_core::random::rngs::StdRng as RealStdRng;
9use scirs2_core::random::seq::SliceRandom;
10use scirs2_core::random::Rng;
11use scirs2_core::random::{thread_rng, SeedableRng};
12use serde::{Deserialize, Serialize};
13use std::sync::Arc;
14
15use sklears_core::error::{Result, SklearsError};
16use sklears_core::traits::{Fit, Transform};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum GpuBackend {
22 Cuda,
24 OpenCL,
26 Metal,
28 Cpu, }
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub enum MemoryStrategy {
36 Pinned, Managed, Explicit, }
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub enum Precision {
48 Single, Double, Half, }
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct GpuDevice {
60 pub id: usize,
62 pub name: String,
64 pub compute_capability: (u32, u32),
66 pub total_memory: usize,
68 pub multiprocessor_count: usize,
70 pub max_threads_per_block: usize,
72 pub max_shared_memory: usize,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct GpuConfig {
80 pub backend: GpuBackend,
82 pub device_id: usize,
84 pub memory_strategy: MemoryStrategy,
86 pub precision: Precision,
88 pub block_size: usize,
90 pub grid_size: usize,
92 pub enable_async: bool,
94 pub stream_count: usize,
96}
97
98impl Default for GpuConfig {
99 fn default() -> Self {
100 Self {
101 backend: GpuBackend::Cpu,
102 device_id: 0,
103 memory_strategy: MemoryStrategy::Managed,
104 precision: Precision::Double,
105 block_size: 256,
106 grid_size: 0, enable_async: true,
108 stream_count: 4,
109 }
110 }
111}
112
113#[derive(Debug)]
115pub struct GpuContext {
117 pub config: GpuConfig,
119 pub device: Option<GpuDevice>,
121 pub is_initialized: bool,
123}
124
125impl GpuContext {
126 pub fn new(config: GpuConfig) -> Self {
127 Self {
128 config,
129 device: None,
130 is_initialized: false,
131 }
132 }
133
134 pub fn initialize(&mut self) -> Result<()> {
135 match self.config.backend {
136 GpuBackend::Cuda => self.initialize_cuda(),
137 GpuBackend::OpenCL => self.initialize_opencl(),
138 GpuBackend::Metal => self.initialize_metal(),
139 GpuBackend::Cpu => {
140 self.is_initialized = true;
141 Ok(())
142 }
143 }
144 }
145
146 fn initialize_cuda(&mut self) -> Result<()> {
147 let device = GpuDevice {
150 id: self.config.device_id,
151 name: "Simulated CUDA Device".to_string(),
152 compute_capability: (8, 6),
153 total_memory: 12 * 1024 * 1024 * 1024, multiprocessor_count: 82,
155 max_threads_per_block: 1024,
156 max_shared_memory: 48 * 1024,
157 };
158
159 self.device = Some(device);
160 self.is_initialized = true;
161 Ok(())
162 }
163
164 fn initialize_opencl(&mut self) -> Result<()> {
165 let device = GpuDevice {
167 id: self.config.device_id,
168 name: "Simulated OpenCL Device".to_string(),
169 compute_capability: (0, 0),
170 total_memory: 8 * 1024 * 1024 * 1024, multiprocessor_count: 64,
172 max_threads_per_block: 256,
173 max_shared_memory: 32 * 1024,
174 };
175
176 self.device = Some(device);
177 self.is_initialized = true;
178 Ok(())
179 }
180
181 fn initialize_metal(&mut self) -> Result<()> {
182 let device = GpuDevice {
184 id: self.config.device_id,
185 name: "Simulated Metal Device".to_string(),
186 compute_capability: (0, 0),
187 total_memory: 16 * 1024 * 1024 * 1024, multiprocessor_count: 32,
189 max_threads_per_block: 1024,
190 max_shared_memory: 32 * 1024,
191 };
192
193 self.device = Some(device);
194 self.is_initialized = true;
195 Ok(())
196 }
197
198 pub fn get_optimal_block_size(&self, problem_size: usize) -> usize {
199 if let Some(device) = &self.device {
200 let max_threads = device.max_threads_per_block;
201 let suggested_size = (problem_size as f64).sqrt() as usize;
202 suggested_size.min(max_threads).max(32)
203 } else {
204 256
205 }
206 }
207
208 pub fn get_optimal_grid_size(&self, problem_size: usize, block_size: usize) -> usize {
209 (problem_size + block_size - 1) / block_size
210 }
211}
212
213#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct GpuRBFSampler {
217 pub n_components: usize,
219 pub gamma: f64,
221 pub gpu_config: GpuConfig,
223}
224
225impl GpuRBFSampler {
226 pub fn new(n_components: usize) -> Self {
227 Self {
228 n_components,
229 gamma: 1.0,
230 gpu_config: GpuConfig::default(),
231 }
232 }
233
234 pub fn gamma(mut self, gamma: f64) -> Self {
235 self.gamma = gamma;
236 self
237 }
238
239 pub fn gpu_config(mut self, config: GpuConfig) -> Self {
240 self.gpu_config = config;
241 self
242 }
243
244 fn generate_random_features_gpu(
245 &self,
246 input_dim: usize,
247 ctx: &GpuContext,
248 ) -> Result<Array2<f64>> {
249 match ctx.config.backend {
250 GpuBackend::Cuda => self.generate_features_cuda(input_dim, ctx),
251 GpuBackend::OpenCL => self.generate_features_opencl(input_dim, ctx),
252 GpuBackend::Metal => self.generate_features_metal(input_dim, ctx),
253 GpuBackend::Cpu => self.generate_features_cpu(input_dim),
254 }
255 }
256
257 fn generate_features_cuda(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
258 let mut rng = RealStdRng::from_seed(thread_rng().gen());
266 let normal = RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).unwrap();
267
268 let mut weights = Array2::zeros((self.n_components, input_dim));
269 for i in 0..self.n_components {
270 for j in 0..input_dim {
271 weights[[i, j]] = rng.sample(normal);
272 }
273 }
274
275 Ok(weights)
276 }
277
278 fn generate_features_opencl(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
279 let mut rng = RealStdRng::from_seed(thread_rng().gen());
281 let normal = RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).unwrap();
282
283 let mut weights = Array2::zeros((self.n_components, input_dim));
284 for i in 0..self.n_components {
285 for j in 0..input_dim {
286 weights[[i, j]] = rng.sample(normal);
287 }
288 }
289
290 Ok(weights)
291 }
292
293 fn generate_features_metal(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
294 let mut rng = RealStdRng::from_seed(thread_rng().gen());
296 let normal = RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).unwrap();
297
298 let mut weights = Array2::zeros((self.n_components, input_dim));
299 for i in 0..self.n_components {
300 for j in 0..input_dim {
301 weights[[i, j]] = rng.sample(normal);
302 }
303 }
304
305 Ok(weights)
306 }
307
308 fn generate_features_cpu(&self, input_dim: usize) -> Result<Array2<f64>> {
309 let mut rng = RealStdRng::from_seed(thread_rng().gen());
311 let normal = RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).unwrap();
312
313 let mut weights = Array2::zeros((self.n_components, input_dim));
314 for i in 0..self.n_components {
315 for j in 0..input_dim {
316 weights[[i, j]] = rng.sample(normal);
317 }
318 }
319
320 Ok(weights)
321 }
322
323 fn transform_gpu(
324 &self,
325 x: &Array2<f64>,
326 weights: &Array2<f64>,
327 biases: &Array1<f64>,
328 ctx: &GpuContext,
329 ) -> Result<Array2<f64>> {
330 match ctx.config.backend {
331 GpuBackend::Cuda => self.transform_cuda(x, weights, biases, ctx),
332 GpuBackend::OpenCL => self.transform_opencl(x, weights, biases, ctx),
333 GpuBackend::Metal => self.transform_metal(x, weights, biases, ctx),
334 GpuBackend::Cpu => self.transform_cpu(x, weights, biases),
335 }
336 }
337
338 fn transform_cuda(
339 &self,
340 x: &Array2<f64>,
341 weights: &Array2<f64>,
342 biases: &Array1<f64>,
343 _ctx: &GpuContext,
344 ) -> Result<Array2<f64>> {
345 let n_samples = x.nrows();
349 let mut result = Array2::zeros((n_samples, self.n_components));
350
351 for i in 0..n_samples {
353 for j in 0..self.n_components {
354 let mut dot_product = 0.0;
355 for k in 0..x.ncols() {
356 dot_product += x[[i, k]] * weights[[j, k]];
357 }
358 dot_product += biases[j];
359
360 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
362 }
363 }
364
365 Ok(result)
366 }
367
368 fn transform_opencl(
369 &self,
370 x: &Array2<f64>,
371 weights: &Array2<f64>,
372 biases: &Array1<f64>,
373 _ctx: &GpuContext,
374 ) -> Result<Array2<f64>> {
375 self.transform_cpu(x, weights, biases)
377 }
378
379 fn transform_metal(
380 &self,
381 x: &Array2<f64>,
382 weights: &Array2<f64>,
383 biases: &Array1<f64>,
384 _ctx: &GpuContext,
385 ) -> Result<Array2<f64>> {
386 self.transform_cpu(x, weights, biases)
388 }
389
390 fn transform_cpu(
391 &self,
392 x: &Array2<f64>,
393 weights: &Array2<f64>,
394 biases: &Array1<f64>,
395 ) -> Result<Array2<f64>> {
396 let n_samples = x.nrows();
397 let mut result = Array2::zeros((n_samples, self.n_components));
398
399 for i in 0..n_samples {
400 for j in 0..self.n_components {
401 let mut dot_product = 0.0;
402 for k in 0..x.ncols() {
403 dot_product += x[[i, k]] * weights[[j, k]];
404 }
405 dot_product += biases[j];
406
407 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
408 }
409 }
410
411 Ok(result)
412 }
413}
414
415#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct FittedGpuRBFSampler {
418 pub n_components: usize,
420 pub gamma: f64,
422 pub gpu_config: GpuConfig,
424 pub weights: Array2<f64>,
426 pub biases: Array1<f64>,
428 #[serde(skip)]
429 pub gpu_context: Option<Arc<GpuContext>>,
430}
431
432impl Fit<Array2<f64>, ()> for GpuRBFSampler {
433 type Fitted = FittedGpuRBFSampler;
434
435 fn fit(self, x: &Array2<f64>, _y: &()) -> Result<Self::Fitted> {
436 let input_dim = x.ncols();
437
438 let mut gpu_context = GpuContext::new(self.gpu_config.clone());
440 gpu_context.initialize()?;
441
442 let weights = self.generate_random_features_gpu(input_dim, &gpu_context)?;
444
445 let mut rng = RealStdRng::from_seed(thread_rng().gen());
447 let uniform = RandUniform::new(0.0, 2.0 * std::f64::consts::PI).unwrap();
448 let biases = Array1::from_vec(
449 (0..self.n_components)
450 .map(|_| rng.sample(uniform))
451 .collect(),
452 );
453
454 Ok(FittedGpuRBFSampler {
455 n_components: self.n_components,
456 gamma: self.gamma,
457 gpu_config: self.gpu_config.clone(),
458 weights,
459 biases,
460 gpu_context: Some(Arc::new(gpu_context)),
461 })
462 }
463}
464
465impl FittedGpuRBFSampler {
466 fn transform_gpu(
467 &self,
468 x: &Array2<f64>,
469 weights: &Array2<f64>,
470 biases: &Array1<f64>,
471 ctx: &GpuContext,
472 ) -> Result<Array2<f64>> {
473 match ctx.config.backend {
474 GpuBackend::Cuda => self.transform_cuda(x, weights, biases, ctx),
475 GpuBackend::OpenCL => self.transform_opencl(x, weights, biases, ctx),
476 GpuBackend::Metal => self.transform_metal(x, weights, biases, ctx),
477 GpuBackend::Cpu => self.transform_cpu(x, weights, biases),
478 }
479 }
480
481 fn transform_cuda(
482 &self,
483 x: &Array2<f64>,
484 weights: &Array2<f64>,
485 biases: &Array1<f64>,
486 _ctx: &GpuContext,
487 ) -> Result<Array2<f64>> {
488 let n_samples = x.nrows();
489 let mut result = Array2::zeros((n_samples, self.n_components));
490
491 for i in 0..n_samples {
492 for j in 0..self.n_components {
493 let mut dot_product = 0.0;
494 for k in 0..x.ncols() {
495 dot_product += x[[i, k]] * weights[[j, k]];
496 }
497 dot_product += biases[j];
498
499 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
500 }
501 }
502
503 Ok(result)
504 }
505
506 fn transform_opencl(
507 &self,
508 x: &Array2<f64>,
509 weights: &Array2<f64>,
510 biases: &Array1<f64>,
511 _ctx: &GpuContext,
512 ) -> Result<Array2<f64>> {
513 self.transform_cpu(x, weights, biases)
514 }
515
516 fn transform_metal(
517 &self,
518 x: &Array2<f64>,
519 weights: &Array2<f64>,
520 biases: &Array1<f64>,
521 _ctx: &GpuContext,
522 ) -> Result<Array2<f64>> {
523 self.transform_cpu(x, weights, biases)
524 }
525
526 fn transform_cpu(
527 &self,
528 x: &Array2<f64>,
529 weights: &Array2<f64>,
530 biases: &Array1<f64>,
531 ) -> Result<Array2<f64>> {
532 let n_samples = x.nrows();
533 let mut result = Array2::zeros((n_samples, self.n_components));
534
535 for i in 0..n_samples {
536 for j in 0..self.n_components {
537 let mut dot_product = 0.0;
538 for k in 0..x.ncols() {
539 dot_product += x[[i, k]] * weights[[j, k]];
540 }
541 dot_product += biases[j];
542
543 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
544 }
545 }
546
547 Ok(result)
548 }
549}
550
551impl Transform<Array2<f64>, Array2<f64>> for FittedGpuRBFSampler {
552 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
553 if let Some(ctx) = &self.gpu_context {
554 self.transform_gpu(x, &self.weights, &self.biases, ctx)
555 } else {
556 self.transform_cpu(x, &self.weights, &self.biases)
557 }
558 }
559}
560
561#[derive(Debug, Clone, Serialize, Deserialize)]
563pub struct GpuNystroem {
565 pub n_components: usize,
567 pub kernel: String,
569 pub gamma: f64,
571 pub gpu_config: GpuConfig,
573}
574
575impl GpuNystroem {
576 pub fn new(n_components: usize) -> Self {
577 Self {
578 n_components,
579 kernel: "rbf".to_string(),
580 gamma: 1.0,
581 gpu_config: GpuConfig::default(),
582 }
583 }
584
585 pub fn kernel(mut self, kernel: String) -> Self {
586 self.kernel = kernel;
587 self
588 }
589
590 pub fn gamma(mut self, gamma: f64) -> Self {
591 self.gamma = gamma;
592 self
593 }
594
595 pub fn gpu_config(mut self, config: GpuConfig) -> Self {
596 self.gpu_config = config;
597 self
598 }
599
600 fn compute_kernel_matrix_gpu(
601 &self,
602 x: &Array2<f64>,
603 y: &Array2<f64>,
604 ctx: &GpuContext,
605 ) -> Result<Array2<f64>> {
606 match ctx.config.backend {
607 GpuBackend::Cuda => self.compute_kernel_cuda(x, y, ctx),
608 GpuBackend::OpenCL => self.compute_kernel_opencl(x, y, ctx),
609 GpuBackend::Metal => self.compute_kernel_metal(x, y, ctx),
610 GpuBackend::Cpu => self.compute_kernel_cpu(x, y),
611 }
612 }
613
614 fn compute_kernel_cuda(
615 &self,
616 x: &Array2<f64>,
617 y: &Array2<f64>,
618 _ctx: &GpuContext,
619 ) -> Result<Array2<f64>> {
620 self.compute_kernel_cpu(x, y)
623 }
624
625 fn compute_kernel_opencl(
626 &self,
627 x: &Array2<f64>,
628 y: &Array2<f64>,
629 _ctx: &GpuContext,
630 ) -> Result<Array2<f64>> {
631 self.compute_kernel_cpu(x, y)
632 }
633
634 fn compute_kernel_metal(
635 &self,
636 x: &Array2<f64>,
637 y: &Array2<f64>,
638 _ctx: &GpuContext,
639 ) -> Result<Array2<f64>> {
640 self.compute_kernel_cpu(x, y)
641 }
642
643 fn compute_kernel_cpu(&self, x: &Array2<f64>, y: &Array2<f64>) -> Result<Array2<f64>> {
644 let n_x = x.nrows();
645 let n_y = y.nrows();
646 let mut kernel_matrix = Array2::zeros((n_x, n_y));
647
648 match self.kernel.as_str() {
649 "rbf" => {
650 for i in 0..n_x {
651 for j in 0..n_y {
652 let diff = &x.row(i) - &y.row(j);
653 let squared_norm = diff.dot(&diff);
654 kernel_matrix[[i, j]] = (-self.gamma * squared_norm).exp();
655 }
656 }
657 }
658 "linear" => {
659 for i in 0..n_x {
660 for j in 0..n_y {
661 kernel_matrix[[i, j]] = x.row(i).dot(&y.row(j));
662 }
663 }
664 }
665 _ => {
666 return Err(SklearsError::InvalidInput(format!(
667 "Unsupported kernel: {}",
668 self.kernel
669 )));
670 }
671 }
672
673 Ok(kernel_matrix)
674 }
675
676 fn eigendecomposition_gpu(
677 &self,
678 matrix: &Array2<f64>,
679 ctx: &GpuContext,
680 ) -> Result<(Array1<f64>, Array2<f64>)> {
681 match ctx.config.backend {
682 GpuBackend::Cuda => self.eigendecomposition_cuda(matrix, ctx),
683 GpuBackend::OpenCL => self.eigendecomposition_opencl(matrix, ctx),
684 GpuBackend::Metal => self.eigendecomposition_metal(matrix, ctx),
685 GpuBackend::Cpu => self.eigendecomposition_cpu(matrix),
686 }
687 }
688
689 fn eigendecomposition_cuda(
690 &self,
691 matrix: &Array2<f64>,
692 _ctx: &GpuContext,
693 ) -> Result<(Array1<f64>, Array2<f64>)> {
694 self.eigendecomposition_cpu(matrix)
696 }
697
698 fn eigendecomposition_opencl(
699 &self,
700 matrix: &Array2<f64>,
701 _ctx: &GpuContext,
702 ) -> Result<(Array1<f64>, Array2<f64>)> {
703 self.eigendecomposition_cpu(matrix)
704 }
705
706 fn eigendecomposition_metal(
707 &self,
708 matrix: &Array2<f64>,
709 _ctx: &GpuContext,
710 ) -> Result<(Array1<f64>, Array2<f64>)> {
711 self.eigendecomposition_cpu(matrix)
712 }
713
714 fn eigendecomposition_cpu(&self, matrix: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>)> {
715 let n = matrix.nrows();
717 let mut eigenvalues = Array1::zeros(n);
718 let mut eigenvectors = Array2::zeros((n, n));
719
720 let mut v: Array1<f64> = Array1::ones(n);
722 v /= v.dot(&v).sqrt();
723
724 for _ in 0..100 {
725 let mut av: Array1<f64> = Array1::zeros(n);
726 for i in 0..n {
727 for j in 0..n {
728 av[i] += matrix[[i, j]] * v[j];
729 }
730 }
731
732 let norm = av.dot(&av).sqrt();
733 if norm > 1e-12 {
734 v = av / norm;
735 eigenvalues[0] = norm;
736 } else {
737 break;
738 }
739 }
740
741 for i in 0..n {
743 eigenvectors[[i, 0]] = v[i];
744 }
745
746 for i in 1..n {
748 eigenvectors[[i, i]] = 1.0;
749 eigenvalues[i] = 0.1; }
751
752 Ok((eigenvalues, eigenvectors))
753 }
754}
755
756#[derive(Debug, Clone, Serialize, Deserialize)]
757pub struct FittedGpuNystroem {
759 pub n_components: usize,
761 pub kernel: String,
763 pub gamma: f64,
765 pub gpu_config: GpuConfig,
767 pub basis_vectors: Array2<f64>,
769 pub eigenvalues: Array1<f64>,
771 pub eigenvectors: Array2<f64>,
773 #[serde(skip)]
774 pub gpu_context: Option<Arc<GpuContext>>,
775}
776
777impl Fit<Array2<f64>, ()> for GpuNystroem {
778 type Fitted = FittedGpuNystroem;
779
780 fn fit(self, x: &Array2<f64>, _y: &()) -> Result<Self::Fitted> {
781 let mut gpu_context = GpuContext::new(self.gpu_config.clone());
782 gpu_context.initialize()?;
783
784 let n_samples = x.nrows();
785 let n_components = self.n_components.min(n_samples);
786
787 let mut rng = RealStdRng::from_seed(thread_rng().gen());
789 let mut indices: Vec<usize> = (0..n_samples).collect();
790 indices.shuffle(&mut rng);
791 indices.truncate(n_components);
792
793 let mut basis_vectors = Array2::zeros((n_components, x.ncols()));
794 for (i, &idx) in indices.iter().enumerate() {
795 basis_vectors.row_mut(i).assign(&x.row(idx));
796 }
797
798 let kernel_matrix =
800 self.compute_kernel_matrix_gpu(&basis_vectors, &basis_vectors, &gpu_context)?;
801
802 let (eigenvalues, eigenvectors) =
804 self.eigendecomposition_gpu(&kernel_matrix, &gpu_context)?;
805
806 Ok(FittedGpuNystroem {
807 n_components,
808 kernel: self.kernel.clone(),
809 gamma: self.gamma,
810 gpu_config: self.gpu_config.clone(),
811 basis_vectors,
812 eigenvalues,
813 eigenvectors,
814 gpu_context: Some(Arc::new(gpu_context)),
815 })
816 }
817}
818
819impl Transform<Array2<f64>, Array2<f64>> for FittedGpuNystroem {
820 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
821 if let Some(ctx) = &self.gpu_context {
822 let kernel_matrix = self.compute_kernel_matrix_gpu(x, &self.basis_vectors, ctx)?;
824
825 let mut result = Array2::zeros((x.nrows(), self.n_components));
827
828 for i in 0..x.nrows() {
829 for j in 0..self.n_components {
830 let mut sum = 0.0;
831 for k in 0..self.n_components {
832 let eigenval_sqrt_inv = if self.eigenvalues[k] > 1e-12 {
833 1.0 / self.eigenvalues[k].sqrt()
834 } else {
835 0.0
836 };
837 sum +=
838 kernel_matrix[[i, k]] * self.eigenvectors[[k, j]] * eigenval_sqrt_inv;
839 }
840 result[[i, j]] = sum;
841 }
842 }
843
844 Ok(result)
845 } else {
846 Err(SklearsError::InvalidData {
847 reason: "GPU context not initialized".to_string(),
848 })
849 }
850 }
851}
852
853#[derive(Debug, Clone, Serialize, Deserialize)]
855pub struct GpuProfiler {
857 pub enable_profiling: bool,
859 pub kernel_times: Vec<(String, f64)>,
861 pub memory_usage: Vec<(String, usize)>,
863 pub transfer_times: Vec<(String, f64)>,
865}
866
867impl Default for GpuProfiler {
868 fn default() -> Self {
869 Self::new()
870 }
871}
872
873impl GpuProfiler {
874 pub fn new() -> Self {
875 Self {
876 enable_profiling: false,
877 kernel_times: Vec::new(),
878 memory_usage: Vec::new(),
879 transfer_times: Vec::new(),
880 }
881 }
882
883 pub fn enable(mut self) -> Self {
884 self.enable_profiling = true;
885 self
886 }
887
888 pub fn record_kernel_time(&mut self, kernel_name: &str, time_ms: f64) {
889 if self.enable_profiling {
890 self.kernel_times.push((kernel_name.to_string(), time_ms));
891 }
892 }
893
894 pub fn record_memory_usage(&mut self, operation: &str, bytes: usize) {
895 if self.enable_profiling {
896 self.memory_usage.push((operation.to_string(), bytes));
897 }
898 }
899
900 pub fn record_transfer_time(&mut self, operation: &str, time_ms: f64) {
901 if self.enable_profiling {
902 self.transfer_times.push((operation.to_string(), time_ms));
903 }
904 }
905
906 pub fn get_summary(&self) -> String {
907 let mut summary = String::new();
908
909 summary.push_str("GPU Performance Summary:\n");
910 summary.push_str(&format!(
911 "Total kernel executions: {}\n",
912 self.kernel_times.len()
913 ));
914
915 if !self.kernel_times.is_empty() {
916 let total_kernel_time: f64 = self.kernel_times.iter().map(|(_, time)| time).sum();
917 summary.push_str(&format!("Total kernel time: {:.2} ms\n", total_kernel_time));
918 }
919
920 if !self.memory_usage.is_empty() {
921 let max_memory: usize = *self
922 .memory_usage
923 .iter()
924 .map(|(_, bytes)| bytes)
925 .max()
926 .unwrap_or(&0);
927 summary.push_str(&format!("Peak memory usage: {} bytes\n", max_memory));
928 }
929
930 if !self.transfer_times.is_empty() {
931 let total_transfer_time: f64 = self.transfer_times.iter().map(|(_, time)| time).sum();
932 summary.push_str(&format!(
933 "Total transfer time: {:.2} ms\n",
934 total_transfer_time
935 ));
936 }
937
938 summary
939 }
940}
941
942#[allow(non_snake_case)]
943#[cfg(test)]
944mod tests {
945 use super::*;
946 use scirs2_core::essentials::Normal;
947
948 use scirs2_core::ndarray::{Array, Array2};
949 use scirs2_core::random::thread_rng;
950
951 #[test]
952 fn test_gpu_context_initialization() {
953 let config = GpuConfig::default();
954 let mut context = GpuContext::new(config);
955
956 assert!(context.initialize().is_ok());
957 assert!(context.is_initialized);
958 }
959
960 #[test]
961 fn test_gpu_rbf_sampler() {
962 let x: Array2<f64> = Array::from_shape_fn((50, 10), |_| {
963 let mut rng = thread_rng();
964 rng.sample(&Normal::new(0.0, 1.0).unwrap())
965 });
966 let sampler = GpuRBFSampler::new(100).gamma(0.5);
967
968 let fitted = sampler.fit(&x, &()).unwrap();
969 let transformed = fitted.transform(&x).unwrap();
970
971 assert_eq!(transformed.shape(), &[50, 100]);
972 }
973
974 #[test]
975 fn test_gpu_nystroem() {
976 let x: Array2<f64> = Array::from_shape_fn((30, 5), |_| {
977 let mut rng = thread_rng();
978 rng.sample(&Normal::new(0.0, 1.0).unwrap())
979 });
980 let nystroem = GpuNystroem::new(20).gamma(1.0);
981
982 let fitted = nystroem.fit(&x, &()).unwrap();
983 let transformed = fitted.transform(&x).unwrap();
984
985 assert_eq!(transformed.shape()[0], 30);
986 assert!(transformed.shape()[1] <= 20);
987 }
988
989 #[test]
990 fn test_gpu_profiler() {
991 let mut profiler = GpuProfiler::new().enable();
992
993 profiler.record_kernel_time("test_kernel", 5.5);
994 profiler.record_memory_usage("allocation", 1024);
995 profiler.record_transfer_time("host_to_device", 2.1);
996
997 let summary = profiler.get_summary();
998 assert!(summary.contains("Total kernel executions: 1"));
999 assert!(summary.contains("Total kernel time: 5.50"));
1000 }
1001}
1002
1003impl FittedGpuNystroem {
1004 fn compute_kernel_matrix_gpu(
1005 &self,
1006 x: &Array2<f64>,
1007 y: &Array2<f64>,
1008 _ctx: &GpuContext,
1009 ) -> Result<Array2<f64>> {
1010 let n_x = x.nrows();
1011 let n_y = y.nrows();
1012 let mut kernel_matrix = Array2::zeros((n_x, n_y));
1013
1014 match self.kernel.as_str() {
1015 "rbf" => {
1016 for i in 0..n_x {
1017 for j in 0..n_y {
1018 let diff = &x.row(i) - &y.row(j);
1019 let squared_norm = diff.dot(&diff);
1020 kernel_matrix[[i, j]] = (-self.gamma * squared_norm).exp();
1021 }
1022 }
1023 }
1024 "linear" => {
1025 for i in 0..n_x {
1026 for j in 0..n_y {
1027 kernel_matrix[[i, j]] = x.row(i).dot(&y.row(j));
1028 }
1029 }
1030 }
1031 _ => {
1032 return Err(SklearsError::InvalidInput(format!(
1033 "Unsupported kernel: {}",
1034 self.kernel
1035 )));
1036 }
1037 }
1038
1039 Ok(kernel_matrix)
1040 }
1041}