1use scirs2_core::ndarray::{Array1, Array2};
7use scirs2_core::random::essentials::{Normal as RandNormal, Uniform as RandUniform};
8use scirs2_core::random::rngs::StdRng as RealStdRng;
9use scirs2_core::random::seq::SliceRandom;
10use scirs2_core::random::RngExt;
11use scirs2_core::random::{thread_rng, SeedableRng};
12use serde::{Deserialize, Serialize};
13use std::sync::Arc;
14
15use sklears_core::error::{Result, SklearsError};
16use sklears_core::traits::{Fit, Transform};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum GpuBackend {
22 Cuda,
24 OpenCL,
26 Metal,
28 Cpu, }
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub enum MemoryStrategy {
36 Pinned, Managed, Explicit, }
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub enum Precision {
48 Single, Double, Half, }
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct GpuDevice {
60 pub id: usize,
62 pub name: String,
64 pub compute_capability: (u32, u32),
66 pub total_memory: usize,
68 pub multiprocessor_count: usize,
70 pub max_threads_per_block: usize,
72 pub max_shared_memory: usize,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct GpuConfig {
80 pub backend: GpuBackend,
82 pub device_id: usize,
84 pub memory_strategy: MemoryStrategy,
86 pub precision: Precision,
88 pub block_size: usize,
90 pub grid_size: usize,
92 pub enable_async: bool,
94 pub stream_count: usize,
96}
97
98impl Default for GpuConfig {
99 fn default() -> Self {
100 Self {
101 backend: GpuBackend::Cpu,
102 device_id: 0,
103 memory_strategy: MemoryStrategy::Managed,
104 precision: Precision::Double,
105 block_size: 256,
106 grid_size: 0, enable_async: true,
108 stream_count: 4,
109 }
110 }
111}
112
113#[derive(Debug)]
115pub struct GpuContext {
117 pub config: GpuConfig,
119 pub device: Option<GpuDevice>,
121 pub is_initialized: bool,
123}
124
125impl GpuContext {
126 pub fn new(config: GpuConfig) -> Self {
127 Self {
128 config,
129 device: None,
130 is_initialized: false,
131 }
132 }
133
134 pub fn initialize(&mut self) -> Result<()> {
135 match self.config.backend {
136 GpuBackend::Cuda => self.initialize_cuda(),
137 GpuBackend::OpenCL => self.initialize_opencl(),
138 GpuBackend::Metal => self.initialize_metal(),
139 GpuBackend::Cpu => {
140 self.is_initialized = true;
141 Ok(())
142 }
143 }
144 }
145
146 fn initialize_cuda(&mut self) -> Result<()> {
147 let device = GpuDevice {
150 id: self.config.device_id,
151 name: "Simulated CUDA Device".to_string(),
152 compute_capability: (8, 6),
153 total_memory: 12 * 1024 * 1024 * 1024, multiprocessor_count: 82,
155 max_threads_per_block: 1024,
156 max_shared_memory: 48 * 1024,
157 };
158
159 self.device = Some(device);
160 self.is_initialized = true;
161 Ok(())
162 }
163
164 fn initialize_opencl(&mut self) -> Result<()> {
165 let device = GpuDevice {
167 id: self.config.device_id,
168 name: "Simulated OpenCL Device".to_string(),
169 compute_capability: (0, 0),
170 total_memory: 8 * 1024 * 1024 * 1024, multiprocessor_count: 64,
172 max_threads_per_block: 256,
173 max_shared_memory: 32 * 1024,
174 };
175
176 self.device = Some(device);
177 self.is_initialized = true;
178 Ok(())
179 }
180
181 fn initialize_metal(&mut self) -> Result<()> {
182 let device = GpuDevice {
184 id: self.config.device_id,
185 name: "Simulated Metal Device".to_string(),
186 compute_capability: (0, 0),
187 total_memory: 16 * 1024 * 1024 * 1024, multiprocessor_count: 32,
189 max_threads_per_block: 1024,
190 max_shared_memory: 32 * 1024,
191 };
192
193 self.device = Some(device);
194 self.is_initialized = true;
195 Ok(())
196 }
197
198 pub fn get_optimal_block_size(&self, problem_size: usize) -> usize {
199 if let Some(device) = &self.device {
200 let max_threads = device.max_threads_per_block;
201 let suggested_size = (problem_size as f64).sqrt() as usize;
202 suggested_size.min(max_threads).max(32)
203 } else {
204 256
205 }
206 }
207
208 pub fn get_optimal_grid_size(&self, problem_size: usize, block_size: usize) -> usize {
209 (problem_size + block_size - 1) / block_size
210 }
211}
212
213#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct GpuRBFSampler {
217 pub n_components: usize,
219 pub gamma: f64,
221 pub gpu_config: GpuConfig,
223}
224
225impl GpuRBFSampler {
226 pub fn new(n_components: usize) -> Self {
227 Self {
228 n_components,
229 gamma: 1.0,
230 gpu_config: GpuConfig::default(),
231 }
232 }
233
234 pub fn gamma(mut self, gamma: f64) -> Self {
235 self.gamma = gamma;
236 self
237 }
238
239 pub fn gpu_config(mut self, config: GpuConfig) -> Self {
240 self.gpu_config = config;
241 self
242 }
243
244 fn generate_random_features_gpu(
245 &self,
246 input_dim: usize,
247 ctx: &GpuContext,
248 ) -> Result<Array2<f64>> {
249 match ctx.config.backend {
250 GpuBackend::Cuda => self.generate_features_cuda(input_dim, ctx),
251 GpuBackend::OpenCL => self.generate_features_opencl(input_dim, ctx),
252 GpuBackend::Metal => self.generate_features_metal(input_dim, ctx),
253 GpuBackend::Cpu => self.generate_features_cpu(input_dim),
254 }
255 }
256
257 fn generate_features_cuda(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
258 let mut rng = RealStdRng::from_seed(thread_rng().random());
266 let normal =
267 RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).expect("operation should succeed");
268
269 let mut weights = Array2::zeros((self.n_components, input_dim));
270 for i in 0..self.n_components {
271 for j in 0..input_dim {
272 weights[[i, j]] = rng.sample(normal);
273 }
274 }
275
276 Ok(weights)
277 }
278
279 fn generate_features_opencl(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
280 let mut rng = RealStdRng::from_seed(thread_rng().random());
282 let normal =
283 RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).expect("operation should succeed");
284
285 let mut weights = Array2::zeros((self.n_components, input_dim));
286 for i in 0..self.n_components {
287 for j in 0..input_dim {
288 weights[[i, j]] = rng.sample(normal);
289 }
290 }
291
292 Ok(weights)
293 }
294
295 fn generate_features_metal(&self, input_dim: usize, _ctx: &GpuContext) -> Result<Array2<f64>> {
296 let mut rng = RealStdRng::from_seed(thread_rng().random());
298 let normal =
299 RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).expect("operation should succeed");
300
301 let mut weights = Array2::zeros((self.n_components, input_dim));
302 for i in 0..self.n_components {
303 for j in 0..input_dim {
304 weights[[i, j]] = rng.sample(normal);
305 }
306 }
307
308 Ok(weights)
309 }
310
311 fn generate_features_cpu(&self, input_dim: usize) -> Result<Array2<f64>> {
312 let mut rng = RealStdRng::from_seed(thread_rng().random());
314 let normal =
315 RandNormal::new(0.0, (2.0 * self.gamma).sqrt()).expect("operation should succeed");
316
317 let mut weights = Array2::zeros((self.n_components, input_dim));
318 for i in 0..self.n_components {
319 for j in 0..input_dim {
320 weights[[i, j]] = rng.sample(normal);
321 }
322 }
323
324 Ok(weights)
325 }
326
327 fn transform_gpu(
328 &self,
329 x: &Array2<f64>,
330 weights: &Array2<f64>,
331 biases: &Array1<f64>,
332 ctx: &GpuContext,
333 ) -> Result<Array2<f64>> {
334 match ctx.config.backend {
335 GpuBackend::Cuda => self.transform_cuda(x, weights, biases, ctx),
336 GpuBackend::OpenCL => self.transform_opencl(x, weights, biases, ctx),
337 GpuBackend::Metal => self.transform_metal(x, weights, biases, ctx),
338 GpuBackend::Cpu => self.transform_cpu(x, weights, biases),
339 }
340 }
341
342 fn transform_cuda(
343 &self,
344 x: &Array2<f64>,
345 weights: &Array2<f64>,
346 biases: &Array1<f64>,
347 _ctx: &GpuContext,
348 ) -> Result<Array2<f64>> {
349 let n_samples = x.nrows();
353 let mut result = Array2::zeros((n_samples, self.n_components));
354
355 for i in 0..n_samples {
357 for j in 0..self.n_components {
358 let mut dot_product = 0.0;
359 for k in 0..x.ncols() {
360 dot_product += x[[i, k]] * weights[[j, k]];
361 }
362 dot_product += biases[j];
363
364 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
366 }
367 }
368
369 Ok(result)
370 }
371
372 fn transform_opencl(
373 &self,
374 x: &Array2<f64>,
375 weights: &Array2<f64>,
376 biases: &Array1<f64>,
377 _ctx: &GpuContext,
378 ) -> Result<Array2<f64>> {
379 self.transform_cpu(x, weights, biases)
381 }
382
383 fn transform_metal(
384 &self,
385 x: &Array2<f64>,
386 weights: &Array2<f64>,
387 biases: &Array1<f64>,
388 _ctx: &GpuContext,
389 ) -> Result<Array2<f64>> {
390 self.transform_cpu(x, weights, biases)
392 }
393
394 fn transform_cpu(
395 &self,
396 x: &Array2<f64>,
397 weights: &Array2<f64>,
398 biases: &Array1<f64>,
399 ) -> Result<Array2<f64>> {
400 let n_samples = x.nrows();
401 let mut result = Array2::zeros((n_samples, self.n_components));
402
403 for i in 0..n_samples {
404 for j in 0..self.n_components {
405 let mut dot_product = 0.0;
406 for k in 0..x.ncols() {
407 dot_product += x[[i, k]] * weights[[j, k]];
408 }
409 dot_product += biases[j];
410
411 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
412 }
413 }
414
415 Ok(result)
416 }
417}
418
419#[derive(Debug, Clone, Serialize, Deserialize)]
420pub struct FittedGpuRBFSampler {
422 pub n_components: usize,
424 pub gamma: f64,
426 pub gpu_config: GpuConfig,
428 pub weights: Array2<f64>,
430 pub biases: Array1<f64>,
432 #[serde(skip)]
433 pub gpu_context: Option<Arc<GpuContext>>,
434}
435
436impl Fit<Array2<f64>, ()> for GpuRBFSampler {
437 type Fitted = FittedGpuRBFSampler;
438
439 fn fit(self, x: &Array2<f64>, _y: &()) -> Result<Self::Fitted> {
440 let input_dim = x.ncols();
441
442 let mut gpu_context = GpuContext::new(self.gpu_config.clone());
444 gpu_context.initialize()?;
445
446 let weights = self.generate_random_features_gpu(input_dim, &gpu_context)?;
448
449 let mut rng = RealStdRng::from_seed(thread_rng().random());
451 let uniform =
452 RandUniform::new(0.0, 2.0 * std::f64::consts::PI).expect("operation should succeed");
453 let biases = Array1::from_vec(
454 (0..self.n_components)
455 .map(|_| rng.sample(uniform))
456 .collect(),
457 );
458
459 Ok(FittedGpuRBFSampler {
460 n_components: self.n_components,
461 gamma: self.gamma,
462 gpu_config: self.gpu_config.clone(),
463 weights,
464 biases,
465 gpu_context: Some(Arc::new(gpu_context)),
466 })
467 }
468}
469
470impl FittedGpuRBFSampler {
471 fn transform_gpu(
472 &self,
473 x: &Array2<f64>,
474 weights: &Array2<f64>,
475 biases: &Array1<f64>,
476 ctx: &GpuContext,
477 ) -> Result<Array2<f64>> {
478 match ctx.config.backend {
479 GpuBackend::Cuda => self.transform_cuda(x, weights, biases, ctx),
480 GpuBackend::OpenCL => self.transform_opencl(x, weights, biases, ctx),
481 GpuBackend::Metal => self.transform_metal(x, weights, biases, ctx),
482 GpuBackend::Cpu => self.transform_cpu(x, weights, biases),
483 }
484 }
485
486 fn transform_cuda(
487 &self,
488 x: &Array2<f64>,
489 weights: &Array2<f64>,
490 biases: &Array1<f64>,
491 _ctx: &GpuContext,
492 ) -> Result<Array2<f64>> {
493 let n_samples = x.nrows();
494 let mut result = Array2::zeros((n_samples, self.n_components));
495
496 for i in 0..n_samples {
497 for j in 0..self.n_components {
498 let mut dot_product = 0.0;
499 for k in 0..x.ncols() {
500 dot_product += x[[i, k]] * weights[[j, k]];
501 }
502 dot_product += biases[j];
503
504 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
505 }
506 }
507
508 Ok(result)
509 }
510
511 fn transform_opencl(
512 &self,
513 x: &Array2<f64>,
514 weights: &Array2<f64>,
515 biases: &Array1<f64>,
516 _ctx: &GpuContext,
517 ) -> Result<Array2<f64>> {
518 self.transform_cpu(x, weights, biases)
519 }
520
521 fn transform_metal(
522 &self,
523 x: &Array2<f64>,
524 weights: &Array2<f64>,
525 biases: &Array1<f64>,
526 _ctx: &GpuContext,
527 ) -> Result<Array2<f64>> {
528 self.transform_cpu(x, weights, biases)
529 }
530
531 fn transform_cpu(
532 &self,
533 x: &Array2<f64>,
534 weights: &Array2<f64>,
535 biases: &Array1<f64>,
536 ) -> Result<Array2<f64>> {
537 let n_samples = x.nrows();
538 let mut result = Array2::zeros((n_samples, self.n_components));
539
540 for i in 0..n_samples {
541 for j in 0..self.n_components {
542 let mut dot_product = 0.0;
543 for k in 0..x.ncols() {
544 dot_product += x[[i, k]] * weights[[j, k]];
545 }
546 dot_product += biases[j];
547
548 result[[i, j]] = (2.0 / self.n_components as f64).sqrt() * dot_product.cos();
549 }
550 }
551
552 Ok(result)
553 }
554}
555
556impl Transform<Array2<f64>, Array2<f64>> for FittedGpuRBFSampler {
557 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
558 if let Some(ctx) = &self.gpu_context {
559 self.transform_gpu(x, &self.weights, &self.biases, ctx)
560 } else {
561 self.transform_cpu(x, &self.weights, &self.biases)
562 }
563 }
564}
565
566#[derive(Debug, Clone, Serialize, Deserialize)]
568pub struct GpuNystroem {
570 pub n_components: usize,
572 pub kernel: String,
574 pub gamma: f64,
576 pub gpu_config: GpuConfig,
578}
579
580impl GpuNystroem {
581 pub fn new(n_components: usize) -> Self {
582 Self {
583 n_components,
584 kernel: "rbf".to_string(),
585 gamma: 1.0,
586 gpu_config: GpuConfig::default(),
587 }
588 }
589
590 pub fn kernel(mut self, kernel: String) -> Self {
591 self.kernel = kernel;
592 self
593 }
594
595 pub fn gamma(mut self, gamma: f64) -> Self {
596 self.gamma = gamma;
597 self
598 }
599
600 pub fn gpu_config(mut self, config: GpuConfig) -> Self {
601 self.gpu_config = config;
602 self
603 }
604
605 fn compute_kernel_matrix_gpu(
606 &self,
607 x: &Array2<f64>,
608 y: &Array2<f64>,
609 ctx: &GpuContext,
610 ) -> Result<Array2<f64>> {
611 match ctx.config.backend {
612 GpuBackend::Cuda => self.compute_kernel_cuda(x, y, ctx),
613 GpuBackend::OpenCL => self.compute_kernel_opencl(x, y, ctx),
614 GpuBackend::Metal => self.compute_kernel_metal(x, y, ctx),
615 GpuBackend::Cpu => self.compute_kernel_cpu(x, y),
616 }
617 }
618
619 fn compute_kernel_cuda(
620 &self,
621 x: &Array2<f64>,
622 y: &Array2<f64>,
623 _ctx: &GpuContext,
624 ) -> Result<Array2<f64>> {
625 self.compute_kernel_cpu(x, y)
628 }
629
630 fn compute_kernel_opencl(
631 &self,
632 x: &Array2<f64>,
633 y: &Array2<f64>,
634 _ctx: &GpuContext,
635 ) -> Result<Array2<f64>> {
636 self.compute_kernel_cpu(x, y)
637 }
638
639 fn compute_kernel_metal(
640 &self,
641 x: &Array2<f64>,
642 y: &Array2<f64>,
643 _ctx: &GpuContext,
644 ) -> Result<Array2<f64>> {
645 self.compute_kernel_cpu(x, y)
646 }
647
648 fn compute_kernel_cpu(&self, x: &Array2<f64>, y: &Array2<f64>) -> Result<Array2<f64>> {
649 let n_x = x.nrows();
650 let n_y = y.nrows();
651 let mut kernel_matrix = Array2::zeros((n_x, n_y));
652
653 match self.kernel.as_str() {
654 "rbf" => {
655 for i in 0..n_x {
656 for j in 0..n_y {
657 let diff = &x.row(i) - &y.row(j);
658 let squared_norm = diff.dot(&diff);
659 kernel_matrix[[i, j]] = (-self.gamma * squared_norm).exp();
660 }
661 }
662 }
663 "linear" => {
664 for i in 0..n_x {
665 for j in 0..n_y {
666 kernel_matrix[[i, j]] = x.row(i).dot(&y.row(j));
667 }
668 }
669 }
670 _ => {
671 return Err(SklearsError::InvalidInput(format!(
672 "Unsupported kernel: {}",
673 self.kernel
674 )));
675 }
676 }
677
678 Ok(kernel_matrix)
679 }
680
681 fn eigendecomposition_gpu(
682 &self,
683 matrix: &Array2<f64>,
684 ctx: &GpuContext,
685 ) -> Result<(Array1<f64>, Array2<f64>)> {
686 match ctx.config.backend {
687 GpuBackend::Cuda => self.eigendecomposition_cuda(matrix, ctx),
688 GpuBackend::OpenCL => self.eigendecomposition_opencl(matrix, ctx),
689 GpuBackend::Metal => self.eigendecomposition_metal(matrix, ctx),
690 GpuBackend::Cpu => self.eigendecomposition_cpu(matrix),
691 }
692 }
693
694 fn eigendecomposition_cuda(
695 &self,
696 matrix: &Array2<f64>,
697 _ctx: &GpuContext,
698 ) -> Result<(Array1<f64>, Array2<f64>)> {
699 self.eigendecomposition_cpu(matrix)
701 }
702
703 fn eigendecomposition_opencl(
704 &self,
705 matrix: &Array2<f64>,
706 _ctx: &GpuContext,
707 ) -> Result<(Array1<f64>, Array2<f64>)> {
708 self.eigendecomposition_cpu(matrix)
709 }
710
711 fn eigendecomposition_metal(
712 &self,
713 matrix: &Array2<f64>,
714 _ctx: &GpuContext,
715 ) -> Result<(Array1<f64>, Array2<f64>)> {
716 self.eigendecomposition_cpu(matrix)
717 }
718
719 fn eigendecomposition_cpu(&self, matrix: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>)> {
720 let n = matrix.nrows();
722 let mut eigenvalues = Array1::zeros(n);
723 let mut eigenvectors = Array2::zeros((n, n));
724
725 let mut v: Array1<f64> = Array1::ones(n);
727 v /= v.dot(&v).sqrt();
728
729 for _ in 0..100 {
730 let mut av: Array1<f64> = Array1::zeros(n);
731 for i in 0..n {
732 for j in 0..n {
733 av[i] += matrix[[i, j]] * v[j];
734 }
735 }
736
737 let norm = av.dot(&av).sqrt();
738 if norm > 1e-12 {
739 v = av / norm;
740 eigenvalues[0] = norm;
741 } else {
742 break;
743 }
744 }
745
746 for i in 0..n {
748 eigenvectors[[i, 0]] = v[i];
749 }
750
751 for i in 1..n {
753 eigenvectors[[i, i]] = 1.0;
754 eigenvalues[i] = 0.1; }
756
757 Ok((eigenvalues, eigenvectors))
758 }
759}
760
761#[derive(Debug, Clone, Serialize, Deserialize)]
762pub struct FittedGpuNystroem {
764 pub n_components: usize,
766 pub kernel: String,
768 pub gamma: f64,
770 pub gpu_config: GpuConfig,
772 pub basis_vectors: Array2<f64>,
774 pub eigenvalues: Array1<f64>,
776 pub eigenvectors: Array2<f64>,
778 #[serde(skip)]
779 pub gpu_context: Option<Arc<GpuContext>>,
780}
781
782impl Fit<Array2<f64>, ()> for GpuNystroem {
783 type Fitted = FittedGpuNystroem;
784
785 fn fit(self, x: &Array2<f64>, _y: &()) -> Result<Self::Fitted> {
786 let mut gpu_context = GpuContext::new(self.gpu_config.clone());
787 gpu_context.initialize()?;
788
789 let n_samples = x.nrows();
790 let n_components = self.n_components.min(n_samples);
791
792 let mut rng = RealStdRng::from_seed(thread_rng().random());
794 let mut indices: Vec<usize> = (0..n_samples).collect();
795 indices.shuffle(&mut rng);
796 indices.truncate(n_components);
797
798 let mut basis_vectors = Array2::zeros((n_components, x.ncols()));
799 for (i, &idx) in indices.iter().enumerate() {
800 basis_vectors.row_mut(i).assign(&x.row(idx));
801 }
802
803 let kernel_matrix =
805 self.compute_kernel_matrix_gpu(&basis_vectors, &basis_vectors, &gpu_context)?;
806
807 let (eigenvalues, eigenvectors) =
809 self.eigendecomposition_gpu(&kernel_matrix, &gpu_context)?;
810
811 Ok(FittedGpuNystroem {
812 n_components,
813 kernel: self.kernel.clone(),
814 gamma: self.gamma,
815 gpu_config: self.gpu_config.clone(),
816 basis_vectors,
817 eigenvalues,
818 eigenvectors,
819 gpu_context: Some(Arc::new(gpu_context)),
820 })
821 }
822}
823
824impl Transform<Array2<f64>, Array2<f64>> for FittedGpuNystroem {
825 fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
826 if let Some(ctx) = &self.gpu_context {
827 let kernel_matrix = self.compute_kernel_matrix_gpu(x, &self.basis_vectors, ctx)?;
829
830 let mut result = Array2::zeros((x.nrows(), self.n_components));
832
833 for i in 0..x.nrows() {
834 for j in 0..self.n_components {
835 let mut sum = 0.0;
836 for k in 0..self.n_components {
837 let eigenval_sqrt_inv = if self.eigenvalues[k] > 1e-12 {
838 1.0 / self.eigenvalues[k].sqrt()
839 } else {
840 0.0
841 };
842 sum +=
843 kernel_matrix[[i, k]] * self.eigenvectors[[k, j]] * eigenval_sqrt_inv;
844 }
845 result[[i, j]] = sum;
846 }
847 }
848
849 Ok(result)
850 } else {
851 Err(SklearsError::InvalidData {
852 reason: "GPU context not initialized".to_string(),
853 })
854 }
855 }
856}
857
858#[derive(Debug, Clone, Serialize, Deserialize)]
860pub struct GpuProfiler {
862 pub enable_profiling: bool,
864 pub kernel_times: Vec<(String, f64)>,
866 pub memory_usage: Vec<(String, usize)>,
868 pub transfer_times: Vec<(String, f64)>,
870}
871
872impl Default for GpuProfiler {
873 fn default() -> Self {
874 Self::new()
875 }
876}
877
878impl GpuProfiler {
879 pub fn new() -> Self {
880 Self {
881 enable_profiling: false,
882 kernel_times: Vec::new(),
883 memory_usage: Vec::new(),
884 transfer_times: Vec::new(),
885 }
886 }
887
888 pub fn enable(mut self) -> Self {
889 self.enable_profiling = true;
890 self
891 }
892
893 pub fn record_kernel_time(&mut self, kernel_name: &str, time_ms: f64) {
894 if self.enable_profiling {
895 self.kernel_times.push((kernel_name.to_string(), time_ms));
896 }
897 }
898
899 pub fn record_memory_usage(&mut self, operation: &str, bytes: usize) {
900 if self.enable_profiling {
901 self.memory_usage.push((operation.to_string(), bytes));
902 }
903 }
904
905 pub fn record_transfer_time(&mut self, operation: &str, time_ms: f64) {
906 if self.enable_profiling {
907 self.transfer_times.push((operation.to_string(), time_ms));
908 }
909 }
910
911 pub fn get_summary(&self) -> String {
912 let mut summary = String::new();
913
914 summary.push_str("GPU Performance Summary:\n");
915 summary.push_str(&format!(
916 "Total kernel executions: {}\n",
917 self.kernel_times.len()
918 ));
919
920 if !self.kernel_times.is_empty() {
921 let total_kernel_time: f64 = self.kernel_times.iter().map(|(_, time)| time).sum();
922 summary.push_str(&format!("Total kernel time: {:.2} ms\n", total_kernel_time));
923 }
924
925 if !self.memory_usage.is_empty() {
926 let max_memory: usize = *self
927 .memory_usage
928 .iter()
929 .map(|(_, bytes)| bytes)
930 .max()
931 .unwrap_or(&0);
932 summary.push_str(&format!("Peak memory usage: {} bytes\n", max_memory));
933 }
934
935 if !self.transfer_times.is_empty() {
936 let total_transfer_time: f64 = self.transfer_times.iter().map(|(_, time)| time).sum();
937 summary.push_str(&format!(
938 "Total transfer time: {:.2} ms\n",
939 total_transfer_time
940 ));
941 }
942
943 summary
944 }
945}
946
947#[allow(non_snake_case)]
948#[cfg(test)]
949mod tests {
950 use super::*;
951 use scirs2_core::essentials::Normal;
952
953 use scirs2_core::ndarray::{Array, Array2};
954 use scirs2_core::random::thread_rng;
955
956 #[test]
957 fn test_gpu_context_initialization() {
958 let config = GpuConfig::default();
959 let mut context = GpuContext::new(config);
960
961 assert!(context.initialize().is_ok());
962 assert!(context.is_initialized);
963 }
964
965 #[test]
966 fn test_gpu_rbf_sampler() {
967 let x: Array2<f64> = Array::from_shape_fn((50, 10), |_| {
968 let mut rng = thread_rng();
969 rng.sample(&Normal::new(0.0, 1.0).expect("operation should succeed"))
970 });
971 let sampler = GpuRBFSampler::new(100).gamma(0.5);
972
973 let fitted = sampler.fit(&x, &()).expect("operation should succeed");
974 let transformed = fitted.transform(&x).expect("operation should succeed");
975
976 assert_eq!(transformed.shape(), &[50, 100]);
977 }
978
979 #[test]
980 fn test_gpu_nystroem() {
981 let x: Array2<f64> = Array::from_shape_fn((30, 5), |_| {
982 let mut rng = thread_rng();
983 rng.sample(&Normal::new(0.0, 1.0).expect("operation should succeed"))
984 });
985 let nystroem = GpuNystroem::new(20).gamma(1.0);
986
987 let fitted = nystroem.fit(&x, &()).expect("operation should succeed");
988 let transformed = fitted.transform(&x).expect("operation should succeed");
989
990 assert_eq!(transformed.shape()[0], 30);
991 assert!(transformed.shape()[1] <= 20);
992 }
993
994 #[test]
995 fn test_gpu_profiler() {
996 let mut profiler = GpuProfiler::new().enable();
997
998 profiler.record_kernel_time("test_kernel", 5.5);
999 profiler.record_memory_usage("allocation", 1024);
1000 profiler.record_transfer_time("host_to_device", 2.1);
1001
1002 let summary = profiler.get_summary();
1003 assert!(summary.contains("Total kernel executions: 1"));
1004 assert!(summary.contains("Total kernel time: 5.50"));
1005 }
1006}
1007
1008impl FittedGpuNystroem {
1009 fn compute_kernel_matrix_gpu(
1010 &self,
1011 x: &Array2<f64>,
1012 y: &Array2<f64>,
1013 _ctx: &GpuContext,
1014 ) -> Result<Array2<f64>> {
1015 let n_x = x.nrows();
1016 let n_y = y.nrows();
1017 let mut kernel_matrix = Array2::zeros((n_x, n_y));
1018
1019 match self.kernel.as_str() {
1020 "rbf" => {
1021 for i in 0..n_x {
1022 for j in 0..n_y {
1023 let diff = &x.row(i) - &y.row(j);
1024 let squared_norm = diff.dot(&diff);
1025 kernel_matrix[[i, j]] = (-self.gamma * squared_norm).exp();
1026 }
1027 }
1028 }
1029 "linear" => {
1030 for i in 0..n_x {
1031 for j in 0..n_y {
1032 kernel_matrix[[i, j]] = x.row(i).dot(&y.row(j));
1033 }
1034 }
1035 }
1036 _ => {
1037 return Err(SklearsError::InvalidInput(format!(
1038 "Unsupported kernel: {}",
1039 self.kernel
1040 )));
1041 }
1042 }
1043
1044 Ok(kernel_matrix)
1045 }
1046}