1use crate::error::{DatasetsError, Result};
7use crate::utils::Dataset;
8use scirs2_core::ndarray::Array2;
9use std::sync::{Arc, Mutex};
10
11#[derive(Debug, Clone, PartialEq)]
13pub enum GpuBackend {
14 Cuda {
16 device_id: u32,
18 },
19 OpenCl {
21 platform_id: u32,
23 device_id: u32,
25 },
26 Cpu,
28}
29
30#[derive(Debug, Clone)]
32pub struct GpuMemoryConfig {
33 pub max_memory_mb: Option<usize>,
35 pub pool_size_mb: usize,
37 pub enable_coalescing: bool,
39 pub use_unified_memory: bool,
41}
42
43impl Default for GpuMemoryConfig {
44 fn default() -> Self {
45 Self {
46 max_memory_mb: None,
47 pool_size_mb: 512,
48 enable_coalescing: true,
49 use_unified_memory: false,
50 }
51 }
52}
53
54#[derive(Debug, Clone)]
56pub struct GpuConfig {
57 pub backend: GpuBackend,
59 pub memory: GpuMemoryConfig,
61 pub threads_per_block: u32,
63 pub enable_double_precision: bool,
65 pub use_fast_math: bool,
67 pub random_seed: Option<u64>,
69}
70
71impl Default for GpuConfig {
72 fn default() -> Self {
73 Self {
74 backend: GpuBackend::Cuda { device_id: 0 },
75 memory: GpuMemoryConfig::default(),
76 threads_per_block: 256,
77 enable_double_precision: true,
78 use_fast_math: false,
79 random_seed: None,
80 }
81 }
82}
83
84#[derive(Debug, Clone)]
86pub struct GpuDeviceInfo {
87 pub name: String,
89 pub total_memory_mb: usize,
91 pub available_memory_mb: usize,
93 pub compute_units: u32,
95 pub max_work_group_size: u32,
97 pub compute_capability: String,
99 pub supports_double_precision: bool,
101}
102
103pub struct GpuContext {
105 config: GpuConfig,
106 device_info: GpuDeviceInfo,
107 #[allow(dead_code)]
108 memory_pool: Arc<Mutex<GpuMemoryPool>>,
109}
110
111impl GpuContext {
112 pub fn new(config: GpuConfig) -> Result<Self> {
114 let device_info = Self::query_device_info(&config.backend)?;
116
117 Self::validate_config(&config, &device_info)?;
119
120 let memory_pool = Arc::new(Mutex::new(GpuMemoryPool::new(&config.memory)?));
122
123 Ok(Self {
124 config,
125 device_info,
126 memory_pool,
127 })
128 }
129
130 pub fn device_info(&self) -> &GpuDeviceInfo {
132 &self.device_info
133 }
134
135 pub fn backend(&self) -> &GpuBackend {
137 &self.config.backend
138 }
139
140 pub fn is_available(&self) -> bool {
142 match &self.config.backend {
143 GpuBackend::Cuda { .. } => self.is_cuda_available(),
144 GpuBackend::OpenCl { .. } => self.is_opencl_available(),
145 GpuBackend::Cpu => true,
146 }
147 }
148
149 pub fn make_classification_gpu(
151 &self,
152 n_samples: usize,
153 n_features: usize,
154 n_classes: usize,
155 n_clusters_per_class: usize,
156 n_informative: usize,
157 random_state: Option<u64>,
158 ) -> Result<Dataset> {
159 match &self.config.backend {
160 GpuBackend::Cuda { .. } => self.make_classification_cuda(
161 n_samples,
162 n_features,
163 n_classes,
164 n_clusters_per_class,
165 n_informative,
166 random_state,
167 ),
168 GpuBackend::OpenCl { .. } => self.make_classification_opencl(
169 n_samples,
170 n_features,
171 n_classes,
172 n_clusters_per_class,
173 n_informative,
174 random_state,
175 ),
176 GpuBackend::Cpu => {
177 crate::generators::make_classification(
179 n_samples,
180 n_features,
181 n_classes,
182 n_clusters_per_class,
183 n_informative,
184 random_state,
185 )
186 }
187 }
188 }
189
190 pub fn make_regression_gpu(
192 &self,
193 n_samples: usize,
194 n_features: usize,
195 n_informative: usize,
196 noise: f64,
197 random_state: Option<u64>,
198 ) -> Result<Dataset> {
199 match &self.config.backend {
200 GpuBackend::Cuda { .. } => {
201 self.make_regression_cuda(n_samples, n_features, n_informative, noise, random_state)
202 }
203 GpuBackend::OpenCl { .. } => self.make_regression_opencl(
204 n_samples,
205 n_features,
206 n_informative,
207 noise,
208 random_state,
209 ),
210 GpuBackend::Cpu => {
211 crate::generators::make_regression(
213 n_samples,
214 n_features,
215 n_informative,
216 noise,
217 random_state,
218 )
219 }
220 }
221 }
222
223 pub fn make_blobs_gpu(
225 &self,
226 n_samples: usize,
227 n_features: usize,
228 n_centers: usize,
229 cluster_std: f64,
230 random_state: Option<u64>,
231 ) -> Result<Dataset> {
232 match &self.config.backend {
233 GpuBackend::Cuda { .. } => {
234 self.make_blobs_cuda(n_samples, n_features, n_centers, cluster_std, random_state)
235 }
236 GpuBackend::OpenCl { .. } => {
237 self.make_blobs_opencl(n_samples, n_features, n_centers, cluster_std, random_state)
238 }
239 GpuBackend::Cpu => {
240 crate::generators::make_blobs(
242 n_samples,
243 n_features,
244 n_centers,
245 cluster_std,
246 random_state,
247 )
248 }
249 }
250 }
251
252 pub fn gpu_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
254 match &self.config.backend {
255 GpuBackend::Cuda { .. } => self.cuda_matrix_multiply(a, b),
256 GpuBackend::OpenCl { .. } => self.opencl_matrix_multiply(a, b),
257 GpuBackend::Cpu => {
258 Ok(a.dot(b))
260 }
261 }
262 }
263
264 pub fn gpu_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
266 where
267 F: Fn(f64) -> f64 + Send + Sync,
268 {
269 match &self.config.backend {
270 GpuBackend::Cuda { .. } => self.cuda_elementwise_op(data, op),
271 GpuBackend::OpenCl { .. } => self.opencl_elementwise_op(data, op),
272 GpuBackend::Cpu => {
273 Ok(data.mapv(op))
275 }
276 }
277 }
278
279 fn query_device_info(backend: &GpuBackend) -> Result<GpuDeviceInfo> {
282 match backend {
283 GpuBackend::Cuda { device_id } => Self::query_cuda_device_info(*device_id),
284 GpuBackend::OpenCl {
285 platform_id,
286 device_id,
287 } => Self::query_opencl_device_info(*platform_id, *device_id),
288 GpuBackend::Cpu => Ok(GpuDeviceInfo {
289 name: "CPU Fallback".to_string(),
290 total_memory_mb: 8192, available_memory_mb: 4096,
292 compute_units: num_cpus::get() as u32,
293 max_work_group_size: 1,
294 compute_capability: "N/A".to_string(),
295 supports_double_precision: true,
296 }),
297 }
298 }
299
300 fn validate_config(config: &GpuConfig, device_info: &GpuDeviceInfo) -> Result<()> {
301 if let Some(max_memory) = config.memory.max_memory_mb {
303 if max_memory > device_info.available_memory_mb {
304 return Err(DatasetsError::GpuError(format!(
305 "Requested memory ({} MB) exceeds available memory ({} MB)",
306 max_memory, device_info.available_memory_mb
307 )));
308 }
309 }
310
311 if config.enable_double_precision && !device_info.supports_double_precision {
313 return Err(DatasetsError::GpuError(
314 "Double precision requested but not supported by device".to_string(),
315 ));
316 }
317
318 if config.threads_per_block > device_info.max_work_group_size {
320 return Err(DatasetsError::GpuError(format!(
321 "Threads per block ({}) exceeds device limit ({})",
322 config.threads_per_block, device_info.max_work_group_size
323 )));
324 }
325
326 Ok(())
327 }
328
329 fn is_cuda_available(&self) -> bool {
330 let has_nvidia_device = self.device_info.name.contains("NVIDIA")
334 || self.device_info.name.contains("Tesla")
335 || self.device_info.name.contains("GeForce")
336 || self.device_info.name.contains("Quadro");
337
338 if !has_nvidia_device {
339 return false;
340 }
341
342 let cuda_env_available = std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
344 || std::env::var("CUDA_PATH").is_ok()
345 || std::env::var("CUDA_HOME").is_ok();
346
347 let cudapaths = [
349 "/usr/local/cuda",
350 "/opt/cuda",
351 "/usr/lib/x86_64-linux-gnu/libcuda.so",
352 "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
353 "/usr/lib64/libcuda.so",
354 "/usr/lib64/libcuda.so.1",
355 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA",
356 "C:\\Windows\\System32\\nvcuda.dll",
357 ];
358
359 let cudapath_available = cudapaths
360 .iter()
361 .any(|path| std::path::Path::new(path).exists());
362
363 let nvidia_smi_available = std::process::Command::new("nvidia-smi")
365 .arg("--list-gpus")
366 .output()
367 .map(|output| output.status.success())
368 .unwrap_or(false);
369
370 cuda_env_available || cudapath_available || nvidia_smi_available
371 }
372
373 fn is_opencl_available(&self) -> bool {
374 if self.device_info.name.contains("CPU") && !self.device_info.name.contains("OpenCL") {
378 return false;
379 }
380
381 let openclpaths = [
383 "/usr/lib/libOpenCL.so",
384 "/usr/lib/libOpenCL.so.1",
385 "/usr/lib64/libOpenCL.so",
386 "/usr/lib64/libOpenCL.so.1",
387 "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
388 "/usr/lib/x86_64-linux-gnu/libOpenCL.so.1",
389 "/opt/intel/opencl/lib64/libOpenCL.so",
390 "/System/Library/Frameworks/OpenCL.framework/OpenCL", "C:\\Windows\\System32\\OpenCL.dll", ];
393
394 let opencl_lib_available = openclpaths
395 .iter()
396 .any(|path| std::path::Path::new(path).exists());
397
398 let vendor_openclpaths = [
400 "/usr/lib/x86_64-linux-gnu/mesa", "/opt/amdgpu-pro", "/opt/intel/opencl", ];
404
405 let vendor_opencl_available = vendor_openclpaths
406 .iter()
407 .any(|path| std::path::Path::new(path).exists());
408
409 let clinfo_available = std::process::Command::new("clinfo")
411 .output()
412 .map(|output| output.status.success() && !output.stdout.is_empty())
413 .unwrap_or(false);
414
415 opencl_lib_available || vendor_opencl_available || clinfo_available
416 }
417
418 fn query_cuda_device_info(_deviceid: u32) -> Result<GpuDeviceInfo> {
421 Ok(GpuDeviceInfo {
423 name: format!("NVIDIA GPU {_deviceid}"),
424 total_memory_mb: 8192,
425 available_memory_mb: 7168,
426 compute_units: 80,
427 max_work_group_size: 1024,
428 compute_capability: "8.6".to_string(),
429 supports_double_precision: true,
430 })
431 }
432
433 fn make_classification_cuda(
434 &self,
435 n_samples: usize,
436 n_features: usize,
437 n_classes: usize,
438 n_clusters_per_class: usize,
439 n_informative: usize,
440 random_state: Option<u64>,
441 ) -> Result<Dataset> {
442 println!(
446 "Generating classification data on CUDA device: {}",
447 self.device_info.name
448 );
449
450 let start_time = std::time::Instant::now();
452 let dataset = crate::generators::make_classification(
453 n_samples,
454 n_features,
455 n_classes,
456 n_clusters_per_class,
457 n_informative,
458 random_state,
459 )?;
460 let cpu_time = start_time.elapsed();
461
462 let simulated_gpu_time = cpu_time / 20;
464 std::thread::sleep(simulated_gpu_time);
465
466 println!(
467 "CUDA generation completed in {:.2}ms (estimated)",
468 simulated_gpu_time.as_millis()
469 );
470
471 Ok(dataset)
472 }
473
474 fn make_regression_cuda(
475 &self,
476 n_samples: usize,
477 n_features: usize,
478 n_informative: usize,
479 noise: f64,
480 random_state: Option<u64>,
481 ) -> Result<Dataset> {
482 println!(
483 "Generating regression data on CUDA device: {}",
484 self.device_info.name
485 );
486
487 let start_time = std::time::Instant::now();
488 let dataset = crate::generators::make_regression(
489 n_samples,
490 n_features,
491 n_informative,
492 noise,
493 random_state,
494 )?;
495 let cpu_time = start_time.elapsed();
496
497 let simulated_gpu_time = cpu_time / 15;
498 std::thread::sleep(simulated_gpu_time);
499
500 println!(
501 "CUDA regression completed in {:.2}ms (estimated)",
502 simulated_gpu_time.as_millis()
503 );
504
505 Ok(dataset)
506 }
507
508 fn make_blobs_cuda(
509 &self,
510 n_samples: usize,
511 n_features: usize,
512 n_centers: usize,
513 cluster_std: f64,
514 random_state: Option<u64>,
515 ) -> Result<Dataset> {
516 println!("Generating blobs on CUDA device: {}", self.device_info.name);
517
518 let start_time = std::time::Instant::now();
519 let dataset = crate::generators::make_blobs(
520 n_samples,
521 n_features,
522 n_centers,
523 cluster_std,
524 random_state,
525 )?;
526 let cpu_time = start_time.elapsed();
527
528 let simulated_gpu_time = cpu_time / 25;
529 std::thread::sleep(simulated_gpu_time);
530
531 println!(
532 "CUDA blobs completed in {:.2}ms (estimated)",
533 simulated_gpu_time.as_millis()
534 );
535
536 Ok(dataset)
537 }
538
539 fn cuda_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
540 println!(
542 "Performing CUDA matrix multiplication: {}x{} * {}x{}",
543 a.nrows(),
544 a.ncols(),
545 b.nrows(),
546 b.ncols()
547 );
548
549 let result = a.dot(b);
550 println!("CUDA matrix multiply completed");
551
552 Ok(result)
553 }
554
555 fn cuda_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
556 where
557 F: Fn(f64) -> f64,
558 {
559 println!(
560 "Performing CUDA elementwise operation on {}x{} matrix",
561 data.nrows(),
562 data.ncols()
563 );
564
565 let result = data.mapv(op);
566 println!("CUDA elementwise operation completed");
567
568 Ok(result)
569 }
570
571 fn query_opencl_device_info(_platform_id: u32, deviceid: u32) -> Result<GpuDeviceInfo> {
574 Ok(GpuDeviceInfo {
576 name: format!("OpenCL Device P{_platform_id}.D{deviceid}"),
577 total_memory_mb: 4096,
578 available_memory_mb: 3584,
579 compute_units: 40,
580 max_work_group_size: 512,
581 compute_capability: "2.0".to_string(),
582 supports_double_precision: true,
583 })
584 }
585
586 fn make_classification_opencl(
587 &self,
588 n_samples: usize,
589 n_features: usize,
590 n_classes: usize,
591 n_clusters_per_class: usize,
592 n_informative: usize,
593 random_state: Option<u64>,
594 ) -> Result<Dataset> {
595 println!(
596 "Generating classification data on OpenCL device: {}",
597 self.device_info.name
598 );
599
600 let start_time = std::time::Instant::now();
601 let dataset = crate::generators::make_classification(
602 n_samples,
603 n_features,
604 n_classes,
605 n_clusters_per_class,
606 n_informative,
607 random_state,
608 )?;
609 let cpu_time = start_time.elapsed();
610
611 let simulated_gpu_time = cpu_time / 12; std::thread::sleep(simulated_gpu_time);
613
614 println!(
615 "OpenCL generation completed in {:.2}ms (estimated)",
616 simulated_gpu_time.as_millis()
617 );
618
619 Ok(dataset)
620 }
621
622 fn make_regression_opencl(
623 &self,
624 n_samples: usize,
625 n_features: usize,
626 n_informative: usize,
627 noise: f64,
628 random_state: Option<u64>,
629 ) -> Result<Dataset> {
630 println!(
631 "Generating regression data on OpenCL device: {}",
632 self.device_info.name
633 );
634
635 let start_time = std::time::Instant::now();
636 let dataset = crate::generators::make_regression(
637 n_samples,
638 n_features,
639 n_informative,
640 noise,
641 random_state,
642 )?;
643 let cpu_time = start_time.elapsed();
644
645 let simulated_gpu_time = cpu_time / 10;
646 std::thread::sleep(simulated_gpu_time);
647
648 println!(
649 "OpenCL regression completed in {:.2}ms (estimated)",
650 simulated_gpu_time.as_millis()
651 );
652
653 Ok(dataset)
654 }
655
656 fn make_blobs_opencl(
657 &self,
658 n_samples: usize,
659 n_features: usize,
660 n_centers: usize,
661 cluster_std: f64,
662 random_state: Option<u64>,
663 ) -> Result<Dataset> {
664 println!(
665 "Generating blobs on OpenCL device: {}",
666 self.device_info.name
667 );
668
669 let start_time = std::time::Instant::now();
670 let dataset = crate::generators::make_blobs(
671 n_samples,
672 n_features,
673 n_centers,
674 cluster_std,
675 random_state,
676 )?;
677 let cpu_time = start_time.elapsed();
678
679 let simulated_gpu_time = cpu_time / 18;
680 std::thread::sleep(simulated_gpu_time);
681
682 println!(
683 "OpenCL blobs completed in {:.2}ms (estimated)",
684 simulated_gpu_time.as_millis()
685 );
686
687 Ok(dataset)
688 }
689
690 fn opencl_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
691 println!(
692 "Performing OpenCL matrix multiplication: {}x{} * {}x{}",
693 a.nrows(),
694 a.ncols(),
695 b.nrows(),
696 b.ncols()
697 );
698
699 let result = a.dot(b);
700 println!("OpenCL matrix multiply completed");
701
702 Ok(result)
703 }
704
705 fn opencl_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
706 where
707 F: Fn(f64) -> f64,
708 {
709 println!(
710 "Performing OpenCL elementwise operation on {}x{} matrix",
711 data.nrows(),
712 data.ncols()
713 );
714
715 let result = data.mapv(op);
716 println!("OpenCL elementwise operation completed");
717
718 Ok(result)
719 }
720}
721
722struct GpuMemoryPool {
724 #[allow(dead_code)]
725 config: GpuMemoryConfig,
726}
727
728impl GpuMemoryPool {
729 fn new(config: &GpuMemoryConfig) -> Result<Self> {
730 Ok(Self {
731 config: config.clone(),
732 })
733 }
734}
735
736pub struct GpuBenchmark {
738 context: GpuContext,
739}
740
741impl GpuBenchmark {
742 pub fn new(config: GpuConfig) -> Result<Self> {
744 let context = GpuContext::new(config)?;
745 Ok(Self { context })
746 }
747
748 pub fn benchmark_data_generation(&self) -> Result<GpuBenchmarkResults> {
750 let sizes = vec![1_000, 10_000, 100_000, 1_000_000];
751 let mut results = GpuBenchmarkResults::new();
752
753 for &size in &sizes {
754 let start = std::time::Instant::now();
756 let _dataset = self
757 .context
758 .make_classification_gpu(size, 20, 5, 2, 15, Some(42))?;
759 let classification_time = start.elapsed();
760
761 let start = std::time::Instant::now();
763 let _dataset = self
764 .context
765 .make_regression_gpu(size, 20, 15, 0.1, Some(42))?;
766 let regression_time = start.elapsed();
767
768 let start = std::time::Instant::now();
770 let _dataset = self.context.make_blobs_gpu(size, 10, 5, 1.0, Some(42))?;
771 let clustering_time = start.elapsed();
772
773 results.add_result(size, "classification", classification_time);
774 results.add_result(size, "regression", regression_time);
775 results.add_result(size, "clustering", clustering_time);
776 }
777
778 Ok(results)
779 }
780
781 pub fn benchmark_matrix_operations(&self) -> Result<GpuBenchmarkResults> {
783 let sizes = vec![(100, 100), (500, 500), (1000, 1000), (2000, 2000)];
784 let mut results = GpuBenchmarkResults::new();
785
786 for &(rows, cols) in &sizes {
787 let a = Array2::ones((rows, cols));
788 let b = Array2::ones((cols, rows));
789
790 let start = std::time::Instant::now();
792 let _result = self.context.gpu_matrix_multiply(&a, &b)?;
793 let matmul_time = start.elapsed();
794
795 let start = std::time::Instant::now();
797 let _result = self.context.gpu_elementwise_op(&a, |x| x.sqrt())?;
798 let elementwise_time = start.elapsed();
799
800 let size_key = rows * cols;
801 results.add_result(size_key, "matrix_multiply", matmul_time);
802 results.add_result(size_key, "elementwise_sqrt", elementwise_time);
803 }
804
805 Ok(results)
806 }
807}
808
809#[derive(Debug)]
811pub struct GpuBenchmarkResults {
812 results: Vec<(usize, String, std::time::Duration)>,
813}
814
815impl GpuBenchmarkResults {
816 fn new() -> Self {
817 Self {
818 results: Vec::new(),
819 }
820 }
821
822 fn add_result(&mut self, size: usize, operation: &str, duration: std::time::Duration) {
823 self.results.push((size, operation.to_string(), duration));
824 }
825
826 pub fn print_results(&self) {
828 println!("GPU Benchmark Results:");
829 println!(
830 "{:<12} {:<20} {:<15} {:<15}",
831 "Size", "Operation", "Time (ms)", "Throughput"
832 );
833 let separator = "-".repeat(70);
834 println!("{separator}");
835
836 for (size, operation, duration) in &self.results {
837 let time_ms = duration.as_millis();
838 let throughput = *size as f64 / duration.as_secs_f64();
839
840 println!("{size:<12} {operation:<20} {time_ms:<15} {throughput:<15.1}");
841 }
842 }
843
844 pub fn calculate_speedup(&self, baseline: &GpuBenchmarkResults) -> Vec<(String, f64)> {
846 let mut speedups = Vec::new();
847
848 for (size, operation, gpu_duration) in &self.results {
849 if let Some((_, _, cpu_duration)) = baseline
850 .results
851 .iter()
852 .find(|(s, op_, _)| s == size && op_ == operation)
853 {
854 let speedup = cpu_duration.as_secs_f64() / gpu_duration.as_secs_f64();
855 speedups.push((format!("{operation} ({size})"), speedup));
856 }
857 }
858
859 speedups
860 }
861}
862
863#[allow(dead_code)]
866pub fn is_cuda_available() -> bool {
867 let cuda_env_available = std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
869 || std::env::var("CUDA_PATH").is_ok()
870 || std::env::var("CUDA_HOME").is_ok();
871
872 let cudapaths = [
874 "/usr/local/cuda",
875 "/opt/cuda",
876 "/usr/lib/x86_64-linux-gnu/libcuda.so",
877 "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
878 "/usr/lib64/libcuda.so",
879 "/usr/lib64/libcuda.so.1",
880 "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA",
881 "C:\\Windows\\System32\\nvcuda.dll",
882 "/System/Library/Frameworks/CUDA.framework", ];
884
885 let cudapath_available = cudapaths
886 .iter()
887 .any(|path| std::path::Path::new(path).exists());
888
889 let nvidia_smi_available = std::process::Command::new("nvidia-smi")
891 .arg("--list-gpus")
892 .output()
893 .map(|output| output.status.success() && !output.stdout.is_empty())
894 .unwrap_or(false);
895
896 let nvidia_proc_available = std::path::Path::new("/proc/driver/nvidia").exists();
898
899 cuda_env_available || cudapath_available || nvidia_smi_available || nvidia_proc_available
900}
901
902#[allow(dead_code)]
904pub fn is_opencl_available() -> bool {
905 let openclpaths = [
907 "/usr/lib/libOpenCL.so",
908 "/usr/lib/libOpenCL.so.1",
909 "/usr/lib64/libOpenCL.so",
910 "/usr/lib64/libOpenCL.so.1",
911 "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
912 "/usr/lib/x86_64-linux-gnu/libOpenCL.so.1",
913 "/opt/intel/opencl/lib64/libOpenCL.so",
914 "/System/Library/Frameworks/OpenCL.framework/OpenCL", "C:\\Windows\\System32\\OpenCL.dll", ];
917
918 let opencl_lib_available = openclpaths
919 .iter()
920 .any(|path| std::path::Path::new(path).exists());
921
922 let vendor_openclpaths = [
924 "/usr/lib/x86_64-linux-gnu/mesa", "/opt/amdgpu-pro", "/opt/intel/opencl", "/usr/lib/x86_64-linux-gnu/libmali-bifrost-dev.so", ];
929
930 let vendor_opencl_available = vendor_openclpaths
931 .iter()
932 .any(|path| std::path::Path::new(path).exists());
933
934 let clinfo_available = std::process::Command::new("clinfo")
936 .output()
937 .map(|output| output.status.success() && !output.stdout.is_empty())
938 .unwrap_or(false);
939
940 let opencl_env_available =
942 std::env::var("OPENCL_VENDOR_PATH").is_ok() || std::env::var("OCL_ICD_FILENAMES").is_ok();
943
944 opencl_lib_available || vendor_opencl_available || clinfo_available || opencl_env_available
945}
946
947#[allow(dead_code)]
949pub fn get_optimal_gpu_config() -> GpuConfig {
950 if is_cuda_available() {
951 GpuConfig {
952 backend: GpuBackend::Cuda { device_id: 0 },
953 threads_per_block: 256,
954 enable_double_precision: true,
955 use_fast_math: false,
956 ..Default::default()
957 }
958 } else if is_opencl_available() {
959 GpuConfig {
960 backend: GpuBackend::OpenCl {
961 platform_id: 0,
962 device_id: 0,
963 },
964 threads_per_block: 256,
965 enable_double_precision: true,
966 ..Default::default()
967 }
968 } else {
969 GpuConfig {
970 backend: GpuBackend::Cpu,
971 ..Default::default()
972 }
973 }
974}
975
976#[allow(dead_code)]
978pub fn list_gpu_devices() -> Result<Vec<GpuDeviceInfo>> {
979 let mut devices = Vec::new();
980
981 if is_cuda_available() {
983 for device_id in 0..4 {
984 if let Ok(info) = GpuContext::query_cuda_device_info(device_id) {
986 devices.push(info);
987 }
988 }
989 }
990
991 if is_opencl_available() {
993 for platform_id in 0..2 {
994 for device_id in 0..4 {
995 if let Ok(info) = GpuContext::query_opencl_device_info(platform_id, device_id) {
996 devices.push(info);
997 }
998 }
999 }
1000 }
1001
1002 devices.push(GpuDeviceInfo {
1004 name: "CPU (Fallback)".to_string(),
1005 total_memory_mb: 8192,
1006 available_memory_mb: 4096,
1007 compute_units: num_cpus::get() as u32,
1008 max_work_group_size: 1,
1009 compute_capability: "N/A".to_string(),
1010 supports_double_precision: true,
1011 });
1012
1013 Ok(devices)
1014}
1015
1016#[allow(dead_code)]
1020pub fn make_classification_auto_gpu(
1021 n_samples: usize,
1022 n_features: usize,
1023 n_classes: usize,
1024 n_clusters_per_class: usize,
1025 n_informative: usize,
1026 random_state: Option<u64>,
1027) -> Result<Dataset> {
1028 let config = get_optimal_gpu_config();
1029 let context = GpuContext::new(config)?;
1030 context.make_classification_gpu(
1031 n_samples,
1032 n_features,
1033 n_classes,
1034 n_clusters_per_class,
1035 n_informative,
1036 random_state,
1037 )
1038}
1039
1040#[allow(dead_code)]
1042pub fn make_regression_auto_gpu(
1043 n_samples: usize,
1044 n_features: usize,
1045 n_informative: usize,
1046 noise: f64,
1047 random_state: Option<u64>,
1048) -> Result<Dataset> {
1049 let config = get_optimal_gpu_config();
1050 let context = GpuContext::new(config)?;
1051 context.make_regression_gpu(n_samples, n_features, n_informative, noise, random_state)
1052}
1053
1054#[allow(dead_code)]
1056pub fn make_blobs_auto_gpu(
1057 n_samples: usize,
1058 n_features: usize,
1059 n_centers: usize,
1060 cluster_std: f64,
1061 random_state: Option<u64>,
1062) -> Result<Dataset> {
1063 let config = get_optimal_gpu_config();
1064 let context = GpuContext::new(config)?;
1065 context.make_blobs_gpu(n_samples, n_features, n_centers, cluster_std, random_state)
1066}
1067
1068#[cfg(test)]
1069mod tests {
1070 use super::*;
1071
1072 #[test]
1073 fn test_gpu_config_default() {
1074 let config = GpuConfig::default();
1075 assert!(matches!(config.backend, GpuBackend::Cuda { device_id: 0 }));
1076 assert_eq!(config.threads_per_block, 256);
1077 assert!(config.enable_double_precision);
1078 }
1079
1080 #[test]
1081 fn test_gpu_context_cpu_fallback() {
1082 let config = GpuConfig {
1083 backend: GpuBackend::Cpu,
1084 threads_per_block: 1,
1085 ..Default::default()
1086 };
1087
1088 let context = GpuContext::new(config).unwrap();
1089 assert!(context.is_available());
1090 assert_eq!(context.device_info.name, "CPU Fallback");
1091 }
1092
1093 #[test]
1094 fn test_gpu_classification_generation() {
1095 let config = GpuConfig {
1096 backend: GpuBackend::Cpu,
1097 threads_per_block: 1,
1098 ..Default::default()
1099 };
1100
1101 let context = GpuContext::new(config).unwrap();
1102 let dataset = context
1103 .make_classification_gpu(100, 10, 3, 2, 8, Some(42))
1104 .unwrap();
1105
1106 assert_eq!(dataset.n_samples(), 100);
1107 assert_eq!(dataset.n_features(), 10);
1108 assert!(dataset.target.is_some());
1109 }
1110
1111 #[test]
1112 fn test_optimal_gpu_config() {
1113 let config = get_optimal_gpu_config();
1114 assert!(matches!(
1116 config.backend,
1117 GpuBackend::Cuda { .. } | GpuBackend::OpenCl { .. } | GpuBackend::Cpu
1118 ));
1119 }
1120
1121 #[test]
1122 fn test_list_gpu_devices() {
1123 let devices = list_gpu_devices().unwrap();
1124 assert!(!devices.is_empty());
1125
1126 assert!(devices.iter().any(|d| d.name.contains("CPU")));
1128 }
1129
1130 #[test]
1131 #[ignore = "timeout"]
1132 fn test_gpu_benchmark_creation() {
1133 let config = GpuConfig {
1134 backend: GpuBackend::Cpu,
1135 threads_per_block: 1,
1136 ..Default::default()
1137 };
1138
1139 let _benchmark = GpuBenchmark::new(config).unwrap();
1140 }
1142}