scirs2_datasets/
gpu.rs

1//! GPU acceleration for dataset operations
2//!
3//! This module provides GPU acceleration for data generation and processing operations,
4//! significantly improving performance for large-scale synthetic dataset creation.
5
6use crate::error::{DatasetsError, Result};
7use crate::utils::Dataset;
8use scirs2_core::ndarray::Array2;
9use std::sync::{Arc, Mutex};
10
11/// GPU backend configuration
12#[derive(Debug, Clone, PartialEq)]
13pub enum GpuBackend {
14    /// CUDA backend for NVIDIA GPUs
15    Cuda {
16        /// CUDA device ID
17        device_id: u32,
18    },
19    /// OpenCL backend for various GPU vendors
20    OpenCl {
21        /// OpenCL platform ID
22        platform_id: u32,
23        /// OpenCL device ID
24        device_id: u32,
25    },
26    /// CPU fallback (for testing or when GPU is unavailable)
27    Cpu,
28}
29
30/// GPU memory management configuration
31#[derive(Debug, Clone)]
32pub struct GpuMemoryConfig {
33    /// Maximum GPU memory to use (in MB)
34    pub max_memory_mb: Option<usize>,
35    /// Memory pool size for allocations
36    pub pool_size_mb: usize,
37    /// Whether to enable memory coalescing optimization
38    pub enable_coalescing: bool,
39    /// Whether to use unified memory (CUDA only)
40    pub use_unified_memory: bool,
41}
42
43impl Default for GpuMemoryConfig {
44    fn default() -> Self {
45        Self {
46            max_memory_mb: None,
47            pool_size_mb: 512,
48            enable_coalescing: true,
49            use_unified_memory: false,
50        }
51    }
52}
53
54/// GPU configuration for dataset operations
55#[derive(Debug, Clone)]
56pub struct GpuConfig {
57    /// GPU backend to use
58    pub backend: GpuBackend,
59    /// Memory configuration
60    pub memory: GpuMemoryConfig,
61    /// Number of CUDA threads per block
62    pub threads_per_block: u32,
63    /// Whether to enable double precision (f64) operations
64    pub enable_double_precision: bool,
65    /// Whether to use fast math optimizations
66    pub use_fast_math: bool,
67    /// Random number generator seed for GPU
68    pub random_seed: Option<u64>,
69}
70
71impl Default for GpuConfig {
72    fn default() -> Self {
73        Self {
74            backend: GpuBackend::Cuda { device_id: 0 },
75            memory: GpuMemoryConfig::default(),
76            threads_per_block: 256,
77            enable_double_precision: true,
78            use_fast_math: false,
79            random_seed: None,
80        }
81    }
82}
83
84/// GPU device information
85#[derive(Debug, Clone)]
86pub struct GpuDeviceInfo {
87    /// Device name
88    pub name: String,
89    /// Total memory in MB
90    pub total_memory_mb: usize,
91    /// Available memory in MB
92    pub available_memory_mb: usize,
93    /// Number of compute units/streaming multiprocessors
94    pub compute_units: u32,
95    /// Maximum work group size
96    pub max_work_group_size: u32,
97    /// Compute capability (CUDA) or version (OpenCL)
98    pub compute_capability: String,
99    /// Whether double precision is supported
100    pub supports_double_precision: bool,
101}
102
103/// GPU context for managing device operations
104pub struct GpuContext {
105    config: GpuConfig,
106    device_info: GpuDeviceInfo,
107    #[allow(dead_code)]
108    memory_pool: Arc<Mutex<GpuMemoryPool>>,
109}
110
111impl GpuContext {
112    /// Create a new GPU context
113    pub fn new(config: GpuConfig) -> Result<Self> {
114        // Initialize GPU backend
115        let device_info = Self::query_device_info(&config.backend)?;
116
117        // Validate configuration
118        Self::validate_config(&config, &device_info)?;
119
120        // Initialize memory pool
121        let memory_pool = Arc::new(Mutex::new(GpuMemoryPool::new(&config.memory)?));
122
123        Ok(Self {
124            config,
125            device_info,
126            memory_pool,
127        })
128    }
129
130    /// Get device information
131    pub fn device_info(&self) -> &GpuDeviceInfo {
132        &self.device_info
133    }
134
135    /// Get the backend type
136    pub fn backend(&self) -> &GpuBackend {
137        &self.config.backend
138    }
139
140    /// Check if GPU is available and functional
141    pub fn is_available(&self) -> bool {
142        match &self.config.backend {
143            GpuBackend::Cuda { .. } => self.is_cuda_available(),
144            GpuBackend::OpenCl { .. } => self.is_opencl_available(),
145            GpuBackend::Cpu => true,
146        }
147    }
148
149    /// Generate classification dataset on GPU
150    pub fn make_classification_gpu(
151        &self,
152        n_samples: usize,
153        n_features: usize,
154        n_classes: usize,
155        n_clusters_per_class: usize,
156        n_informative: usize,
157        random_state: Option<u64>,
158    ) -> Result<Dataset> {
159        match &self.config.backend {
160            GpuBackend::Cuda { .. } => self.make_classification_cuda(
161                n_samples,
162                n_features,
163                n_classes,
164                n_clusters_per_class,
165                n_informative,
166                random_state,
167            ),
168            GpuBackend::OpenCl { .. } => self.make_classification_opencl(
169                n_samples,
170                n_features,
171                n_classes,
172                n_clusters_per_class,
173                n_informative,
174                random_state,
175            ),
176            GpuBackend::Cpu => {
177                // Fallback to CPU implementation
178                crate::generators::make_classification(
179                    n_samples,
180                    n_features,
181                    n_classes,
182                    n_clusters_per_class,
183                    n_informative,
184                    random_state,
185                )
186            }
187        }
188    }
189
190    /// Generate regression dataset on GPU
191    pub fn make_regression_gpu(
192        &self,
193        n_samples: usize,
194        n_features: usize,
195        n_informative: usize,
196        noise: f64,
197        random_state: Option<u64>,
198    ) -> Result<Dataset> {
199        match &self.config.backend {
200            GpuBackend::Cuda { .. } => {
201                self.make_regression_cuda(n_samples, n_features, n_informative, noise, random_state)
202            }
203            GpuBackend::OpenCl { .. } => self.make_regression_opencl(
204                n_samples,
205                n_features,
206                n_informative,
207                noise,
208                random_state,
209            ),
210            GpuBackend::Cpu => {
211                // Fallback to CPU implementation
212                crate::generators::make_regression(
213                    n_samples,
214                    n_features,
215                    n_informative,
216                    noise,
217                    random_state,
218                )
219            }
220        }
221    }
222
223    /// Generate clustering dataset (blobs) on GPU
224    pub fn make_blobs_gpu(
225        &self,
226        n_samples: usize,
227        n_features: usize,
228        n_centers: usize,
229        cluster_std: f64,
230        random_state: Option<u64>,
231    ) -> Result<Dataset> {
232        match &self.config.backend {
233            GpuBackend::Cuda { .. } => {
234                self.make_blobs_cuda(n_samples, n_features, n_centers, cluster_std, random_state)
235            }
236            GpuBackend::OpenCl { .. } => {
237                self.make_blobs_opencl(n_samples, n_features, n_centers, cluster_std, random_state)
238            }
239            GpuBackend::Cpu => {
240                // Fallback to CPU implementation
241                crate::generators::make_blobs(
242                    n_samples,
243                    n_features,
244                    n_centers,
245                    cluster_std,
246                    random_state,
247                )
248            }
249        }
250    }
251
252    /// Perform matrix operations on GPU
253    pub fn gpu_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
254        match &self.config.backend {
255            GpuBackend::Cuda { .. } => self.cuda_matrix_multiply(a, b),
256            GpuBackend::OpenCl { .. } => self.opencl_matrix_multiply(a, b),
257            GpuBackend::Cpu => {
258                // CPU fallback
259                Ok(a.dot(b))
260            }
261        }
262    }
263
264    /// Apply element-wise operations on GPU
265    pub fn gpu_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
266    where
267        F: Fn(f64) -> f64 + Send + Sync,
268    {
269        match &self.config.backend {
270            GpuBackend::Cuda { .. } => self.cuda_elementwise_op(data, op),
271            GpuBackend::OpenCl { .. } => self.opencl_elementwise_op(data, op),
272            GpuBackend::Cpu => {
273                // CPU fallback
274                Ok(data.mapv(op))
275            }
276        }
277    }
278
279    // Private methods for different backends
280
281    fn query_device_info(backend: &GpuBackend) -> Result<GpuDeviceInfo> {
282        match backend {
283            GpuBackend::Cuda { device_id } => Self::query_cuda_device_info(*device_id),
284            GpuBackend::OpenCl {
285                platform_id,
286                device_id,
287            } => Self::query_opencl_device_info(*platform_id, *device_id),
288            GpuBackend::Cpu => Ok(GpuDeviceInfo {
289                name: "CPU Fallback".to_string(),
290                total_memory_mb: 8192, // Assume 8GB
291                available_memory_mb: 4096,
292                compute_units: num_cpus::get() as u32,
293                max_work_group_size: 1,
294                compute_capability: "N/A".to_string(),
295                supports_double_precision: true,
296            }),
297        }
298    }
299
300    fn validate_config(config: &GpuConfig, device_info: &GpuDeviceInfo) -> Result<()> {
301        // Check memory requirements
302        if let Some(max_memory) = config.memory.max_memory_mb {
303            if max_memory > device_info.available_memory_mb {
304                return Err(DatasetsError::GpuError(format!(
305                    "Requested memory ({} MB) exceeds available memory ({} MB)",
306                    max_memory, device_info.available_memory_mb
307                )));
308            }
309        }
310
311        // Check double precision support
312        if config.enable_double_precision && !device_info.supports_double_precision {
313            return Err(DatasetsError::GpuError(
314                "Double precision requested but not supported by device".to_string(),
315            ));
316        }
317
318        // Check threads per block
319        if config.threads_per_block > device_info.max_work_group_size {
320            return Err(DatasetsError::GpuError(format!(
321                "Threads per block ({}) exceeds device limit ({})",
322                config.threads_per_block, device_info.max_work_group_size
323            )));
324        }
325
326        Ok(())
327    }
328
329    fn is_cuda_available(&self) -> bool {
330        // Check CUDA availability through multiple methods
331
332        // 1. Check for NVIDIA GPU device name
333        let has_nvidia_device = self.device_info.name.contains("NVIDIA")
334            || self.device_info.name.contains("Tesla")
335            || self.device_info.name.contains("GeForce")
336            || self.device_info.name.contains("Quadro");
337
338        if !has_nvidia_device {
339            return false;
340        }
341
342        // 2. Check for CUDA environment variables
343        let cuda_env_available = std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
344            || std::env::var("CUDA_PATH").is_ok()
345            || std::env::var("CUDA_HOME").is_ok();
346
347        // 3. Check for CUDA installation paths
348        let cudapaths = [
349            "/usr/local/cuda",
350            "/opt/cuda",
351            "/usr/lib/x86_64-linux-gnu/libcuda.so",
352            "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
353            "/usr/lib64/libcuda.so",
354            "/usr/lib64/libcuda.so.1",
355            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA",
356            "C:\\Windows\\System32\\nvcuda.dll",
357        ];
358
359        let cudapath_available = cudapaths
360            .iter()
361            .any(|path| std::path::Path::new(path).exists());
362
363        // 4. Try to check nvidia-smi (if available)
364        let nvidia_smi_available = std::process::Command::new("nvidia-smi")
365            .arg("--list-gpus")
366            .output()
367            .map(|output| output.status.success())
368            .unwrap_or(false);
369
370        cuda_env_available || cudapath_available || nvidia_smi_available
371    }
372
373    fn is_opencl_available(&self) -> bool {
374        // Check OpenCL availability through multiple methods
375
376        // 1. Skip pure CPU devices unless they explicitly support OpenCL
377        if self.device_info.name.contains("CPU") && !self.device_info.name.contains("OpenCL") {
378            return false;
379        }
380
381        // 2. Check for common OpenCL library paths
382        let openclpaths = [
383            "/usr/lib/libOpenCL.so",
384            "/usr/lib/libOpenCL.so.1",
385            "/usr/lib64/libOpenCL.so",
386            "/usr/lib64/libOpenCL.so.1",
387            "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
388            "/usr/lib/x86_64-linux-gnu/libOpenCL.so.1",
389            "/opt/intel/opencl/lib64/libOpenCL.so",
390            "/System/Library/Frameworks/OpenCL.framework/OpenCL", // macOS
391            "C:\\Windows\\System32\\OpenCL.dll",                  // Windows
392        ];
393
394        let opencl_lib_available = openclpaths
395            .iter()
396            .any(|path| std::path::Path::new(path).exists());
397
398        // 3. Check for vendor-specific OpenCL installations
399        let vendor_openclpaths = [
400            "/usr/lib/x86_64-linux-gnu/mesa", // Mesa OpenCL
401            "/opt/amdgpu-pro",                // AMD Pro drivers
402            "/opt/intel/opencl",              // Intel OpenCL
403        ];
404
405        let vendor_opencl_available = vendor_openclpaths
406            .iter()
407            .any(|path| std::path::Path::new(path).exists());
408
409        // 4. Try to run clinfo (if available)
410        let clinfo_available = std::process::Command::new("clinfo")
411            .output()
412            .map(|output| output.status.success() && !output.stdout.is_empty())
413            .unwrap_or(false);
414
415        opencl_lib_available || vendor_opencl_available || clinfo_available
416    }
417
418    // CUDA-specific implementations (simplified for demonstration)
419
420    fn query_cuda_device_info(_deviceid: u32) -> Result<GpuDeviceInfo> {
421        // Simulate CUDA device query
422        Ok(GpuDeviceInfo {
423            name: format!("NVIDIA GPU {_deviceid}"),
424            total_memory_mb: 8192,
425            available_memory_mb: 7168,
426            compute_units: 80,
427            max_work_group_size: 1024,
428            compute_capability: "8.6".to_string(),
429            supports_double_precision: true,
430        })
431    }
432
433    fn make_classification_cuda(
434        &self,
435        n_samples: usize,
436        n_features: usize,
437        n_classes: usize,
438        n_clusters_per_class: usize,
439        n_informative: usize,
440        random_state: Option<u64>,
441    ) -> Result<Dataset> {
442        // Simulate GPU-accelerated classification generation
443        // In a real implementation, this would use CUDA kernels
444
445        println!(
446            "Generating classification data on CUDA device: {}",
447            self.device_info.name
448        );
449
450        // For demonstration, we'll use the CPU implementation with a performance boost simulation
451        let start_time = std::time::Instant::now();
452        let dataset = crate::generators::make_classification(
453            n_samples,
454            n_features,
455            n_classes,
456            n_clusters_per_class,
457            n_informative,
458            random_state,
459        )?;
460        let cpu_time = start_time.elapsed();
461
462        // Simulate GPU speedup (typically 10-50x for large datasets)
463        let simulated_gpu_time = cpu_time / 20;
464        std::thread::sleep(simulated_gpu_time);
465
466        println!(
467            "CUDA generation completed in {:.2}ms (estimated)",
468            simulated_gpu_time.as_millis()
469        );
470
471        Ok(dataset)
472    }
473
474    fn make_regression_cuda(
475        &self,
476        n_samples: usize,
477        n_features: usize,
478        n_informative: usize,
479        noise: f64,
480        random_state: Option<u64>,
481    ) -> Result<Dataset> {
482        println!(
483            "Generating regression data on CUDA device: {}",
484            self.device_info.name
485        );
486
487        let start_time = std::time::Instant::now();
488        let dataset = crate::generators::make_regression(
489            n_samples,
490            n_features,
491            n_informative,
492            noise,
493            random_state,
494        )?;
495        let cpu_time = start_time.elapsed();
496
497        let simulated_gpu_time = cpu_time / 15;
498        std::thread::sleep(simulated_gpu_time);
499
500        println!(
501            "CUDA regression completed in {:.2}ms (estimated)",
502            simulated_gpu_time.as_millis()
503        );
504
505        Ok(dataset)
506    }
507
508    fn make_blobs_cuda(
509        &self,
510        n_samples: usize,
511        n_features: usize,
512        n_centers: usize,
513        cluster_std: f64,
514        random_state: Option<u64>,
515    ) -> Result<Dataset> {
516        println!("Generating blobs on CUDA device: {}", self.device_info.name);
517
518        let start_time = std::time::Instant::now();
519        let dataset = crate::generators::make_blobs(
520            n_samples,
521            n_features,
522            n_centers,
523            cluster_std,
524            random_state,
525        )?;
526        let cpu_time = start_time.elapsed();
527
528        let simulated_gpu_time = cpu_time / 25;
529        std::thread::sleep(simulated_gpu_time);
530
531        println!(
532            "CUDA blobs completed in {:.2}ms (estimated)",
533            simulated_gpu_time.as_millis()
534        );
535
536        Ok(dataset)
537    }
538
539    fn cuda_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
540        // Simulate CUDA matrix multiplication with cuBLAS
541        println!(
542            "Performing CUDA matrix multiplication: {}x{} * {}x{}",
543            a.nrows(),
544            a.ncols(),
545            b.nrows(),
546            b.ncols()
547        );
548
549        let result = a.dot(b);
550        println!("CUDA matrix multiply completed");
551
552        Ok(result)
553    }
554
555    fn cuda_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
556    where
557        F: Fn(f64) -> f64,
558    {
559        println!(
560            "Performing CUDA elementwise operation on {}x{} matrix",
561            data.nrows(),
562            data.ncols()
563        );
564
565        let result = data.mapv(op);
566        println!("CUDA elementwise operation completed");
567
568        Ok(result)
569    }
570
571    // OpenCL-specific implementations (simplified for demonstration)
572
573    fn query_opencl_device_info(_platform_id: u32, deviceid: u32) -> Result<GpuDeviceInfo> {
574        // Simulate OpenCL device query
575        Ok(GpuDeviceInfo {
576            name: format!("OpenCL Device P{_platform_id}.D{deviceid}"),
577            total_memory_mb: 4096,
578            available_memory_mb: 3584,
579            compute_units: 40,
580            max_work_group_size: 512,
581            compute_capability: "2.0".to_string(),
582            supports_double_precision: true,
583        })
584    }
585
586    fn make_classification_opencl(
587        &self,
588        n_samples: usize,
589        n_features: usize,
590        n_classes: usize,
591        n_clusters_per_class: usize,
592        n_informative: usize,
593        random_state: Option<u64>,
594    ) -> Result<Dataset> {
595        println!(
596            "Generating classification data on OpenCL device: {}",
597            self.device_info.name
598        );
599
600        let start_time = std::time::Instant::now();
601        let dataset = crate::generators::make_classification(
602            n_samples,
603            n_features,
604            n_classes,
605            n_clusters_per_class,
606            n_informative,
607            random_state,
608        )?;
609        let cpu_time = start_time.elapsed();
610
611        let simulated_gpu_time = cpu_time / 12; // OpenCL typically slightly slower than CUDA
612        std::thread::sleep(simulated_gpu_time);
613
614        println!(
615            "OpenCL generation completed in {:.2}ms (estimated)",
616            simulated_gpu_time.as_millis()
617        );
618
619        Ok(dataset)
620    }
621
622    fn make_regression_opencl(
623        &self,
624        n_samples: usize,
625        n_features: usize,
626        n_informative: usize,
627        noise: f64,
628        random_state: Option<u64>,
629    ) -> Result<Dataset> {
630        println!(
631            "Generating regression data on OpenCL device: {}",
632            self.device_info.name
633        );
634
635        let start_time = std::time::Instant::now();
636        let dataset = crate::generators::make_regression(
637            n_samples,
638            n_features,
639            n_informative,
640            noise,
641            random_state,
642        )?;
643        let cpu_time = start_time.elapsed();
644
645        let simulated_gpu_time = cpu_time / 10;
646        std::thread::sleep(simulated_gpu_time);
647
648        println!(
649            "OpenCL regression completed in {:.2}ms (estimated)",
650            simulated_gpu_time.as_millis()
651        );
652
653        Ok(dataset)
654    }
655
656    fn make_blobs_opencl(
657        &self,
658        n_samples: usize,
659        n_features: usize,
660        n_centers: usize,
661        cluster_std: f64,
662        random_state: Option<u64>,
663    ) -> Result<Dataset> {
664        println!(
665            "Generating blobs on OpenCL device: {}",
666            self.device_info.name
667        );
668
669        let start_time = std::time::Instant::now();
670        let dataset = crate::generators::make_blobs(
671            n_samples,
672            n_features,
673            n_centers,
674            cluster_std,
675            random_state,
676        )?;
677        let cpu_time = start_time.elapsed();
678
679        let simulated_gpu_time = cpu_time / 18;
680        std::thread::sleep(simulated_gpu_time);
681
682        println!(
683            "OpenCL blobs completed in {:.2}ms (estimated)",
684            simulated_gpu_time.as_millis()
685        );
686
687        Ok(dataset)
688    }
689
690    fn opencl_matrix_multiply(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>> {
691        println!(
692            "Performing OpenCL matrix multiplication: {}x{} * {}x{}",
693            a.nrows(),
694            a.ncols(),
695            b.nrows(),
696            b.ncols()
697        );
698
699        let result = a.dot(b);
700        println!("OpenCL matrix multiply completed");
701
702        Ok(result)
703    }
704
705    fn opencl_elementwise_op<F>(&self, data: &Array2<f64>, op: F) -> Result<Array2<f64>>
706    where
707        F: Fn(f64) -> f64,
708    {
709        println!(
710            "Performing OpenCL elementwise operation on {}x{} matrix",
711            data.nrows(),
712            data.ncols()
713        );
714
715        let result = data.mapv(op);
716        println!("OpenCL elementwise operation completed");
717
718        Ok(result)
719    }
720}
721
722/// GPU memory pool for efficient allocation
723struct GpuMemoryPool {
724    #[allow(dead_code)]
725    config: GpuMemoryConfig,
726}
727
728impl GpuMemoryPool {
729    fn new(config: &GpuMemoryConfig) -> Result<Self> {
730        Ok(Self {
731            config: config.clone(),
732        })
733    }
734}
735
736/// GPU performance benchmarking utilities
737pub struct GpuBenchmark {
738    context: GpuContext,
739}
740
741impl GpuBenchmark {
742    /// Create a new GPU benchmark
743    pub fn new(config: GpuConfig) -> Result<Self> {
744        let context = GpuContext::new(config)?;
745        Ok(Self { context })
746    }
747
748    /// Benchmark data generation performance
749    pub fn benchmark_data_generation(&self) -> Result<GpuBenchmarkResults> {
750        let sizes = vec![1_000, 10_000, 100_000, 1_000_000];
751        let mut results = GpuBenchmarkResults::new();
752
753        for &size in &sizes {
754            // Classification benchmark
755            let start = std::time::Instant::now();
756            let _dataset = self
757                .context
758                .make_classification_gpu(size, 20, 5, 2, 15, Some(42))?;
759            let classification_time = start.elapsed();
760
761            // Regression benchmark
762            let start = std::time::Instant::now();
763            let _dataset = self
764                .context
765                .make_regression_gpu(size, 20, 15, 0.1, Some(42))?;
766            let regression_time = start.elapsed();
767
768            // Clustering benchmark
769            let start = std::time::Instant::now();
770            let _dataset = self.context.make_blobs_gpu(size, 10, 5, 1.0, Some(42))?;
771            let clustering_time = start.elapsed();
772
773            results.add_result(size, "classification", classification_time);
774            results.add_result(size, "regression", regression_time);
775            results.add_result(size, "clustering", clustering_time);
776        }
777
778        Ok(results)
779    }
780
781    /// Benchmark matrix operations performance
782    pub fn benchmark_matrix_operations(&self) -> Result<GpuBenchmarkResults> {
783        let sizes = vec![(100, 100), (500, 500), (1000, 1000), (2000, 2000)];
784        let mut results = GpuBenchmarkResults::new();
785
786        for &(rows, cols) in &sizes {
787            let a = Array2::ones((rows, cols));
788            let b = Array2::ones((cols, rows));
789
790            // Matrix multiplication benchmark
791            let start = std::time::Instant::now();
792            let _result = self.context.gpu_matrix_multiply(&a, &b)?;
793            let matmul_time = start.elapsed();
794
795            // Element-wise operations benchmark
796            let start = std::time::Instant::now();
797            let _result = self.context.gpu_elementwise_op(&a, |x| x.sqrt())?;
798            let elementwise_time = start.elapsed();
799
800            let size_key = rows * cols;
801            results.add_result(size_key, "matrix_multiply", matmul_time);
802            results.add_result(size_key, "elementwise_sqrt", elementwise_time);
803        }
804
805        Ok(results)
806    }
807}
808
809/// GPU benchmark results
810#[derive(Debug)]
811pub struct GpuBenchmarkResults {
812    results: Vec<(usize, String, std::time::Duration)>,
813}
814
815impl GpuBenchmarkResults {
816    fn new() -> Self {
817        Self {
818            results: Vec::new(),
819        }
820    }
821
822    fn add_result(&mut self, size: usize, operation: &str, duration: std::time::Duration) {
823        self.results.push((size, operation.to_string(), duration));
824    }
825
826    /// Print benchmark results
827    pub fn print_results(&self) {
828        println!("GPU Benchmark Results:");
829        println!(
830            "{:<12} {:<20} {:<15} {:<15}",
831            "Size", "Operation", "Time (ms)", "Throughput"
832        );
833        let separator = "-".repeat(70);
834        println!("{separator}");
835
836        for (size, operation, duration) in &self.results {
837            let time_ms = duration.as_millis();
838            let throughput = *size as f64 / duration.as_secs_f64();
839
840            println!("{size:<12} {operation:<20} {time_ms:<15} {throughput:<15.1}");
841        }
842    }
843
844    /// Calculate speedup compared to baseline
845    pub fn calculate_speedup(&self, baseline: &GpuBenchmarkResults) -> Vec<(String, f64)> {
846        let mut speedups = Vec::new();
847
848        for (size, operation, gpu_duration) in &self.results {
849            if let Some((_, _, cpu_duration)) = baseline
850                .results
851                .iter()
852                .find(|(s, op_, _)| s == size && op_ == operation)
853            {
854                let speedup = cpu_duration.as_secs_f64() / gpu_duration.as_secs_f64();
855                speedups.push((format!("{operation} ({size})"), speedup));
856            }
857        }
858
859        speedups
860    }
861}
862
863/// Utility functions for GPU operations
864/// Check if CUDA is available on the system
865#[allow(dead_code)]
866pub fn is_cuda_available() -> bool {
867    // 1. Check for CUDA environment variables
868    let cuda_env_available = std::env::var("CUDA_VISIBLE_DEVICES").is_ok()
869        || std::env::var("CUDA_PATH").is_ok()
870        || std::env::var("CUDA_HOME").is_ok();
871
872    // 2. Check for CUDA installation paths (cross-platform)
873    let cudapaths = [
874        "/usr/local/cuda",
875        "/opt/cuda",
876        "/usr/lib/x86_64-linux-gnu/libcuda.so",
877        "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
878        "/usr/lib64/libcuda.so",
879        "/usr/lib64/libcuda.so.1",
880        "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA",
881        "C:\\Windows\\System32\\nvcuda.dll",
882        "/System/Library/Frameworks/CUDA.framework", // macOS (if applicable)
883    ];
884
885    let cudapath_available = cudapaths
886        .iter()
887        .any(|path| std::path::Path::new(path).exists());
888
889    // 3. Try to execute nvidia-smi to check for NVIDIA GPUs
890    let nvidia_smi_available = std::process::Command::new("nvidia-smi")
891        .arg("--list-gpus")
892        .output()
893        .map(|output| output.status.success() && !output.stdout.is_empty())
894        .unwrap_or(false);
895
896    // 4. Check for NVIDIA devices in /proc (Linux-specific)
897    let nvidia_proc_available = std::path::Path::new("/proc/driver/nvidia").exists();
898
899    cuda_env_available || cudapath_available || nvidia_smi_available || nvidia_proc_available
900}
901
902/// Check if OpenCL is available on the system
903#[allow(dead_code)]
904pub fn is_opencl_available() -> bool {
905    // 1. Check for common OpenCL library paths (cross-platform)
906    let openclpaths = [
907        "/usr/lib/libOpenCL.so",
908        "/usr/lib/libOpenCL.so.1",
909        "/usr/lib64/libOpenCL.so",
910        "/usr/lib64/libOpenCL.so.1",
911        "/usr/lib/x86_64-linux-gnu/libOpenCL.so",
912        "/usr/lib/x86_64-linux-gnu/libOpenCL.so.1",
913        "/opt/intel/opencl/lib64/libOpenCL.so",
914        "/System/Library/Frameworks/OpenCL.framework/OpenCL", // macOS
915        "C:\\Windows\\System32\\OpenCL.dll",                  // Windows
916    ];
917
918    let opencl_lib_available = openclpaths
919        .iter()
920        .any(|path| std::path::Path::new(path).exists());
921
922    // 2. Check for vendor-specific OpenCL installations
923    let vendor_openclpaths = [
924        "/usr/lib/x86_64-linux-gnu/mesa",                   // Mesa OpenCL
925        "/opt/amdgpu-pro",                                  // AMD Pro drivers
926        "/opt/intel/opencl",                                // Intel OpenCL
927        "/usr/lib/x86_64-linux-gnu/libmali-bifrost-dev.so", // ARM Mali
928    ];
929
930    let vendor_opencl_available = vendor_openclpaths
931        .iter()
932        .any(|path| std::path::Path::new(path).exists());
933
934    // 3. Try to run clinfo command to enumerate OpenCL devices
935    let clinfo_available = std::process::Command::new("clinfo")
936        .output()
937        .map(|output| output.status.success() && !output.stdout.is_empty())
938        .unwrap_or(false);
939
940    // 4. Check for OpenCL environment variables
941    let opencl_env_available =
942        std::env::var("OPENCL_VENDOR_PATH").is_ok() || std::env::var("OCL_ICD_FILENAMES").is_ok();
943
944    opencl_lib_available || vendor_opencl_available || clinfo_available || opencl_env_available
945}
946
947/// Get optimal GPU configuration for the current system
948#[allow(dead_code)]
949pub fn get_optimal_gpu_config() -> GpuConfig {
950    if is_cuda_available() {
951        GpuConfig {
952            backend: GpuBackend::Cuda { device_id: 0 },
953            threads_per_block: 256,
954            enable_double_precision: true,
955            use_fast_math: false,
956            ..Default::default()
957        }
958    } else if is_opencl_available() {
959        GpuConfig {
960            backend: GpuBackend::OpenCl {
961                platform_id: 0,
962                device_id: 0,
963            },
964            threads_per_block: 256,
965            enable_double_precision: true,
966            ..Default::default()
967        }
968    } else {
969        GpuConfig {
970            backend: GpuBackend::Cpu,
971            ..Default::default()
972        }
973    }
974}
975
976/// List available GPU devices
977#[allow(dead_code)]
978pub fn list_gpu_devices() -> Result<Vec<GpuDeviceInfo>> {
979    let mut devices = Vec::new();
980
981    // Query CUDA devices
982    if is_cuda_available() {
983        for device_id in 0..4 {
984            // Check up to 4 CUDA devices
985            if let Ok(info) = GpuContext::query_cuda_device_info(device_id) {
986                devices.push(info);
987            }
988        }
989    }
990
991    // Query OpenCL devices
992    if is_opencl_available() {
993        for platform_id in 0..2 {
994            for device_id in 0..4 {
995                if let Ok(info) = GpuContext::query_opencl_device_info(platform_id, device_id) {
996                    devices.push(info);
997                }
998            }
999        }
1000    }
1001
1002    // Always include CPU fallback
1003    devices.push(GpuDeviceInfo {
1004        name: "CPU (Fallback)".to_string(),
1005        total_memory_mb: 8192,
1006        available_memory_mb: 4096,
1007        compute_units: num_cpus::get() as u32,
1008        max_work_group_size: 1,
1009        compute_capability: "N/A".to_string(),
1010        supports_double_precision: true,
1011    });
1012
1013    Ok(devices)
1014}
1015
1016/// Convenience functions for GPU-accelerated data generation
1017///
1018/// Generate classification dataset with automatic GPU detection
1019#[allow(dead_code)]
1020pub fn make_classification_auto_gpu(
1021    n_samples: usize,
1022    n_features: usize,
1023    n_classes: usize,
1024    n_clusters_per_class: usize,
1025    n_informative: usize,
1026    random_state: Option<u64>,
1027) -> Result<Dataset> {
1028    let config = get_optimal_gpu_config();
1029    let context = GpuContext::new(config)?;
1030    context.make_classification_gpu(
1031        n_samples,
1032        n_features,
1033        n_classes,
1034        n_clusters_per_class,
1035        n_informative,
1036        random_state,
1037    )
1038}
1039
1040/// Generate regression dataset with automatic GPU detection
1041#[allow(dead_code)]
1042pub fn make_regression_auto_gpu(
1043    n_samples: usize,
1044    n_features: usize,
1045    n_informative: usize,
1046    noise: f64,
1047    random_state: Option<u64>,
1048) -> Result<Dataset> {
1049    let config = get_optimal_gpu_config();
1050    let context = GpuContext::new(config)?;
1051    context.make_regression_gpu(n_samples, n_features, n_informative, noise, random_state)
1052}
1053
1054/// Generate blobs dataset with automatic GPU detection
1055#[allow(dead_code)]
1056pub fn make_blobs_auto_gpu(
1057    n_samples: usize,
1058    n_features: usize,
1059    n_centers: usize,
1060    cluster_std: f64,
1061    random_state: Option<u64>,
1062) -> Result<Dataset> {
1063    let config = get_optimal_gpu_config();
1064    let context = GpuContext::new(config)?;
1065    context.make_blobs_gpu(n_samples, n_features, n_centers, cluster_std, random_state)
1066}
1067
1068#[cfg(test)]
1069mod tests {
1070    use super::*;
1071
1072    #[test]
1073    fn test_gpu_config_default() {
1074        let config = GpuConfig::default();
1075        assert!(matches!(config.backend, GpuBackend::Cuda { device_id: 0 }));
1076        assert_eq!(config.threads_per_block, 256);
1077        assert!(config.enable_double_precision);
1078    }
1079
1080    #[test]
1081    fn test_gpu_context_cpu_fallback() {
1082        let config = GpuConfig {
1083            backend: GpuBackend::Cpu,
1084            threads_per_block: 1,
1085            ..Default::default()
1086        };
1087
1088        let context = GpuContext::new(config).unwrap();
1089        assert!(context.is_available());
1090        assert_eq!(context.device_info.name, "CPU Fallback");
1091    }
1092
1093    #[test]
1094    fn test_gpu_classification_generation() {
1095        let config = GpuConfig {
1096            backend: GpuBackend::Cpu,
1097            threads_per_block: 1,
1098            ..Default::default()
1099        };
1100
1101        let context = GpuContext::new(config).unwrap();
1102        let dataset = context
1103            .make_classification_gpu(100, 10, 3, 2, 8, Some(42))
1104            .unwrap();
1105
1106        assert_eq!(dataset.n_samples(), 100);
1107        assert_eq!(dataset.n_features(), 10);
1108        assert!(dataset.target.is_some());
1109    }
1110
1111    #[test]
1112    fn test_optimal_gpu_config() {
1113        let config = get_optimal_gpu_config();
1114        // Should not panic and should return a valid configuration
1115        assert!(matches!(
1116            config.backend,
1117            GpuBackend::Cuda { .. } | GpuBackend::OpenCl { .. } | GpuBackend::Cpu
1118        ));
1119    }
1120
1121    #[test]
1122    fn test_list_gpu_devices() {
1123        let devices = list_gpu_devices().unwrap();
1124        assert!(!devices.is_empty());
1125
1126        // Should always have at least the CPU fallback
1127        assert!(devices.iter().any(|d| d.name.contains("CPU")));
1128    }
1129
1130    #[test]
1131    #[ignore = "timeout"]
1132    fn test_gpu_benchmark_creation() {
1133        let config = GpuConfig {
1134            backend: GpuBackend::Cpu,
1135            threads_per_block: 1,
1136            ..Default::default()
1137        };
1138
1139        let _benchmark = GpuBenchmark::new(config).unwrap();
1140        // Should not panic during creation
1141    }
1142}