scirs2_cluster/gpu/
mod.rs

1//! GPU acceleration interfaces and implementations for clustering algorithms
2//!
3//! This module provides GPU acceleration capabilities for clustering algorithms,
4//! supporting multiple backends including CUDA, OpenCL, ROCm, Intel OneAPI, and Metal.
5//! When GPU acceleration is not available, algorithms automatically fall back to
6//! optimized CPU implementations.
7//!
8//! # Features
9//!
10//! * **Multiple GPU Backends**: Support for CUDA, OpenCL, ROCm, Intel OneAPI, and Metal
11//! * **Automatic Fallback**: Seamless fallback to CPU when GPU is not available
12//! * **Memory Management**: Efficient GPU memory allocation and pooling
13//! * **Distance Computations**: Optimized distance matrix calculations
14//! * **Performance Monitoring**: Built-in benchmarking and performance statistics
15//! * **Device Selection**: Automatic or manual GPU device selection strategies
16//! * **Tensor Core Support**: Mixed precision and tensor core acceleration (v0.2.0)
17//! * **Advanced Memory Strategies**: Conservative, aggressive, and adaptive memory management
18//!
19//! # Examples
20//!
21//! ## Basic GPU Configuration
22//!
23//! ```rust
24//! use scirs2_cluster::gpu::{GpuConfig, GpuBackend, DeviceSelection};
25//!
26//! // Create CUDA configuration with automatic device selection
27//! let config = GpuConfig::cuda()
28//!     .with_device_selection(DeviceSelection::Auto)
29//!     .with_memory_pool_size(1024 * 1024 * 1024); // 1GB pool
30//!
31//! // OpenCL configuration for cross-platform support
32//! let opencl_config = GpuConfig::opencl()
33//!     .with_device_selection(DeviceSelection::MostMemory);
34//! ```
35//!
36//! ## GPU Distance Matrix Computation
37//!
38//! ```rust
39//! use scirs2_cluster::gpu::{GpuDistanceMatrix, DistanceMetric, GpuConfig};
40//! use scirs2_core::ndarray::Array2;
41//!
42//! // Create sample data
43//! let data = Array2::from_shape_vec((1000, 10), (0..10000).map(|x| x as f64).collect()).expect("operation should succeed");
44//!
45//! // Create GPU distance matrix
46//! let config = GpuConfig::default();
47//! let mut gpu_matrix = GpuDistanceMatrix::new(config, DistanceMetric::Euclidean, None).expect("operation should succeed");
48//!
49//! // Preload data to GPU for faster repeated computations
50//! gpu_matrix.preload_data(data.view()).expect("operation should succeed");
51//!
52//! // Compute distance matrix
53//! let distances = gpu_matrix.compute_distance_matrix(data.view()).expect("operation should succeed");
54//! ```
55//!
56//! ## Memory Management
57//!
58//! ```rust
59//! use scirs2_cluster::gpu::{GpuMemoryManager, MemoryStrategy};
60//!
61//! // Create memory manager with 256-byte alignment and pool size of 100
62//! let mut memory_manager = GpuMemoryManager::new(256, 100);
63//!
64//! // Allocate GPU memory
65//! let block = memory_manager.allocate(1024 * 1024).expect("operation should succeed"); // 1MB
66//!
67//! // Memory is automatically pooled for reuse
68//! memory_manager.deallocate(block).expect("operation should succeed");
69//!
70//! // Check memory statistics
71//! let stats = memory_manager.get_stats();
72//! println!("Pool efficiency: {:.2}%", memory_manager.pool_efficiency() * 100.0);
73//! ```
74//!
75//! ## GPU-Accelerated K-means (v0.2.0)
76//!
77//! ```rust,ignore
78//! use scirs2_cluster::gpu::{GpuKMeans, GpuAccelerationConfig};
79//! use scirs2_core::ndarray::Array2;
80//!
81//! // Create GPU K-means with CUDA backend
82//! let config = GpuAccelerationConfig::cuda();
83//! let mut kmeans = GpuKMeans::<f64>::new(config).expect("operation should succeed");
84//!
85//! // Create sample data
86//! let data = Array2::from_shape_fn((1000, 10), |(i, j)| (i + j) as f64);
87//!
88//! // Fit K-means with GPU acceleration
89//! let result = kmeans.fit(data.view(), 10, 100, 1e-4).expect("operation should succeed");
90//!
91//! println!("Converged: {}", result.converged);
92//! println!("Iterations: {}", result.n_iterations);
93//! println!("GPU used: {}", result.metrics.used_gpu);
94//! ```
95
96pub mod acceleration;
97pub mod core;
98pub mod distance;
99pub mod kernels;
100pub mod memory;
101
102// Re-export main types for convenience
103pub use core::{BackendContext, DeviceSelection, GpuBackend, GpuConfig, GpuContext, GpuDevice};
104
105pub use distance::{DistanceMetric, GpuArray, GpuDistanceMatrix};
106
107pub use memory::{
108    BandwidthMonitor, GpuMemoryBlock, GpuMemoryManager, MemoryStats, MemoryStrategy, MemoryTransfer,
109};
110
111// Re-export advanced acceleration features (v0.2.0)
112pub use acceleration::{
113    detect_tensor_core_capabilities, AdvancedDeviceSelection, AdvancedGpuMemoryManager,
114    AdvancedMemoryStrategy, AllocationRecord, DeviceBenchmark, DeviceSelector,
115    GpuAccelerationConfig, GpuKMeans, GpuKMeansResult, KMeansMetrics, KernelOptimizations,
116    MemoryUsageStats, PrecisionMode, ProfilingRecord, TensorCoreCapabilities, TensorCoreConfig,
117};
118
119// Re-export kernel types
120pub use kernels::{
121    calculate_kernel_config, generate_cuda_batch_distance_kernel,
122    generate_cuda_distance_matrix_kernel, generate_cuda_kmeans_assign_kernel,
123    generate_metal_distance_kernel, generate_opencl_distance_matrix_kernel,
124    generate_rocm_distance_kernel, get_kernel_source, get_kmeans_kernel_source, DistanceKernelType,
125    KernelConfig, KernelDataType,
126};
127
128// Additional convenience functions and types
129
130/// Create a GPU configuration for the best available backend
131pub fn auto_config() -> GpuConfig {
132    // Try backends in order of preference
133    let preferred_backends = [
134        GpuBackend::Cuda,
135        GpuBackend::OpenCl,
136        GpuBackend::Rocm,
137        GpuBackend::Metal,
138        GpuBackend::OneApi,
139    ];
140
141    for &backend in &preferred_backends {
142        if is_backend_available(backend) {
143            return GpuConfig::new(backend);
144        }
145    }
146
147    // Fallback to CPU
148    GpuConfig::default()
149}
150
151/// Check if a specific GPU backend is available
152pub fn is_backend_available(backend: GpuBackend) -> bool {
153    match backend {
154        GpuBackend::CpuFallback => true,
155        _ => {
156            // This is a stub implementation
157            // Real implementation would check for:
158            // - CUDA: nvidia-ml-py, cupy, or pycuda availability
159            // - OpenCL: pyopencl availability
160            // - ROCm: rocm installation
161            // - Metal: Metal framework availability (macOS)
162            // - OneAPI: Intel OneAPI toolkit installation
163            false
164        }
165    }
166}
167
168/// List all available GPU devices
169pub fn list_devices() -> Vec<GpuDevice> {
170    // This is a stub implementation
171    // Real implementation would enumerate actual devices
172    vec![GpuDevice::new(
173        0,
174        "Integrated GPU".to_string(),
175        4_000_000_000, // 4GB
176        3_500_000_000, // 3.5GB available
177        "1.0".to_string(),
178        512,
179        GpuBackend::CpuFallback,
180        false,
181    )]
182}
183
184/// Get the best available GPU device
185pub fn get_best_device() -> Option<GpuDevice> {
186    let devices = list_devices();
187    devices
188        .into_iter()
189        .filter(|d| d.backend != GpuBackend::CpuFallback)
190        .max_by(|a, b| {
191            a.get_device_score()
192                .partial_cmp(&b.get_device_score())
193                .unwrap_or(std::cmp::Ordering::Equal)
194        })
195}
196
197/// Benchmark GPU vs CPU performance for distance computations
198pub fn benchmark_gpu_vs_cpu(
199    data_size: usize,
200    n_features: usize,
201    metric: DistanceMetric,
202) -> Result<BenchmarkResult, crate::error::ClusteringError> {
203    use scirs2_core::ndarray::Array2;
204    use std::time::Instant;
205
206    // Generate test data
207    let data = Array2::from_shape_fn((data_size, n_features), |(i, j)| {
208        (i * n_features + j) as f64 / 1000.0
209    });
210
211    // CPU benchmark
212    let cpu_start = Instant::now();
213    let cpu_config = GpuConfig::new(GpuBackend::CpuFallback);
214    let cpu_matrix = GpuDistanceMatrix::new(cpu_config, metric, None)?;
215    let _cpu_result = cpu_matrix.compute_distance_matrix_cpu(data.view())?;
216    let cpu_duration = cpu_start.elapsed();
217
218    // GPU benchmark (will fallback to CPU in stub implementation)
219    let gpu_start = Instant::now();
220    let gpu_config = auto_config();
221    let mut gpu_matrix = GpuDistanceMatrix::new(gpu_config, metric, None)?;
222    let _gpu_result = gpu_matrix.compute_distance_matrix(data.view())?;
223    let gpu_duration = gpu_start.elapsed();
224
225    Ok(BenchmarkResult {
226        cpu_duration_ms: cpu_duration.as_millis() as f64,
227        gpu_duration_ms: gpu_duration.as_millis() as f64,
228        speedup: cpu_duration.as_secs_f64() / gpu_duration.as_secs_f64(),
229        data_size,
230        n_features,
231        metric,
232    })
233}
234
235/// Result of GPU vs CPU benchmark
236#[derive(Debug, Clone)]
237pub struct BenchmarkResult {
238    /// CPU computation time in milliseconds
239    pub cpu_duration_ms: f64,
240    /// GPU computation time in milliseconds
241    pub gpu_duration_ms: f64,
242    /// Speedup factor (CPU time / GPU time)
243    pub speedup: f64,
244    /// Size of test data
245    pub data_size: usize,
246    /// Number of features
247    pub n_features: usize,
248    /// Distance metric used
249    pub metric: DistanceMetric,
250}
251
252impl BenchmarkResult {
253    /// Get performance summary
254    pub fn summary(&self) -> String {
255        format!(
256            "GPU vs CPU Benchmark Results:\n\
257             Data size: {} samples x {} features\n\
258             Distance metric: {}\n\
259             CPU time: {:.2} ms\n\
260             GPU time: {:.2} ms\n\
261             Speedup: {:.2}x",
262            self.data_size,
263            self.n_features,
264            self.metric,
265            self.cpu_duration_ms,
266            self.gpu_duration_ms,
267            self.speedup
268        )
269    }
270
271    /// Check if GPU provided a speedup
272    pub fn gpu_is_faster(&self) -> bool {
273        self.speedup > 1.0
274    }
275
276    /// Get efficiency rating
277    pub fn efficiency_rating(&self) -> &'static str {
278        match self.speedup {
279            x if x >= 10.0 => "Excellent",
280            x if x >= 5.0 => "Very Good",
281            x if x >= 2.0 => "Good",
282            x if x >= 1.1 => "Marginal",
283            _ => "No Benefit",
284        }
285    }
286}
287
288/// GPU feature detection and capabilities
289pub struct GpuCapabilities {
290    /// Available backends
291    pub available_backends: Vec<GpuBackend>,
292    /// Best device for each backend
293    pub best_devices: std::collections::HashMap<GpuBackend, GpuDevice>,
294    /// Total GPU memory across all devices
295    pub total_gpu_memory: usize,
296    /// Supports unified memory
297    pub supports_unified_memory: bool,
298    /// Supports double precision
299    pub supports_double_precision: bool,
300}
301
302impl GpuCapabilities {
303    /// Detect GPU capabilities
304    pub fn detect() -> Self {
305        let available_backends: Vec<GpuBackend> = [
306            GpuBackend::Cuda,
307            GpuBackend::OpenCl,
308            GpuBackend::Rocm,
309            GpuBackend::Metal,
310            GpuBackend::OneApi,
311        ]
312        .iter()
313        .cloned()
314        .filter(|&backend| is_backend_available(backend))
315        .collect();
316
317        let mut best_devices = std::collections::HashMap::new();
318        let mut total_memory = 0;
319        let mut supports_unified = false;
320        let mut supports_double = false;
321
322        // Stub implementation
323        for backend in available_backends.iter() {
324            if let Some(device) = Self::get_best_device_for_backend(*backend) {
325                total_memory += device.total_memory;
326                supports_unified |= *backend == GpuBackend::Cuda; // CUDA typically supports unified memory
327                supports_double |= device.supports_double_precision;
328                best_devices.insert(*backend, device);
329            }
330        }
331
332        Self {
333            available_backends,
334            best_devices,
335            total_gpu_memory: total_memory,
336            supports_unified_memory: supports_unified,
337            supports_double_precision: supports_double,
338        }
339    }
340
341    /// Get summary of GPU capabilities
342    pub fn summary(&self) -> String {
343        let mut summary = String::new();
344        summary.push_str("GPU Capabilities Summary:\n");
345        summary.push_str(&format!(
346            "Available backends: {:?}\n",
347            self.available_backends
348        ));
349        summary.push_str(&format!(
350            "Total GPU memory: {:.2} GB\n",
351            self.total_gpu_memory as f64 / (1024.0 * 1024.0 * 1024.0)
352        ));
353        summary.push_str(&format!(
354            "Unified memory support: {}\n",
355            self.supports_unified_memory
356        ));
357        summary.push_str(&format!(
358            "Double precision support: {}\n",
359            self.supports_double_precision
360        ));
361
362        for (backend, device) in &self.best_devices {
363            summary.push_str(&format!(
364                "Best {} device: {} ({:.2} GB)\n",
365                backend,
366                device.name,
367                device.total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
368            ));
369        }
370
371        summary
372    }
373
374    fn get_best_device_for_backend(backend: GpuBackend) -> Option<GpuDevice> {
375        // Stub implementation
376        match backend {
377            GpuBackend::CpuFallback => None,
378            _ => Some(GpuDevice::new(
379                0,
380                format!("{} Device", backend),
381                8_000_000_000,
382                7_000_000_000,
383                "1.0".to_string(),
384                1024,
385                backend,
386                true,
387            )),
388        }
389    }
390}
391
392/// Convenience function to check if GPU acceleration is recommended for a given problem size
393pub fn is_gpu_recommended(n_samples: usize, n_features: usize) -> bool {
394    // Simple heuristic: GPU typically beneficial for larger problems
395    let problem_size = n_samples * n_features;
396    problem_size > 10_000 && n_samples > 100
397}
398
399/// Get recommended tile size for GPU computations
400pub fn get_recommended_tile_size(device: &GpuDevice, element_size: usize) -> usize {
401    // Calculate based on available memory and compute units
402    let memory_per_tile = device.available_memory / 16; // Use 1/16 of available memory per tile
403    let elements_per_tile = memory_per_tile / element_size;
404    let sqrt_elements = (elements_per_tile as f64).sqrt() as usize;
405
406    // Clamp to reasonable range and align to compute units
407    let base_tile_size = sqrt_elements.max(32).min(1024);
408    let compute_aligned = ((base_tile_size + device.compute_units as usize - 1)
409        / device.compute_units as usize)
410        * device.compute_units as usize;
411
412    compute_aligned.min(1024)
413}
414
415#[cfg(test)]
416mod tests {
417    use super::*;
418
419    #[test]
420    fn test_auto_config() {
421        let config = auto_config();
422        assert!(config.auto_fallback);
423    }
424
425    #[test]
426    fn test_backend_availability() {
427        assert!(is_backend_available(GpuBackend::CpuFallback));
428        // Other backends return false in stub implementation
429        assert!(!is_backend_available(GpuBackend::Cuda));
430    }
431
432    #[test]
433    fn test_list_devices() {
434        let devices = list_devices();
435        assert!(!devices.is_empty());
436    }
437
438    #[test]
439    fn test_gpu_recommendation() {
440        assert!(!is_gpu_recommended(10, 10)); // Small problem
441        assert!(is_gpu_recommended(1000, 100)); // Large problem
442    }
443
444    #[test]
445    fn test_capabilities_detection() {
446        let caps = GpuCapabilities::detect();
447        assert!(!caps.summary().is_empty());
448    }
449
450    #[test]
451    fn test_recommended_tile_size() {
452        let device = GpuDevice::new(
453            0,
454            "Test".to_string(),
455            8_000_000_000,
456            6_000_000_000,
457            "1.0".to_string(),
458            1024,
459            GpuBackend::Cuda,
460            true,
461        );
462
463        let tile_size = get_recommended_tile_size(&device, 8);
464        assert!(tile_size >= 32);
465        assert!(tile_size <= 1024);
466    }
467
468    #[test]
469    fn test_benchmark_result() {
470        let result = BenchmarkResult {
471            cpu_duration_ms: 100.0,
472            gpu_duration_ms: 20.0,
473            speedup: 5.0,
474            data_size: 1000,
475            n_features: 10,
476            metric: DistanceMetric::Euclidean,
477        };
478
479        assert!(result.gpu_is_faster());
480        assert_eq!(result.efficiency_rating(), "Very Good");
481        assert!(!result.summary().is_empty());
482    }
483}
scirs2_cluster/gpu/mod.rs

scirs2_cluster/gpu/
mod.rs