scirs2_series/gpu_acceleration/
device_manager.rs

1//! GPU device detection and management
2//!
3//! This module handles the detection and management of GPU devices across
4//! different backends (CUDA, OpenCL, Metal, ROCm) with automatic fallback to CPU.
5
6use std::fmt::Debug;
7
8use super::config::{GpuBackend, GpuCapabilities, TensorCoresGeneration};
9use crate::error::{Result, TimeSeriesError};
10
11/// GPU device manager for detecting and managing GPU devices
12#[derive(Debug)]
13pub struct GpuDeviceManager {
14    /// Available devices
15    devices: Vec<GpuCapabilities>,
16    /// Current device
17    current_device: Option<usize>,
18}
19
20impl GpuDeviceManager {
21    /// Create a new device manager
22    pub fn new() -> Result<Self> {
23        // Detect actual GPU devices when dependencies are available
24        let mut devices = Vec::new();
25
26        // Try to detect CUDA devices
27        if let Some(cuda_devices) = Self::detect_cuda_devices() {
28            devices.extend(cuda_devices);
29        }
30
31        // Try to detect OpenCL devices
32        if let Some(opencl_devices) = Self::detect_opencl_devices() {
33            devices.extend(opencl_devices);
34        }
35
36        // Try to detect Metal devices (Apple Silicon)
37        if let Some(metal_devices) = Self::detect_metal_devices() {
38            devices.extend(metal_devices);
39        }
40
41        // Try to detect ROCm devices (AMD)
42        if let Some(rocm_devices) = Self::detect_rocm_devices() {
43            devices.extend(rocm_devices);
44        }
45
46        // Always provide CPU fallback if no GPU devices found
47        if devices.is_empty() {
48            devices.push(GpuCapabilities {
49                backend: GpuBackend::CpuFallback,
50                compute_capability: None,
51                memory: Self::get_system_memory(),
52                multiprocessors: Self::get_cpu_cores(),
53                supports_fp16: false,
54                supports_tensor_cores: false,
55                max_threads_per_block: 1,
56                tensor_cores_generation: None,
57                memory_bandwidth: 100.0, // GB/s - rough estimate for system memory
58                tensor_performance: None,
59            });
60        }
61
62        Ok(Self {
63            devices,
64            current_device: Some(0), // Default to first device
65        })
66    }
67
68    /// Get available devices
69    pub fn get_devices(&self) -> &[GpuCapabilities] {
70        &self.devices
71    }
72
73    /// Set current device
74    pub fn set_device(&mut self, deviceid: usize) -> Result<()> {
75        if deviceid >= self.devices.len() {
76            return Err(TimeSeriesError::InvalidInput(format!(
77                "Device {deviceid} not available"
78            )));
79        }
80        self.current_device = Some(deviceid);
81        Ok(())
82    }
83
84    /// Get current device capabilities
85    pub fn current_device_capabilities(&self) -> Option<&GpuCapabilities> {
86        self.current_device.map(|id| &self.devices[id])
87    }
88
89    /// Check if GPU acceleration is available
90    pub fn is_gpu_available(&self) -> bool {
91        self.devices
92            .iter()
93            .any(|dev| !matches!(dev.backend, GpuBackend::CpuFallback))
94    }
95
96    /// Detect CUDA devices
97    fn detect_cuda_devices() -> Option<Vec<GpuCapabilities>> {
98        // In a real implementation, this would use CUDA Runtime API
99        // For now, simulate detection by checking for common NVIDIA indicators
100        #[cfg(target_os = "linux")]
101        {
102            if std::path::Path::new("/dev/nvidia0").exists()
103                || std::path::Path::new("/proc/driver/nvidia").exists()
104            {
105                return Some(vec![GpuCapabilities {
106                    backend: GpuBackend::Cuda,
107                    compute_capability: Some((8, 0)), // Simulated A100 capability
108                    memory: 40 * 1024 * 1024 * 1024,  // 40GB simulated
109                    multiprocessors: 108,
110                    supports_fp16: true,
111                    supports_tensor_cores: true,
112                    max_threads_per_block: 1024,
113                    tensor_cores_generation: Some(TensorCoresGeneration::V3), // A100 is gen 3
114                    memory_bandwidth: 1555.0,                                 // GB/s for A100
115                    tensor_performance: Some(312.0),                          // TOPS for A100 BF16
116                }]);
117            }
118        }
119
120        #[cfg(target_os = "windows")]
121        {
122            // On Windows, could check for nvidia-ml.dll or query WMI
123            // For simulation, assume no CUDA devices
124        }
125
126        None
127    }
128
129    /// Detect OpenCL devices
130    fn detect_opencl_devices() -> Option<Vec<GpuCapabilities>> {
131        // In a real implementation, this would use OpenCL API
132        // Check for common OpenCL indicators
133        #[cfg(any(target_os = "linux", target_os = "windows", target_os = "macos"))]
134        {
135            // Simulated OpenCL device detection
136            // In real implementation, would enumerate platforms and devices
137            if Self::has_opencl_drivers() {
138                return Some(vec![GpuCapabilities {
139                    backend: GpuBackend::OpenCL,
140                    compute_capability: None,
141                    memory: 8 * 1024 * 1024 * 1024, // 8GB simulated
142                    multiprocessors: 64,
143                    supports_fp16: true,
144                    supports_tensor_cores: false,
145                    max_threads_per_block: 256,
146                    tensor_cores_generation: None,
147                    memory_bandwidth: 500.0, // GB/s estimate
148                    tensor_performance: None,
149                }]);
150            }
151        }
152
153        None
154    }
155
156    /// Detect Metal devices (Apple Silicon)
157    fn detect_metal_devices() -> Option<Vec<GpuCapabilities>> {
158        #[cfg(target_os = "macos")]
159        {
160            // Check for Apple Silicon or dedicated GPU
161            if Self::is_apple_silicon() || Self::has_metal_gpu() {
162                return Some(vec![GpuCapabilities {
163                    backend: GpuBackend::Metal,
164                    compute_capability: None,
165                    memory: 16 * 1024 * 1024 * 1024, // 16GB unified memory
166                    multiprocessors: 32,             // GPU cores
167                    supports_fp16: true,
168                    supports_tensor_cores: true, // Neural Engine
169                    max_threads_per_block: 1024,
170                    tensor_cores_generation: Some(TensorCoresGeneration::V3), // Apple Silicon Neural Engine
171                    memory_bandwidth: 400.0,                                  // GB/s for M1 Pro/Max
172                    tensor_performance: Some(15.8), // TOPS for M1 Neural Engine
173                }]);
174            }
175        }
176
177        None
178    }
179
180    /// Detect ROCm devices (AMD)
181    fn detect_rocm_devices() -> Option<Vec<GpuCapabilities>> {
182        #[cfg(target_os = "linux")]
183        {
184            // Check for AMD ROCm installation
185            if std::path::Path::new("/opt/rocm").exists()
186                || std::path::Path::new("/dev/kfd").exists()
187            {
188                return Some(vec![GpuCapabilities {
189                    backend: GpuBackend::Rocm,
190                    compute_capability: None,
191                    memory: 32 * 1024 * 1024 * 1024, // 32GB simulated
192                    multiprocessors: 120,
193                    supports_fp16: true,
194                    supports_tensor_cores: false, // AMD uses Matrix Cores, not Tensor Cores
195                    max_threads_per_block: 1024,
196                    tensor_cores_generation: None, // AMD has MFMA instructions instead
197                    memory_bandwidth: 1600.0,      // GB/s for MI250X
198                    tensor_performance: Some(383.0), // TOPS for MI250X BF16
199                }]);
200            }
201        }
202
203        None
204    }
205
206    /// Check for OpenCL drivers
207    fn has_opencl_drivers() -> bool {
208        #[cfg(target_os = "linux")]
209        {
210            std::path::Path::new("/usr/lib/x86_64-linux-gnu/libOpenCL.so").exists()
211                || std::path::Path::new("/usr/lib64/libOpenCL.so").exists()
212        }
213        #[cfg(target_os = "windows")]
214        {
215            std::path::Path::new("C:/Windows/System32/OpenCL.dll").exists()
216        }
217        #[cfg(target_os = "macos")]
218        {
219            std::path::Path::new("/System/Library/Frameworks/OpenCL.framework").exists()
220        }
221        #[cfg(not(any(target_os = "linux", target_os = "windows", target_os = "macos")))]
222        {
223            false
224        }
225    }
226
227    /// Check if running on Apple Silicon
228    #[cfg(target_os = "macos")]
229    #[allow(dead_code)]
230    fn is_apple_silicon() -> bool {
231        std::env::consts::ARCH == "aarch64"
232    }
233
234    #[cfg(not(target_os = "macos"))]
235    #[allow(dead_code)]
236    fn is_apple_silicon() -> bool {
237        false
238    }
239
240    /// Check for Metal GPU
241    #[cfg(target_os = "macos")]
242    #[allow(dead_code)]
243    fn has_metal_gpu() -> bool {
244        std::path::Path::new("/System/Library/Frameworks/Metal.framework").exists()
245    }
246
247    #[cfg(not(target_os = "macos"))]
248    #[allow(dead_code)]
249    fn has_metal_gpu() -> bool {
250        false
251    }
252
253    /// Get system memory size
254    fn get_system_memory() -> usize {
255        #[cfg(target_os = "linux")]
256        {
257            // Try to read from /proc/meminfo
258            if let Ok(contents) = std::fs::read_to_string("/proc/meminfo") {
259                for line in contents.lines() {
260                    if line.starts_with("MemTotal:") {
261                        if let Some(kb_str) = line.split_whitespace().nth(1) {
262                            if let Ok(kb) = kb_str.parse::<usize>() {
263                                return kb * 1024; // Convert KB to bytes
264                            }
265                        }
266                    }
267                }
268            }
269        }
270
271        // Default to 8GB if detection fails
272        8 * 1024 * 1024 * 1024
273    }
274
275    /// Get number of CPU cores
276    fn get_cpu_cores() -> usize {
277        std::thread::available_parallelism()
278            .map(|p| p.get())
279            .unwrap_or(4) // Default to 4 cores
280    }
281}
282
283impl Default for GpuDeviceManager {
284    fn default() -> Self {
285        Self::new().unwrap_or_else(|_| Self {
286            devices: vec![],
287            current_device: None,
288        })
289    }
290}