Skip to main content

trueno_gpu/monitor/cuda/
mod.rs

1//! CUDA GPU Monitoring (TRUENO-SPEC-010)
2//!
3//! Provides native CUDA device monitoring via the CUDA Driver API.
4//! This module enables accurate device information and real-time memory metrics.
5//!
6//! # Design Philosophy
7//!
8//! **Native CUDA**: Direct access via cuDeviceGetName, cuMemGetInfo provides
9//! accurate information (e.g., "NVIDIA GeForce RTX 4090") compared to wgpu's
10//! generic backend queries.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use trueno_gpu::monitor::{CudaDeviceInfo, CudaMemoryInfo};
16//!
17//! // Query device info
18//! let info = CudaDeviceInfo::query(0)?;
19//! println!("GPU: {} ({} GB)", info.name, info.total_memory_gb());
20//!
21//! // Query memory usage
22//! let mem = CudaMemoryInfo::query()?;
23//! println!("Free: {} / {} MB", mem.free_mb(), mem.total_mb());
24//! ```
25//!
26//! # References
27//!
28//! - NVIDIA CUDA Driver API: cuDeviceGetName, cuDeviceTotalMem, cuMemGetInfo
29//! - TRUENO-SPEC-010: GPU Monitoring, Tracing, and Visualization
30
31#[cfg(feature = "cuda")]
32use crate::driver::{cuda_available, device_count, CudaContext};
33use crate::GpuError;
34
35// ============================================================================
36// CUDA Device Information (TRUENO-SPEC-010 Section 3.1)
37// ============================================================================
38
39/// CUDA device information from native driver API
40///
41/// Provides accurate device information including:
42/// - Device name (e.g., "NVIDIA GeForce RTX 4090")
43/// - Total VRAM in bytes
44/// - Device ordinal
45#[derive(Debug, Clone)]
46pub struct CudaDeviceInfo {
47    /// Device ordinal (0-based index)
48    pub index: u32,
49    /// Device name from cuDeviceGetName
50    pub name: String,
51    /// Total VRAM in bytes from cuDeviceTotalMem
52    pub total_memory: u64,
53}
54
55impl CudaDeviceInfo {
56    /// Query device information for the specified device index
57    ///
58    /// # Arguments
59    ///
60    /// * `device_index` - Device ordinal (0 for first GPU)
61    ///
62    /// # Errors
63    ///
64    /// Returns error if device is not found or query fails.
65    ///
66    /// # Example
67    ///
68    /// ```rust,ignore
69    /// let info = CudaDeviceInfo::query(0)?;
70    /// println!("GPU: {}", info.name);
71    /// ```
72    #[cfg(feature = "cuda")]
73    #[allow(clippy::cast_possible_wrap)]
74    pub fn query(device_index: u32) -> Result<Self, GpuError> {
75        let ctx = CudaContext::new(device_index as i32)?;
76        let name = ctx.device_name()?;
77        let total_memory = ctx.total_memory()? as u64;
78
79        Ok(Self {
80            index: device_index,
81            name,
82            total_memory,
83        })
84    }
85
86    /// Query device information (non-CUDA stub)
87    #[cfg(not(feature = "cuda"))]
88    pub fn query(_device_index: u32) -> Result<Self, GpuError> {
89        Err(GpuError::CudaNotAvailable(
90            "cuda feature not enabled".to_string(),
91        ))
92    }
93
94    /// Enumerate all available CUDA devices
95    ///
96    /// # Errors
97    ///
98    /// Returns error if enumeration fails.
99    #[cfg(feature = "cuda")]
100    pub fn enumerate() -> Result<Vec<Self>, GpuError> {
101        let count = device_count()?;
102        let mut devices = Vec::with_capacity(count);
103
104        for i in 0..count {
105            devices.push(Self::query(i as u32)?);
106        }
107
108        Ok(devices)
109    }
110
111    /// Enumerate devices (non-CUDA stub)
112    #[cfg(not(feature = "cuda"))]
113    pub fn enumerate() -> Result<Vec<Self>, GpuError> {
114        Err(GpuError::CudaNotAvailable(
115            "cuda feature not enabled".to_string(),
116        ))
117    }
118
119    /// Get total memory in megabytes
120    #[must_use]
121    pub fn total_memory_mb(&self) -> u64 {
122        self.total_memory / (1024 * 1024)
123    }
124
125    /// Get total memory in gigabytes
126    #[must_use]
127    pub fn total_memory_gb(&self) -> f64 {
128        self.total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
129    }
130}
131
132impl std::fmt::Display for CudaDeviceInfo {
133    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
134        write!(
135            f,
136            "[{}] {} ({:.1} GB)",
137            self.index,
138            self.name,
139            self.total_memory_gb()
140        )
141    }
142}
143
144// ============================================================================
145// CUDA Memory Information (TRUENO-SPEC-010 Section 4.1.2)
146// ============================================================================
147
148/// Real-time CUDA memory information from cuMemGetInfo
149///
150/// Provides current memory usage on the active CUDA context.
151#[derive(Debug, Clone, Copy)]
152pub struct CudaMemoryInfo {
153    /// Free memory in bytes
154    pub free: u64,
155    /// Total memory in bytes
156    pub total: u64,
157}
158
159impl CudaMemoryInfo {
160    /// Query current memory information
161    ///
162    /// Requires an active CUDA context.
163    ///
164    /// # Errors
165    ///
166    /// Returns error if no context is active or query fails.
167    #[cfg(feature = "cuda")]
168    pub fn query(ctx: &CudaContext) -> Result<Self, GpuError> {
169        let (free, total) = ctx.memory_info()?;
170        Ok(Self {
171            free: free as u64,
172            total: total as u64,
173        })
174    }
175
176    /// Get used memory in bytes
177    #[must_use]
178    pub fn used(&self) -> u64 {
179        self.total.saturating_sub(self.free)
180    }
181
182    /// Get free memory in megabytes
183    #[must_use]
184    pub fn free_mb(&self) -> u64 {
185        self.free / (1024 * 1024)
186    }
187
188    /// Get total memory in megabytes
189    #[must_use]
190    pub fn total_mb(&self) -> u64 {
191        self.total / (1024 * 1024)
192    }
193
194    /// Get used memory in megabytes
195    #[must_use]
196    pub fn used_mb(&self) -> u64 {
197        self.used() / (1024 * 1024)
198    }
199
200    /// Get memory usage percentage (0.0 - 100.0)
201    #[must_use]
202    pub fn usage_percent(&self) -> f64 {
203        if self.total == 0 {
204            0.0
205        } else {
206            (self.used() as f64 / self.total as f64) * 100.0
207        }
208    }
209}
210
211impl std::fmt::Display for CudaMemoryInfo {
212    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
213        write!(
214            f,
215            "{} / {} MB ({:.1}% used)",
216            self.used_mb(),
217            self.total_mb(),
218            self.usage_percent()
219        )
220    }
221}
222
223// ============================================================================
224// Convenience Functions
225// ============================================================================
226
227/// Check if CUDA monitoring is available
228///
229/// Returns `true` if CUDA driver is installed and at least one device exists.
230#[must_use]
231pub fn cuda_monitoring_available() -> bool {
232    #[cfg(feature = "cuda")]
233    {
234        cuda_available()
235    }
236    #[cfg(not(feature = "cuda"))]
237    {
238        false
239    }
240}
241
242/// Get the number of CUDA devices
243///
244/// # Errors
245///
246/// Returns error if CUDA is not available.
247pub fn cuda_device_count() -> Result<usize, GpuError> {
248    #[cfg(feature = "cuda")]
249    {
250        device_count()
251    }
252    #[cfg(not(feature = "cuda"))]
253    {
254        Err(GpuError::CudaNotAvailable(
255            "cuda feature not enabled".to_string(),
256        ))
257    }
258}
259
260// ============================================================================
261// Tests (EXTREME TDD)
262// ============================================================================
263
264#[cfg(test)]
265mod tests;