trueno_gpu/monitor/cuda/mod.rs
1//! CUDA GPU Monitoring (TRUENO-SPEC-010)
2//!
3//! Provides native CUDA device monitoring via the CUDA Driver API.
4//! This module enables accurate device information and real-time memory metrics.
5//!
6//! # Design Philosophy
7//!
8//! **Native CUDA**: Direct access via cuDeviceGetName, cuMemGetInfo provides
9//! accurate information (e.g., "NVIDIA GeForce RTX 4090") compared to wgpu's
10//! generic backend queries.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use trueno_gpu::monitor::{CudaDeviceInfo, CudaMemoryInfo};
16//!
17//! // Query device info
18//! let info = CudaDeviceInfo::query(0)?;
19//! println!("GPU: {} ({} GB)", info.name, info.total_memory_gb());
20//!
21//! // Query memory usage
22//! let mem = CudaMemoryInfo::query()?;
23//! println!("Free: {} / {} MB", mem.free_mb(), mem.total_mb());
24//! ```
25//!
26//! # References
27//!
28//! - NVIDIA CUDA Driver API: cuDeviceGetName, cuDeviceTotalMem, cuMemGetInfo
29//! - TRUENO-SPEC-010: GPU Monitoring, Tracing, and Visualization
30
31#[cfg(feature = "cuda")]
32use crate::driver::{cuda_available, device_count, CudaContext};
33use crate::GpuError;
34
35// ============================================================================
36// CUDA Device Information (TRUENO-SPEC-010 Section 3.1)
37// ============================================================================
38
39/// CUDA device information from native driver API
40///
41/// Provides accurate device information including:
42/// - Device name (e.g., "NVIDIA GeForce RTX 4090")
43/// - Total VRAM in bytes
44/// - Device ordinal
45#[derive(Debug, Clone)]
46pub struct CudaDeviceInfo {
47 /// Device ordinal (0-based index)
48 pub index: u32,
49 /// Device name from cuDeviceGetName
50 pub name: String,
51 /// Total VRAM in bytes from cuDeviceTotalMem
52 pub total_memory: u64,
53}
54
55impl CudaDeviceInfo {
56 /// Query device information for the specified device index
57 ///
58 /// # Arguments
59 ///
60 /// * `device_index` - Device ordinal (0 for first GPU)
61 ///
62 /// # Errors
63 ///
64 /// Returns error if device is not found or query fails.
65 ///
66 /// # Example
67 ///
68 /// ```rust,ignore
69 /// let info = CudaDeviceInfo::query(0)?;
70 /// println!("GPU: {}", info.name);
71 /// ```
72 #[cfg(feature = "cuda")]
73 #[allow(clippy::cast_possible_wrap)]
74 pub fn query(device_index: u32) -> Result<Self, GpuError> {
75 let ctx = CudaContext::new(device_index as i32)?;
76 let name = ctx.device_name()?;
77 let total_memory = ctx.total_memory()? as u64;
78
79 Ok(Self {
80 index: device_index,
81 name,
82 total_memory,
83 })
84 }
85
86 /// Query device information (non-CUDA stub)
87 #[cfg(not(feature = "cuda"))]
88 pub fn query(_device_index: u32) -> Result<Self, GpuError> {
89 Err(GpuError::CudaNotAvailable(
90 "cuda feature not enabled".to_string(),
91 ))
92 }
93
94 /// Enumerate all available CUDA devices
95 ///
96 /// # Errors
97 ///
98 /// Returns error if enumeration fails.
99 #[cfg(feature = "cuda")]
100 pub fn enumerate() -> Result<Vec<Self>, GpuError> {
101 let count = device_count()?;
102 let mut devices = Vec::with_capacity(count);
103
104 for i in 0..count {
105 devices.push(Self::query(i as u32)?);
106 }
107
108 Ok(devices)
109 }
110
111 /// Enumerate devices (non-CUDA stub)
112 #[cfg(not(feature = "cuda"))]
113 pub fn enumerate() -> Result<Vec<Self>, GpuError> {
114 Err(GpuError::CudaNotAvailable(
115 "cuda feature not enabled".to_string(),
116 ))
117 }
118
119 /// Get total memory in megabytes
120 #[must_use]
121 pub fn total_memory_mb(&self) -> u64 {
122 self.total_memory / (1024 * 1024)
123 }
124
125 /// Get total memory in gigabytes
126 #[must_use]
127 pub fn total_memory_gb(&self) -> f64 {
128 self.total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
129 }
130}
131
132impl std::fmt::Display for CudaDeviceInfo {
133 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
134 write!(
135 f,
136 "[{}] {} ({:.1} GB)",
137 self.index,
138 self.name,
139 self.total_memory_gb()
140 )
141 }
142}
143
144// ============================================================================
145// CUDA Memory Information (TRUENO-SPEC-010 Section 4.1.2)
146// ============================================================================
147
148/// Real-time CUDA memory information from cuMemGetInfo
149///
150/// Provides current memory usage on the active CUDA context.
151#[derive(Debug, Clone, Copy)]
152pub struct CudaMemoryInfo {
153 /// Free memory in bytes
154 pub free: u64,
155 /// Total memory in bytes
156 pub total: u64,
157}
158
159impl CudaMemoryInfo {
160 /// Query current memory information
161 ///
162 /// Requires an active CUDA context.
163 ///
164 /// # Errors
165 ///
166 /// Returns error if no context is active or query fails.
167 #[cfg(feature = "cuda")]
168 pub fn query(ctx: &CudaContext) -> Result<Self, GpuError> {
169 let (free, total) = ctx.memory_info()?;
170 Ok(Self {
171 free: free as u64,
172 total: total as u64,
173 })
174 }
175
176 /// Get used memory in bytes
177 #[must_use]
178 pub fn used(&self) -> u64 {
179 self.total.saturating_sub(self.free)
180 }
181
182 /// Get free memory in megabytes
183 #[must_use]
184 pub fn free_mb(&self) -> u64 {
185 self.free / (1024 * 1024)
186 }
187
188 /// Get total memory in megabytes
189 #[must_use]
190 pub fn total_mb(&self) -> u64 {
191 self.total / (1024 * 1024)
192 }
193
194 /// Get used memory in megabytes
195 #[must_use]
196 pub fn used_mb(&self) -> u64 {
197 self.used() / (1024 * 1024)
198 }
199
200 /// Get memory usage percentage (0.0 - 100.0)
201 #[must_use]
202 pub fn usage_percent(&self) -> f64 {
203 if self.total == 0 {
204 0.0
205 } else {
206 (self.used() as f64 / self.total as f64) * 100.0
207 }
208 }
209}
210
211impl std::fmt::Display for CudaMemoryInfo {
212 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
213 write!(
214 f,
215 "{} / {} MB ({:.1}% used)",
216 self.used_mb(),
217 self.total_mb(),
218 self.usage_percent()
219 )
220 }
221}
222
223// ============================================================================
224// Convenience Functions
225// ============================================================================
226
227/// Check if CUDA monitoring is available
228///
229/// Returns `true` if CUDA driver is installed and at least one device exists.
230#[must_use]
231pub fn cuda_monitoring_available() -> bool {
232 #[cfg(feature = "cuda")]
233 {
234 cuda_available()
235 }
236 #[cfg(not(feature = "cuda"))]
237 {
238 false
239 }
240}
241
242/// Get the number of CUDA devices
243///
244/// # Errors
245///
246/// Returns error if CUDA is not available.
247pub fn cuda_device_count() -> Result<usize, GpuError> {
248 #[cfg(feature = "cuda")]
249 {
250 device_count()
251 }
252 #[cfg(not(feature = "cuda"))]
253 {
254 Err(GpuError::CudaNotAvailable(
255 "cuda feature not enabled".to_string(),
256 ))
257 }
258}
259
260// ============================================================================
261// Tests (EXTREME TDD)
262// ============================================================================
263
264#[cfg(test)]
265mod tests;