1use serde::{Deserialize, Serialize};
11use tracing::{debug, warn};
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct GpuUtilizationReport {
16 pub index: u32,
18 pub utilization_percent: f32,
20 pub memory_used_mb: u64,
22 pub memory_total_mb: u64,
24 pub temperature_c: Option<u32>,
26 pub power_draw_w: Option<f32>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
32pub enum GpuHealthStatus {
33 Healthy,
35 ThermalThrottle,
37 EccError,
39 Unresponsive,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct GpuHealthReport {
46 pub index: u32,
48 pub status: GpuHealthStatus,
50 pub detail: Option<String>,
52}
53
54pub async fn collect_gpu_metrics(vendor: &str, gpu_count: u32) -> Vec<GpuUtilizationReport> {
59 match vendor {
60 "nvidia" => collect_nvidia_metrics(gpu_count).await,
61 "amd" => collect_amd_metrics(gpu_count),
62 "intel" => collect_intel_metrics(gpu_count),
63 _ => Vec::new(),
64 }
65}
66
67pub async fn check_gpu_health(vendor: &str, gpu_count: u32) -> Vec<GpuHealthReport> {
69 match vendor {
70 "nvidia" => check_nvidia_health(gpu_count).await,
71 "amd" => check_amd_health(gpu_count),
72 _ => (0..gpu_count)
73 .map(|i| GpuHealthReport {
74 index: i,
75 status: GpuHealthStatus::Healthy,
76 detail: None,
77 })
78 .collect(),
79 }
80}
81
82async fn collect_nvidia_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
85 let output = match tokio::process::Command::new("nvidia-smi")
86 .args([
87 "--query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw",
88 "--format=csv,noheader,nounits",
89 ])
90 .output()
91 .await
92 {
93 Ok(o) if o.status.success() => o,
94 Ok(o) => {
95 let stderr = String::from_utf8_lossy(&o.stderr);
96 warn!("nvidia-smi failed: {stderr}");
97 return Vec::new();
98 }
99 Err(e) => {
100 debug!("nvidia-smi not available: {e}");
101 return Vec::new();
102 }
103 };
104
105 let stdout = String::from_utf8_lossy(&output.stdout);
106 stdout
107 .lines()
108 .filter_map(|line| {
109 let parts: Vec<&str> = line.split(',').map(str::trim).collect();
110 if parts.len() < 6 {
111 return None;
112 }
113 Some(GpuUtilizationReport {
114 index: parts[0].parse().ok()?,
115 utilization_percent: parts[1].parse().ok()?,
116 memory_used_mb: parts[2].parse().ok()?,
117 memory_total_mb: parts[3].parse().ok()?,
118 temperature_c: parts[4].parse().ok(),
119 power_draw_w: parts[5].parse().ok(),
120 })
121 })
122 .take(gpu_count as usize)
123 .collect()
124}
125
126async fn check_nvidia_health(gpu_count: u32) -> Vec<GpuHealthReport> {
127 let output = match tokio::process::Command::new("nvidia-smi")
129 .args([
130 "--query-gpu=index,temperature.gpu,ecc.errors.uncorrected.volatile.total",
131 "--format=csv,noheader,nounits",
132 ])
133 .output()
134 .await
135 {
136 Ok(o) if o.status.success() => o,
137 _ => {
138 return (0..gpu_count)
139 .map(|i| GpuHealthReport {
140 index: i,
141 status: GpuHealthStatus::Unresponsive,
142 detail: Some("nvidia-smi unavailable".to_string()),
143 })
144 .collect();
145 }
146 };
147
148 let stdout = String::from_utf8_lossy(&output.stdout);
149 stdout
150 .lines()
151 .filter_map(|line| {
152 let parts: Vec<&str> = line.split(',').map(str::trim).collect();
153 if parts.len() < 3 {
154 return None;
155 }
156 let index: u32 = parts[0].parse().ok()?;
157 let temp: u32 = parts[1].parse().unwrap_or(0);
158 let ecc_errors: u64 = parts[2].parse().unwrap_or(0);
159
160 let (status, detail) = if ecc_errors > 0 {
161 (
162 GpuHealthStatus::EccError,
163 Some(format!("{ecc_errors} uncorrected ECC errors")),
164 )
165 } else if temp > 90 {
166 (
167 GpuHealthStatus::ThermalThrottle,
168 Some(format!("Temperature: {temp}\u{00b0}C (throttle threshold)")),
169 )
170 } else {
171 (GpuHealthStatus::Healthy, None)
172 };
173
174 Some(GpuHealthReport {
175 index,
176 status,
177 detail,
178 })
179 })
180 .take(gpu_count as usize)
181 .collect()
182}
183
184#[allow(clippy::cast_precision_loss)]
187fn collect_amd_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
188 (0..gpu_count)
189 .map(|i| {
190 let base = format!("/sys/class/drm/card{i}/device");
191 let utilization = read_sysfs_u32(&format!("{base}/gpu_busy_percent")).unwrap_or(0);
192 let mem_used = read_sysfs_u64(&format!("{base}/mem_info_vram_used"))
193 .map_or(0, |b| b / (1024 * 1024));
194 let mem_total = read_sysfs_u64(&format!("{base}/mem_info_vram_total"))
195 .map_or(0, |b| b / (1024 * 1024));
196 let temp =
197 read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map(|t| t / 1000); let power = read_sysfs_u32(&format!("{base}/hwmon/hwmon0/power1_average"))
199 .map(|p| p as f32 / 1_000_000.0); GpuUtilizationReport {
202 index: i,
203 utilization_percent: utilization as f32,
204 memory_used_mb: mem_used,
205 memory_total_mb: mem_total,
206 temperature_c: temp,
207 power_draw_w: power,
208 }
209 })
210 .collect()
211}
212
213fn check_amd_health(gpu_count: u32) -> Vec<GpuHealthReport> {
214 (0..gpu_count)
215 .map(|i| {
216 let base = format!("/sys/class/drm/card{i}/device");
217 let temp =
218 read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map_or(0, |t| t / 1000);
219
220 if temp > 100 {
221 GpuHealthReport {
222 index: i,
223 status: GpuHealthStatus::ThermalThrottle,
224 detail: Some(format!("Temperature: {temp}\u{00b0}C")),
225 }
226 } else {
227 GpuHealthReport {
228 index: i,
229 status: GpuHealthStatus::Healthy,
230 detail: None,
231 }
232 }
233 })
234 .collect()
235}
236
237#[allow(clippy::cast_precision_loss)]
240fn collect_intel_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
241 (0..gpu_count)
243 .map(|i| {
244 let base = format!("/sys/class/drm/card{i}/device");
245 let temp =
246 read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map(|t| t / 1000);
247 let power = read_sysfs_u32(&format!("{base}/hwmon/hwmon0/power1_average"))
248 .map(|p| p as f32 / 1_000_000.0);
249
250 GpuUtilizationReport {
251 index: i,
252 utilization_percent: 0.0, memory_used_mb: 0,
254 memory_total_mb: 0,
255 temperature_c: temp,
256 power_draw_w: power,
257 }
258 })
259 .collect()
260}
261
262fn read_sysfs_u32(path: &str) -> Option<u32> {
265 std::fs::read_to_string(path).ok()?.trim().parse().ok()
266}
267
268fn read_sysfs_u64(path: &str) -> Option<u64> {
269 std::fs::read_to_string(path).ok()?.trim().parse().ok()
270}
271
272#[cfg(test)]
275mod tests {
276 use super::*;
277
278 #[test]
279 fn test_gpu_utilization_report_serialization() {
280 let report = GpuUtilizationReport {
281 index: 0,
282 utilization_percent: 85.5,
283 memory_used_mb: 4096,
284 memory_total_mb: 8192,
285 temperature_c: Some(72),
286 power_draw_w: Some(250.0),
287 };
288
289 let json = serde_json::to_string(&report).unwrap();
290 let deserialized: GpuUtilizationReport = serde_json::from_str(&json).unwrap();
291 assert_eq!(deserialized.index, 0);
292 assert!((deserialized.utilization_percent - 85.5).abs() < f32::EPSILON);
293 assert_eq!(deserialized.memory_used_mb, 4096);
294 assert_eq!(deserialized.memory_total_mb, 8192);
295 assert_eq!(deserialized.temperature_c, Some(72));
296 }
297
298 #[test]
299 fn test_gpu_health_report_serialization() {
300 let report = GpuHealthReport {
301 index: 1,
302 status: GpuHealthStatus::ThermalThrottle,
303 detail: Some("Temperature: 95\u{00b0}C".to_string()),
304 };
305
306 let json = serde_json::to_string(&report).unwrap();
307 let deserialized: GpuHealthReport = serde_json::from_str(&json).unwrap();
308 assert_eq!(deserialized.index, 1);
309 assert_eq!(deserialized.status, GpuHealthStatus::ThermalThrottle);
310 assert!(deserialized.detail.unwrap().contains("95"));
311 }
312
313 #[test]
314 fn test_gpu_health_status_variants() {
315 let statuses = [
316 GpuHealthStatus::Healthy,
317 GpuHealthStatus::ThermalThrottle,
318 GpuHealthStatus::EccError,
319 GpuHealthStatus::Unresponsive,
320 ];
321
322 for status in &statuses {
323 let json = serde_json::to_string(status).unwrap();
324 let deserialized: GpuHealthStatus = serde_json::from_str(&json).unwrap();
325 assert_eq!(&deserialized, status);
326 }
327 }
328
329 #[tokio::test]
330 async fn test_collect_gpu_metrics_unknown_vendor() {
331 let metrics = collect_gpu_metrics("unknown_vendor", 1).await;
333 assert!(metrics.is_empty());
334 }
335
336 #[tokio::test]
337 async fn test_check_gpu_health_unknown_vendor() {
338 let reports = check_gpu_health("unknown_vendor", 2).await;
340 assert_eq!(reports.len(), 2);
341 for report in &reports {
342 assert_eq!(report.status, GpuHealthStatus::Healthy);
343 assert!(report.detail.is_none());
344 }
345 }
346}