Skip to main content

zlayer_agent/
gpu_metrics.rs

1//! GPU metrics collection for observability.
2//!
3//! Collects per-GPU utilization, memory, temperature, and power metrics
4//! using vendor-specific interfaces:
5//! - NVIDIA: `nvidia-smi` CLI (avoids hard NVML dependency)
6//! - AMD: sysfs under `/sys/class/drm/card{N}/device/`
7//! - Intel: sysfs under `/sys/class/drm/card{N}/device/`
8//! - Apple: `IOKit` via `powermetrics` (macOS only)
9
10use serde::{Deserialize, Serialize};
11use tracing::{debug, warn};
12
13/// Per-GPU utilization snapshot
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct GpuUtilizationReport {
16    /// GPU index on this node
17    pub index: u32,
18    /// GPU compute utilization percentage (0-100)
19    pub utilization_percent: f32,
20    /// GPU memory currently used in MB
21    pub memory_used_mb: u64,
22    /// GPU total memory in MB
23    pub memory_total_mb: u64,
24    /// GPU temperature in Celsius (if available)
25    pub temperature_c: Option<u32>,
26    /// GPU power draw in Watts (if available)
27    pub power_draw_w: Option<f32>,
28}
29
30/// GPU health status
31#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
32pub enum GpuHealthStatus {
33    /// GPU is operating normally
34    Healthy,
35    /// GPU is throttling due to temperature
36    ThermalThrottle,
37    /// GPU has ECC errors
38    EccError,
39    /// GPU is not responding
40    Unresponsive,
41}
42
43/// Per-GPU health check result
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct GpuHealthReport {
46    /// GPU index
47    pub index: u32,
48    /// Health status
49    pub status: GpuHealthStatus,
50    /// Human-readable detail message
51    pub detail: Option<String>,
52}
53
54/// Collect GPU utilization metrics for all detected GPUs.
55///
56/// Uses `nvidia-smi` for NVIDIA GPUs and sysfs for AMD/Intel.
57/// Returns an empty vec if no GPUs are detected or metrics cannot be read.
58pub async fn collect_gpu_metrics(vendor: &str, gpu_count: u32) -> Vec<GpuUtilizationReport> {
59    match vendor {
60        "nvidia" => collect_nvidia_metrics(gpu_count).await,
61        "amd" => collect_amd_metrics(gpu_count),
62        "intel" => collect_intel_metrics(gpu_count),
63        _ => Vec::new(),
64    }
65}
66
67/// Collect GPU health reports.
68pub async fn check_gpu_health(vendor: &str, gpu_count: u32) -> Vec<GpuHealthReport> {
69    match vendor {
70        "nvidia" => check_nvidia_health(gpu_count).await,
71        "amd" => check_amd_health(gpu_count),
72        _ => (0..gpu_count)
73            .map(|i| GpuHealthReport {
74                index: i,
75                status: GpuHealthStatus::Healthy,
76                detail: None,
77            })
78            .collect(),
79    }
80}
81
82// -- NVIDIA -------------------------------------------------------------------
83
84async fn collect_nvidia_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
85    let output = match tokio::process::Command::new("nvidia-smi")
86        .args([
87            "--query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw",
88            "--format=csv,noheader,nounits",
89        ])
90        .output()
91        .await
92    {
93        Ok(o) if o.status.success() => o,
94        Ok(o) => {
95            let stderr = String::from_utf8_lossy(&o.stderr);
96            warn!("nvidia-smi failed: {stderr}");
97            return Vec::new();
98        }
99        Err(e) => {
100            debug!("nvidia-smi not available: {e}");
101            return Vec::new();
102        }
103    };
104
105    let stdout = String::from_utf8_lossy(&output.stdout);
106    stdout
107        .lines()
108        .filter_map(|line| {
109            let parts: Vec<&str> = line.split(',').map(str::trim).collect();
110            if parts.len() < 6 {
111                return None;
112            }
113            Some(GpuUtilizationReport {
114                index: parts[0].parse().ok()?,
115                utilization_percent: parts[1].parse().ok()?,
116                memory_used_mb: parts[2].parse().ok()?,
117                memory_total_mb: parts[3].parse().ok()?,
118                temperature_c: parts[4].parse().ok(),
119                power_draw_w: parts[5].parse().ok(),
120            })
121        })
122        .take(gpu_count as usize)
123        .collect()
124}
125
126async fn check_nvidia_health(gpu_count: u32) -> Vec<GpuHealthReport> {
127    // Check for Xid errors and thermal throttling via nvidia-smi
128    let output = match tokio::process::Command::new("nvidia-smi")
129        .args([
130            "--query-gpu=index,temperature.gpu,ecc.errors.uncorrected.volatile.total",
131            "--format=csv,noheader,nounits",
132        ])
133        .output()
134        .await
135    {
136        Ok(o) if o.status.success() => o,
137        _ => {
138            return (0..gpu_count)
139                .map(|i| GpuHealthReport {
140                    index: i,
141                    status: GpuHealthStatus::Unresponsive,
142                    detail: Some("nvidia-smi unavailable".to_string()),
143                })
144                .collect();
145        }
146    };
147
148    let stdout = String::from_utf8_lossy(&output.stdout);
149    stdout
150        .lines()
151        .filter_map(|line| {
152            let parts: Vec<&str> = line.split(',').map(str::trim).collect();
153            if parts.len() < 3 {
154                return None;
155            }
156            let index: u32 = parts[0].parse().ok()?;
157            let temp: u32 = parts[1].parse().unwrap_or(0);
158            let ecc_errors: u64 = parts[2].parse().unwrap_or(0);
159
160            let (status, detail) = if ecc_errors > 0 {
161                (
162                    GpuHealthStatus::EccError,
163                    Some(format!("{ecc_errors} uncorrected ECC errors")),
164                )
165            } else if temp > 90 {
166                (
167                    GpuHealthStatus::ThermalThrottle,
168                    Some(format!("Temperature: {temp}\u{00b0}C (throttle threshold)")),
169                )
170            } else {
171                (GpuHealthStatus::Healthy, None)
172            };
173
174            Some(GpuHealthReport {
175                index,
176                status,
177                detail,
178            })
179        })
180        .take(gpu_count as usize)
181        .collect()
182}
183
184// -- AMD ----------------------------------------------------------------------
185
186#[allow(clippy::cast_precision_loss)]
187fn collect_amd_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
188    (0..gpu_count)
189        .map(|i| {
190            let base = format!("/sys/class/drm/card{i}/device");
191            let utilization = read_sysfs_u32(&format!("{base}/gpu_busy_percent")).unwrap_or(0);
192            let mem_used = read_sysfs_u64(&format!("{base}/mem_info_vram_used"))
193                .map_or(0, |b| b / (1024 * 1024));
194            let mem_total = read_sysfs_u64(&format!("{base}/mem_info_vram_total"))
195                .map_or(0, |b| b / (1024 * 1024));
196            let temp =
197                read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map(|t| t / 1000); // millidegrees to degrees
198            let power = read_sysfs_u32(&format!("{base}/hwmon/hwmon0/power1_average"))
199                .map(|p| p as f32 / 1_000_000.0); // microwatts to watts
200
201            GpuUtilizationReport {
202                index: i,
203                utilization_percent: utilization as f32,
204                memory_used_mb: mem_used,
205                memory_total_mb: mem_total,
206                temperature_c: temp,
207                power_draw_w: power,
208            }
209        })
210        .collect()
211}
212
213fn check_amd_health(gpu_count: u32) -> Vec<GpuHealthReport> {
214    (0..gpu_count)
215        .map(|i| {
216            let base = format!("/sys/class/drm/card{i}/device");
217            let temp =
218                read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map_or(0, |t| t / 1000);
219
220            if temp > 100 {
221                GpuHealthReport {
222                    index: i,
223                    status: GpuHealthStatus::ThermalThrottle,
224                    detail: Some(format!("Temperature: {temp}\u{00b0}C")),
225                }
226            } else {
227                GpuHealthReport {
228                    index: i,
229                    status: GpuHealthStatus::Healthy,
230                    detail: None,
231                }
232            }
233        })
234        .collect()
235}
236
237// -- Intel --------------------------------------------------------------------
238
239#[allow(clippy::cast_precision_loss)]
240fn collect_intel_metrics(gpu_count: u32) -> Vec<GpuUtilizationReport> {
241    // Intel discrete GPUs expose some metrics via i915 sysfs
242    (0..gpu_count)
243        .map(|i| {
244            let base = format!("/sys/class/drm/card{i}/device");
245            let temp =
246                read_sysfs_u32(&format!("{base}/hwmon/hwmon0/temp1_input")).map(|t| t / 1000);
247            let power = read_sysfs_u32(&format!("{base}/hwmon/hwmon0/power1_average"))
248                .map(|p| p as f32 / 1_000_000.0);
249
250            GpuUtilizationReport {
251                index: i,
252                utilization_percent: 0.0, // Intel sysfs doesn't expose utilization directly
253                memory_used_mb: 0,
254                memory_total_mb: 0,
255                temperature_c: temp,
256                power_draw_w: power,
257            }
258        })
259        .collect()
260}
261
262// -- Helpers ------------------------------------------------------------------
263
264fn read_sysfs_u32(path: &str) -> Option<u32> {
265    std::fs::read_to_string(path).ok()?.trim().parse().ok()
266}
267
268fn read_sysfs_u64(path: &str) -> Option<u64> {
269    std::fs::read_to_string(path).ok()?.trim().parse().ok()
270}
271
272// -- Tests --------------------------------------------------------------------
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    #[test]
279    fn test_gpu_utilization_report_serialization() {
280        let report = GpuUtilizationReport {
281            index: 0,
282            utilization_percent: 85.5,
283            memory_used_mb: 4096,
284            memory_total_mb: 8192,
285            temperature_c: Some(72),
286            power_draw_w: Some(250.0),
287        };
288
289        let json = serde_json::to_string(&report).unwrap();
290        let deserialized: GpuUtilizationReport = serde_json::from_str(&json).unwrap();
291        assert_eq!(deserialized.index, 0);
292        assert!((deserialized.utilization_percent - 85.5).abs() < f32::EPSILON);
293        assert_eq!(deserialized.memory_used_mb, 4096);
294        assert_eq!(deserialized.memory_total_mb, 8192);
295        assert_eq!(deserialized.temperature_c, Some(72));
296    }
297
298    #[test]
299    fn test_gpu_health_report_serialization() {
300        let report = GpuHealthReport {
301            index: 1,
302            status: GpuHealthStatus::ThermalThrottle,
303            detail: Some("Temperature: 95\u{00b0}C".to_string()),
304        };
305
306        let json = serde_json::to_string(&report).unwrap();
307        let deserialized: GpuHealthReport = serde_json::from_str(&json).unwrap();
308        assert_eq!(deserialized.index, 1);
309        assert_eq!(deserialized.status, GpuHealthStatus::ThermalThrottle);
310        assert!(deserialized.detail.unwrap().contains("95"));
311    }
312
313    #[test]
314    fn test_gpu_health_status_variants() {
315        let statuses = [
316            GpuHealthStatus::Healthy,
317            GpuHealthStatus::ThermalThrottle,
318            GpuHealthStatus::EccError,
319            GpuHealthStatus::Unresponsive,
320        ];
321
322        for status in &statuses {
323            let json = serde_json::to_string(status).unwrap();
324            let deserialized: GpuHealthStatus = serde_json::from_str(&json).unwrap();
325            assert_eq!(&deserialized, status);
326        }
327    }
328
329    #[tokio::test]
330    async fn test_collect_gpu_metrics_unknown_vendor() {
331        // Unknown vendor should return empty vec
332        let metrics = collect_gpu_metrics("unknown_vendor", 1).await;
333        assert!(metrics.is_empty());
334    }
335
336    #[tokio::test]
337    async fn test_check_gpu_health_unknown_vendor() {
338        // Unknown vendor should return default Healthy status
339        let reports = check_gpu_health("unknown_vendor", 2).await;
340        assert_eq!(reports.len(), 2);
341        for report in &reports {
342            assert_eq!(report.status, GpuHealthStatus::Healthy);
343            assert!(report.detail.is_none());
344        }
345    }
346}