Skip to main content

cgp/profilers/
system.rs

1//! System health and VRAM collection via nvidia-smi and /proc.
2//! Spec sections 9.8 (VRAM), 9.10 (System Health), 9.11 (Energy).
3
4use crate::metrics::catalog::{EnergyMetrics, SystemHealthMetrics, VramMetrics};
5use std::process::Command;
6
7/// Collect system health metrics from nvidia-smi (NVML) and /proc.
8pub fn collect_system_health() -> Option<SystemHealthMetrics> {
9    let gpu = query_nvidia_smi(&[
10        "temperature.gpu",
11        "power.draw",
12        "clocks.current.sm",
13        "clocks.current.memory",
14        "memory.used",
15        "memory.total",
16    ])?;
17
18    let fields: Vec<&str> = gpu.split(", ").collect();
19    if fields.len() < 6 {
20        return None;
21    }
22
23    let cpu_freq = read_cpu_frequency().unwrap_or(0.0);
24    let cpu_temp = read_cpu_temperature().unwrap_or(0.0);
25
26    Some(SystemHealthMetrics {
27        gpu_temperature_celsius: parse_nvidia_val(fields[0]),
28        gpu_power_watts: parse_nvidia_val(fields[1]),
29        gpu_clock_mhz: parse_nvidia_val(fields[2]),
30        gpu_memory_clock_mhz: parse_nvidia_val(fields[3]),
31        cpu_frequency_mhz: cpu_freq,
32        cpu_temperature_celsius: cpu_temp,
33        gpu_memory_used_mb: parse_nvidia_val(fields[4]),
34        gpu_memory_total_mb: parse_nvidia_val(fields[5]),
35    })
36}
37
38/// Collect VRAM metrics from nvidia-smi.
39pub fn collect_vram() -> Option<VramMetrics> {
40    let gpu = query_nvidia_smi(&["memory.used", "memory.total", "memory.free"])?;
41
42    let fields: Vec<&str> = gpu.split(", ").collect();
43    if fields.len() < 3 {
44        return None;
45    }
46
47    let used = parse_nvidia_val(fields[0]);
48    let total = parse_nvidia_val(fields[1]);
49    let free = parse_nvidia_val(fields[2]);
50    let utilization = if total > 0.0 {
51        used / total * 100.0
52    } else {
53        0.0
54    };
55
56    Some(VramMetrics {
57        vram_used_mb: used,
58        vram_total_mb: total,
59        vram_free_mb: free,
60        vram_utilization_pct: utilization,
61        vram_peak_mb: used, // snapshot — no tracking history
62        vram_allocation_count: 0,
63        vram_fragmentation_pct: 0.0,
64    })
65}
66
67/// Compute energy efficiency from power and throughput.
68pub fn compute_energy(power_watts: f64, tflops: f64, duration_us: f64) -> Option<EnergyMetrics> {
69    if power_watts <= 0.0 {
70        return None;
71    }
72    let tflops_per_watt = if power_watts > 0.0 {
73        tflops / power_watts
74    } else {
75        0.0
76    };
77    let joules = power_watts * duration_us * 1e-6;
78    Some(EnergyMetrics {
79        tflops_per_watt,
80        joules_per_inference: joules,
81    })
82}
83
84/// Run nvidia-smi --query-gpu and return the CSV row.
85fn query_nvidia_smi(fields: &[&str]) -> Option<String> {
86    let query = fields.join(",");
87    let output = Command::new("nvidia-smi")
88        .args(["--query-gpu", &query, "--format=csv,noheader,nounits"])
89        .output()
90        .ok()?;
91
92    if !output.status.success() {
93        return None;
94    }
95    let stdout = String::from_utf8_lossy(&output.stdout);
96    let line = stdout.trim();
97    if line.is_empty() || line.contains("[N/A]") && line.chars().all(|c| c == ',' || c == ' ') {
98        return None;
99    }
100    Some(line.to_string())
101}
102
103/// Parse a numeric value from nvidia-smi output (handles "123 W", "45 MiB", etc.)
104fn parse_nvidia_val(s: &str) -> f64 {
105    let s = s.trim();
106    if s == "[N/A]" || s == "N/A" {
107        return 0.0;
108    }
109    // Take the first token that looks numeric
110    s.split_whitespace()
111        .next()
112        .and_then(|token| token.parse::<f64>().ok())
113        .unwrap_or(0.0)
114}
115
116/// Read current CPU frequency from /proc/cpuinfo (MHz).
117fn read_cpu_frequency() -> Option<f64> {
118    let content = std::fs::read_to_string("/proc/cpuinfo").ok()?;
119    // Take average across all cores
120    let mut total = 0.0;
121    let mut count = 0;
122    for line in content.lines() {
123        if line.starts_with("cpu MHz") {
124            if let Some(val) = line.split(':').nth(1) {
125                if let Ok(mhz) = val.trim().parse::<f64>() {
126                    total += mhz;
127                    count += 1;
128                }
129            }
130        }
131    }
132    if count > 0 {
133        Some(total / count as f64)
134    } else {
135        None
136    }
137}
138
139/// Read CPU temperature from /sys thermal zones.
140fn read_cpu_temperature() -> Option<f64> {
141    // Try thermal_zone0 first (usually CPU package)
142    for i in 0..10 {
143        let path = format!("/sys/class/thermal/thermal_zone{i}/temp");
144        if let Ok(content) = std::fs::read_to_string(&path) {
145            if let Ok(millidegrees) = content.trim().parse::<f64>() {
146                return Some(millidegrees / 1000.0);
147            }
148        }
149    }
150    None
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_parse_nvidia_val() {
159        assert!((parse_nvidia_val("285.32 W") - 285.32).abs() < 0.01);
160        assert!((parse_nvidia_val("24564 MiB") - 24564.0).abs() < 1.0);
161        assert!((parse_nvidia_val("62") - 62.0).abs() < 0.01);
162        assert!((parse_nvidia_val("[N/A]")).abs() < 0.01);
163        assert!((parse_nvidia_val("N/A")).abs() < 0.01);
164    }
165
166    #[test]
167    fn test_compute_energy() {
168        let e = compute_energy(300.0, 11.6, 23.2).unwrap();
169        assert!((e.tflops_per_watt - 11.6 / 300.0).abs() < 0.001);
170        assert!((e.joules_per_inference - 300.0 * 23.2e-6).abs() < 0.001);
171    }
172
173    #[test]
174    fn test_compute_energy_zero_power() {
175        assert!(compute_energy(0.0, 11.6, 23.2).is_none());
176    }
177
178    /// System health collection should not panic even without nvidia-smi.
179    #[test]
180    fn test_collect_system_health_no_panic() {
181        let _ = collect_system_health();
182    }
183
184    /// VRAM collection should not panic even without nvidia-smi.
185    #[test]
186    fn test_collect_vram_no_panic() {
187        let _ = collect_vram();
188    }
189
190    #[test]
191    fn test_read_cpu_frequency_no_panic() {
192        let _ = read_cpu_frequency();
193    }
194
195    #[test]
196    fn test_read_cpu_temperature_no_panic() {
197        let _ = read_cpu_temperature();
198    }
199
200    /// If nvidia-smi is available, system health must have valid data.
201    #[test]
202    fn test_system_health_with_gpu() {
203        if which::which("nvidia-smi").is_err() {
204            return; // skip on machines without GPU
205        }
206        let health = collect_system_health();
207        assert!(health.is_some(), "nvidia-smi exists but no health data");
208        let h = health.unwrap();
209        assert!(h.gpu_temperature_celsius > 0.0, "GPU temp should be > 0");
210        assert!(
211            h.gpu_memory_total_mb > 0.0,
212            "GPU memory total should be > 0"
213        );
214    }
215
216    /// If nvidia-smi is available, VRAM must have valid data.
217    #[test]
218    fn test_vram_with_gpu() {
219        if which::which("nvidia-smi").is_err() {
220            return;
221        }
222        let vram = collect_vram();
223        assert!(vram.is_some(), "nvidia-smi exists but no VRAM data");
224        let v = vram.unwrap();
225        assert!(v.vram_total_mb > 0.0, "VRAM total should be > 0");
226        assert!(v.vram_utilization_pct >= 0.0 && v.vram_utilization_pct <= 100.0);
227    }
228}