1use crate::metrics::catalog::{EnergyMetrics, SystemHealthMetrics, VramMetrics};
5use std::process::Command;
6
7pub fn collect_system_health() -> Option<SystemHealthMetrics> {
9 let gpu = query_nvidia_smi(&[
10 "temperature.gpu",
11 "power.draw",
12 "clocks.current.sm",
13 "clocks.current.memory",
14 "memory.used",
15 "memory.total",
16 ])?;
17
18 let fields: Vec<&str> = gpu.split(", ").collect();
19 if fields.len() < 6 {
20 return None;
21 }
22
23 let cpu_freq = read_cpu_frequency().unwrap_or(0.0);
24 let cpu_temp = read_cpu_temperature().unwrap_or(0.0);
25
26 Some(SystemHealthMetrics {
27 gpu_temperature_celsius: parse_nvidia_val(fields[0]),
28 gpu_power_watts: parse_nvidia_val(fields[1]),
29 gpu_clock_mhz: parse_nvidia_val(fields[2]),
30 gpu_memory_clock_mhz: parse_nvidia_val(fields[3]),
31 cpu_frequency_mhz: cpu_freq,
32 cpu_temperature_celsius: cpu_temp,
33 gpu_memory_used_mb: parse_nvidia_val(fields[4]),
34 gpu_memory_total_mb: parse_nvidia_val(fields[5]),
35 })
36}
37
38pub fn collect_vram() -> Option<VramMetrics> {
40 let gpu = query_nvidia_smi(&["memory.used", "memory.total", "memory.free"])?;
41
42 let fields: Vec<&str> = gpu.split(", ").collect();
43 if fields.len() < 3 {
44 return None;
45 }
46
47 let used = parse_nvidia_val(fields[0]);
48 let total = parse_nvidia_val(fields[1]);
49 let free = parse_nvidia_val(fields[2]);
50 let utilization = if total > 0.0 {
51 used / total * 100.0
52 } else {
53 0.0
54 };
55
56 Some(VramMetrics {
57 vram_used_mb: used,
58 vram_total_mb: total,
59 vram_free_mb: free,
60 vram_utilization_pct: utilization,
61 vram_peak_mb: used, vram_allocation_count: 0,
63 vram_fragmentation_pct: 0.0,
64 })
65}
66
67pub fn compute_energy(power_watts: f64, tflops: f64, duration_us: f64) -> Option<EnergyMetrics> {
69 if power_watts <= 0.0 {
70 return None;
71 }
72 let tflops_per_watt = if power_watts > 0.0 {
73 tflops / power_watts
74 } else {
75 0.0
76 };
77 let joules = power_watts * duration_us * 1e-6;
78 Some(EnergyMetrics {
79 tflops_per_watt,
80 joules_per_inference: joules,
81 })
82}
83
84fn query_nvidia_smi(fields: &[&str]) -> Option<String> {
86 let query = fields.join(",");
87 let output = Command::new("nvidia-smi")
88 .args(["--query-gpu", &query, "--format=csv,noheader,nounits"])
89 .output()
90 .ok()?;
91
92 if !output.status.success() {
93 return None;
94 }
95 let stdout = String::from_utf8_lossy(&output.stdout);
96 let line = stdout.trim();
97 if line.is_empty() || line.contains("[N/A]") && line.chars().all(|c| c == ',' || c == ' ') {
98 return None;
99 }
100 Some(line.to_string())
101}
102
103fn parse_nvidia_val(s: &str) -> f64 {
105 let s = s.trim();
106 if s == "[N/A]" || s == "N/A" {
107 return 0.0;
108 }
109 s.split_whitespace()
111 .next()
112 .and_then(|token| token.parse::<f64>().ok())
113 .unwrap_or(0.0)
114}
115
116fn read_cpu_frequency() -> Option<f64> {
118 let content = std::fs::read_to_string("/proc/cpuinfo").ok()?;
119 let mut total = 0.0;
121 let mut count = 0;
122 for line in content.lines() {
123 if line.starts_with("cpu MHz") {
124 if let Some(val) = line.split(':').nth(1) {
125 if let Ok(mhz) = val.trim().parse::<f64>() {
126 total += mhz;
127 count += 1;
128 }
129 }
130 }
131 }
132 if count > 0 {
133 Some(total / count as f64)
134 } else {
135 None
136 }
137}
138
139fn read_cpu_temperature() -> Option<f64> {
141 for i in 0..10 {
143 let path = format!("/sys/class/thermal/thermal_zone{i}/temp");
144 if let Ok(content) = std::fs::read_to_string(&path) {
145 if let Ok(millidegrees) = content.trim().parse::<f64>() {
146 return Some(millidegrees / 1000.0);
147 }
148 }
149 }
150 None
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
158 fn test_parse_nvidia_val() {
159 assert!((parse_nvidia_val("285.32 W") - 285.32).abs() < 0.01);
160 assert!((parse_nvidia_val("24564 MiB") - 24564.0).abs() < 1.0);
161 assert!((parse_nvidia_val("62") - 62.0).abs() < 0.01);
162 assert!((parse_nvidia_val("[N/A]")).abs() < 0.01);
163 assert!((parse_nvidia_val("N/A")).abs() < 0.01);
164 }
165
166 #[test]
167 fn test_compute_energy() {
168 let e = compute_energy(300.0, 11.6, 23.2).unwrap();
169 assert!((e.tflops_per_watt - 11.6 / 300.0).abs() < 0.001);
170 assert!((e.joules_per_inference - 300.0 * 23.2e-6).abs() < 0.001);
171 }
172
173 #[test]
174 fn test_compute_energy_zero_power() {
175 assert!(compute_energy(0.0, 11.6, 23.2).is_none());
176 }
177
178 #[test]
180 fn test_collect_system_health_no_panic() {
181 let _ = collect_system_health();
182 }
183
184 #[test]
186 fn test_collect_vram_no_panic() {
187 let _ = collect_vram();
188 }
189
190 #[test]
191 fn test_read_cpu_frequency_no_panic() {
192 let _ = read_cpu_frequency();
193 }
194
195 #[test]
196 fn test_read_cpu_temperature_no_panic() {
197 let _ = read_cpu_temperature();
198 }
199
200 #[test]
202 fn test_system_health_with_gpu() {
203 if which::which("nvidia-smi").is_err() {
204 return; }
206 let health = collect_system_health();
207 assert!(health.is_some(), "nvidia-smi exists but no health data");
208 let h = health.unwrap();
209 assert!(h.gpu_temperature_celsius > 0.0, "GPU temp should be > 0");
210 assert!(
211 h.gpu_memory_total_mb > 0.0,
212 "GPU memory total should be > 0"
213 );
214 }
215
216 #[test]
218 fn test_vram_with_gpu() {
219 if which::which("nvidia-smi").is_err() {
220 return;
221 }
222 let vram = collect_vram();
223 assert!(vram.is_some(), "nvidia-smi exists but no VRAM data");
224 let v = vram.unwrap();
225 assert!(v.vram_total_mb > 0.0, "VRAM total should be > 0");
226 assert!(v.vram_utilization_pct >= 0.0 && v.vram_utilization_pct <= 100.0);
227 }
228}