use crate::metrics::catalog::{EnergyMetrics, SystemHealthMetrics, VramMetrics};
use std::process::Command;
pub fn collect_system_health() -> Option<SystemHealthMetrics> {
let gpu = query_nvidia_smi(&[
"temperature.gpu",
"power.draw",
"clocks.current.sm",
"clocks.current.memory",
"memory.used",
"memory.total",
])?;
let fields: Vec<&str> = gpu.split(", ").collect();
if fields.len() < 6 {
return None;
}
let cpu_freq = read_cpu_frequency().unwrap_or(0.0);
let cpu_temp = read_cpu_temperature().unwrap_or(0.0);
Some(SystemHealthMetrics {
gpu_temperature_celsius: parse_nvidia_val(fields[0]),
gpu_power_watts: parse_nvidia_val(fields[1]),
gpu_clock_mhz: parse_nvidia_val(fields[2]),
gpu_memory_clock_mhz: parse_nvidia_val(fields[3]),
cpu_frequency_mhz: cpu_freq,
cpu_temperature_celsius: cpu_temp,
gpu_memory_used_mb: parse_nvidia_val(fields[4]),
gpu_memory_total_mb: parse_nvidia_val(fields[5]),
})
}
pub fn collect_vram() -> Option<VramMetrics> {
let gpu = query_nvidia_smi(&["memory.used", "memory.total", "memory.free"])?;
let fields: Vec<&str> = gpu.split(", ").collect();
if fields.len() < 3 {
return None;
}
let used = parse_nvidia_val(fields[0]);
let total = parse_nvidia_val(fields[1]);
let free = parse_nvidia_val(fields[2]);
let utilization = if total > 0.0 {
used / total * 100.0
} else {
0.0
};
Some(VramMetrics {
vram_used_mb: used,
vram_total_mb: total,
vram_free_mb: free,
vram_utilization_pct: utilization,
vram_peak_mb: used, vram_allocation_count: 0,
vram_fragmentation_pct: 0.0,
})
}
pub fn compute_energy(power_watts: f64, tflops: f64, duration_us: f64) -> Option<EnergyMetrics> {
if power_watts <= 0.0 {
return None;
}
let tflops_per_watt = if power_watts > 0.0 {
tflops / power_watts
} else {
0.0
};
let joules = power_watts * duration_us * 1e-6;
Some(EnergyMetrics {
tflops_per_watt,
joules_per_inference: joules,
})
}
fn query_nvidia_smi(fields: &[&str]) -> Option<String> {
let query = fields.join(",");
let output = Command::new("nvidia-smi")
.args(["--query-gpu", &query, "--format=csv,noheader,nounits"])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
let line = stdout.trim();
if line.is_empty() || line.contains("[N/A]") && line.chars().all(|c| c == ',' || c == ' ') {
return None;
}
Some(line.to_string())
}
fn parse_nvidia_val(s: &str) -> f64 {
let s = s.trim();
if s == "[N/A]" || s == "N/A" {
return 0.0;
}
s.split_whitespace()
.next()
.and_then(|token| token.parse::<f64>().ok())
.unwrap_or(0.0)
}
fn read_cpu_frequency() -> Option<f64> {
let content = std::fs::read_to_string("/proc/cpuinfo").ok()?;
let mut total = 0.0;
let mut count = 0;
for line in content.lines() {
if line.starts_with("cpu MHz") {
if let Some(val) = line.split(':').nth(1) {
if let Ok(mhz) = val.trim().parse::<f64>() {
total += mhz;
count += 1;
}
}
}
}
if count > 0 {
Some(total / count as f64)
} else {
None
}
}
fn read_cpu_temperature() -> Option<f64> {
for i in 0..10 {
let path = format!("/sys/class/thermal/thermal_zone{i}/temp");
if let Ok(content) = std::fs::read_to_string(&path) {
if let Ok(millidegrees) = content.trim().parse::<f64>() {
return Some(millidegrees / 1000.0);
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_nvidia_val() {
assert!((parse_nvidia_val("285.32 W") - 285.32).abs() < 0.01);
assert!((parse_nvidia_val("24564 MiB") - 24564.0).abs() < 1.0);
assert!((parse_nvidia_val("62") - 62.0).abs() < 0.01);
assert!((parse_nvidia_val("[N/A]")).abs() < 0.01);
assert!((parse_nvidia_val("N/A")).abs() < 0.01);
}
#[test]
fn test_compute_energy() {
let e = compute_energy(300.0, 11.6, 23.2).unwrap();
assert!((e.tflops_per_watt - 11.6 / 300.0).abs() < 0.001);
assert!((e.joules_per_inference - 300.0 * 23.2e-6).abs() < 0.001);
}
#[test]
fn test_compute_energy_zero_power() {
assert!(compute_energy(0.0, 11.6, 23.2).is_none());
}
#[test]
fn test_collect_system_health_no_panic() {
let _ = collect_system_health();
}
#[test]
fn test_collect_vram_no_panic() {
let _ = collect_vram();
}
#[test]
fn test_read_cpu_frequency_no_panic() {
let _ = read_cpu_frequency();
}
#[test]
fn test_read_cpu_temperature_no_panic() {
let _ = read_cpu_temperature();
}
#[test]
fn test_system_health_with_gpu() {
if which::which("nvidia-smi").is_err() {
return; }
let health = collect_system_health();
assert!(health.is_some(), "nvidia-smi exists but no health data");
let h = health.unwrap();
assert!(h.gpu_temperature_celsius > 0.0, "GPU temp should be > 0");
assert!(
h.gpu_memory_total_mb > 0.0,
"GPU memory total should be > 0"
);
}
#[test]
fn test_vram_with_gpu() {
if which::which("nvidia-smi").is_err() {
return;
}
let vram = collect_vram();
assert!(vram.is_some(), "nvidia-smi exists but no VRAM data");
let v = vram.unwrap();
assert!(v.vram_total_mb > 0.0, "VRAM total should be > 0");
assert!(v.vram_utilization_pct >= 0.0 && v.vram_utilization_pct <= 100.0);
}
}