use std::collections::HashMap;
use std::ffi::OsStr;
use std::process::Command;
use std::time::Instant;
use anyhow::{Context, Result, bail};
use crate::backend::{GpuBackend, require_devices};
use crate::model::{GpuInfo, GpuProcess, GpuProcessKind, GpuSample};
const GPU_INFO_QUERY: &str = "index,name,uuid";
const GPU_SAMPLE_QUERY: &str = "index,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,power.limit,temperature.gpu,fan.speed,clocks.gr,clocks.mem";
const PROCESS_QUERY: &str = "gpu_uuid,pid,used_memory,process_name";
pub struct NvidiaSmiBackend {
devices: Vec<GpuInfo>,
}
impl NvidiaSmiBackend {
pub fn new() -> Result<Self> {
let stdout = nvidia_smi([
format!("--query-gpu={GPU_INFO_QUERY}"),
"--format=csv,noheader,nounits".to_owned(),
])?;
let mut devices = Vec::new();
for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
let fields = split_csv_line(line);
if fields.len() < 3 {
continue;
}
let backend_index = fields[0]
.parse::<u32>()
.with_context(|| format!("invalid nvidia-smi GPU index {:?}", fields[0]))?;
devices.push(GpuInfo {
id: devices.len(),
backend_index,
name: nonempty_field(&fields[1])
.unwrap_or_else(|| format!("NVIDIA GPU {backend_index}")),
uuid: optional_string(&fields[2]),
});
}
require_devices(&devices, "nvidia-smi")?;
Ok(Self { devices })
}
}
impl GpuBackend for NvidiaSmiBackend {
fn label(&self) -> &str {
"nvidia-smi"
}
fn devices(&self) -> &[GpuInfo] {
&self.devices
}
fn sample(&mut self) -> Result<Vec<GpuSample>> {
let at = Instant::now();
let stdout = nvidia_smi([
format!("--query-gpu={GPU_SAMPLE_QUERY}"),
"--format=csv,noheader,nounits".to_owned(),
])?;
let processes = collect_processes_by_uuid();
let mut samples_by_backend_index = HashMap::new();
for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
let fields = split_csv_line(line);
if fields.len() < 11 {
continue;
}
let backend_index = match fields[0].parse::<u32>() {
Ok(index) => index,
Err(_) => continue,
};
let Some(info) = self
.devices
.iter()
.find(|device| device.backend_index == backend_index)
else {
continue;
};
let gpu_processes = info
.uuid
.as_ref()
.and_then(|uuid| processes.get(uuid))
.cloned()
.unwrap_or_default();
let compute_processes = Some(gpu_processes.len() as u32);
samples_by_backend_index.insert(
backend_index,
GpuSample {
gpu_id: info.id,
at,
gpu_util_percent: optional_f64(&fields[1]),
mem_util_percent: optional_f64(&fields[2]),
vram_used_bytes: optional_mib(&fields[3]),
vram_total_bytes: optional_mib(&fields[4]),
power_watts: optional_f64(&fields[5]),
power_limit_watts: optional_f64(&fields[6]),
temperature_celsius: optional_f64(&fields[7]),
fan_percent: optional_f64(&fields[8]),
graphics_clock_mhz: optional_f64(&fields[9]),
memory_clock_mhz: optional_f64(&fields[10]),
compute_processes,
processes: gpu_processes,
},
);
}
Ok(self
.devices
.iter()
.map(|info| {
samples_by_backend_index
.remove(&info.backend_index)
.unwrap_or_else(|| empty_sample(info.id, at))
})
.collect())
}
}
fn collect_processes_by_uuid() -> HashMap<String, Vec<GpuProcess>> {
let Ok(stdout) = nvidia_smi([
format!("--query-compute-apps={PROCESS_QUERY}"),
"--format=csv,noheader,nounits".to_owned(),
]) else {
return HashMap::new();
};
let mut processes_by_uuid: HashMap<String, Vec<GpuProcess>> = HashMap::new();
for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
let fields = split_csv_line(line);
if fields.len() < 4 {
continue;
}
let Some(uuid) = optional_string(&fields[0]) else {
continue;
};
let Ok(pid) = fields[1].parse::<u32>() else {
continue;
};
processes_by_uuid.entry(uuid).or_default().push(GpuProcess {
pid,
user: None,
command: optional_string(&fields[3]),
kinds: vec![GpuProcessKind::Compute],
used_gpu_memory_bytes: optional_mib(&fields[2]),
gpu_instance_id: None,
compute_instance_id: None,
});
}
for processes in processes_by_uuid.values_mut() {
processes.sort_by(|a, b| {
b.used_gpu_memory_bytes
.unwrap_or(0)
.cmp(&a.used_gpu_memory_bytes.unwrap_or(0))
.then_with(|| a.pid.cmp(&b.pid))
});
}
processes_by_uuid
}
fn nvidia_smi<I, S>(args: I) -> Result<String>
where
I: IntoIterator<Item = S>,
S: AsRef<OsStr>,
{
let output = Command::new("nvidia-smi")
.args(args)
.output()
.with_context(|| "failed to execute nvidia-smi")?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
bail!("nvidia-smi failed: {}", stderr.trim());
}
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
}
fn split_csv_line(line: &str) -> Vec<String> {
let mut fields = Vec::new();
let mut field = String::new();
let mut quoted = false;
for ch in line.chars() {
match ch {
'"' => quoted = !quoted,
',' if !quoted => {
fields.push(field.trim().to_owned());
field.clear();
}
_ => field.push(ch),
}
}
fields.push(field.trim().to_owned());
fields
}
fn optional_string(value: &str) -> Option<String> {
nonempty_field(value).filter(|field| !is_unavailable(field))
}
fn nonempty_field(value: &str) -> Option<String> {
let trimmed = value.trim();
(!trimmed.is_empty()).then(|| trimmed.to_owned())
}
fn optional_f64(value: &str) -> Option<f64> {
let value = value.trim();
if is_unavailable(value) {
return None;
}
value.parse::<f64>().ok()
}
fn optional_mib(value: &str) -> Option<u64> {
let mib = optional_f64(value)?;
Some((mib * 1024.0 * 1024.0).round() as u64)
}
fn is_unavailable(value: &str) -> bool {
let normalized = value.trim().to_ascii_lowercase();
normalized.is_empty()
|| normalized == "n/a"
|| normalized == "not supported"
|| normalized == "[not supported]"
|| normalized == "none"
}
fn empty_sample(gpu_id: usize, at: Instant) -> GpuSample {
GpuSample {
gpu_id,
at,
gpu_util_percent: None,
mem_util_percent: None,
vram_used_bytes: None,
vram_total_bytes: None,
power_watts: None,
power_limit_watts: None,
temperature_celsius: None,
fan_percent: None,
graphics_clock_mhz: None,
memory_clock_mhz: None,
compute_processes: None,
processes: Vec::new(),
}
}
#[cfg(test)]
mod tests {
use super::{optional_f64, optional_mib, split_csv_line};
#[test]
fn split_csv_handles_quoted_commas() {
assert_eq!(
split_csv_line("0, \"GPU, Name\", GPU-123"),
vec!["0", "GPU, Name", "GPU-123"]
);
}
#[test]
fn optional_numbers_handle_unavailable_values() {
assert_eq!(optional_f64("N/A"), None);
assert_eq!(optional_f64("17.5"), Some(17.5));
assert_eq!(optional_mib("1024"), Some(1024 * 1024 * 1024));
}
}