gpu-histop 0.1.0

High-resolution GPU history monitor for NVIDIA, AMDGPU, and Apple Silicon
Documentation
use std::collections::HashMap;
use std::ffi::OsStr;
use std::process::Command;
use std::time::Instant;

use anyhow::{Context, Result, bail};

use crate::backend::{GpuBackend, require_devices};
use crate::model::{GpuInfo, GpuProcess, GpuProcessKind, GpuSample};

const GPU_INFO_QUERY: &str = "index,name,uuid";
const GPU_SAMPLE_QUERY: &str = "index,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,power.limit,temperature.gpu,fan.speed,clocks.gr,clocks.mem";
const PROCESS_QUERY: &str = "gpu_uuid,pid,used_memory,process_name";

pub struct NvidiaSmiBackend {
    devices: Vec<GpuInfo>,
}

impl NvidiaSmiBackend {
    pub fn new() -> Result<Self> {
        let stdout = nvidia_smi([
            format!("--query-gpu={GPU_INFO_QUERY}"),
            "--format=csv,noheader,nounits".to_owned(),
        ])?;
        let mut devices = Vec::new();

        for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
            let fields = split_csv_line(line);
            if fields.len() < 3 {
                continue;
            }

            let backend_index = fields[0]
                .parse::<u32>()
                .with_context(|| format!("invalid nvidia-smi GPU index {:?}", fields[0]))?;
            devices.push(GpuInfo {
                id: devices.len(),
                backend_index,
                name: nonempty_field(&fields[1])
                    .unwrap_or_else(|| format!("NVIDIA GPU {backend_index}")),
                uuid: optional_string(&fields[2]),
            });
        }

        require_devices(&devices, "nvidia-smi")?;
        Ok(Self { devices })
    }
}

impl GpuBackend for NvidiaSmiBackend {
    fn label(&self) -> &str {
        "nvidia-smi"
    }

    fn devices(&self) -> &[GpuInfo] {
        &self.devices
    }

    fn sample(&mut self) -> Result<Vec<GpuSample>> {
        let at = Instant::now();
        let stdout = nvidia_smi([
            format!("--query-gpu={GPU_SAMPLE_QUERY}"),
            "--format=csv,noheader,nounits".to_owned(),
        ])?;
        let processes = collect_processes_by_uuid();
        let mut samples_by_backend_index = HashMap::new();

        for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
            let fields = split_csv_line(line);
            if fields.len() < 11 {
                continue;
            }

            let backend_index = match fields[0].parse::<u32>() {
                Ok(index) => index,
                Err(_) => continue,
            };
            let Some(info) = self
                .devices
                .iter()
                .find(|device| device.backend_index == backend_index)
            else {
                continue;
            };

            let gpu_processes = info
                .uuid
                .as_ref()
                .and_then(|uuid| processes.get(uuid))
                .cloned()
                .unwrap_or_default();
            let compute_processes = Some(gpu_processes.len() as u32);

            samples_by_backend_index.insert(
                backend_index,
                GpuSample {
                    gpu_id: info.id,
                    at,
                    gpu_util_percent: optional_f64(&fields[1]),
                    mem_util_percent: optional_f64(&fields[2]),
                    vram_used_bytes: optional_mib(&fields[3]),
                    vram_total_bytes: optional_mib(&fields[4]),
                    power_watts: optional_f64(&fields[5]),
                    power_limit_watts: optional_f64(&fields[6]),
                    temperature_celsius: optional_f64(&fields[7]),
                    fan_percent: optional_f64(&fields[8]),
                    graphics_clock_mhz: optional_f64(&fields[9]),
                    memory_clock_mhz: optional_f64(&fields[10]),
                    compute_processes,
                    processes: gpu_processes,
                },
            );
        }

        Ok(self
            .devices
            .iter()
            .map(|info| {
                samples_by_backend_index
                    .remove(&info.backend_index)
                    .unwrap_or_else(|| empty_sample(info.id, at))
            })
            .collect())
    }
}

fn collect_processes_by_uuid() -> HashMap<String, Vec<GpuProcess>> {
    let Ok(stdout) = nvidia_smi([
        format!("--query-compute-apps={PROCESS_QUERY}"),
        "--format=csv,noheader,nounits".to_owned(),
    ]) else {
        return HashMap::new();
    };

    let mut processes_by_uuid: HashMap<String, Vec<GpuProcess>> = HashMap::new();
    for line in stdout.lines().filter(|line| !line.trim().is_empty()) {
        let fields = split_csv_line(line);
        if fields.len() < 4 {
            continue;
        }

        let Some(uuid) = optional_string(&fields[0]) else {
            continue;
        };
        let Ok(pid) = fields[1].parse::<u32>() else {
            continue;
        };

        processes_by_uuid.entry(uuid).or_default().push(GpuProcess {
            pid,
            user: None,
            command: optional_string(&fields[3]),
            kinds: vec![GpuProcessKind::Compute],
            used_gpu_memory_bytes: optional_mib(&fields[2]),
            gpu_instance_id: None,
            compute_instance_id: None,
        });
    }

    for processes in processes_by_uuid.values_mut() {
        processes.sort_by(|a, b| {
            b.used_gpu_memory_bytes
                .unwrap_or(0)
                .cmp(&a.used_gpu_memory_bytes.unwrap_or(0))
                .then_with(|| a.pid.cmp(&b.pid))
        });
    }

    processes_by_uuid
}

fn nvidia_smi<I, S>(args: I) -> Result<String>
where
    I: IntoIterator<Item = S>,
    S: AsRef<OsStr>,
{
    let output = Command::new("nvidia-smi")
        .args(args)
        .output()
        .with_context(|| "failed to execute nvidia-smi")?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        bail!("nvidia-smi failed: {}", stderr.trim());
    }

    Ok(String::from_utf8_lossy(&output.stdout).into_owned())
}

fn split_csv_line(line: &str) -> Vec<String> {
    let mut fields = Vec::new();
    let mut field = String::new();
    let mut quoted = false;

    for ch in line.chars() {
        match ch {
            '"' => quoted = !quoted,
            ',' if !quoted => {
                fields.push(field.trim().to_owned());
                field.clear();
            }
            _ => field.push(ch),
        }
    }
    fields.push(field.trim().to_owned());
    fields
}

fn optional_string(value: &str) -> Option<String> {
    nonempty_field(value).filter(|field| !is_unavailable(field))
}

fn nonempty_field(value: &str) -> Option<String> {
    let trimmed = value.trim();
    (!trimmed.is_empty()).then(|| trimmed.to_owned())
}

fn optional_f64(value: &str) -> Option<f64> {
    let value = value.trim();
    if is_unavailable(value) {
        return None;
    }

    value.parse::<f64>().ok()
}

fn optional_mib(value: &str) -> Option<u64> {
    let mib = optional_f64(value)?;
    Some((mib * 1024.0 * 1024.0).round() as u64)
}

fn is_unavailable(value: &str) -> bool {
    let normalized = value.trim().to_ascii_lowercase();
    normalized.is_empty()
        || normalized == "n/a"
        || normalized == "not supported"
        || normalized == "[not supported]"
        || normalized == "none"
}

fn empty_sample(gpu_id: usize, at: Instant) -> GpuSample {
    GpuSample {
        gpu_id,
        at,
        gpu_util_percent: None,
        mem_util_percent: None,
        vram_used_bytes: None,
        vram_total_bytes: None,
        power_watts: None,
        power_limit_watts: None,
        temperature_celsius: None,
        fan_percent: None,
        graphics_clock_mhz: None,
        memory_clock_mhz: None,
        compute_processes: None,
        processes: Vec::new(),
    }
}

#[cfg(test)]
mod tests {
    use super::{optional_f64, optional_mib, split_csv_line};

    #[test]
    fn split_csv_handles_quoted_commas() {
        assert_eq!(
            split_csv_line("0, \"GPU, Name\", GPU-123"),
            vec!["0", "GPU, Name", "GPU-123"]
        );
    }

    #[test]
    fn optional_numbers_handle_unavailable_values() {
        assert_eq!(optional_f64("N/A"), None);
        assert_eq!(optional_f64("17.5"), Some(17.5));
        assert_eq!(optional_mib("1024"), Some(1024 * 1024 * 1024));
    }
}