use std::collections::HashMap;
use std::fs;
use std::process::Command;
use std::time::Instant;
use anyhow::{Context, Result};
use nvml_wrapper::enum_wrappers::device::{Clock, TemperatureSensor};
use nvml_wrapper::enums::device::UsedGpuMemory;
use nvml_wrapper::struct_wrappers::device::ProcessInfo;
use nvml_wrapper::{Device, Nvml};
use crate::backend::{GpuBackend, require_devices};
use crate::model::{GpuInfo, GpuProcess, GpuProcessKind, GpuSample};
pub struct NvmlBackend {
nvml: Nvml,
devices: Vec<GpuInfo>,
fan_readouts: Vec<FanReadout>,
proc_cache: ProcInfoCache,
}
#[derive(Debug, Clone, Copy)]
enum FanReadout {
Unavailable,
Percent { fan_index: u32 },
}
#[derive(Debug, Clone)]
struct ProcIdentity {
start_time_ticks: Option<u64>,
user: Option<String>,
command: Option<String>,
}
#[derive(Debug, Default)]
struct ProcInfoCache {
identities: HashMap<u32, ProcIdentity>,
usernames: HashMap<u32, String>,
}
impl NvmlBackend {
pub fn new() -> Result<Self> {
let nvml = Nvml::init().with_context(|| "NVML initialization failed")?;
let count = nvml
.device_count()
.with_context(|| "NVML device_count failed")?;
let mut devices = Vec::with_capacity(count as usize);
let mut fan_readouts = Vec::with_capacity(count as usize);
for index in 0..count {
let device = nvml
.device_by_index(index)
.with_context(|| format!("failed to open NVML device {index}"))?;
let name = device
.name()
.unwrap_or_else(|_| format!("NVIDIA GPU {index}"));
let uuid = device.uuid().ok();
devices.push(GpuInfo {
id: index as usize,
backend_index: index,
name,
uuid,
});
fan_readouts.push(detect_fan_readout(&device, &devices[index as usize].name));
}
require_devices(&devices, "NVML")?;
Ok(Self {
nvml,
devices,
fan_readouts,
proc_cache: ProcInfoCache::default(),
})
}
}
impl GpuBackend for NvmlBackend {
fn label(&self) -> &str {
"NVML"
}
fn devices(&self) -> &[GpuInfo] {
&self.devices
}
fn sample(&mut self) -> Result<Vec<GpuSample>> {
let at = Instant::now();
let mut samples = Vec::with_capacity(self.devices.len());
for info in &self.devices {
let device = self
.nvml
.device_by_index(info.backend_index)
.with_context(|| format!("failed to open NVML device {}", info.backend_index))?;
let utilization = device.utilization_rates().ok();
let memory = device.memory_info().ok();
let processes = collect_processes(&device, &mut self.proc_cache);
samples.push(GpuSample {
gpu_id: info.id,
at,
gpu_util_percent: utilization.as_ref().map(|u| u.gpu as f64),
mem_util_percent: utilization.as_ref().map(|u| u.memory as f64),
vram_used_bytes: memory.as_ref().map(|m| m.used),
vram_total_bytes: memory.as_ref().map(|m| m.total),
power_watts: device.power_usage().ok().map(|mw| mw as f64 / 1000.0),
power_limit_watts: device
.enforced_power_limit()
.ok()
.map(|mw| mw as f64 / 1000.0),
temperature_celsius: device
.temperature(TemperatureSensor::Gpu)
.ok()
.map(|c| c as f64),
fan_percent: self
.fan_readouts
.get(info.id)
.and_then(|readout| sample_fan_percent(&device, *readout)),
graphics_clock_mhz: device
.clock_info(Clock::Graphics)
.ok()
.map(|mhz| mhz as f64),
memory_clock_mhz: device.clock_info(Clock::Memory).ok().map(|mhz| mhz as f64),
compute_processes: Some(processes.len() as u32),
processes,
});
}
Ok(samples)
}
}
fn detect_fan_readout(device: &Device<'_>, name: &str) -> FanReadout {
let Ok(fan_count) = device.num_fans() else {
return FanReadout::Unavailable;
};
if fan_count == 0 {
return FanReadout::Unavailable;
}
let percent = device.fan_speed(0).ok();
let rpm = device.fan_speed_rpm(0).ok();
if is_enclosure_cooled_nvidia(name) && percent == Some(0) && rpm == Some(0) {
return FanReadout::Unavailable;
}
FanReadout::Percent { fan_index: 0 }
}
fn sample_fan_percent(device: &Device<'_>, readout: FanReadout) -> Option<f64> {
match readout {
FanReadout::Unavailable => None,
FanReadout::Percent { fan_index } => device.fan_speed(fan_index).ok().map(|v| v as f64),
}
}
fn is_enclosure_cooled_nvidia(name: &str) -> bool {
let normalized = name.to_ascii_uppercase();
normalized == "NVIDIA A40"
}
fn collect_processes(device: &Device<'_>, proc_cache: &mut ProcInfoCache) -> Vec<GpuProcess> {
let mut by_pid = HashMap::new();
if let Ok(processes) = device.running_compute_processes() {
merge_processes(&mut by_pid, processes, GpuProcessKind::Compute, proc_cache);
}
if let Ok(processes) = device.running_graphics_processes() {
merge_processes(&mut by_pid, processes, GpuProcessKind::Graphics, proc_cache);
}
if let Ok(processes) = device.mps_running_compute_processes() {
merge_processes(&mut by_pid, processes, GpuProcessKind::Mps, proc_cache);
}
let mut processes = by_pid.into_values().collect::<Vec<_>>();
processes.sort_by(|a, b| {
b.used_gpu_memory_bytes
.unwrap_or(0)
.cmp(&a.used_gpu_memory_bytes.unwrap_or(0))
.then_with(|| a.pid.cmp(&b.pid))
});
processes
}
fn merge_processes(
by_pid: &mut HashMap<u32, GpuProcess>,
processes: Vec<ProcessInfo>,
kind: GpuProcessKind,
proc_cache: &mut ProcInfoCache,
) {
for process in processes {
let identity = proc_cache.identity(process.pid);
let memory = used_gpu_memory_bytes(&process.used_gpu_memory);
by_pid
.entry(process.pid)
.and_modify(|existing| {
if !existing.kinds.contains(&kind) {
existing.kinds.push(kind);
}
existing.used_gpu_memory_bytes =
max_optional(existing.used_gpu_memory_bytes, memory);
existing.gpu_instance_id = existing.gpu_instance_id.or(process.gpu_instance_id);
existing.compute_instance_id =
existing.compute_instance_id.or(process.compute_instance_id);
})
.or_insert_with(|| GpuProcess {
pid: process.pid,
user: identity.user.clone(),
command: identity.command.clone(),
kinds: vec![kind],
used_gpu_memory_bytes: memory,
gpu_instance_id: process.gpu_instance_id,
compute_instance_id: process.compute_instance_id,
});
}
}
fn used_gpu_memory_bytes(memory: &UsedGpuMemory) -> Option<u64> {
match memory {
UsedGpuMemory::Unavailable => None,
UsedGpuMemory::Used(bytes) => Some(*bytes),
}
}
fn max_optional(left: Option<u64>, right: Option<u64>) -> Option<u64> {
match (left, right) {
(Some(left), Some(right)) => Some(left.max(right)),
(Some(left), None) => Some(left),
(None, Some(right)) => Some(right),
(None, None) => None,
}
}
impl ProcInfoCache {
fn identity(&mut self, pid: u32) -> ProcIdentity {
let start_time_ticks = process_start_time_ticks(pid);
if let Some(identity) = self.identities.get(&pid)
&& identity.start_time_ticks == start_time_ticks
{
return identity.clone();
}
let user = process_uid(pid).map(|uid| self.username(uid));
let command = process_cmdline(pid).or_else(|| process_comm(pid));
let identity = ProcIdentity {
start_time_ticks,
user,
command,
};
self.identities.insert(pid, identity.clone());
identity
}
fn username(&mut self, uid: u32) -> String {
if let Some(name) = self.usernames.get(&uid) {
return name.clone();
}
let name = username_from_system(uid).unwrap_or_else(|| uid.to_string());
self.usernames.insert(uid, name.clone());
name
}
}
fn process_uid(pid: u32) -> Option<u32> {
fs::read_to_string(format!("/proc/{pid}/status"))
.ok()?
.lines()
.find_map(|line| {
let mut fields = line.split_whitespace();
(fields.next()? == "Uid:").then(|| fields.next()?.parse().ok())?
})
}
fn process_cmdline(pid: u32) -> Option<String> {
let bytes = fs::read(format!("/proc/{pid}/cmdline")).ok()?;
let args = bytes
.split(|byte| *byte == 0)
.filter(|arg| !arg.is_empty())
.map(|arg| String::from_utf8_lossy(arg).into_owned())
.collect::<Vec<_>>();
(!args.is_empty()).then(|| args.join(" "))
}
fn process_comm(pid: u32) -> Option<String> {
let command = fs::read_to_string(format!("/proc/{pid}/comm")).ok()?;
let command = command.trim();
(!command.is_empty()).then(|| command.to_owned())
}
fn process_start_time_ticks(pid: u32) -> Option<u64> {
let stat = fs::read_to_string(format!("/proc/{pid}/stat")).ok()?;
let after_comm = stat.rsplit_once(") ")?.1;
after_comm.split_whitespace().nth(19)?.parse().ok()
}
fn username_from_system(uid: u32) -> Option<String> {
username_from_passwd(uid).or_else(|| username_from_getent(uid))
}
fn username_from_passwd(uid: u32) -> Option<String> {
passwd_name(&fs::read_to_string("/etc/passwd").ok()?, uid)
}
fn username_from_getent(uid: u32) -> Option<String> {
let output = Command::new("getent")
.arg("passwd")
.arg(uid.to_string())
.output()
.ok()?;
if !output.status.success() {
return None;
}
let passwd = String::from_utf8(output.stdout).ok()?;
passwd_name(&passwd, uid)
}
fn passwd_name(passwd: &str, uid: u32) -> Option<String> {
passwd.lines().find_map(|line| {
let mut fields = line.split(':');
let name = fields.next()?;
let _password = fields.next()?;
let candidate_uid = fields.next()?.parse::<u32>().ok()?;
(candidate_uid == uid).then(|| name.to_owned())
})
}