use std::sync::atomic::{AtomicBool, Ordering};
use nvml_wrapper::Nvml;
use nvml_wrapper::enum_wrappers::device::{Clock, PcieUtilCounter, TemperatureSensor};
use nvml_wrapper::error::NvmlError;
use super::GpuReading;
pub(super) struct Sampler {
nvml: Nvml,
}
impl Sampler {
pub(super) fn new() -> Option<Self> {
let nvml = match Nvml::init() {
Ok(nvml) => nvml,
Err(e) => {
tracing::warn!("NVML init failed; GPU rows unavailable: {e}");
return None;
}
};
match nvml.device_count() {
Ok(n) if n >= 1 => Some(Self { nvml }),
Ok(_) => {
tracing::warn!("NVML found no devices; GPU rows unavailable");
None
}
Err(e) => {
tracing::warn!("NVML device_count failed; GPU rows unavailable: {e}");
None
}
}
}
pub(super) fn poll(&mut self) -> GpuReading {
let device = self.nvml.device_by_index(0).unwrap();
let util = match device.utilization_rates() {
Ok(u) => u.gpu as f32,
Err(e) => {
warn_once(&e);
0.0
}
};
let power_w = match device.power_usage() {
Ok(milliwatts) => milliwatts as f32 / 1e3,
Err(e) => {
warn_once(&e);
0.0
}
};
let pcie_bps = match (
device.pcie_throughput(PcieUtilCounter::Send),
device.pcie_throughput(PcieUtilCounter::Receive),
) {
(Ok(tx), Ok(rx)) => (tx as u64 + rx as u64) * 1_000,
(Err(e), _) | (_, Err(e)) => {
warn_once(&e);
0
}
};
let mem_bytes = match device.memory_info() {
Ok(m) => m.used,
Err(e) => {
warn_once(&e);
0
}
};
let temp_c = match device.temperature(TemperatureSensor::Gpu) {
Ok(c) => c as f32,
Err(e) => {
warn_once(&e);
0.0
}
};
let clock_mhz = match device.clock_info(Clock::SM) {
Ok(mhz) => mhz as f32,
Err(e) => {
warn_once(&e);
0.0
}
};
GpuReading {
util,
sm: 0.0,
pcie_bps,
power_w,
mem_bytes,
temp_c,
clock_mhz,
}
}
}
fn warn_once(err: &NvmlError) {
static WARNED: AtomicBool = AtomicBool::new(false);
if !WARNED.swap(true, Ordering::Relaxed) {
tracing::warn!("NVML metric unavailable: {err}");
}
}