#![cfg(all(feature = "nvml", target_os = "linux"))]
use async_trait::async_trait;
use bytesize::ByteSize;
use concerto_core::GpuId;
use nvml_wrapper::{enum_wrappers::device::TemperatureSensor, error::NvmlError, Nvml};
use crate::monitor::{GpuMonitor, GpuMonitorError, GpuSnapshot};
pub struct NvmlMonitor {
nvml: std::sync::Arc<Nvml>,
device_count: usize,
}
impl NvmlMonitor {
pub fn new() -> Result<Self, GpuMonitorError> {
let nvml = Nvml::init().map_err(|e| GpuMonitorError::NvmlInit(e.to_string()))?;
let device_count =
nvml.device_count()
.map_err(|e| GpuMonitorError::NvmlQuery(e.to_string()))? as usize;
Ok(Self {
nvml: std::sync::Arc::new(nvml),
device_count,
})
}
fn snapshot_blocking(nvml: &Nvml, device_count: usize) -> Vec<GpuSnapshot> {
let mut out = Vec::with_capacity(device_count);
for idx in 0..device_count {
match Self::snapshot_one(nvml, idx) {
Ok(snap) => out.push(snap),
Err(err) => {
tracing::warn!(
gpu = idx,
error = %err,
"failed to read NVML snapshot for device; skipping"
);
}
}
}
out
}
fn snapshot_one(nvml: &Nvml, idx: usize) -> Result<GpuSnapshot, NvmlError> {
let device = nvml.device_by_index(idx as u32)?;
let memory = device.memory_info()?;
let temperature = device.temperature(TemperatureSensor::Gpu)?;
let utilisation = device.utilization_rates()?;
let ecc = device
.total_ecc_errors(
nvml_wrapper::enum_wrappers::device::MemoryError::Uncorrected,
nvml_wrapper::enum_wrappers::device::EccCounter::Aggregate,
)
.unwrap_or(0);
Ok(GpuSnapshot {
id: GpuId(idx),
memory_total: ByteSize::b(memory.total),
memory_used: ByteSize::b(memory.used),
temperature_celsius: temperature,
utilisation_percent: utilisation.gpu,
ecc_errors_uncorrected: ecc,
})
}
}
#[async_trait]
impl GpuMonitor for NvmlMonitor {
fn gpu_count(&self) -> usize {
self.device_count
}
async fn snapshot(&self) -> Vec<GpuSnapshot> {
let nvml = self.nvml.clone();
let device_count = self.device_count;
tokio::task::spawn_blocking(move || Self::snapshot_blocking(&nvml, device_count))
.await
.unwrap_or_else(|err| {
tracing::error!(error = %err, "NVML snapshot task panicked");
Vec::new()
})
}
}