vor 0.2.1 - Docs.rs

//! NVIDIA GPU backend via NVML.
//!
//! `nvml-wrapper` loads `libnvidia-ml` from the installed driver at
//! runtime (no CUDA toolkit needed to build), so this compiles
//! anywhere but only yields data on a machine with an NVIDIA driver.
//!
//! Maps onto the shared rows: `gpu_util` from utilization, `gpu_power`
//! from power draw, `pcie` from PCIe TX+RX. NVML exposes no SM-occupancy
//! counter, so that row is macOS-only and left at zero here.

use std::sync::atomic::{AtomicBool, Ordering};

use nvml_wrapper::Nvml;
use nvml_wrapper::enum_wrappers::device::{Clock, PcieUtilCounter, TemperatureSensor};
use nvml_wrapper::error::NvmlError;

use super::GpuReading;

pub(super) struct Sampler {
    nvml: Nvml,
}

impl Sampler {
    /// `None` when NVML can't initialize or no device is present (e.g. a
    /// cuda-built binary run on a box with no NVIDIA driver), so the
    /// collector degrades to no GPU rows instead of crashing.
    pub(super) fn new() -> Option<Self> {
        let nvml = match Nvml::init() {
            Ok(nvml) => nvml,
            Err(e) => {
                tracing::warn!("NVML init failed; GPU rows unavailable: {e}");
                return None;
            }
        };
        match nvml.device_count() {
            Ok(n) if n >= 1 => Some(Self { nvml }),
            Ok(_) => {
                tracing::warn!("NVML found no devices; GPU rows unavailable");
                None
            }
            Err(e) => {
                tracing::warn!("NVML device_count failed; GPU rows unavailable: {e}");
                None
            }
        }
    }

    pub(super) fn poll(&mut self) -> GpuReading {
        // `Device` borrows the `Nvml` handle, so re-fetch it each poll
        // rather than storing a self-referential pair.
        let device = self.nvml.device_by_index(0).unwrap();
        let util = match device.utilization_rates() {
            Ok(u) => u.gpu as f32,
            Err(e) => {
                warn_once(&e);
                0.0
            }
        };
        let power_w = match device.power_usage() {
            Ok(milliwatts) => milliwatts as f32 / 1e3,
            Err(e) => {
                warn_once(&e);
                0.0
            }
        };
        // NVML reports PCIe throughput in KB/s; combine both directions.
        let pcie_bps = match (
            device.pcie_throughput(PcieUtilCounter::Send),
            device.pcie_throughput(PcieUtilCounter::Receive),
        ) {
            (Ok(tx), Ok(rx)) => (tx as u64 + rx as u64) * 1_000,
            (Err(e), _) | (_, Err(e)) => {
                warn_once(&e);
                0
            }
        };
        let mem_bytes = match device.memory_info() {
            Ok(m) => m.used,
            Err(e) => {
                warn_once(&e);
                0
            }
        };
        let temp_c = match device.temperature(TemperatureSensor::Gpu) {
            Ok(c) => c as f32,
            Err(e) => {
                warn_once(&e);
                0.0
            }
        };
        let clock_mhz = match device.clock_info(Clock::SM) {
            Ok(mhz) => mhz as f32,
            Err(e) => {
                warn_once(&e);
                0.0
            }
        };
        GpuReading {
            util,
            sm: 0.0,
            pcie_bps,
            power_w,
            mem_bytes,
            temp_c,
            clock_mhz,
        }
    }
}

/// Some counters aren't supported on every card; tolerate a miss as
/// zero, warned once so a flat row is traceable rather than mistaken
/// for an idle GPU.
fn warn_once(err: &NvmlError) {
    static WARNED: AtomicBool = AtomicBool::new(false);
    if !WARNED.swap(true, Ordering::Relaxed) {
        tracing::warn!("NVML metric unavailable: {err}");
    }
}