vor 0.2.1 - Docs.rs

//! GPU-system metrics rendered as system rows in the panel.
//!
//! [`ensure_collector`] spawns a background thread that polls the
//! platform GPU backend and stores the latest reading in atomics;
//! [`sample_now`](crate::system::sample_now) snapshots them into each
//! frame's [`SystemSample`](crate::SystemSample). The live panel and
//! the headless recorder each call [`ensure_collector`] when they
//! start, so callers wire up nothing.
//!
//! Backends are feature-gated:
//! - `mac`: IOKit `IOAccelerator` (util / SM) + the private IOReport
//!   framework (power), no `sudo`. PCIe is unified-memory on Apple
//!   Silicon, so that row stays zero.
//! - `cuda`: NVML (util / power / PCIe). NVML has no SM-occupancy
//!   counter, so that row is dropped on NVIDIA.
//! - `web` / no platform feature: no API available; no GPU rows.

#[cfg(feature = "gpu")]
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};

#[cfg(feature = "mac")]
mod mac;
#[cfg(all(feature = "cuda", not(feature = "mac")))]
mod nvml;

// Pick the platform backend. `mac` wins if both are somehow enabled;
// they never coexist on real hardware.
#[cfg(feature = "mac")]
use mac::Sampler;
#[cfg(all(feature = "cuda", not(feature = "mac")))]
use nvml::Sampler;

#[cfg(feature = "gpu")]
static GPU_UTIL_BITS: AtomicU32 = AtomicU32::new(0);
#[cfg(feature = "gpu")]
static GPU_SM_BITS: AtomicU32 = AtomicU32::new(0);
#[cfg(feature = "gpu")]
static PCIE_BPS: AtomicU64 = AtomicU64::new(0);
#[cfg(feature = "gpu")]
static GPU_POWER_BITS: AtomicU32 = AtomicU32::new(0);
#[cfg(feature = "gpu")]
static GPU_MEM_BYTES: AtomicU64 = AtomicU64::new(0);
#[cfg(feature = "gpu")]
static GPU_TEMP_BITS: AtomicU32 = AtomicU32::new(0);
#[cfg(feature = "gpu")]
static GPU_CLOCK_BITS: AtomicU32 = AtomicU32::new(0);

/// One poll of the platform GPU backend. `sm` is macOS-only; `temp`
/// and `clock` are NVIDIA-only; backends leave fields they lack zero.
#[cfg(feature = "gpu")]
#[derive(Clone, Copy)]
struct GpuReading {
    util: f32,
    sm: f32,
    pcie_bps: u64,
    power_w: f32,
    mem_bytes: u64,
    temp_c: f32,
    clock_mhz: f32,
}

/// Ensure the background GPU poller is running.
///
/// Idempotent and cheap to re-call: the thread is spawned at most
/// once per process. Called when the live panel or the headless
/// recorder starts, so collection begins with the first consumer.
#[cfg(feature = "gpu")]
pub(crate) fn ensure_collector() {
    use std::sync::Once;
    use std::time::Duration;

    static START: Once = Once::new();
    START.call_once(|| {
        // ~4 Hz tracks load without busy-polling the source. On macOS
        // power is ΔEnergy / Δt, so the interval is also its window.
        const POLL: Duration = Duration::from_millis(250);
        std::thread::Builder::new()
            .name("vor-gpu".into())
            .spawn(|| {
                // No backend (e.g. a cuda-built wheel on a box with no
                // NVIDIA driver): leave the gpu_* rows at zero rather
                // than crash. Sampler::new logs why.
                let Some(mut sampler) = Sampler::new() else {
                    return;
                };
                loop {
                    std::thread::sleep(POLL);
                    store(sampler.poll());
                }
            })
            .unwrap();
    });
}

#[cfg(not(feature = "gpu"))]
pub(crate) const fn ensure_collector() {}

#[cfg(feature = "gpu")]
fn store(reading: GpuReading) {
    let GpuReading {
        util,
        sm,
        pcie_bps,
        power_w,
        mem_bytes,
        temp_c,
        clock_mhz,
    } = reading;
    GPU_UTIL_BITS.store(util.to_bits(), Ordering::Relaxed);
    GPU_SM_BITS.store(sm.to_bits(), Ordering::Relaxed);
    PCIE_BPS.store(pcie_bps, Ordering::Relaxed);
    GPU_POWER_BITS.store(power_w.to_bits(), Ordering::Relaxed);
    GPU_MEM_BYTES.store(mem_bytes, Ordering::Relaxed);
    GPU_TEMP_BITS.store(temp_c.to_bits(), Ordering::Relaxed);
    GPU_CLOCK_BITS.store(clock_mhz.to_bits(), Ordering::Relaxed);
}

#[cfg(feature = "gpu")]
pub(crate) fn read_gpu_util() -> f32 {
    f32::from_bits(GPU_UTIL_BITS.load(Ordering::Relaxed))
}
#[cfg(feature = "mac")]
pub(crate) fn read_gpu_sm() -> f32 {
    f32::from_bits(GPU_SM_BITS.load(Ordering::Relaxed))
}
#[cfg(feature = "cuda")]
pub(crate) fn read_pcie_bps() -> u64 {
    PCIE_BPS.load(Ordering::Relaxed)
}
#[cfg(feature = "gpu")]
pub(crate) fn read_gpu_power_w() -> f32 {
    f32::from_bits(GPU_POWER_BITS.load(Ordering::Relaxed))
}
#[cfg(feature = "gpu")]
pub(crate) fn read_gpu_mem_bytes() -> u64 {
    GPU_MEM_BYTES.load(Ordering::Relaxed)
}
#[cfg(feature = "cuda")]
pub(crate) fn read_gpu_temp_c() -> f32 {
    f32::from_bits(GPU_TEMP_BITS.load(Ordering::Relaxed))
}
#[cfg(feature = "cuda")]
pub(crate) fn read_gpu_clock_mhz() -> f32 {
    f32::from_bits(GPU_CLOCK_BITS.load(Ordering::Relaxed))
}

#[cfg(all(test, feature = "mac"))]
mod tests {
    /// Exercises the IOKit + IOReport FFI end-to-end on real
    /// hardware: utilization in range, power a finite non-negative
    /// rate. Catches signature / unit drift a compile check misses.
    #[test]
    fn poll_yields_sane_readings() {
        let mut sampler = super::mac::Sampler::new().unwrap();
        std::thread::sleep(std::time::Duration::from_millis(300));
        let r = sampler.poll();
        assert!((0.0..=100.0).contains(&r.util));
        assert!((0.0..=100.0).contains(&r.sm));
        assert!(r.power_w.is_finite() && r.power_w >= 0.0);
    }
}

#[cfg(all(test, feature = "cuda", not(feature = "mac")))]
mod tests {
    /// Exercises NVML end-to-end on a machine with an NVIDIA driver:
    /// utilization in range, power a finite non-negative rate. Panics
    /// in `Sampler::new` if no driver is present, so run it on the GPU
    /// host.
    #[test]
    fn poll_yields_sane_readings() {
        let mut sampler = super::nvml::Sampler::new().unwrap();
        std::thread::sleep(std::time::Duration::from_millis(300));
        let r = sampler.poll();
        assert!((0.0..=100.0).contains(&r.util));
        assert!(r.power_w.is_finite() && r.power_w >= 0.0);
    }
}