Skip to main content

concerto_gpu/
monitor.rs

1//! The [`GpuMonitor`] trait and its associated snapshot and error types.
2
3use async_trait::async_trait;
4use bytesize::ByteSize;
5use concerto_core::GpuId;
6use serde::{Deserialize, Serialize};
7
8/// A point-in-time view of a single GPU's telemetry.
9///
10/// `GpuSnapshot` is the lowest common denominator of what every [`GpuMonitor`]
11/// implementation must be able to report. Higher layers (e.g. `concerto-core`)
12/// turn these snapshots into richer state such as `GpuState`.
13#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
14pub struct GpuSnapshot {
15    /// The GPU's index on the node.
16    pub id: GpuId,
17    /// Total physical VRAM on the device.
18    pub memory_total: ByteSize,
19    /// VRAM currently in use (from all processes on the device, not just ours).
20    pub memory_used: ByteSize,
21    /// Core temperature in degrees Celsius.
22    pub temperature_celsius: u32,
23    /// GPU core utilisation as a percentage (0-100).
24    pub utilisation_percent: u32,
25    /// Cumulative count of uncorrectable ECC errors reported by the driver.
26    pub ecc_errors_uncorrected: u64,
27}
28
29/// Abstraction over a source of GPU telemetry.
30///
31/// Implementors must be `Send + Sync` so that the monitor can be shared across
32/// tasks behind an `Arc`.
33#[async_trait]
34pub trait GpuMonitor: Send + Sync {
35    /// The number of GPUs this monitor is tracking.
36    ///
37    /// This is the value at construction time; implementations that support
38    /// hot-plug / hot-unplug (such as `MockGpuMonitor` for test scenarios) may
39    /// return a different value after the monitor has been mutated.
40    fn gpu_count(&self) -> usize;
41
42    /// Take a snapshot of all GPUs currently visible to this monitor.
43    async fn snapshot(&self) -> Vec<GpuSnapshot>;
44}
45
46/// Errors that can occur when constructing or using a [`GpuMonitor`].
47#[derive(Debug, thiserror::Error)]
48pub enum GpuMonitorError {
49    /// NVML could not be initialised (driver missing, permission denied, etc.).
50    #[error("failed to initialise NVML: {0}")]
51    NvmlInit(String),
52
53    /// A query against the NVML backend failed.
54    #[error("NVML query failed: {0}")]
55    NvmlQuery(String),
56
57    /// A GPU index was requested that does not exist on this node.
58    #[error("GPU index {0} is out of range")]
59    GpuOutOfRange(usize),
60
61    /// The requested feature is not supported on this platform.
62    #[error("feature not supported on this platform: {0}")]
63    Unsupported(&'static str),
64}