concerto_gpu/monitor.rs
1//! The [`GpuMonitor`] trait and its associated snapshot and error types.
2
3use async_trait::async_trait;
4use bytesize::ByteSize;
5use concerto_core::GpuId;
6use serde::{Deserialize, Serialize};
7
8/// A point-in-time view of a single GPU's telemetry.
9///
10/// `GpuSnapshot` is the lowest common denominator of what every [`GpuMonitor`]
11/// implementation must be able to report. Higher layers (e.g. `concerto-core`)
12/// turn these snapshots into richer state such as `GpuState`.
13#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
14pub struct GpuSnapshot {
15 /// The GPU's index on the node.
16 pub id: GpuId,
17 /// Total physical VRAM on the device.
18 pub memory_total: ByteSize,
19 /// VRAM currently in use (from all processes on the device, not just ours).
20 pub memory_used: ByteSize,
21 /// Core temperature in degrees Celsius.
22 pub temperature_celsius: u32,
23 /// GPU core utilisation as a percentage (0-100).
24 pub utilisation_percent: u32,
25 /// Cumulative count of uncorrectable ECC errors reported by the driver.
26 pub ecc_errors_uncorrected: u64,
27}
28
29/// Abstraction over a source of GPU telemetry.
30///
31/// Implementors must be `Send + Sync` so that the monitor can be shared across
32/// tasks behind an `Arc`.
33#[async_trait]
34pub trait GpuMonitor: Send + Sync {
35 /// The number of GPUs this monitor is tracking.
36 ///
37 /// This is the value at construction time; implementations that support
38 /// hot-plug / hot-unplug (such as `MockGpuMonitor` for test scenarios) may
39 /// return a different value after the monitor has been mutated.
40 fn gpu_count(&self) -> usize;
41
42 /// Take a snapshot of all GPUs currently visible to this monitor.
43 async fn snapshot(&self) -> Vec<GpuSnapshot>;
44}
45
46/// Errors that can occur when constructing or using a [`GpuMonitor`].
47#[derive(Debug, thiserror::Error)]
48pub enum GpuMonitorError {
49 /// NVML could not be initialised (driver missing, permission denied, etc.).
50 #[error("failed to initialise NVML: {0}")]
51 NvmlInit(String),
52
53 /// A query against the NVML backend failed.
54 #[error("NVML query failed: {0}")]
55 NvmlQuery(String),
56
57 /// A GPU index was requested that does not exist on this node.
58 #[error("GPU index {0} is out of range")]
59 GpuOutOfRange(usize),
60
61 /// The requested feature is not supported on this platform.
62 #[error("feature not supported on this platform: {0}")]
63 Unsupported(&'static str),
64}