Skip to main content

hematite/ui/
gpu_monitor.rs

1//! Background GPU VRAM monitor.
2//!
3//! Spawns a Tokio task that polls `nvidia-smi` every few seconds and stores
4//! the result in lock-free atomics so the TUI render loop can read it cheaply.
5
6use lazy_static::lazy_static;
7use std::collections::VecDeque;
8use std::sync::atomic::{AtomicU32, Ordering};
9use std::sync::{Arc, RwLock};
10
11lazy_static! {
12    /// Global access to GPU vitals for tool investigation (Zero-Shot Trends).
13    pub static ref GLOBAL_GPU_STATE: Arc<GpuState> = Arc::new(GpuState::new());
14}
15
16/// Shared GPU state — read by the TUI/Agent, written by the background poller.
17#[derive(Debug)]
18pub struct GpuState {
19    /// VRAM used in MiB.
20    pub used_mib: AtomicU32,
21    /// VRAM total in MiB.
22    pub total_mib: AtomicU32,
23    /// GPU name (set once on first successful poll).
24    pub name: RwLock<String>,
25    /// Recent history points (max 10).
26    pub history: RwLock<VecDeque<HistoryPoint>>,
27}
28
29#[derive(Debug, Clone)]
30pub struct HistoryPoint {
31    pub timestamp: chrono::DateTime<chrono::Local>,
32    pub used_mib: u32,
33    pub temperature: u32,
34    pub core_clock: u32,
35    pub mem_clock: u32,
36    pub power_draw: f32,
37    pub fan_speed: u32,
38    pub throttle_reasons: String,
39}
40
41impl Default for GpuState {
42    fn default() -> Self {
43        Self::new()
44    }
45}
46
47impl GpuState {
48    pub fn new() -> Self {
49        Self {
50            used_mib: AtomicU32::new(0),
51            total_mib: AtomicU32::new(0),
52            name: RwLock::new("GPU".into()),
53            history: RwLock::new(VecDeque::with_capacity(10)),
54        }
55    }
56
57    /// Returns (used_mib, total_mib).
58    pub fn read(&self) -> (u32, u32) {
59        (
60            self.used_mib.load(Ordering::Relaxed),
61            self.total_mib.load(Ordering::Relaxed),
62        )
63    }
64
65    /// Returns the ratio used/total, clamped to [0.0, 1.0].
66    pub fn ratio(&self) -> f64 {
67        let (used, total) = self.read();
68        if total == 0 {
69            return 0.0;
70        }
71        (used as f64 / total as f64).clamp(0.0, 1.0)
72    }
73
74    /// Returns a human-readable label like "7.5 GB / 12.0 GB".
75    pub fn label(&self) -> String {
76        let (used, total) = self.read();
77        if total == 0 {
78            return "N/A".into();
79        }
80        format!(
81            "{:.1} GB / {:.1} GB",
82            used as f64 / 1024.0,
83            total as f64 / 1024.0
84        )
85    }
86
87    /// Returns the GPU name (e.g. "NVIDIA GeForce RTX 4070").
88    pub fn gpu_name(&self) -> String {
89        self.name.read().unwrap().clone()
90    }
91}
92
93/// Spawn the background polling task. Returns the shared state handle.
94pub fn spawn_gpu_monitor() -> Arc<GpuState> {
95    let state = GLOBAL_GPU_STATE.clone();
96    let bg = state.clone();
97
98    tokio::spawn(async move {
99        let mut poll_count = 0u64;
100        loop {
101            if let Some(metrics) = poll_nvidia_smi().await {
102                bg.used_mib.store(metrics.used_mib, Ordering::Relaxed);
103                bg.total_mib.store(metrics.total_mib, Ordering::Relaxed);
104                if !metrics.name.is_empty() {
105                    let mut name = bg.name.write().unwrap();
106                    if *name == "GPU" {
107                        *name = metrics.name;
108                    }
109                }
110
111                // Add to history every ~2 minutes (60 iterations @ 2s each)
112                if poll_count.is_multiple_of(60) {
113                    let mut history = bg.history.write().unwrap();
114                    history.push_back(HistoryPoint {
115                        timestamp: chrono::Local::now(),
116                        used_mib: metrics.used_mib,
117                        temperature: metrics.temperature,
118                        core_clock: metrics.core_clock,
119                        mem_clock: metrics.mem_clock,
120                        power_draw: metrics.power_draw,
121                        fan_speed: metrics.fan_speed,
122                        throttle_reasons: metrics.throttle_reasons,
123                    });
124                    if history.len() > 10 {
125                        history.pop_front();
126                    }
127                }
128            }
129            poll_count += 1;
130            tokio::time::sleep(std::time::Duration::from_secs(2)).await;
131        }
132    });
133
134    state
135}
136
137pub struct GpuMetrics {
138    pub used_mib: u32,
139    pub total_mib: u32,
140    pub name: String,
141    pub temperature: u32,
142    pub core_clock: u32,
143    pub mem_clock: u32,
144    pub power_draw: f32,
145    pub fan_speed: u32,
146    pub throttle_reasons: String,
147}
148
149/// Call nvidia-smi and parse the CSV output.
150async fn poll_nvidia_smi() -> Option<GpuMetrics> {
151    let output = tokio::process::Command::new("nvidia-smi")
152        .args([
153            "--query-gpu=memory.used,memory.total,name,temperature.gpu,clocks.current.graphics,clocks.current.memory,power.draw,fan.speed,clocks_throttle_reasons.active",
154            "--format=csv,noheader,nounits",
155        ])
156        .output()
157        .await
158        .ok()?;
159
160    if !output.status.success() {
161        return None;
162    }
163
164    let stdout = String::from_utf8_lossy(&output.stdout);
165    let line = stdout.trim();
166    let mut it = line.split(',').map(|s| s.trim());
167    let (Some(p0), Some(p1), Some(p2), Some(p3), Some(p4), Some(p5), Some(p6), Some(p7), Some(p8)) = (
168        it.next(),
169        it.next(),
170        it.next(),
171        it.next(),
172        it.next(),
173        it.next(),
174        it.next(),
175        it.next(),
176        it.next(),
177    ) else {
178        return None;
179    };
180
181    Some(GpuMetrics {
182        used_mib: p0.parse().ok()?,
183        total_mib: p1.parse().ok()?,
184        name: p2.to_string(),
185        temperature: p3.parse().ok()?,
186        core_clock: p4.parse().ok()?,
187        mem_clock: p5.parse().ok()?,
188        power_draw: p6.parse().unwrap_or(0.0),
189        fan_speed: p7.parse().unwrap_or(0),
190        throttle_reasons: p8.to_string(),
191    })
192}