Skip to main content

hematite/ui/
gpu_monitor.rs

1//! Background GPU VRAM monitor.
2//!
3//! Spawns a Tokio task that polls `nvidia-smi` every few seconds and stores
4//! the result in lock-free atomics so the TUI render loop can read it cheaply.
5
6use lazy_static::lazy_static;
7use std::collections::VecDeque;
8use std::sync::atomic::{AtomicU32, Ordering};
9use std::sync::{Arc, Mutex};
10
11lazy_static! {
12    /// Global access to GPU vitals for tool investigation (Zero-Shot Trends).
13    pub static ref GLOBAL_GPU_STATE: Arc<GpuState> = Arc::new(GpuState::new());
14}
15
16/// Shared GPU state — read by the TUI/Agent, written by the background poller.
17#[derive(Debug)]
18pub struct GpuState {
19    /// VRAM used in MiB.
20    pub used_mib: AtomicU32,
21    /// VRAM total in MiB.
22    pub total_mib: AtomicU32,
23    /// GPU name (set once on first successful poll).
24    pub name: Mutex<String>,
25    /// Recent history points (max 10).
26    pub history: Mutex<VecDeque<HistoryPoint>>,
27}
28
29#[derive(Debug, Clone)]
30pub struct HistoryPoint {
31    pub timestamp: chrono::DateTime<chrono::Local>,
32    pub used_mib: u32,
33    pub temperature: u32,
34    pub core_clock: u32,
35    pub mem_clock: u32,
36    pub power_draw: f32,
37    pub fan_speed: u32,
38    pub throttle_reasons: String,
39}
40
41impl GpuState {
42    pub fn new() -> Self {
43        Self {
44            used_mib: AtomicU32::new(0),
45            total_mib: AtomicU32::new(0),
46            name: Mutex::new("GPU".into()),
47            history: Mutex::new(VecDeque::with_capacity(10)),
48        }
49    }
50
51    /// Returns (used_mib, total_mib).
52    pub fn read(&self) -> (u32, u32) {
53        (
54            self.used_mib.load(Ordering::Relaxed),
55            self.total_mib.load(Ordering::Relaxed),
56        )
57    }
58
59    /// Returns the ratio used/total, clamped to [0.0, 1.0].
60    pub fn ratio(&self) -> f64 {
61        let (used, total) = self.read();
62        if total == 0 {
63            return 0.0;
64        }
65        (used as f64 / total as f64).clamp(0.0, 1.0)
66    }
67
68    /// Returns a human-readable label like "7.5 GB / 12.0 GB".
69    pub fn label(&self) -> String {
70        let (used, total) = self.read();
71        if total == 0 {
72            return "N/A".into();
73        }
74        format!(
75            "{:.1} GB / {:.1} GB",
76            used as f64 / 1024.0,
77            total as f64 / 1024.0
78        )
79    }
80
81    /// Returns the GPU name (e.g. "NVIDIA GeForce RTX 4070").
82    pub fn gpu_name(&self) -> String {
83        self.name.lock().unwrap().clone()
84    }
85}
86
87/// Spawn the background polling task. Returns the shared state handle.
88pub fn spawn_gpu_monitor() -> Arc<GpuState> {
89    let state = GLOBAL_GPU_STATE.clone();
90    let bg = state.clone();
91
92    tokio::spawn(async move {
93        let mut poll_count = 0u64;
94        loop {
95            if let Some(metrics) = poll_nvidia_smi().await {
96                bg.used_mib.store(metrics.used_mib, Ordering::Relaxed);
97                bg.total_mib.store(metrics.total_mib, Ordering::Relaxed);
98                if !metrics.name.is_empty() {
99                    let mut name = bg.name.lock().unwrap();
100                    if *name == "GPU" {
101                        *name = metrics.name;
102                    }
103                }
104
105                // Add to history every ~2 minutes (60 iterations @ 2s each)
106                if poll_count % 60 == 0 {
107                    let mut history = bg.history.lock().unwrap();
108                    history.push_back(HistoryPoint {
109                        timestamp: chrono::Local::now(),
110                        used_mib: metrics.used_mib,
111                        temperature: metrics.temperature,
112                        core_clock: metrics.core_clock,
113                        mem_clock: metrics.mem_clock,
114                        power_draw: metrics.power_draw,
115                        fan_speed: metrics.fan_speed,
116                        throttle_reasons: metrics.throttle_reasons,
117                    });
118                    if history.len() > 10 {
119                        history.pop_front();
120                    }
121                }
122            }
123            poll_count += 1;
124            tokio::time::sleep(std::time::Duration::from_secs(2)).await;
125        }
126    });
127
128    state
129}
130
131pub struct GpuMetrics {
132    pub used_mib: u32,
133    pub total_mib: u32,
134    pub name: String,
135    pub temperature: u32,
136    pub core_clock: u32,
137    pub mem_clock: u32,
138    pub power_draw: f32,
139    pub fan_speed: u32,
140    pub throttle_reasons: String,
141}
142
143/// Call nvidia-smi and parse the CSV output.
144async fn poll_nvidia_smi() -> Option<GpuMetrics> {
145    let output = tokio::process::Command::new("nvidia-smi")
146        .args([
147            "--query-gpu=memory.used,memory.total,name,temperature.gpu,clocks.current.graphics,clocks.current.memory,power.draw,fan.speed,clocks_throttle_reasons.active",
148            "--format=csv,noheader,nounits",
149        ])
150        .output()
151        .await
152        .ok()?;
153
154    if !output.status.success() {
155        return None;
156    }
157
158    let stdout = String::from_utf8_lossy(&output.stdout);
159    let line = stdout.trim();
160    let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
161    if parts.len() < 9 {
162        return None;
163    }
164
165    Some(GpuMetrics {
166        used_mib: parts[0].parse().ok()?,
167        total_mib: parts[1].parse().ok()?,
168        name: parts[2].to_string(),
169        temperature: parts[3].parse().ok()?,
170        core_clock: parts[4].parse().ok()?,
171        mem_clock: parts[5].parse().ok()?,
172        power_draw: parts[6].parse().unwrap_or(0.0),
173        fan_speed: parts[7].parse().unwrap_or(0),
174        throttle_reasons: parts[8].to_string(),
175    })
176}