Skip to main content

trueno/monitor/
gpu_monitor.rs

1//! GPU Monitor for real-time metrics collection (TRUENO-SPEC-010 Section 8.2)
2
3use std::collections::VecDeque;
4use std::sync::{Arc, Mutex, RwLock};
5
6use super::{GpuDeviceInfo, GpuMemoryMetrics, GpuMetrics, MonitorConfig, MonitorError};
7
8/// GPU Monitor for real-time metrics collection (TRUENO-SPEC-010)
9///
10/// Provides both on-demand and background metric collection with configurable
11/// polling intervals and history retention.
12///
13/// # Example
14///
15/// ```rust,ignore
16/// use trueno::monitor::{GpuMonitor, MonitorConfig};
17///
18/// // Create monitor for device 0
19/// let monitor = GpuMonitor::new(0, MonitorConfig::default())?;
20///
21/// // Get latest metrics
22/// let metrics = monitor.latest()?;
23/// println!("GPU usage: {}%", metrics.utilization.gpu_percent);
24///
25/// // Get history (ring buffer)
26/// let history = monitor.history();
27/// println!("Samples: {}", history.len());
28/// ```
29pub struct GpuMonitor {
30    /// Device info
31    device_info: GpuDeviceInfo,
32    /// Configuration
33    config: MonitorConfig,
34    /// Metrics history (ring buffer)
35    history: Arc<RwLock<VecDeque<GpuMetrics>>>,
36    /// Background thread handle
37    #[cfg(feature = "gpu")]
38    _background_handle: Option<std::thread::JoinHandle<()>>,
39    /// Stop signal for background thread
40    stop_signal: Arc<Mutex<bool>>,
41}
42
43impl GpuMonitor {
44    /// Create a new GPU monitor for the specified device
45    ///
46    /// # Errors
47    ///
48    /// Returns error if device is not found or initialization fails.
49    #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
50    pub fn new(device_index: u32, config: MonitorConfig) -> Result<Self, MonitorError> {
51        let device_info = GpuDeviceInfo::query_device(device_index)?;
52        let history = Arc::new(RwLock::new(VecDeque::with_capacity(config.history_size)));
53        let stop_signal = Arc::new(Mutex::new(false));
54
55        let monitor = Self { device_info, config, history, _background_handle: None, stop_signal };
56
57        Ok(monitor)
58    }
59
60    /// Create monitor without GPU feature (for testing)
61    #[cfg(not(feature = "gpu"))]
62    pub fn new(_device_index: u32, _config: MonitorConfig) -> Result<Self, MonitorError> {
63        Err(MonitorError::NotAvailable("GPU feature not enabled".to_string()))
64    }
65
66    /// Create a mock monitor for testing (no GPU required)
67    #[must_use]
68    pub fn mock(device_info: GpuDeviceInfo, config: MonitorConfig) -> Self {
69        Self {
70            device_info,
71            config,
72            history: Arc::new(RwLock::new(VecDeque::with_capacity(16))),
73            #[cfg(feature = "gpu")]
74            _background_handle: None,
75            stop_signal: Arc::new(Mutex::new(false)),
76        }
77    }
78
79    /// Get device info
80    #[must_use]
81    pub fn device_info(&self) -> &GpuDeviceInfo {
82        &self.device_info
83    }
84
85    /// Get current configuration
86    #[must_use]
87    pub fn config(&self) -> &MonitorConfig {
88        &self.config
89    }
90
91    /// Collect metrics sample now
92    ///
93    /// This performs an immediate collection and adds to history.
94    pub fn collect(&self) -> Result<GpuMetrics, MonitorError> {
95        // For now, return basic memory metrics
96        // Full implementation would query NVML/wgpu for utilization, thermal, etc.
97        let memory = GpuMemoryMetrics::new(
98            self.device_info.vram_total,
99            0, // Would query actual usage
100            self.device_info.vram_total,
101        );
102
103        let metrics = GpuMetrics::new(self.device_info.index, memory);
104
105        // Add to history
106        if let Ok(mut history) = self.history.write() {
107            if history.len() >= self.config.history_size {
108                history.pop_front();
109            }
110            history.push_back(metrics.clone());
111        }
112
113        Ok(metrics)
114    }
115
116    /// Get the latest metrics snapshot (without collecting)
117    pub fn latest(&self) -> Result<GpuMetrics, MonitorError> {
118        self.history
119            .read()
120            .ok()
121            .and_then(|h| h.back().cloned())
122            .ok_or(MonitorError::QueryFailed("No metrics available".to_string()))
123    }
124
125    /// Get history buffer (read-only snapshot)
126    #[must_use]
127    pub fn history(&self) -> Vec<GpuMetrics> {
128        self.history.read().map(|h| h.iter().cloned().collect()).unwrap_or_default()
129    }
130
131    /// Get number of samples in history
132    #[must_use]
133    pub fn sample_count(&self) -> usize {
134        self.history.read().map(|h| h.len()).unwrap_or(0)
135    }
136
137    /// Clear history buffer
138    pub fn clear_history(&self) {
139        if let Ok(mut history) = self.history.write() {
140            history.clear();
141        }
142    }
143
144    /// Check if background collection is active
145    #[must_use]
146    pub fn is_collecting(&self) -> bool {
147        #[cfg(feature = "gpu")]
148        {
149            self._background_handle.is_some()
150        }
151        #[cfg(not(feature = "gpu"))]
152        {
153            false
154        }
155    }
156
157    /// Stop background collection (if running)
158    pub fn stop(&self) {
159        if let Ok(mut stop) = self.stop_signal.lock() {
160            *stop = true;
161        }
162    }
163}
164
165impl Drop for GpuMonitor {
166    fn drop(&mut self) {
167        self.stop();
168    }
169}