Skip to main content

entrenar/monitor/gpu/
types.rs

1//! GPU metric types for monitoring.
2
3use serde::{Deserialize, Serialize};
4
5// ---------------------------------------------------------------------------
6// Mock GPU hardware constants (modelled after NVIDIA RTX 4090)
7// ---------------------------------------------------------------------------
8
9/// Total VRAM in megabytes for the mock GPU device
10const MOCK_GPU_MEMORY_TOTAL_MB: u64 = 24576;
11/// Typical board power draw in watts
12const MOCK_GPU_POWER_WATTS: f32 = 250.0;
13/// Maximum power limit in watts
14const MOCK_GPU_POWER_LIMIT_WATTS: f32 = 450.0;
15/// Core clock frequency in MHz
16const MOCK_GPU_CLOCK_MHZ: u32 = 2100;
17/// Memory clock frequency in MHz
18const MOCK_GPU_MEMORY_CLOCK_MHZ: u32 = 10_000;
19/// PCIe receive throughput in KB/s
20const MOCK_GPU_PCIE_RX_KBPS: u64 = 2000;
21/// Placeholder PID for the mock training process
22const MOCK_PROCESS_PID: u32 = 12345;
23
24/// Process using GPU resources
25#[derive(Debug, Clone, Default, Serialize, Deserialize)]
26pub struct GpuProcess {
27    /// Process ID
28    pub pid: u32,
29    /// Full path to executable
30    pub exe_path: String,
31    /// GPU memory used by this process in MB
32    pub gpu_memory_mb: u64,
33    /// CPU usage percentage (0-100)
34    pub cpu_percent: f32,
35    /// Resident set size (RSS) in MB
36    pub rss_mb: u64,
37}
38
39/// GPU metrics snapshot (inspired by btop's GPU visualization)
40///
41/// Reference: btop `src/btop_shared.hpp` lines 130-171
42#[derive(Debug, Clone, Default, Serialize, Deserialize)]
43pub struct GpuMetrics {
44    /// Device index
45    pub device_id: u32,
46    /// GPU name (e.g., "RTX 4090")
47    pub name: String,
48    /// GPU compute utilization (0-100%)
49    pub utilization_percent: u32,
50    /// Used VRAM in MB
51    pub memory_used_mb: u64,
52    /// Total VRAM in MB
53    pub memory_total_mb: u64,
54    /// Memory utilization (0-100%)
55    pub memory_utilization_percent: u32,
56    /// GPU temperature in Celsius
57    pub temperature_celsius: u32,
58    /// Current power draw in watts
59    pub power_watts: f32,
60    /// Power limit in watts
61    pub power_limit_watts: f32,
62    /// Graphics clock in MHz
63    pub clock_mhz: u32,
64    /// Memory clock in MHz
65    pub memory_clock_mhz: u32,
66    /// PCIe transmit throughput in KB/s
67    pub pcie_tx_kbps: u64,
68    /// PCIe receive throughput in KB/s
69    pub pcie_rx_kbps: u64,
70    /// Fan speed percentage (0-100%)
71    pub fan_speed_percent: u32,
72    /// Processes using this GPU
73    pub processes: Vec<GpuProcess>,
74}
75
76impl GpuMetrics {
77    /// Create mock metrics for testing
78    pub fn mock(device_id: u32) -> Self {
79        Self {
80            device_id,
81            name: format!("Mock GPU {device_id}"),
82            utilization_percent: 75,
83            memory_used_mb: 8192,
84            memory_total_mb: MOCK_GPU_MEMORY_TOTAL_MB,
85            memory_utilization_percent: 33,
86            temperature_celsius: 65,
87            power_watts: MOCK_GPU_POWER_WATTS,
88            power_limit_watts: MOCK_GPU_POWER_LIMIT_WATTS,
89            clock_mhz: MOCK_GPU_CLOCK_MHZ,
90            memory_clock_mhz: MOCK_GPU_MEMORY_CLOCK_MHZ,
91            pcie_tx_kbps: 1000,
92            pcie_rx_kbps: MOCK_GPU_PCIE_RX_KBPS,
93            fan_speed_percent: 50,
94            processes: vec![GpuProcess {
95                pid: MOCK_PROCESS_PID,
96                exe_path: "/usr/bin/mock_training".to_string(),
97                gpu_memory_mb: 4096,
98                cpu_percent: 95.0,
99                rss_mb: 2048,
100            }],
101        }
102    }
103
104    /// Calculate memory utilization percentage
105    pub fn memory_percent(&self) -> f64 {
106        if self.memory_total_mb == 0 {
107            return 0.0;
108        }
109        self.memory_used_mb as f64 / self.memory_total_mb as f64 * 100.0
110    }
111
112    /// Calculate power utilization percentage
113    pub fn power_percent(&self) -> f64 {
114        if self.power_limit_watts <= 0.0 {
115            return 0.0;
116        }
117        f64::from(self.power_watts) / f64::from(self.power_limit_watts) * 100.0
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_gpu_metrics_mock() {
127        let m = GpuMetrics::mock(0);
128        assert_eq!(m.device_id, 0);
129        assert!(!m.name.is_empty());
130        assert!(m.utilization_percent <= 100);
131    }
132
133    #[test]
134    fn test_gpu_metrics_memory_percent() {
135        let mut m = GpuMetrics::mock(0);
136        m.memory_used_mb = 8000;
137        m.memory_total_mb = 16000;
138        assert!((m.memory_percent() - 50.0).abs() < 0.1);
139    }
140
141    #[test]
142    fn test_gpu_metrics_memory_percent_zero_total() {
143        let mut m = GpuMetrics::mock(0);
144        m.memory_total_mb = 0;
145        assert!((m.memory_percent() - 0.0).abs() < f64::EPSILON);
146    }
147
148    #[test]
149    fn test_gpu_metrics_power_percent() {
150        let mut m = GpuMetrics::mock(0);
151        m.power_watts = 225.0;
152        m.power_limit_watts = 450.0;
153        assert!((m.power_percent() - 50.0).abs() < 0.1);
154    }
155
156    #[test]
157    fn test_gpu_metrics_power_percent_zero_limit() {
158        let mut m = GpuMetrics::mock(0);
159        m.power_limit_watts = 0.0;
160        assert!((m.power_percent() - 0.0).abs() < f64::EPSILON);
161    }
162
163    #[test]
164    fn test_gpu_metrics_default() {
165        let m = GpuMetrics::default();
166        assert_eq!(m.device_id, 0);
167        assert!(m.name.is_empty());
168        assert_eq!(m.utilization_percent, 0);
169    }
170
171    #[test]
172    fn test_gpu_metrics_clone() {
173        let metrics = GpuMetrics::mock(0);
174        let cloned = metrics.clone();
175        assert_eq!(metrics.device_id, cloned.device_id);
176        assert_eq!(metrics.name, cloned.name);
177    }
178
179    #[test]
180    fn test_gpu_metrics_serde() {
181        let metrics = GpuMetrics::mock(0);
182        let json = serde_json::to_string(&metrics).expect("JSON serialization should succeed");
183        let parsed: GpuMetrics =
184            serde_json::from_str(&json).expect("JSON deserialization should succeed");
185        assert_eq!(metrics.device_id, parsed.device_id);
186        assert_eq!(metrics.utilization_percent, parsed.utilization_percent);
187    }
188}
189
190#[cfg(test)]
191mod property_tests {
192    use super::*;
193    use proptest::prelude::*;
194
195    proptest! {
196        #![proptest_config(ProptestConfig::with_cases(200))]
197
198        #[test]
199        fn prop_memory_percent_bounds(used in 0u64..100000, total in 1u64..100000) {
200            let m = GpuMetrics {
201                memory_used_mb: used,
202                memory_total_mb: total,
203                ..Default::default()
204            };
205            let percent = m.memory_percent();
206            prop_assert!(percent >= 0.0);
207            // Can be > 100 if used > total (which is invalid but shouldn't crash)
208        }
209
210        #[test]
211        fn prop_power_percent_bounds(power in 0.001f32..1000.0, limit in 0.1f32..1000.0) {
212            let m = GpuMetrics {
213                power_watts: power,
214                power_limit_watts: limit,
215                ..Default::default()
216            };
217            let percent = m.power_percent();
218            prop_assert!(percent >= 0.0);
219        }
220    }
221}