Skip to main content

trueno_gpu/monitor/memory/
mod.rs

1//! Memory Hierarchy Monitoring (TRUENO-SPEC-021)
2//!
3//! Comprehensive memory metrics including RAM, SWAP, and GPU VRAM
4//! with pressure level detection based on LAMBDA-0002 specification.
5//!
6//! # Memory Pressure Levels (from lambda-lab-rust-development)
7//!
8//! | Level | Available | Action |
9//! |-------|-----------|--------|
10//! | Ok | >= 50% | Normal operation |
11//! | Elevated | 30-50% | Monitor closely |
12//! | Warning | 15-30% | Reduce parallelism |
13//! | Critical | < 15% | Block new builds |
14//!
15//! # References
16//!
17//! - [Hennessy2017] Memory hierarchy model
18//! - [McCalpin1995] STREAM bandwidth benchmarking
19//! - [Drepper2007] Memory access patterns
20
21use std::collections::VecDeque;
22use std::fmt;
23
24use super::device::DeviceId;
25
26// ============================================================================
27// Memory Pressure Levels (LAMBDA-0002)
28// ============================================================================
29
30/// Memory pressure level based on available memory percentage
31///
32/// From lambda-lab-rust-development LAMBDA-0002 specification.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
34pub enum PressureLevel {
35    /// Normal operation (>= 50% available)
36    Ok,
37    /// Monitor closely (30-50% available)
38    Elevated,
39    /// Reduce parallelism (15-30% available)
40    Warning,
41    /// Block new builds (< 15% available)
42    Critical,
43}
44
45impl PressureLevel {
46    /// Determine pressure level from available percentage
47    #[must_use]
48    pub fn from_available_percent(percent: f64) -> Self {
49        match percent {
50            x if x >= 50.0 => Self::Ok,
51            x if x >= 30.0 => Self::Elevated,
52            x if x >= 15.0 => Self::Warning,
53            _ => Self::Critical,
54        }
55    }
56
57    /// Get recommendation text for this pressure level
58    #[must_use]
59    pub fn recommendation(&self) -> &'static str {
60        match self {
61            Self::Ok => "System healthy - normal operation",
62            Self::Elevated => "Memory usage elevated - monitor closely",
63            Self::Warning => "High memory usage - reduce parallel jobs",
64            Self::Critical => "Critical memory pressure - block new allocations",
65        }
66    }
67
68    /// Check if new allocations should be blocked
69    #[must_use]
70    pub fn should_block_allocations(&self) -> bool {
71        matches!(self, Self::Critical)
72    }
73
74    /// Get ANSI color code for TUI display
75    #[must_use]
76    pub fn ansi_color(&self) -> &'static str {
77        match self {
78            Self::Ok => "\x1b[32m",            // Green
79            Self::Elevated => "\x1b[33m",      // Yellow
80            Self::Warning => "\x1b[38;5;208m", // Orange
81            Self::Critical => "\x1b[31m",      // Red
82        }
83    }
84}
85
86impl fmt::Display for PressureLevel {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        match self {
89            Self::Ok => write!(f, "OK"),
90            Self::Elevated => write!(f, "ELEVATED"),
91            Self::Warning => write!(f, "WARNING"),
92            Self::Critical => write!(f, "CRITICAL"),
93        }
94    }
95}
96
97// ============================================================================
98// Memory Metrics (TRUENO-SPEC-021 Section 3.2)
99// ============================================================================
100
101/// Comprehensive memory metrics for system and GPU
102#[derive(Debug, Clone)]
103pub struct MemoryMetrics {
104    // System RAM
105    /// RAM used in bytes
106    pub ram_used_bytes: u64,
107    /// RAM total in bytes
108    pub ram_total_bytes: u64,
109    /// RAM available in bytes (accounts for cache/buffers)
110    pub ram_available_bytes: u64,
111    /// RAM cached in bytes
112    pub ram_cached_bytes: u64,
113    /// RAM buffers in bytes
114    pub ram_buffers_bytes: u64,
115
116    // Swap
117    /// Swap used in bytes
118    pub swap_used_bytes: u64,
119    /// Swap total in bytes
120    pub swap_total_bytes: u64,
121
122    // Per-GPU VRAM
123    /// GPU VRAM metrics for each device
124    pub gpu_vram: Vec<GpuVramMetrics>,
125
126    // Derived metrics
127    /// Current pressure level
128    pub pressure_level: PressureLevel,
129    /// Safe number of parallel jobs (based on 3GB/job heuristic)
130    pub safe_parallel_jobs: u32,
131
132    // Bandwidth (if measurable)
133    /// RAM read bandwidth in GB/s
134    pub ram_read_bandwidth_gbps: Option<f64>,
135    /// RAM write bandwidth in GB/s
136    pub ram_write_bandwidth_gbps: Option<f64>,
137
138    // History (60-point sparkline, ~60 seconds at 1Hz)
139    /// RAM usage history (percentage, 0.0-100.0)
140    pub ram_history: VecDeque<f64>,
141    /// Swap usage history (percentage, 0.0-100.0)
142    pub swap_history: VecDeque<f64>,
143}
144
145impl MemoryMetrics {
146    /// Maximum history points (60 seconds at 1Hz)
147    pub const MAX_HISTORY_POINTS: usize = 60;
148
149    /// Create new memory metrics by reading system state
150    #[must_use]
151    pub fn new() -> Self {
152        let mut metrics = Self::default();
153        metrics.refresh();
154        metrics
155    }
156
157    /// Refresh all memory metrics from system
158    pub fn refresh(&mut self) {
159        self.read_meminfo();
160        self.read_swapinfo();
161        self.calculate_pressure();
162        self.update_history();
163    }
164
165    /// Read /proc/meminfo on Linux
166    fn read_meminfo(&mut self) {
167        #[cfg(target_os = "linux")]
168        {
169            if let Ok(content) = std::fs::read_to_string("/proc/meminfo") {
170                for line in content.lines() {
171                    let parts: Vec<&str> = line.split_whitespace().collect();
172                    if parts.len() >= 2 {
173                        let value_kb: u64 = parts[1].parse().unwrap_or(0);
174                        let value_bytes = value_kb * 1024;
175
176                        match parts[0] {
177                            "MemTotal:" => self.ram_total_bytes = value_bytes,
178                            "MemAvailable:" => self.ram_available_bytes = value_bytes,
179                            "Cached:" => self.ram_cached_bytes = value_bytes,
180                            "Buffers:" => self.ram_buffers_bytes = value_bytes,
181                            _ => {}
182                        }
183                    }
184                }
185                // Used = Total - Available
186                self.ram_used_bytes = self
187                    .ram_total_bytes
188                    .saturating_sub(self.ram_available_bytes);
189            }
190        }
191    }
192
193    /// Read swap information
194    fn read_swapinfo(&mut self) {
195        #[cfg(target_os = "linux")]
196        {
197            if let Ok(content) = std::fs::read_to_string("/proc/meminfo") {
198                for line in content.lines() {
199                    let parts: Vec<&str> = line.split_whitespace().collect();
200                    if parts.len() >= 2 {
201                        let value_kb: u64 = parts[1].parse().unwrap_or(0);
202                        let value_bytes = value_kb * 1024;
203
204                        match parts[0] {
205                            "SwapTotal:" => self.swap_total_bytes = value_bytes,
206                            "SwapFree:" => {
207                                self.swap_used_bytes =
208                                    self.swap_total_bytes.saturating_sub(value_bytes);
209                            }
210                            _ => {}
211                        }
212                    }
213                }
214            }
215        }
216    }
217
218    /// Calculate pressure level and safe jobs
219    fn calculate_pressure(&mut self) {
220        let available_pct = self.ram_available_percent();
221        self.pressure_level = PressureLevel::from_available_percent(available_pct);
222
223        // Safe jobs = min(available_gb / 3.0, cpu_cores)
224        // Based on 3GB/job heuristic [Volkov2008]
225        let available_gb = self.ram_available_bytes as f64 / (1024.0 * 1024.0 * 1024.0);
226        let cpu_cores = std::thread::available_parallelism()
227            .map(|n| n.get() as u32)
228            .unwrap_or(1);
229
230        self.safe_parallel_jobs = ((available_gb / 3.0) as u32).min(cpu_cores).max(1);
231    }
232
233    /// Update history sparklines
234    fn update_history(&mut self) {
235        // Add current RAM usage percentage
236        self.ram_history.push_back(self.ram_usage_percent());
237        if self.ram_history.len() > Self::MAX_HISTORY_POINTS {
238            self.ram_history.pop_front();
239        }
240
241        // Add current swap usage percentage
242        self.swap_history.push_back(self.swap_usage_percent());
243        if self.swap_history.len() > Self::MAX_HISTORY_POINTS {
244            self.swap_history.pop_front();
245        }
246    }
247
248    // =========================================================================
249    // Helper Methods
250    // =========================================================================
251
252    /// Get RAM usage percentage (0.0-100.0)
253    #[must_use]
254    pub fn ram_usage_percent(&self) -> f64 {
255        if self.ram_total_bytes == 0 {
256            return 0.0;
257        }
258        (self.ram_used_bytes as f64 / self.ram_total_bytes as f64) * 100.0
259    }
260
261    /// Get RAM available percentage (0.0-100.0)
262    #[must_use]
263    pub fn ram_available_percent(&self) -> f64 {
264        if self.ram_total_bytes == 0 {
265            return 100.0;
266        }
267        (self.ram_available_bytes as f64 / self.ram_total_bytes as f64) * 100.0
268    }
269
270    /// Get swap usage percentage (0.0-100.0)
271    #[must_use]
272    pub fn swap_usage_percent(&self) -> f64 {
273        batuta_common::math::usage_percent(self.swap_used_bytes, self.swap_total_bytes)
274    }
275
276    /// Get RAM used in GB
277    #[must_use]
278    pub fn ram_used_gb(&self) -> f64 {
279        self.ram_used_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
280    }
281
282    /// Get RAM total in GB
283    #[must_use]
284    pub fn ram_total_gb(&self) -> f64 {
285        self.ram_total_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
286    }
287
288    /// Get swap used in GB
289    #[must_use]
290    pub fn swap_used_gb(&self) -> f64 {
291        self.swap_used_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
292    }
293
294    /// Get swap total in GB
295    #[must_use]
296    pub fn swap_total_gb(&self) -> f64 {
297        self.swap_total_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
298    }
299
300    /// Get total GPU VRAM used across all devices
301    #[must_use]
302    pub fn total_vram_used_bytes(&self) -> u64 {
303        self.gpu_vram.iter().map(|v| v.used_bytes).sum()
304    }
305
306    /// Get total GPU VRAM capacity across all devices
307    #[must_use]
308    pub fn total_vram_total_bytes(&self) -> u64 {
309        self.gpu_vram.iter().map(|v| v.total_bytes).sum()
310    }
311}
312
313impl Default for MemoryMetrics {
314    fn default() -> Self {
315        Self {
316            ram_used_bytes: 0,
317            ram_total_bytes: 0,
318            ram_available_bytes: 0,
319            ram_cached_bytes: 0,
320            ram_buffers_bytes: 0,
321            swap_used_bytes: 0,
322            swap_total_bytes: 0,
323            gpu_vram: Vec::new(),
324            pressure_level: PressureLevel::Ok,
325            safe_parallel_jobs: 1,
326            ram_read_bandwidth_gbps: None,
327            ram_write_bandwidth_gbps: None,
328            ram_history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
329            swap_history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
330        }
331    }
332}
333
334// ============================================================================
335// GPU VRAM Metrics (TRUENO-SPEC-021 Section 3.2)
336// ============================================================================
337
338/// GPU VRAM metrics for a single device
339#[derive(Debug, Clone)]
340pub struct GpuVramMetrics {
341    /// Device ID
342    pub device_id: DeviceId,
343    /// VRAM used in bytes
344    pub used_bytes: u64,
345    /// VRAM total in bytes
346    pub total_bytes: u64,
347    /// VRAM reserved by driver/system
348    pub reserved_bytes: u64,
349    /// PCIe BAR1 aperture usage (for large memory)
350    pub bar1_used_bytes: u64,
351    /// Usage history (percentage, 0.0-100.0)
352    pub history: VecDeque<f64>,
353}
354
355impl GpuVramMetrics {
356    /// Maximum history points
357    pub const MAX_HISTORY_POINTS: usize = 60;
358
359    /// Create new GPU VRAM metrics
360    #[must_use]
361    pub fn new(device_id: DeviceId, used: u64, total: u64) -> Self {
362        Self {
363            device_id,
364            used_bytes: used,
365            total_bytes: total,
366            reserved_bytes: 0,
367            bar1_used_bytes: 0,
368            history: VecDeque::with_capacity(Self::MAX_HISTORY_POINTS),
369        }
370    }
371
372    /// Get VRAM usage percentage (0.0-100.0)
373    #[must_use]
374    pub fn usage_percent(&self) -> f64 {
375        if self.total_bytes == 0 {
376            return 0.0;
377        }
378        (self.used_bytes as f64 / self.total_bytes as f64) * 100.0
379    }
380
381    /// Get VRAM available in bytes
382    #[must_use]
383    pub fn available_bytes(&self) -> u64 {
384        self.total_bytes.saturating_sub(self.used_bytes)
385    }
386
387    /// Get VRAM used in GB
388    #[must_use]
389    pub fn used_gb(&self) -> f64 {
390        self.used_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
391    }
392
393    /// Get VRAM total in GB
394    #[must_use]
395    pub fn total_gb(&self) -> f64 {
396        self.total_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
397    }
398
399    /// Update usage and add to history
400    pub fn update(&mut self, used: u64) {
401        self.used_bytes = used;
402        self.history.push_back(self.usage_percent());
403        if self.history.len() > Self::MAX_HISTORY_POINTS {
404            self.history.pop_front();
405        }
406    }
407}
408
409// ============================================================================
410// Pressure Analysis Result
411// ============================================================================
412
413/// Detailed memory pressure analysis
414#[derive(Debug, Clone)]
415pub struct PressureAnalysis {
416    /// Current pressure level
417    pub level: PressureLevel,
418    /// Available memory percentage
419    pub available_percent: f64,
420    /// Available memory in GB
421    pub available_gb: f64,
422    /// Safe number of parallel jobs
423    pub safe_jobs: u32,
424    /// Whether to block new builds
425    pub block_builds: bool,
426    /// Human-readable recommendation
427    pub recommendation: String,
428}
429
430impl PressureAnalysis {
431    /// Analyze memory metrics and return detailed analysis
432    #[must_use]
433    pub fn from_metrics(metrics: &MemoryMetrics) -> Self {
434        let available_pct = metrics.ram_available_percent();
435        let available_gb = metrics.ram_available_bytes as f64 / (1024.0 * 1024.0 * 1024.0);
436        let level = metrics.pressure_level;
437
438        Self {
439            level,
440            available_percent: available_pct,
441            available_gb,
442            safe_jobs: metrics.safe_parallel_jobs,
443            block_builds: level.should_block_allocations(),
444            recommendation: level.recommendation().to_string(),
445        }
446    }
447}
448
449#[cfg(test)]
450mod tests;