Skip to main content

runtimo_core/
monitor.rs

1//! Health Monitoring Daemon — Background health checks with alerting.
2//!
3//! Monitors system health by capturing periodic snapshots of hardware telemetry
4//! and process state. Alerts on threshold violations:
5//! - Zombie processes > 10
6//! - CPU usage > 90% for 5 consecutive minutes
7//! - Memory monotonic increase (potential leak)
8//!
9//! # Example
10//!
11//! ```rust,ignore
12//! use runtimo_core::HealthMonitor;
13//!
14//! let monitor = HealthMonitor::start()?;
15//! // Monitor runs in background, checking every 60s
16//! // Access latest health state:
17//! let health = monitor.health();
18//! println!("CPU: {:.1}%, RAM: {:.1}%", health.cpu_percent, health.ram_percent);
19//! ```
20
21use crate::processes::ProcessSnapshot;
22use crate::telemetry::Telemetry;
23use serde::{Deserialize, Serialize};
24use std::sync::atomic::{AtomicBool, Ordering};
25use std::sync::{Arc, RwLock};
26use std::thread;
27use std::time::Duration;
28
29/// Alert thresholds for health monitoring.
30const ZOMBIE_THRESHOLD: usize = 10;
31const CPU_THRESHOLD: f32 = 90.0;
32const CPU_ALERT_MINUTES: usize = 5;
33const CHECK_INTERVAL_SECS: u64 = 60;
34
35/// Current health state snapshot.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct HealthState {
38    /// Unix timestamp of last check.
39    pub timestamp: u64,
40    /// Total CPU usage percentage.
41    pub cpu_percent: f32,
42    /// Total memory usage percentage.
43    pub ram_percent: f32,
44    /// Number of zombie processes.
45    pub zombie_count: usize,
46    /// Total process count.
47    pub process_count: usize,
48    /// Top CPU consuming process name.
49    pub top_cpu_process: Option<String>,
50    /// Top memory consuming process name.
51    pub top_mem_process: Option<String>,
52    /// Number of consecutive minutes CPU exceeded threshold.
53    pub cpu_alert_count: usize,
54    /// Number of consecutive checks with monotonically increasing RAM.
55    pub ram_alert_count: usize,
56    /// Whether memory is monotonically increasing.
57    pub ram_increasing: bool,
58    /// Last RAM usage for monotonicity check.
59    pub last_ram_percent: Option<f32>,
60}
61
62impl Default for HealthState {
63    fn default() -> Self {
64        Self {
65            timestamp: 0,
66            cpu_percent: 0.0,
67            ram_percent: 0.0,
68            zombie_count: 0,
69            process_count: 0,
70            top_cpu_process: None,
71            top_mem_process: None,
72            cpu_alert_count: 0,
73            ram_alert_count: 0,
74            ram_increasing: false,
75            last_ram_percent: None,
76        }
77    }
78}
79
80/// Health alert types.
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub enum HealthAlert {
83    /// Zombie process count exceeded threshold.
84    ZombieCount { count: usize, threshold: usize },
85    /// CPU usage exceeded threshold for consecutive minutes.
86    CpuHigh { percent: f32, minutes: usize },
87    /// Memory usage monotonically increasing (potential leak).
88    MemoryLeak { ram_percent: f32 },
89}
90
91impl std::fmt::Display for HealthAlert {
92    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93        match self {
94            HealthAlert::ZombieCount { count, threshold } => {
95                write!(f, "Zombie processes: {} (threshold: {})", count, threshold)
96            }
97            HealthAlert::CpuHigh { percent, minutes } => {
98                write!(f, "CPU usage: {:.1}% for {} minutes", percent, minutes)
99            }
100            HealthAlert::MemoryLeak { ram_percent } => {
101                write!(f, "Memory leak detected: {:.1}% RAM", ram_percent)
102            }
103        }
104    }
105}
106
107/// Health monitoring daemon with background thread.
108///
109/// Captures snapshots every 60 seconds and alerts on threshold violations.
110/// Thread-safe state access via RwLock.
111pub struct HealthMonitor {
112    /// Shared health state.
113    state: Arc<RwLock<HealthState>>,
114    /// Stop flag for background thread.
115    stop_flag: Arc<AtomicBool>,
116    /// Background thread handle.
117    _thread: thread::JoinHandle<()>,
118    /// Alert history (last 100 alerts).
119    alerts: Arc<RwLock<Vec<HealthAlert>>>,
120}
121
122impl Drop for HealthMonitor {
123    fn drop(&mut self) {
124        self.stop_flag.store(true, Ordering::Relaxed);
125    }
126}
127
128impl HealthMonitor {
129    /// Starts the health monitoring background thread.
130    ///
131    /// The monitor checks system health every 60 seconds and updates
132    /// the shared health state. Alerts are generated for:
133    /// - Zombie count > 10
134    /// - CPU > 90% for 5+ consecutive minutes
135    /// - Monotonic RAM increase
136    ///
137    /// # Returns
138    ///
139    /// `Ok(HealthMonitor)` on success, or error if thread spawn fails.
140    pub fn start() -> Result<Self, String> {
141        let state = Arc::new(RwLock::new(HealthState::default()));
142        let alerts = Arc::new(RwLock::new(Vec::new()));
143        let stop_flag = Arc::new(AtomicBool::new(false));
144
145        let state_clone = Arc::clone(&state);
146        let alerts_clone = Arc::clone(&alerts);
147        let stop_flag_clone = Arc::clone(&stop_flag);
148
149        let handle = thread::spawn(move || {
150            while !stop_flag_clone.load(Ordering::Relaxed) {
151                // Capture health snapshot
152                let telemetry = Telemetry::capture();
153                let processes = ProcessSnapshot::capture();
154
155                let mut current_state = state_clone.write().unwrap_or_else(|e| {
156                    eprintln!("[HealthMonitor] State lock poisoned: {}", e);
157                    // Recover from poison by taking the broken lock
158                    e.into_inner()
159                });
160
161                // Update state
162                current_state.timestamp = telemetry.timestamp;
163                current_state.cpu_percent = processes.summary.total_cpu_percent;
164                current_state.ram_percent = parse_ram_percent(&telemetry.system.ram_total, &telemetry.system.ram_free);
165                current_state.zombie_count = processes.summary.zombie_count;
166                current_state.process_count = processes.summary.total_processes;
167                current_state.top_cpu_process = processes.summary.top_cpu_consumer.clone();
168                current_state.top_mem_process = processes.summary.top_mem_consumer.clone();
169
170                // Check CPU threshold
171                if current_state.cpu_percent > CPU_THRESHOLD {
172                    current_state.cpu_alert_count += 1;
173                    if current_state.cpu_alert_count >= CPU_ALERT_MINUTES {
174                        let alert = HealthAlert::CpuHigh {
175                            percent: current_state.cpu_percent,
176                            minutes: current_state.cpu_alert_count,
177                        };
178                        add_alert(&alerts_clone, alert);
179                    }
180                } else {
181                    current_state.cpu_alert_count = 0;
182                }
183
184                // Check memory monotonicity
185                if let Some(last_ram) = current_state.last_ram_percent {
186                    if current_state.ram_percent > last_ram {
187                        current_state.ram_increasing = true;
188                        current_state.ram_alert_count += 1;
189                        // Alert if RAM increased for 5 consecutive checks
190                        if current_state.ram_alert_count >= 5 {
191                            let alert = HealthAlert::MemoryLeak {
192                                ram_percent: current_state.ram_percent,
193                            };
194                            add_alert(&alerts_clone, alert);
195                        }
196                    } else {
197                        current_state.ram_increasing = false;
198                        current_state.ram_alert_count = 0;
199                    }
200                }
201                current_state.last_ram_percent = Some(current_state.ram_percent);
202
203                // Check zombie threshold
204                if current_state.zombie_count > ZOMBIE_THRESHOLD {
205                    let alert = HealthAlert::ZombieCount {
206                        count: current_state.zombie_count,
207                        threshold: ZOMBIE_THRESHOLD,
208                    };
209                    add_alert(&alerts_clone, alert);
210                }
211
212                // Sleep for check interval
213                for _ in 0..CHECK_INTERVAL_SECS {
214                    if stop_flag_clone.load(Ordering::Relaxed) {
215                        break;
216                    }
217                    thread::sleep(Duration::from_secs(1));
218                }
219            }
220        });
221
222        Ok(Self {
223            state,
224            stop_flag,
225            _thread: handle,
226            alerts,
227        })
228    }
229
230    /// Returns the current health state snapshot.
231    pub fn health(&self) -> HealthState {
232        self.state.read().unwrap_or_else(|e| e.into_inner()).clone()
233    }
234
235    /// Returns recent health alerts (up to 100).
236    pub fn alerts(&self) -> Vec<HealthAlert> {
237        self.alerts
238            .read()
239            .unwrap_or_else(|e| e.into_inner())
240            .clone()
241    }
242
243    /// Stops the background monitoring thread.
244    pub fn stop(&self) {
245        self.stop_flag.store(true, Ordering::Relaxed);
246    }
247
248    /// Returns whether the monitor is still running.
249    pub fn is_running(&self) -> bool {
250        !self.stop_flag.load(Ordering::Relaxed)
251    }
252}
253
254/// Helper to compute RAM usage percentage from total and free values.
255///
256/// Accepts raw telemetry strings like "16Gi" (total) and "13Gi" (free).
257/// Returns used percentage: ((total - free) / total) * 100.
258fn parse_ram_percent(ram_total: &str, ram_free: &str) -> f32 {
259    let total_val = parse_size_value(ram_total.trim());
260    let free_val = parse_size_value(ram_free.trim());
261
262    if total_val > 0.0 {
263        ((total_val - free_val) / total_val) * 100.0
264    } else {
265        0.0
266    }
267}
268
269/// Parses a size string (e.g., "13Gi", "512Mi", "16384MB") into a numeric value in GB.
270fn parse_size_value(size_str: &str) -> f32 {
271    let size_str = size_str.trim();
272    if size_str.ends_with("Gi") {
273        size_str.trim_end_matches("Gi").parse().unwrap_or(0.0)
274    } else if size_str.ends_with("Mi") {
275        size_str
276            .trim_end_matches("Mi")
277            .parse::<f32>()
278            .map(|v| v / 1024.0)
279            .unwrap_or(0.0)
280    } else if size_str.ends_with("Ki") {
281        size_str
282            .trim_end_matches("Ki")
283            .parse::<f32>()
284            .map(|v| v / (1024.0 * 1024.0))
285            .unwrap_or(0.0)
286    } else if size_str.ends_with("MB") {
287        size_str
288            .trim_end_matches("MB")
289            .parse::<f32>()
290            .map(|v| v / 1000.0)
291            .unwrap_or(0.0)
292    } else if size_str.ends_with("GB") {
293        size_str
294            .trim_end_matches("GB")
295            .parse::<f32>()
296            .unwrap_or(0.0)
297    } else {
298        0.0
299    }
300}
301
302/// Adds an alert to the alert history (max 100 alerts).
303fn add_alert(alerts: &Arc<RwLock<Vec<HealthAlert>>>, alert: HealthAlert) {
304    let mut alerts_vec = alerts.write().expect("Alerts lock poisoned");
305    alerts_vec.push(alert);
306    if alerts_vec.len() > 100 {
307        alerts_vec.remove(0);
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_health_monitor_lifecycle() {
317        let monitor = HealthMonitor::start().expect("Failed to start monitor");
318        assert!(monitor.is_running());
319        // Stop immediately — verifies start/stop without waiting for 60s cycle
320        monitor.stop();
321        // Give thread time to see the flag (sleep loop checks every 1s)
322        thread::sleep(Duration::from_millis(1100));
323        assert!(!monitor.is_running());
324    }
325
326    #[test]
327    fn test_health_state_defaults() {
328        let state = HealthState::default();
329        assert_eq!(state.cpu_alert_count, 0);
330        assert_eq!(state.ram_alert_count, 0);
331        assert!(!state.ram_increasing);
332        assert!(state.last_ram_percent.is_none());
333    }
334
335    #[test]
336    fn test_cpu_alert_after_consecutive_checks() {
337        let mut state = HealthState::default();
338        // Simulate 5 consecutive minutes of high CPU
339        for _ in 0..5 {
340            state.cpu_percent = 95.0;
341            if state.cpu_percent > CPU_THRESHOLD {
342                state.cpu_alert_count += 1;
343            }
344        }
345        assert_eq!(state.cpu_alert_count, 5);
346    }
347
348#[test]
349fn test_ram_alert_uses_ram_counter_not_cpu() {
350    let mut state = HealthState {
351        last_ram_percent: Some(50.0),
352        ..Default::default()
353    };
354    // Simulate RAM increasing each check while CPU is normal
355    for i in 0..5 {
356        state.ram_percent = 50.0 + (i as f32 + 1.0); // 51, 52, 53, 54, 55
357        state.cpu_percent = 10.0; // CPU is fine
358        if state.ram_percent > state.last_ram_percent.unwrap() {
359            state.ram_increasing = true;
360            state.ram_alert_count += 1;
361        } else {
362            state.ram_increasing = false;
363            state.ram_alert_count = 0;
364        }
365        state.last_ram_percent = Some(state.ram_percent);
366    }
367    // RAM alert should fire after 5 consecutive increases (independent of CPU)
368    assert_eq!(state.ram_alert_count, 5);
369    assert!(state.ram_increasing);
370}
371
372#[test]
373fn test_ram_alert_resets_when_ram_decreases() {
374    let mut state = HealthState {
375        last_ram_percent: Some(50.0),
376        ..Default::default()
377    };
378
379    // RAM increases twice
380    state.ram_percent = 55.0;
381    state.ram_alert_count = 2;
382    state.last_ram_percent = Some(55.0);
383
384    // RAM decreases — counter should reset
385    state.ram_percent = 40.0;
386    if state.ram_percent > state.last_ram_percent.unwrap() {
387        state.ram_increasing = true;
388        state.ram_alert_count += 1;
389        } else {
390            state.ram_increasing = false;
391            state.ram_alert_count = 0;
392        }
393        state.last_ram_percent = Some(state.ram_percent);
394
395        assert_eq!(state.ram_alert_count, 0);
396        assert!(!state.ram_increasing);
397    }
398
399    #[test]
400    fn test_parse_size_value() {
401        assert!((parse_size_value("13Gi") - 13.0).abs() < 0.01);
402        assert!((parse_size_value("512Mi") - 0.5).abs() < 0.01);
403        assert_eq!(parse_size_value("invalid"), 0.0);
404    }
405}