Skip to main content

runtimo_core/
monitor.rs

1//! Health Monitoring Daemon — Background health checks with alerting.
2//!
3//! Monitors system health by capturing periodic snapshots of hardware telemetry
4//! and process state. Alerts on threshold violations:
5//! - Zombie processes > 10
6//! - CPU usage > 90% for 5 consecutive minutes
7//! - Memory monotonic increase (potential leak)
8//!
9//! # Example
10//!
11//! ```rust,ignore
12//! use runtimo_core::HealthMonitor;
13//!
14//! let monitor = HealthMonitor::start()?;
15//! // Monitor runs in background, checking every 60s
16//! // Access latest health state:
17//! let health = monitor.health();
18//! println!("CPU: {:.1}%, RAM: {:.1}%", health.cpu_percent, health.ram_percent);
19//! ```
20
21use crate::processes::ProcessSnapshot;
22use crate::telemetry::Telemetry;
23use serde::{Deserialize, Serialize};
24use std::sync::atomic::{AtomicBool, Ordering};
25use std::sync::{Arc, RwLock};
26use std::thread;
27use std::time::Duration;
28
29/// Alert thresholds for health monitoring.
30const ZOMBIE_THRESHOLD: usize = 10;
31const CPU_THRESHOLD: f32 = 90.0;
32const CPU_ALERT_MINUTES: usize = 5;
33const CHECK_INTERVAL_SECS: u64 = 60;
34
35/// Current health state snapshot.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37#[allow(clippy::exhaustive_structs)]
38pub struct HealthState {
39    /// Unix timestamp of last check.
40    pub timestamp: u64,
41    /// Total CPU usage percentage.
42    pub cpu_percent: f32,
43    /// Total memory usage percentage.
44    pub ram_percent: f32,
45    /// Number of zombie processes.
46    pub zombie_count: usize,
47    /// Total process count.
48    pub process_count: usize,
49    /// Top CPU consuming process name.
50    pub top_cpu_process: Option<String>,
51    /// Top memory consuming process name.
52    pub top_mem_process: Option<String>,
53    /// Number of consecutive minutes CPU exceeded threshold.
54    pub cpu_alert_count: usize,
55    /// Number of consecutive checks with monotonically increasing RAM.
56    pub ram_alert_count: usize,
57    /// Whether memory is monotonically increasing.
58    pub ram_increasing: bool,
59    /// Last RAM usage for monotonicity check.
60    pub last_ram_percent: Option<f32>,
61}
62
63impl Default for HealthState {
64    fn default() -> Self {
65        Self {
66            timestamp: 0,
67            cpu_percent: 0.0,
68            ram_percent: 0.0,
69            zombie_count: 0,
70            process_count: 0,
71            top_cpu_process: None,
72            top_mem_process: None,
73            cpu_alert_count: 0,
74            ram_alert_count: 0,
75            ram_increasing: false,
76            last_ram_percent: None,
77        }
78    }
79}
80
81/// Health alert types.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83#[allow(clippy::exhaustive_enums)]
84pub enum HealthAlert {
85    /// Zombie process count exceeded threshold.
86    ZombieCount { count: usize, threshold: usize },
87    /// CPU usage exceeded threshold for consecutive minutes.
88    CpuHigh { percent: f32, minutes: usize },
89    /// Memory usage monotonically increasing (potential leak).
90    MemoryLeak { ram_percent: f32 },
91}
92
93impl std::fmt::Display for HealthAlert {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        match self {
96            Self::ZombieCount { count, threshold } => {
97                write!(f, "Zombie processes: {} (threshold: {})", count, threshold)
98            }
99            Self::CpuHigh { percent, minutes } => {
100                write!(f, "CPU usage: {:.1}% for {} minutes", percent, minutes)
101            }
102            Self::MemoryLeak { ram_percent } => {
103                write!(f, "Memory leak detected: {:.1}% RAM", ram_percent)
104            }
105        }
106    }
107}
108
109/// Health monitoring daemon with background thread.
110///
111/// Captures snapshots every 60 seconds and alerts on threshold violations.
112/// Thread-safe state access via RwLock.
113#[allow(clippy::exhaustive_structs)]
114pub struct HealthMonitor {
115    /// Shared health state.
116    state: Arc<RwLock<HealthState>>,
117    /// Stop flag for background thread.
118    stop_flag: Arc<AtomicBool>,
119    /// Background thread handle.
120    _thread: thread::JoinHandle<()>,
121    /// Alert history (last 100 alerts).
122    alerts: Arc<RwLock<Vec<HealthAlert>>>,
123}
124
125impl Drop for HealthMonitor {
126    fn drop(&mut self) {
127        self.stop_flag.store(true, Ordering::Relaxed);
128    }
129}
130
131impl HealthMonitor {
132    /// Starts the health monitoring background thread.
133    ///
134    /// The monitor checks system health every 60 seconds and updates
135    /// the shared health state. Alerts are generated for:
136    /// - Zombie count > 10
137    /// - CPU > 90% for 5+ consecutive minutes
138    /// - Monotonic RAM increase
139    ///
140    /// # Returns
141    ///
142    /// `Ok(HealthMonitor)` on success, or error if thread spawn fails.
143    ///
144    /// # Errors
145    ///
146    /// Returns `Err(String)` if the background monitoring thread fails to spawn.
147    #[allow(clippy::arithmetic_side_effects)] // alert counters are intentional increments
148    pub fn start() -> Result<Self, String> {
149        let state = Arc::new(RwLock::new(HealthState::default()));
150        let alerts = Arc::new(RwLock::new(Vec::new()));
151        let stop_flag = Arc::new(AtomicBool::new(false));
152
153        let state_clone = Arc::clone(&state);
154        let alerts_clone = Arc::clone(&alerts);
155        let stop_flag_clone = Arc::clone(&stop_flag);
156
157        let handle = thread::spawn(move || {
158            while !stop_flag_clone.load(Ordering::Relaxed) {
159                // Capture health snapshot
160                let telemetry = Telemetry::capture();
161                let processes = ProcessSnapshot::capture();
162
163                let mut current_state = state_clone.write().unwrap_or_else(|e| {
164                    eprintln!("[HealthMonitor] State lock poisoned: {}", e);
165                    // Recover from poison by taking the broken lock
166                    e.into_inner()
167                });
168
169                // Update state
170                current_state.timestamp = telemetry.timestamp;
171                current_state.cpu_percent = processes.summary.total_cpu_percent;
172                current_state.ram_percent =
173                    parse_ram_percent(&telemetry.system.ram_total, &telemetry.system.ram_free);
174                current_state.zombie_count = processes.summary.zombie_count;
175                current_state.process_count = processes.summary.total_processes;
176                current_state
177                    .top_cpu_process
178                    .clone_from(&processes.summary.top_cpu_consumer);
179                current_state
180                    .top_mem_process
181                    .clone_from(&processes.summary.top_mem_consumer);
182
183                // Check CPU threshold
184                if current_state.cpu_percent > CPU_THRESHOLD {
185                    current_state.cpu_alert_count += 1;
186                    if current_state.cpu_alert_count >= CPU_ALERT_MINUTES {
187                        let alert = HealthAlert::CpuHigh {
188                            percent: current_state.cpu_percent,
189                            minutes: current_state.cpu_alert_count,
190                        };
191                        add_alert(&alerts_clone, alert);
192                    }
193                } else {
194                    current_state.cpu_alert_count = 0;
195                }
196
197                // Check memory monotonicity
198                if let Some(last_ram) = current_state.last_ram_percent {
199                    if current_state.ram_percent > last_ram {
200                        current_state.ram_increasing = true;
201                        current_state.ram_alert_count += 1;
202                        // Alert if RAM increased for 5 consecutive checks
203                        if current_state.ram_alert_count >= 5 {
204                            let alert = HealthAlert::MemoryLeak {
205                                ram_percent: current_state.ram_percent,
206                            };
207                            add_alert(&alerts_clone, alert);
208                        }
209                    } else {
210                        current_state.ram_increasing = false;
211                        current_state.ram_alert_count = 0;
212                    }
213                }
214                current_state.last_ram_percent = Some(current_state.ram_percent);
215
216                // Check zombie threshold
217                if current_state.zombie_count > ZOMBIE_THRESHOLD {
218                    let alert = HealthAlert::ZombieCount {
219                        count: current_state.zombie_count,
220                        threshold: ZOMBIE_THRESHOLD,
221                    };
222                    add_alert(&alerts_clone, alert);
223                }
224
225                // Sleep for check interval
226                for _ in 0..CHECK_INTERVAL_SECS {
227                    if stop_flag_clone.load(Ordering::Relaxed) {
228                        break;
229                    }
230                    thread::sleep(Duration::from_secs(1));
231                }
232            }
233        });
234
235        Ok(Self {
236            state,
237            stop_flag,
238            _thread: handle,
239            alerts,
240        })
241    }
242
243    /// Returns the current health state snapshot.
244    #[must_use]
245    pub fn health(&self) -> HealthState {
246        self.state.read().unwrap_or_else(|e| e.into_inner()).clone()
247    }
248
249    /// Returns recent health alerts (up to 100).
250    #[must_use]
251    pub fn alerts(&self) -> Vec<HealthAlert> {
252        self.alerts
253            .read()
254            .unwrap_or_else(|e| e.into_inner())
255            .clone()
256    }
257
258    /// Stops the background monitoring thread.
259    pub fn stop(&self) {
260        self.stop_flag.store(true, Ordering::Relaxed);
261    }
262
263    /// Returns whether the monitor is still running.
264    #[must_use]
265    pub fn is_running(&self) -> bool {
266        !self.stop_flag.load(Ordering::Relaxed)
267    }
268}
269
270/// Helper to compute RAM usage percentage from total and free values.
271///
272/// Accepts raw telemetry strings like "16Gi" (total) and "13Gi" (free).
273/// Returns used percentage: ((total - free) / total) * 100.
274fn parse_ram_percent(ram_total: &str, ram_free: &str) -> f32 {
275    let total_val = parse_size_value(ram_total.trim()).unwrap_or(0.0);
276    let free_val = parse_size_value(ram_free.trim()).unwrap_or(0.0);
277
278    if total_val > 0.0 {
279        ((total_val - free_val) / total_val) * 100.0
280    } else {
281        0.0
282    }
283}
284
285/// Parses a size string (e.g., "13Gi", "512Mi", "16384MB") into a numeric value in GB.
286///
287/// # Input
288///
289/// `size_str` — A size string with suffix (`Gi`, `Mi`, `Ki`, `GB`, `MB`).
290///
291/// # Output
292///
293/// `Some(f32)` — Parsed value in GB.
294/// `None` — Unrecognized suffix or non-numeric prefix (e.g. empty string, "invalid").
295fn parse_size_value(size_str: &str) -> Option<f32> {
296    let size_str = size_str.trim();
297    if size_str.ends_with("Gi") {
298        size_str.trim_end_matches("Gi").parse().ok()
299    } else if size_str.ends_with("Mi") {
300        size_str
301            .trim_end_matches("Mi")
302            .parse::<f32>()
303            .ok()
304            .map(|v| v / 1024.0)
305    } else if size_str.ends_with("Ki") {
306        size_str
307            .trim_end_matches("Ki")
308            .parse::<f32>()
309            .ok()
310            .map(|v| v / (1024.0 * 1024.0))
311    } else if size_str.ends_with("MB") {
312        size_str
313            .trim_end_matches("MB")
314            .parse::<f32>()
315            .ok()
316            .map(|v| v / 1000.0)
317    } else if size_str.ends_with("GB") {
318        size_str.trim_end_matches("GB").parse::<f32>().ok()
319    } else {
320        None
321    }
322}
323
324/// Adds an alert to the alert history (max 100 alerts).
325fn add_alert(alerts: &Arc<RwLock<Vec<HealthAlert>>>, alert: HealthAlert) {
326    #[allow(clippy::expect_used)] // lock poisoning is irrecoverable
327    let mut alerts_vec = alerts.write().expect("Alerts lock poisoned");
328    alerts_vec.push(alert);
329    if alerts_vec.len() > 100 {
330        alerts_vec.remove(0);
331    }
332}
333
334#[cfg(test)]
335#[allow(clippy::float_cmp, clippy::use_self)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_health_monitor_lifecycle() {
341        let monitor = HealthMonitor::start().expect("Failed to start monitor");
342        assert!(monitor.is_running());
343        // Stop immediately — verifies start/stop without waiting for 60s cycle
344        monitor.stop();
345        // Give thread time to see the flag (sleep loop checks every 1s)
346        thread::sleep(Duration::from_millis(1100));
347        assert!(!monitor.is_running());
348    }
349
350    #[test]
351    fn test_health_state_defaults() {
352        let state = HealthState::default();
353        assert_eq!(state.cpu_alert_count, 0);
354        assert_eq!(state.ram_alert_count, 0);
355        assert!(!state.ram_increasing);
356        assert!(state.last_ram_percent.is_none());
357    }
358
359    #[test]
360    fn test_cpu_alert_after_consecutive_checks() {
361        let mut state = HealthState::default();
362        // Simulate 5 consecutive minutes of high CPU
363        for _ in 0..5 {
364            state.cpu_percent = 95.0;
365            if state.cpu_percent > CPU_THRESHOLD {
366                state.cpu_alert_count += 1;
367            }
368        }
369        assert_eq!(state.cpu_alert_count, 5);
370    }
371
372    #[test]
373    fn test_ram_alert_uses_ram_counter_not_cpu() {
374        let mut state = HealthState {
375            last_ram_percent: Some(50.0),
376            ..Default::default()
377        };
378        // Simulate RAM increasing each check while CPU is normal
379        #[allow(clippy::cast_precision_loss)]
380        for i in 0..5 {
381            state.ram_percent = 50.0 + (i as f32 + 1.0); // 51, 52, 53, 54, 55
382            state.cpu_percent = 10.0; // CPU is fine
383            if state.ram_percent > state.last_ram_percent.unwrap() {
384                state.ram_increasing = true;
385                state.ram_alert_count += 1;
386            } else {
387                state.ram_increasing = false;
388                state.ram_alert_count = 0;
389            }
390            state.last_ram_percent = Some(state.ram_percent);
391        }
392        // RAM alert should fire after 5 consecutive increases (independent of CPU)
393        assert_eq!(state.ram_alert_count, 5);
394        assert!(state.ram_increasing);
395    }
396
397    #[test]
398    fn test_ram_alert_resets_when_ram_decreases() {
399        let mut state = HealthState {
400            last_ram_percent: Some(50.0),
401            ..Default::default()
402        };
403
404        // RAM increases twice
405        state.ram_percent = 55.0;
406        state.ram_alert_count = 2;
407        state.last_ram_percent = Some(55.0);
408
409        // RAM decreases — counter should reset
410        state.ram_percent = 40.0;
411        if state.ram_percent > state.last_ram_percent.unwrap() {
412            state.ram_increasing = true;
413            state.ram_alert_count += 1;
414        } else {
415            state.ram_increasing = false;
416            state.ram_alert_count = 0;
417        }
418        state.last_ram_percent = Some(state.ram_percent);
419
420        assert_eq!(state.ram_alert_count, 0);
421        assert!(!state.ram_increasing);
422    }
423
424    #[test]
425    fn test_parse_size_value() {
426        assert!((parse_size_value("13Gi").unwrap() - 13.0).abs() < 0.01);
427        assert!((parse_size_value("512Mi").unwrap() - 0.5).abs() < 0.01);
428        assert_eq!(parse_size_value("invalid"), None);
429    }
430}