omega_runtime/
health.rs

1//! Health Monitoring System
2//!
3//! Provides comprehensive health monitoring for subsystems with configurable
4//! thresholds, automatic degradation detection, and aggregated health status.
5
6use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::time::{Duration, Instant};
9use thiserror::Error;
10use tracing::{debug, warn};
11
12/// Health status of a subsystem
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub enum HealthStatus {
15    /// Subsystem is operating normally
16    Healthy,
17    /// Subsystem is degraded but operational
18    Degraded,
19    /// Subsystem is unhealthy or non-operational
20    Unhealthy,
21}
22
23impl HealthStatus {
24    /// Check if status is healthy
25    pub fn is_healthy(&self) -> bool {
26        matches!(self, HealthStatus::Healthy)
27    }
28
29    /// Check if status is degraded
30    pub fn is_degraded(&self) -> bool {
31        matches!(self, HealthStatus::Degraded)
32    }
33
34    /// Check if status is unhealthy
35    pub fn is_unhealthy(&self) -> bool {
36        matches!(self, HealthStatus::Unhealthy)
37    }
38
39    /// Get numeric score (0-100)
40    pub fn score(&self) -> u8 {
41        match self {
42            HealthStatus::Healthy => 100,
43            HealthStatus::Degraded => 50,
44            HealthStatus::Unhealthy => 0,
45        }
46    }
47}
48
49/// Health information for a subsystem
50#[derive(Debug, Clone)]
51pub struct SubsystemHealth {
52    /// Subsystem name
53    pub name: String,
54    /// Current health status
55    pub status: HealthStatus,
56    /// Last health check timestamp
57    pub last_check: Instant,
58    /// Consecutive failure count
59    pub consecutive_failures: u32,
60    /// Consecutive success count
61    pub consecutive_successes: u32,
62    /// Total check count
63    pub total_checks: u64,
64    /// Total failure count
65    pub total_failures: u64,
66    /// Optional metadata
67    pub metadata: HashMap<String, String>,
68}
69
70impl SubsystemHealth {
71    /// Create new subsystem health
72    pub fn new(name: String) -> Self {
73        Self {
74            name,
75            status: HealthStatus::Healthy,
76            last_check: Instant::now(),
77            consecutive_failures: 0,
78            consecutive_successes: 0,
79            total_checks: 0,
80            total_failures: 0,
81            metadata: HashMap::new(),
82        }
83    }
84
85    /// Get time since last check
86    pub fn time_since_check(&self) -> Duration {
87        self.last_check.elapsed()
88    }
89
90    /// Get failure rate as percentage
91    pub fn failure_rate(&self) -> f64 {
92        if self.total_checks == 0 {
93            0.0
94        } else {
95            (self.total_failures as f64 / self.total_checks as f64) * 100.0
96        }
97    }
98
99    /// Check if subsystem is stale (hasn't been checked recently)
100    pub fn is_stale(&self, threshold: Duration) -> bool {
101        self.time_since_check() > threshold
102    }
103}
104
105/// Health monitor configuration
106#[derive(Debug, Clone)]
107pub struct HealthMonitorConfig {
108    /// Number of consecutive failures before marking as degraded
109    pub degraded_threshold: u32,
110    /// Number of consecutive failures before marking as unhealthy
111    pub unhealthy_threshold: u32,
112    /// Number of consecutive successes needed to recover from degraded
113    pub recovery_threshold: u32,
114    /// Duration after which a subsystem is considered stale
115    pub stale_threshold: Duration,
116}
117
118impl Default for HealthMonitorConfig {
119    fn default() -> Self {
120        Self {
121            degraded_threshold: 3,
122            unhealthy_threshold: 5,
123            recovery_threshold: 5,
124            stale_threshold: Duration::from_secs(300), // 5 minutes
125        }
126    }
127}
128
129/// Health monitoring error types
130#[derive(Debug, Error)]
131pub enum HealthError {
132    #[error("Subsystem not found: {0}")]
133    SubsystemNotFound(String),
134    #[error("Invalid health status transition: {0}")]
135    InvalidTransition(String),
136}
137
138/// Health monitor for tracking subsystem health
139pub struct HealthMonitor {
140    subsystems: RwLock<HashMap<String, SubsystemHealth>>,
141    config: HealthMonitorConfig,
142}
143
144impl HealthMonitor {
145    /// Create a new health monitor
146    pub fn new(config: HealthMonitorConfig) -> Self {
147        Self {
148            subsystems: RwLock::new(HashMap::new()),
149            config,
150        }
151    }
152
153    /// Create with default configuration
154    pub fn with_default_config() -> Self {
155        Self::new(HealthMonitorConfig::default())
156    }
157
158    /// Register a new subsystem for monitoring
159    pub fn register_subsystem(&self, name: String) {
160        let mut subsystems = self.subsystems.write();
161        if !subsystems.contains_key(&name) {
162            debug!("Registering subsystem for health monitoring: {}", name);
163            subsystems.insert(name.clone(), SubsystemHealth::new(name));
164        }
165    }
166
167    /// Unregister a subsystem
168    pub fn unregister_subsystem(&self, name: &str) -> Result<(), HealthError> {
169        let mut subsystems = self.subsystems.write();
170        subsystems
171            .remove(name)
172            .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
173        debug!("Unregistered subsystem: {}", name);
174        Ok(())
175    }
176
177    /// Update health status based on check result
178    pub fn update_health(&self, name: &str, is_healthy: bool) -> Result<(), HealthError> {
179        let mut subsystems = self.subsystems.write();
180        let health = subsystems
181            .get_mut(name)
182            .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
183
184        health.last_check = Instant::now();
185        health.total_checks += 1;
186
187        if is_healthy {
188            health.consecutive_successes += 1;
189            health.consecutive_failures = 0;
190
191            // Update status based on recovery
192            match health.status {
193                HealthStatus::Unhealthy | HealthStatus::Degraded => {
194                    if health.consecutive_successes >= self.config.recovery_threshold {
195                        debug!("Subsystem {} recovered to healthy", name);
196                        health.status = HealthStatus::Healthy;
197                    }
198                }
199                HealthStatus::Healthy => {}
200            }
201        } else {
202            health.consecutive_failures += 1;
203            health.consecutive_successes = 0;
204            health.total_failures += 1;
205
206            // Update status based on failures
207            let old_status = health.status;
208            if health.consecutive_failures >= self.config.unhealthy_threshold {
209                health.status = HealthStatus::Unhealthy;
210                if old_status != HealthStatus::Unhealthy {
211                    warn!("Subsystem {} marked as unhealthy", name);
212                }
213            } else if health.consecutive_failures >= self.config.degraded_threshold {
214                health.status = HealthStatus::Degraded;
215                if old_status != HealthStatus::Degraded {
216                    warn!("Subsystem {} marked as degraded", name);
217                }
218            }
219        }
220
221        Ok(())
222    }
223
224    /// Manually set health status
225    pub fn set_health(
226        &self,
227        name: &str,
228        status: HealthStatus,
229    ) -> Result<(), HealthError> {
230        let mut subsystems = self.subsystems.write();
231        let health = subsystems
232            .get_mut(name)
233            .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
234
235        health.status = status;
236        health.last_check = Instant::now();
237
238        Ok(())
239    }
240
241    /// Check health of a specific subsystem
242    pub fn check_health(&self, name: &str) -> Result<SubsystemHealth, HealthError> {
243        let subsystems = self.subsystems.read();
244        subsystems
245            .get(name)
246            .cloned()
247            .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))
248    }
249
250    /// Get overall system health status
251    pub fn overall_status(&self) -> HealthStatus {
252        let subsystems = self.subsystems.read();
253
254        if subsystems.is_empty() {
255            return HealthStatus::Healthy;
256        }
257
258        let mut has_unhealthy = false;
259        let mut has_degraded = false;
260
261        for health in subsystems.values() {
262            match health.status {
263                HealthStatus::Unhealthy => has_unhealthy = true,
264                HealthStatus::Degraded => has_degraded = true,
265                HealthStatus::Healthy => {}
266            }
267
268            // Check for stale subsystems
269            if health.is_stale(self.config.stale_threshold) {
270                warn!("Subsystem {} is stale (last check {:?} ago)", health.name, health.time_since_check());
271                has_degraded = true;
272            }
273        }
274
275        if has_unhealthy {
276            HealthStatus::Unhealthy
277        } else if has_degraded {
278            HealthStatus::Degraded
279        } else {
280            HealthStatus::Healthy
281        }
282    }
283
284    /// Get all subsystem health statuses
285    pub fn all_subsystems(&self) -> HashMap<String, SubsystemHealth> {
286        self.subsystems.read().clone()
287    }
288
289    /// Get count of subsystems by status
290    pub fn status_counts(&self) -> HashMap<HealthStatus, usize> {
291        let subsystems = self.subsystems.read();
292        let mut counts = HashMap::new();
293
294        for health in subsystems.values() {
295            *counts.entry(health.status).or_insert(0) += 1;
296        }
297
298        counts
299    }
300
301    /// Get unhealthy subsystems
302    pub fn unhealthy_subsystems(&self) -> Vec<String> {
303        let subsystems = self.subsystems.read();
304        subsystems
305            .values()
306            .filter(|h| h.status.is_unhealthy())
307            .map(|h| h.name.clone())
308            .collect()
309    }
310
311    /// Get degraded subsystems
312    pub fn degraded_subsystems(&self) -> Vec<String> {
313        let subsystems = self.subsystems.read();
314        subsystems
315            .values()
316            .filter(|h| h.status.is_degraded())
317            .map(|h| h.name.clone())
318            .collect()
319    }
320
321    /// Update subsystem metadata
322    pub fn update_metadata(
323        &self,
324        name: &str,
325        key: String,
326        value: String,
327    ) -> Result<(), HealthError> {
328        let mut subsystems = self.subsystems.write();
329        let health = subsystems
330            .get_mut(name)
331            .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
332
333        health.metadata.insert(key, value);
334        Ok(())
335    }
336
337    /// Get subsystem count
338    pub fn subsystem_count(&self) -> usize {
339        self.subsystems.read().len()
340    }
341
342    /// Clear all subsystems
343    pub fn clear(&self) {
344        self.subsystems.write().clear();
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351    use std::thread::sleep;
352
353    #[test]
354    fn test_health_monitor_creation() {
355        let monitor = HealthMonitor::default();
356        assert_eq!(monitor.subsystem_count(), 0);
357        assert_eq!(monitor.overall_status(), HealthStatus::Healthy);
358    }
359
360    #[test]
361    fn test_register_subsystem() {
362        let monitor = HealthMonitor::default();
363        monitor.register_subsystem("test-subsystem".to_string());
364        assert_eq!(monitor.subsystem_count(), 1);
365
366        let health = monitor.check_health("test-subsystem").unwrap();
367        assert_eq!(health.name, "test-subsystem");
368        assert_eq!(health.status, HealthStatus::Healthy);
369    }
370
371    #[test]
372    fn test_register_duplicate_subsystem() {
373        let monitor = HealthMonitor::default();
374        monitor.register_subsystem("test".to_string());
375        monitor.register_subsystem("test".to_string());
376        assert_eq!(monitor.subsystem_count(), 1);
377    }
378
379    #[test]
380    fn test_unregister_subsystem() {
381        let monitor = HealthMonitor::default();
382        monitor.register_subsystem("test".to_string());
383        assert_eq!(monitor.subsystem_count(), 1);
384
385        monitor.unregister_subsystem("test").unwrap();
386        assert_eq!(monitor.subsystem_count(), 0);
387    }
388
389    #[test]
390    fn test_update_health_success() {
391        let monitor = HealthMonitor::default();
392        monitor.register_subsystem("test".to_string());
393
394        monitor.update_health("test", true).unwrap();
395
396        let health = monitor.check_health("test").unwrap();
397        assert_eq!(health.consecutive_successes, 1);
398        assert_eq!(health.total_checks, 1);
399        assert_eq!(health.status, HealthStatus::Healthy);
400    }
401
402    #[test]
403    fn test_update_health_failure() {
404        let config = HealthMonitorConfig {
405            degraded_threshold: 2,
406            unhealthy_threshold: 4,
407            ..Default::default()
408        };
409        let monitor = HealthMonitor::new(config);
410        monitor.register_subsystem("test".to_string());
411
412        // First failure - still healthy
413        monitor.update_health("test", false).unwrap();
414        let health = monitor.check_health("test").unwrap();
415        assert_eq!(health.status, HealthStatus::Healthy);
416
417        // Second failure - degraded
418        monitor.update_health("test", false).unwrap();
419        let health = monitor.check_health("test").unwrap();
420        assert_eq!(health.status, HealthStatus::Degraded);
421
422        // More failures - unhealthy
423        monitor.update_health("test", false).unwrap();
424        monitor.update_health("test", false).unwrap();
425        let health = monitor.check_health("test").unwrap();
426        assert_eq!(health.status, HealthStatus::Unhealthy);
427    }
428
429    #[test]
430    fn test_recovery_from_degraded() {
431        let config = HealthMonitorConfig {
432            degraded_threshold: 2,
433            recovery_threshold: 3,
434            ..Default::default()
435        };
436        let monitor = HealthMonitor::new(config);
437        monitor.register_subsystem("test".to_string());
438
439        // Make it degraded
440        monitor.update_health("test", false).unwrap();
441        monitor.update_health("test", false).unwrap();
442        assert_eq!(
443            monitor.check_health("test").unwrap().status,
444            HealthStatus::Degraded
445        );
446
447        // Recover with successes
448        monitor.update_health("test", true).unwrap();
449        monitor.update_health("test", true).unwrap();
450        assert_eq!(
451            monitor.check_health("test").unwrap().status,
452            HealthStatus::Degraded
453        );
454
455        monitor.update_health("test", true).unwrap();
456        assert_eq!(
457            monitor.check_health("test").unwrap().status,
458            HealthStatus::Healthy
459        );
460    }
461
462    #[test]
463    fn test_overall_status_healthy() {
464        let monitor = HealthMonitor::default();
465        monitor.register_subsystem("test1".to_string());
466        monitor.register_subsystem("test2".to_string());
467
468        monitor.update_health("test1", true).unwrap();
469        monitor.update_health("test2", true).unwrap();
470
471        assert_eq!(monitor.overall_status(), HealthStatus::Healthy);
472    }
473
474    #[test]
475    fn test_overall_status_degraded() {
476        let config = HealthMonitorConfig {
477            degraded_threshold: 2,
478            ..Default::default()
479        };
480        let monitor = HealthMonitor::new(config);
481        monitor.register_subsystem("test1".to_string());
482        monitor.register_subsystem("test2".to_string());
483
484        monitor.update_health("test1", true).unwrap();
485
486        // Make test2 degraded
487        monitor.update_health("test2", false).unwrap();
488        monitor.update_health("test2", false).unwrap();
489
490        assert_eq!(monitor.overall_status(), HealthStatus::Degraded);
491    }
492
493    #[test]
494    fn test_overall_status_unhealthy() {
495        let config = HealthMonitorConfig {
496            unhealthy_threshold: 2,
497            ..Default::default()
498        };
499        let monitor = HealthMonitor::new(config);
500        monitor.register_subsystem("test1".to_string());
501        monitor.register_subsystem("test2".to_string());
502
503        monitor.update_health("test1", true).unwrap();
504
505        // Make test2 unhealthy
506        monitor.update_health("test2", false).unwrap();
507        monitor.update_health("test2", false).unwrap();
508
509        assert_eq!(monitor.overall_status(), HealthStatus::Unhealthy);
510    }
511
512    #[test]
513    fn test_status_counts() {
514        let config = HealthMonitorConfig {
515            degraded_threshold: 2,
516            unhealthy_threshold: 4,
517            ..Default::default()
518        };
519        let monitor = HealthMonitor::new(config);
520
521        monitor.register_subsystem("healthy".to_string());
522        monitor.register_subsystem("degraded".to_string());
523        monitor.register_subsystem("unhealthy".to_string());
524
525        // Make degraded
526        monitor.update_health("degraded", false).unwrap();
527        monitor.update_health("degraded", false).unwrap();
528
529        // Make unhealthy
530        for _ in 0..4 {
531            monitor.update_health("unhealthy", false).unwrap();
532        }
533
534        let counts = monitor.status_counts();
535        assert_eq!(counts.get(&HealthStatus::Healthy), Some(&1));
536        assert_eq!(counts.get(&HealthStatus::Degraded), Some(&1));
537        assert_eq!(counts.get(&HealthStatus::Unhealthy), Some(&1));
538    }
539
540    #[test]
541    fn test_unhealthy_subsystems() {
542        let config = HealthMonitorConfig {
543            unhealthy_threshold: 2,
544            ..Default::default()
545        };
546        let monitor = HealthMonitor::new(config);
547
548        monitor.register_subsystem("healthy".to_string());
549        monitor.register_subsystem("unhealthy1".to_string());
550        monitor.register_subsystem("unhealthy2".to_string());
551
552        monitor.update_health("unhealthy1", false).unwrap();
553        monitor.update_health("unhealthy1", false).unwrap();
554        monitor.update_health("unhealthy2", false).unwrap();
555        monitor.update_health("unhealthy2", false).unwrap();
556
557        let unhealthy = monitor.unhealthy_subsystems();
558        assert_eq!(unhealthy.len(), 2);
559        assert!(unhealthy.contains(&"unhealthy1".to_string()));
560        assert!(unhealthy.contains(&"unhealthy2".to_string()));
561    }
562
563    #[test]
564    fn test_metadata() {
565        let monitor = HealthMonitor::default();
566        monitor.register_subsystem("test".to_string());
567
568        monitor
569            .update_metadata("test", "version".to_string(), "1.0.0".to_string())
570            .unwrap();
571
572        let health = monitor.check_health("test").unwrap();
573        assert_eq!(health.metadata.get("version"), Some(&"1.0.0".to_string()));
574    }
575
576    #[test]
577    fn test_failure_rate() {
578        let monitor = HealthMonitor::default();
579        monitor.register_subsystem("test".to_string());
580
581        monitor.update_health("test", true).unwrap();
582        monitor.update_health("test", false).unwrap();
583        monitor.update_health("test", true).unwrap();
584        monitor.update_health("test", false).unwrap();
585
586        let health = monitor.check_health("test").unwrap();
587        assert_eq!(health.failure_rate(), 50.0);
588    }
589
590    #[test]
591    fn test_stale_detection() {
592        let config = HealthMonitorConfig {
593            stale_threshold: Duration::from_millis(100),
594            ..Default::default()
595        };
596        let monitor = HealthMonitor::new(config);
597        monitor.register_subsystem("test".to_string());
598
599        // Initial state - not stale
600        let health = monitor.check_health("test").unwrap();
601        assert!(!health.is_stale(Duration::from_millis(100)));
602
603        // Wait and check stale
604        sleep(Duration::from_millis(150));
605        let health = monitor.check_health("test").unwrap();
606        assert!(health.is_stale(Duration::from_millis(100)));
607    }
608
609    #[test]
610    fn test_set_health_manually() {
611        let monitor = HealthMonitor::default();
612        monitor.register_subsystem("test".to_string());
613
614        monitor
615            .set_health("test", HealthStatus::Degraded)
616            .unwrap();
617
618        let health = monitor.check_health("test").unwrap();
619        assert_eq!(health.status, HealthStatus::Degraded);
620    }
621
622    #[test]
623    fn test_clear_subsystems() {
624        let monitor = HealthMonitor::default();
625        monitor.register_subsystem("test1".to_string());
626        monitor.register_subsystem("test2".to_string());
627        assert_eq!(monitor.subsystem_count(), 2);
628
629        monitor.clear();
630        assert_eq!(monitor.subsystem_count(), 0);
631    }
632
633    #[test]
634    fn test_health_status_score() {
635        assert_eq!(HealthStatus::Healthy.score(), 100);
636        assert_eq!(HealthStatus::Degraded.score(), 50);
637        assert_eq!(HealthStatus::Unhealthy.score(), 0);
638    }
639
640    #[test]
641    fn test_subsystem_not_found_error() {
642        let monitor = HealthMonitor::default();
643
644        let result = monitor.check_health("nonexistent");
645        assert!(result.is_err());
646        assert!(matches!(result, Err(HealthError::SubsystemNotFound(_))));
647    }
648}