Skip to main content

clawft_kernel/
health.rs

1//! Health monitoring subsystem.
2//!
3//! The [`HealthSystem`] aggregates health checks from all registered
4//! services and produces an overall [`OverallHealth`] status.
5
6use std::sync::Arc;
7
8use serde::{Deserialize, Serialize};
9use tracing::{debug, warn};
10
11use crate::service::ServiceRegistry;
12
13/// Health status for a single service.
14#[non_exhaustive]
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub enum HealthStatus {
17    /// Service is operating normally.
18    Healthy,
19    /// Service is operational but degraded.
20    Degraded(String),
21    /// Service is not operational.
22    Unhealthy(String),
23    /// Health status could not be determined (e.g. timeout).
24    Unknown,
25}
26
27impl std::fmt::Display for HealthStatus {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        match self {
30            HealthStatus::Healthy => write!(f, "healthy"),
31            HealthStatus::Degraded(msg) => write!(f, "degraded: {msg}"),
32            HealthStatus::Unhealthy(msg) => write!(f, "unhealthy: {msg}"),
33            HealthStatus::Unknown => write!(f, "unknown"),
34        }
35    }
36}
37
38/// Aggregated health status for the entire kernel.
39#[non_exhaustive]
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub enum OverallHealth {
42    /// All services are healthy.
43    Healthy,
44    /// Some services are degraded or unhealthy.
45    Degraded {
46        /// Services that are not fully healthy.
47        unhealthy_services: Vec<String>,
48    },
49    /// All services are unhealthy or no services registered.
50    Down,
51}
52
53impl std::fmt::Display for OverallHealth {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        match self {
56            OverallHealth::Healthy => write!(f, "healthy"),
57            OverallHealth::Degraded { unhealthy_services } => {
58                write!(f, "degraded ({})", unhealthy_services.join(", "))
59            }
60            OverallHealth::Down => write!(f, "down"),
61        }
62    }
63}
64
65/// Health monitoring system.
66///
67/// Periodically checks all registered services and aggregates their
68/// health into an overall status.
69pub struct HealthSystem {
70    check_interval_secs: u64,
71}
72
73impl HealthSystem {
74    /// Create a new health system with the given check interval.
75    pub fn new(check_interval_secs: u64) -> Self {
76        Self {
77            check_interval_secs,
78        }
79    }
80
81    /// Get the configured check interval in seconds.
82    pub fn check_interval_secs(&self) -> u64 {
83        self.check_interval_secs
84    }
85
86    /// Run a single health check cycle against all services.
87    pub async fn aggregate(
88        &self,
89        registry: &Arc<ServiceRegistry>,
90    ) -> (OverallHealth, Vec<(String, HealthStatus)>) {
91        let results = registry.health_all().await;
92
93        if results.is_empty() {
94            return (OverallHealth::Down, results);
95        }
96
97        let mut unhealthy = Vec::new();
98        let mut all_unhealthy = true;
99
100        for (name, status) in &results {
101            match status {
102                HealthStatus::Healthy => {
103                    debug!(service = %name, "health check: healthy");
104                    all_unhealthy = false;
105                }
106                HealthStatus::Degraded(msg) => {
107                    warn!(service = %name, reason = %msg, "health check: degraded");
108                    unhealthy.push(name.clone());
109                    all_unhealthy = false;
110                }
111                HealthStatus::Unhealthy(msg) => {
112                    warn!(service = %name, reason = %msg, "health check: unhealthy");
113                    unhealthy.push(name.clone());
114                }
115                HealthStatus::Unknown => {
116                    warn!(service = %name, "health check: unknown");
117                    unhealthy.push(name.clone());
118                }
119            }
120        }
121
122        let overall = if unhealthy.is_empty() {
123            OverallHealth::Healthy
124        } else if all_unhealthy {
125            OverallHealth::Down
126        } else {
127            OverallHealth::Degraded {
128                unhealthy_services: unhealthy,
129            }
130        };
131
132        (overall, results)
133    }
134}
135
136// ── K2b-G2: Liveness and readiness probes (os-patterns) ─────────
137
138/// Result of a liveness or readiness probe.
139#[non_exhaustive]
140#[cfg(feature = "os-patterns")]
141#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
142pub enum ProbeResult {
143    /// Service is live (responding to probes).
144    Live,
145    /// Service is not live.
146    NotLive { reason: String },
147    /// Service is ready to accept traffic.
148    Ready,
149    /// Service is not ready.
150    NotReady { reason: String },
151}
152
153/// Configuration for liveness and readiness probes.
154#[cfg(feature = "os-patterns")]
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct ProbeConfig {
157    /// How often to check liveness (default: 10s).
158    pub liveness_interval_secs: u64,
159    /// How often to check readiness (default: 5s).
160    pub readiness_interval_secs: u64,
161    /// Number of consecutive failures before marking as failed.
162    pub failure_threshold: u32,
163    /// Number of consecutive successes before marking as recovered.
164    pub success_threshold: u32,
165}
166
167#[cfg(feature = "os-patterns")]
168impl Default for ProbeConfig {
169    fn default() -> Self {
170        Self {
171            liveness_interval_secs: 10,
172            readiness_interval_secs: 5,
173            failure_threshold: 3,
174            success_threshold: 1,
175        }
176    }
177}
178
179/// Tracks consecutive probe results for threshold-based decisions.
180#[cfg(feature = "os-patterns")]
181#[derive(Debug, Clone)]
182pub struct ProbeState {
183    /// Consecutive liveness failures.
184    pub liveness_failures: u32,
185    /// Consecutive readiness failures.
186    pub readiness_failures: u32,
187    /// Consecutive readiness successes (for recovery).
188    pub readiness_successes: u32,
189    /// Whether the service is currently considered live.
190    pub is_live: bool,
191    /// Whether the service is currently considered ready.
192    pub is_ready: bool,
193}
194
195#[cfg(feature = "os-patterns")]
196impl Default for ProbeState {
197    fn default() -> Self {
198        Self {
199            liveness_failures: 0,
200            readiness_failures: 0,
201            readiness_successes: 0,
202            is_live: true,
203            is_ready: true,
204        }
205    }
206}
207
208#[cfg(feature = "os-patterns")]
209impl ProbeState {
210    /// Record a liveness probe result.
211    ///
212    /// Returns `true` if the service should be restarted (failures >= threshold).
213    pub fn record_liveness(&mut self, result: &ProbeResult, config: &ProbeConfig) -> bool {
214        match result {
215            ProbeResult::Live => {
216                self.liveness_failures = 0;
217                self.is_live = true;
218                false
219            }
220            ProbeResult::NotLive { .. } => {
221                self.liveness_failures += 1;
222                if self.liveness_failures >= config.failure_threshold {
223                    self.is_live = false;
224                    true
225                } else {
226                    false
227                }
228            }
229            _ => false, // readiness results ignored here
230        }
231    }
232
233    /// Record a readiness probe result.
234    ///
235    /// Returns the readiness state change:
236    /// - `Some(false)` if service should be removed from registry
237    /// - `Some(true)` if service should be re-added (recovered)
238    /// - `None` if no state change
239    pub fn record_readiness(&mut self, result: &ProbeResult, config: &ProbeConfig) -> Option<bool> {
240        match result {
241            ProbeResult::Ready => {
242                self.readiness_failures = 0;
243                self.readiness_successes += 1;
244                if !self.is_ready && self.readiness_successes >= config.success_threshold {
245                    self.is_ready = true;
246                    Some(true) // recovered
247                } else {
248                    None
249                }
250            }
251            ProbeResult::NotReady { .. } => {
252                self.readiness_successes = 0;
253                self.readiness_failures += 1;
254                if self.is_ready && self.readiness_failures >= config.failure_threshold {
255                    self.is_ready = false;
256                    Some(false) // became unready
257                } else {
258                    None
259                }
260            }
261            _ => None, // liveness results ignored here
262        }
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269    use crate::service::{ServiceType, SystemService};
270    use async_trait::async_trait;
271
272    struct HealthyService;
273
274    #[async_trait]
275    impl SystemService for HealthyService {
276        fn name(&self) -> &str {
277            "healthy-svc"
278        }
279        fn service_type(&self) -> ServiceType {
280            ServiceType::Core
281        }
282        async fn start(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
283            Ok(())
284        }
285        async fn stop(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
286            Ok(())
287        }
288        async fn health_check(&self) -> HealthStatus {
289            HealthStatus::Healthy
290        }
291    }
292
293    struct UnhealthyService;
294
295    #[async_trait]
296    impl SystemService for UnhealthyService {
297        fn name(&self) -> &str {
298            "unhealthy-svc"
299        }
300        fn service_type(&self) -> ServiceType {
301            ServiceType::Core
302        }
303        async fn start(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
304            Ok(())
305        }
306        async fn stop(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
307            Ok(())
308        }
309        async fn health_check(&self) -> HealthStatus {
310            HealthStatus::Unhealthy("test failure".into())
311        }
312    }
313
314    #[tokio::test]
315    async fn aggregate_all_healthy() {
316        let registry = Arc::new(ServiceRegistry::new());
317        registry.register(Arc::new(HealthyService)).unwrap();
318
319        let health = HealthSystem::new(30);
320        let (overall, results) = health.aggregate(&registry).await;
321
322        assert!(matches!(overall, OverallHealth::Healthy));
323        assert_eq!(results.len(), 1);
324    }
325
326    #[tokio::test]
327    async fn aggregate_mixed() {
328        let registry = Arc::new(ServiceRegistry::new());
329        registry.register(Arc::new(HealthyService)).unwrap();
330        registry.register(Arc::new(UnhealthyService)).unwrap();
331
332        let health = HealthSystem::new(30);
333        let (overall, results) = health.aggregate(&registry).await;
334
335        assert!(matches!(overall, OverallHealth::Degraded { .. }));
336        assert_eq!(results.len(), 2);
337    }
338
339    #[tokio::test]
340    async fn aggregate_all_unhealthy() {
341        let registry = Arc::new(ServiceRegistry::new());
342        registry.register(Arc::new(UnhealthyService)).unwrap();
343
344        let health = HealthSystem::new(30);
345        let (overall, _) = health.aggregate(&registry).await;
346
347        assert!(matches!(overall, OverallHealth::Down));
348    }
349
350    #[tokio::test]
351    async fn aggregate_empty_registry() {
352        let registry = Arc::new(ServiceRegistry::new());
353        let health = HealthSystem::new(30);
354        let (overall, results) = health.aggregate(&registry).await;
355
356        assert!(matches!(overall, OverallHealth::Down));
357        assert!(results.is_empty());
358    }
359
360    #[test]
361    fn health_status_display() {
362        assert_eq!(HealthStatus::Healthy.to_string(), "healthy");
363        assert_eq!(
364            HealthStatus::Degraded("slow".into()).to_string(),
365            "degraded: slow"
366        );
367        assert_eq!(
368            HealthStatus::Unhealthy("crash".into()).to_string(),
369            "unhealthy: crash"
370        );
371        assert_eq!(HealthStatus::Unknown.to_string(), "unknown");
372    }
373
374    #[test]
375    fn overall_health_display() {
376        assert_eq!(OverallHealth::Healthy.to_string(), "healthy");
377        assert_eq!(OverallHealth::Down.to_string(), "down");
378        assert_eq!(
379            OverallHealth::Degraded {
380                unhealthy_services: vec!["svc-a".into(), "svc-b".into()]
381            }
382            .to_string(),
383            "degraded (svc-a, svc-b)"
384        );
385    }
386
387    #[test]
388    fn check_interval() {
389        let health = HealthSystem::new(15);
390        assert_eq!(health.check_interval_secs(), 15);
391    }
392
393    // ── K2b-G2: Probe tests (os-patterns) ────────────────────────
394
395    #[cfg(feature = "os-patterns")]
396    mod probe_tests {
397        use super::super::*;
398
399        #[test]
400        fn probe_config_default() {
401            let config = ProbeConfig::default();
402            assert_eq!(config.liveness_interval_secs, 10);
403            assert_eq!(config.readiness_interval_secs, 5);
404            assert_eq!(config.failure_threshold, 3);
405            assert_eq!(config.success_threshold, 1);
406        }
407
408        #[test]
409        fn probe_config_serde_roundtrip() {
410            let config = ProbeConfig {
411                liveness_interval_secs: 15,
412                readiness_interval_secs: 10,
413                failure_threshold: 5,
414                success_threshold: 2,
415            };
416            let json = serde_json::to_string(&config).unwrap();
417            let restored: ProbeConfig = serde_json::from_str(&json).unwrap();
418            assert_eq!(restored.liveness_interval_secs, 15);
419            assert_eq!(restored.failure_threshold, 5);
420        }
421
422        #[test]
423        fn probe_result_serde_roundtrip() {
424            let results = vec![
425                ProbeResult::Live,
426                ProbeResult::NotLive { reason: "oom".into() },
427                ProbeResult::Ready,
428                ProbeResult::NotReady { reason: "init".into() },
429            ];
430            for result in results {
431                let json = serde_json::to_string(&result).unwrap();
432                let restored: ProbeResult = serde_json::from_str(&json).unwrap();
433                assert_eq!(restored, result);
434            }
435        }
436
437        #[test]
438        fn probe_state_default_is_live_and_ready() {
439            let state = ProbeState::default();
440            assert!(state.is_live);
441            assert!(state.is_ready);
442            assert_eq!(state.liveness_failures, 0);
443            assert_eq!(state.readiness_failures, 0);
444        }
445
446        #[test]
447        fn liveness_resets_on_success() {
448            let config = ProbeConfig {
449                failure_threshold: 3,
450                ..Default::default()
451            };
452            let mut state = ProbeState::default();
453            state.liveness_failures = 2;
454
455            let restart = state.record_liveness(&ProbeResult::Live, &config);
456            assert!(!restart);
457            assert_eq!(state.liveness_failures, 0);
458            assert!(state.is_live);
459        }
460
461        #[test]
462        fn liveness_triggers_restart_at_threshold() {
463            let config = ProbeConfig {
464                failure_threshold: 3,
465                ..Default::default()
466            };
467            let mut state = ProbeState::default();
468
469            // 2 failures: no restart
470            assert!(!state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
471            assert!(!state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
472            assert!(state.is_live);
473
474            // 3rd failure: restart
475            assert!(state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
476            assert!(!state.is_live);
477        }
478
479        #[test]
480        fn readiness_removes_at_threshold() {
481            let config = ProbeConfig {
482                failure_threshold: 2,
483                ..Default::default()
484            };
485            let mut state = ProbeState::default();
486
487            assert!(state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config).is_none());
488            let change = state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config);
489            assert_eq!(change, Some(false)); // should be removed
490            assert!(!state.is_ready);
491        }
492
493        #[test]
494        fn readiness_recovery_re_adds() {
495            let config = ProbeConfig {
496                failure_threshold: 1,
497                success_threshold: 1,
498                ..Default::default()
499            };
500            let mut state = ProbeState::default();
501
502            // Make it unready
503            state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config);
504            assert!(!state.is_ready);
505
506            // Recover
507            let change = state.record_readiness(&ProbeResult::Ready, &config);
508            assert_eq!(change, Some(true));
509            assert!(state.is_ready);
510        }
511
512        #[test]
513        fn threshold_prevents_flapping() {
514            let config = ProbeConfig {
515                failure_threshold: 3,
516                success_threshold: 2,
517                ..Default::default()
518            };
519            let mut state = ProbeState::default();
520
521            // One failure shouldn't change anything
522            assert!(state.record_readiness(&ProbeResult::NotReady { reason: "x".into() }, &config).is_none());
523            assert!(state.is_ready);
524
525            // One success resets failures
526            assert!(state.record_readiness(&ProbeResult::Ready, &config).is_none());
527            assert!(state.is_ready);
528        }
529
530        #[test]
531        fn default_probe_returns_live_ready() {
532            // Default liveness/readiness should return Live/Ready
533            let live = ProbeResult::Live;
534            let ready = ProbeResult::Ready;
535            assert_eq!(live, ProbeResult::Live);
536            assert_eq!(ready, ProbeResult::Ready);
537        }
538    }
539}