Skip to main content

axon/
health_check.rs

1//! Health Check — structured health assessment for AxonServer subsystems.
2//!
3//! Provides readiness/liveness checks with per-component status:
4//!   - `event_bus` — event bus operational (has published or has subscribers)
5//!   - `supervisor` — daemon supervisor (no dead daemons)
6//!   - `session_store` — session store accessible
7//!   - `version_registry` — flow version registry accessible
8//!   - `rate_limiter` — rate limiter status and configuration
9//!   - `request_logger` — request log buffer utilization
10//!   - `api_keys` — API key manager status
11//!   - `webhooks` — webhook registry and delivery health
12//!   - `audit_log` — audit trail buffer utilization
13//!
14//! Endpoints:
15//!   - `/v1/health` — full health report with component details
16//!   - `/v1/health/live` — liveness probe (always up if responding)
17//!   - `/v1/health/ready` — readiness probe (all components healthy or degraded)
18
19use serde::Serialize;
20use std::collections::HashMap;
21
22// ── Types ────────────────────────────────────────────────────────────────
23
24/// Overall health status.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
26#[serde(rename_all = "lowercase")]
27pub enum HealthStatus {
28    /// All components operational.
29    Healthy,
30    /// Some components impaired but server can still serve requests.
31    Degraded,
32    /// Critical failure — server cannot serve requests reliably.
33    Unhealthy,
34}
35
36impl HealthStatus {
37    pub fn as_str(&self) -> &'static str {
38        match self {
39            HealthStatus::Healthy => "healthy",
40            HealthStatus::Degraded => "degraded",
41            HealthStatus::Unhealthy => "unhealthy",
42        }
43    }
44}
45
46/// Result of checking a single component.
47#[derive(Debug, Clone, Serialize)]
48pub struct ComponentCheck {
49    pub name: String,
50    pub status: HealthStatus,
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub message: Option<String>,
53    #[serde(skip_serializing_if = "Option::is_none")]
54    pub details: Option<serde_json::Value>,
55}
56
57/// Full health report.
58#[derive(Debug, Clone, Serialize)]
59pub struct HealthReport {
60    pub status: HealthStatus,
61    pub uptime_secs: u64,
62    pub axon_version: String,
63    pub components: Vec<ComponentCheck>,
64}
65
66// ── Input snapshot ───────────────────────────────────────────────────────
67
68/// Lightweight snapshot of server state for health evaluation.
69/// Decouples health logic from the locked ServerState.
70pub struct HealthInput {
71    pub uptime_secs: u64,
72    pub axon_version: String,
73    pub daemon_count: usize,
74    pub daemon_state_counts: HashMap<String, usize>,
75    pub bus_events_published: u64,
76    pub bus_subscriber_count: usize,
77    pub session_memory_count: usize,
78    pub session_store_count: usize,
79    pub flows_tracked: usize,
80    pub versions_total: usize,
81    // D45: new component fields
82    pub rate_limiter_enabled: bool,
83    pub rate_limiter_max_requests: u32,
84    pub rate_limiter_window_secs: u64,
85    pub request_log_enabled: bool,
86    pub request_log_entries: usize,
87    pub request_log_capacity: usize,
88    pub api_keys_enabled: bool,
89    pub api_keys_active: usize,
90    pub api_keys_total: usize,
91    pub webhooks_active: usize,
92    pub webhooks_total: usize,
93    pub webhooks_total_failures: u64,
94    pub audit_log_entries: usize,
95    pub audit_log_total_recorded: u64,
96}
97
98// ── Evaluation ───────────────────────────────────────────────────────────
99
100/// Evaluate full health from a server snapshot.
101pub fn evaluate(input: &HealthInput) -> HealthReport {
102    let mut components = Vec::new();
103
104    // Event bus check
105    components.push(check_event_bus(input));
106
107    // Supervisor check
108    components.push(check_supervisor(input));
109
110    // Session store check
111    components.push(check_session_store(input));
112
113    // Version registry check
114    components.push(check_version_registry(input));
115
116    // Rate limiter check
117    components.push(check_rate_limiter(input));
118
119    // Request logger check
120    components.push(check_request_logger(input));
121
122    // API keys check
123    components.push(check_api_keys(input));
124
125    // Webhooks check
126    components.push(check_webhooks(input));
127
128    // Audit log check
129    components.push(check_audit_log(input));
130
131    // Aggregate status: unhealthy if any unhealthy, degraded if any degraded
132    let status = aggregate_status(&components);
133
134    HealthReport {
135        status,
136        uptime_secs: input.uptime_secs,
137        axon_version: input.axon_version.clone(),
138        components,
139    }
140}
141
142/// Liveness check — always alive if the server is responding.
143pub fn liveness() -> serde_json::Value {
144    serde_json::json!({
145        "status": "alive"
146    })
147}
148
149/// Readiness check — ready if no component is unhealthy.
150pub fn readiness(input: &HealthInput) -> serde_json::Value {
151    let report = evaluate(input);
152    let ready = report.status != HealthStatus::Unhealthy;
153    serde_json::json!({
154        "ready": ready,
155        "status": report.status.as_str()
156    })
157}
158
159// ── Component checks ─────────────────────────────────────────────────────
160
161fn check_event_bus(input: &HealthInput) -> ComponentCheck {
162    let details = serde_json::json!({
163        "events_published": input.bus_events_published,
164        "subscriber_count": input.bus_subscriber_count,
165    });
166
167    // Bus is always healthy — it's an in-process channel, never "down"
168    ComponentCheck {
169        name: "event_bus".to_string(),
170        status: HealthStatus::Healthy,
171        message: None,
172        details: Some(details),
173    }
174}
175
176fn check_supervisor(input: &HealthInput) -> ComponentCheck {
177    let dead = input.daemon_state_counts.get("dead").copied().unwrap_or(0);
178    let total = input.daemon_count;
179
180    let details = serde_json::json!({
181        "daemon_count": total,
182        "states": input.daemon_state_counts,
183    });
184
185    let (status, message) = if dead > 0 && dead == total && total > 0 {
186        (HealthStatus::Unhealthy, Some(format!("all {} daemons dead", total)))
187    } else if dead > 0 {
188        (HealthStatus::Degraded, Some(format!("{} of {} daemons dead", dead, total)))
189    } else {
190        (HealthStatus::Healthy, None)
191    };
192
193    ComponentCheck {
194        name: "supervisor".to_string(),
195        status,
196        message,
197        details: Some(details),
198    }
199}
200
201fn check_session_store(input: &HealthInput) -> ComponentCheck {
202    let details = serde_json::json!({
203        "memory_entries": input.session_memory_count,
204        "persistent_entries": input.session_store_count,
205    });
206
207    // Session store is in-process HashMap + file — always accessible
208    ComponentCheck {
209        name: "session_store".to_string(),
210        status: HealthStatus::Healthy,
211        message: None,
212        details: Some(details),
213    }
214}
215
216fn check_version_registry(input: &HealthInput) -> ComponentCheck {
217    let details = serde_json::json!({
218        "flows_tracked": input.flows_tracked,
219        "versions_total": input.versions_total,
220    });
221
222    ComponentCheck {
223        name: "version_registry".to_string(),
224        status: HealthStatus::Healthy,
225        message: None,
226        details: Some(details),
227    }
228}
229
230fn check_rate_limiter(input: &HealthInput) -> ComponentCheck {
231    let details = serde_json::json!({
232        "enabled": input.rate_limiter_enabled,
233        "max_requests": input.rate_limiter_max_requests,
234        "window_secs": input.rate_limiter_window_secs,
235    });
236
237    ComponentCheck {
238        name: "rate_limiter".to_string(),
239        status: HealthStatus::Healthy,
240        message: if !input.rate_limiter_enabled { Some("disabled".to_string()) } else { None },
241        details: Some(details),
242    }
243}
244
245fn check_request_logger(input: &HealthInput) -> ComponentCheck {
246    let details = serde_json::json!({
247        "enabled": input.request_log_enabled,
248        "entries": input.request_log_entries,
249        "capacity": input.request_log_capacity,
250    });
251
252    // Degraded if buffer is >90% full
253    let (status, message) = if !input.request_log_enabled {
254        (HealthStatus::Healthy, Some("disabled".to_string()))
255    } else if input.request_log_capacity > 0 && input.request_log_entries * 100 / input.request_log_capacity > 90 {
256        (HealthStatus::Degraded, Some(format!("buffer {}% full ({}/{})", input.request_log_entries * 100 / input.request_log_capacity, input.request_log_entries, input.request_log_capacity)))
257    } else {
258        (HealthStatus::Healthy, None)
259    };
260
261    ComponentCheck {
262        name: "request_logger".to_string(),
263        status,
264        message,
265        details: Some(details),
266    }
267}
268
269fn check_api_keys(input: &HealthInput) -> ComponentCheck {
270    let details = serde_json::json!({
271        "enabled": input.api_keys_enabled,
272        "active_keys": input.api_keys_active,
273        "total_keys": input.api_keys_total,
274    });
275
276    // Degraded if auth enabled but no active keys (locked out risk)
277    let (status, message) = if input.api_keys_enabled && input.api_keys_active == 0 && input.api_keys_total > 0 {
278        (HealthStatus::Degraded, Some("all keys revoked — only master token works".to_string()))
279    } else {
280        (HealthStatus::Healthy, None)
281    };
282
283    ComponentCheck {
284        name: "api_keys".to_string(),
285        status,
286        message,
287        details: Some(details),
288    }
289}
290
291fn check_webhooks(input: &HealthInput) -> ComponentCheck {
292    let details = serde_json::json!({
293        "active_webhooks": input.webhooks_active,
294        "total_webhooks": input.webhooks_total,
295        "total_failures": input.webhooks_total_failures,
296    });
297
298    // Degraded if >50% of webhooks have failures
299    let (status, message) = if input.webhooks_total > 0 && input.webhooks_total_failures > input.webhooks_total as u64 * 5 {
300        (HealthStatus::Degraded, Some(format!("{} delivery failures across {} webhooks", input.webhooks_total_failures, input.webhooks_total)))
301    } else {
302        (HealthStatus::Healthy, None)
303    };
304
305    ComponentCheck {
306        name: "webhooks".to_string(),
307        status,
308        message,
309        details: Some(details),
310    }
311}
312
313fn check_audit_log(input: &HealthInput) -> ComponentCheck {
314    let details = serde_json::json!({
315        "buffered_entries": input.audit_log_entries,
316        "total_recorded": input.audit_log_total_recorded,
317    });
318
319    ComponentCheck {
320        name: "audit_log".to_string(),
321        status: HealthStatus::Healthy,
322        message: None,
323        details: Some(details),
324    }
325}
326
327fn aggregate_status(components: &[ComponentCheck]) -> HealthStatus {
328    let mut worst = HealthStatus::Healthy;
329    for c in components {
330        match c.status {
331            HealthStatus::Unhealthy => return HealthStatus::Unhealthy,
332            HealthStatus::Degraded => worst = HealthStatus::Degraded,
333            HealthStatus::Healthy => {}
334        }
335    }
336    worst
337}
338
339// ── Tests ────────────────────────────────────────────────────────────────
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    fn sample_input() -> HealthInput {
346        let mut states = HashMap::new();
347        states.insert("running".to_string(), 2);
348        states.insert("waiting".to_string(), 1);
349
350        HealthInput {
351            uptime_secs: 3600,
352            axon_version: "0.31.0".to_string(),
353            daemon_count: 3,
354            daemon_state_counts: states,
355            bus_events_published: 100,
356            bus_subscriber_count: 3,
357            session_memory_count: 5,
358            session_store_count: 2,
359            flows_tracked: 4,
360            versions_total: 10,
361            rate_limiter_enabled: true,
362            rate_limiter_max_requests: 100,
363            rate_limiter_window_secs: 60,
364            request_log_enabled: true,
365            request_log_entries: 50,
366            request_log_capacity: 1000,
367            api_keys_enabled: true,
368            api_keys_active: 3,
369            api_keys_total: 5,
370            webhooks_active: 2,
371            webhooks_total: 3,
372            webhooks_total_failures: 0,
373            audit_log_entries: 100,
374            audit_log_total_recorded: 150,
375        }
376    }
377
378    #[test]
379    fn healthy_report_all_green() {
380        let report = evaluate(&sample_input());
381        assert_eq!(report.status, HealthStatus::Healthy);
382        assert_eq!(report.components.len(), 9);
383        for c in &report.components {
384            assert_eq!(c.status, HealthStatus::Healthy, "component {} not healthy", c.name);
385        }
386    }
387
388    #[test]
389    fn degraded_when_some_daemons_dead() {
390        let mut input = sample_input();
391        input.daemon_state_counts.insert("dead".to_string(), 1);
392        let report = evaluate(&input);
393        assert_eq!(report.status, HealthStatus::Degraded);
394        let sup = report.components.iter().find(|c| c.name == "supervisor").unwrap();
395        assert_eq!(sup.status, HealthStatus::Degraded);
396        assert!(sup.message.as_ref().unwrap().contains("1 of"));
397    }
398
399    #[test]
400    fn unhealthy_when_all_daemons_dead() {
401        let mut states = HashMap::new();
402        states.insert("dead".to_string(), 3);
403        let mut input = sample_input();
404        input.daemon_count = 3;
405        input.daemon_state_counts = states;
406        let report = evaluate(&input);
407        assert_eq!(report.status, HealthStatus::Unhealthy);
408        let sup = report.components.iter().find(|c| c.name == "supervisor").unwrap();
409        assert_eq!(sup.status, HealthStatus::Unhealthy);
410        assert!(sup.message.as_ref().unwrap().contains("all 3 daemons dead"));
411    }
412
413    #[test]
414    fn healthy_when_no_daemons() {
415        let mut input = sample_input();
416        input.daemon_count = 0;
417        input.daemon_state_counts.clear();
418        let report = evaluate(&input);
419        assert_eq!(report.status, HealthStatus::Healthy);
420    }
421
422    #[test]
423    fn liveness_always_alive() {
424        let live = liveness();
425        assert_eq!(live["status"], "alive");
426    }
427
428    #[test]
429    fn readiness_true_when_healthy() {
430        let ready = readiness(&sample_input());
431        assert_eq!(ready["ready"], true);
432        assert_eq!(ready["status"], "healthy");
433    }
434
435    #[test]
436    fn readiness_true_when_degraded() {
437        let mut input = sample_input();
438        input.daemon_state_counts.insert("dead".to_string(), 1);
439        let ready = readiness(&input);
440        assert_eq!(ready["ready"], true);
441        assert_eq!(ready["status"], "degraded");
442    }
443
444    #[test]
445    fn readiness_false_when_unhealthy() {
446        let mut states = HashMap::new();
447        states.insert("dead".to_string(), 2);
448        let mut input = sample_input();
449        input.daemon_count = 2;
450        input.daemon_state_counts = states;
451        let ready = readiness(&input);
452        assert_eq!(ready["ready"], false);
453        assert_eq!(ready["status"], "unhealthy");
454    }
455
456    #[test]
457    fn report_includes_uptime_and_version() {
458        let report = evaluate(&sample_input());
459        assert_eq!(report.uptime_secs, 3600);
460        assert_eq!(report.axon_version, "0.31.0");
461    }
462
463    #[test]
464    fn component_details_present() {
465        let report = evaluate(&sample_input());
466        for c in &report.components {
467            assert!(c.details.is_some(), "component {} missing details", c.name);
468        }
469    }
470
471    #[test]
472    fn event_bus_details_contain_counts() {
473        let report = evaluate(&sample_input());
474        let bus = report.components.iter().find(|c| c.name == "event_bus").unwrap();
475        let d = bus.details.as_ref().unwrap();
476        assert_eq!(d["events_published"], 100);
477        assert_eq!(d["subscriber_count"], 3);
478    }
479
480    #[test]
481    fn supervisor_details_contain_states() {
482        let report = evaluate(&sample_input());
483        let sup = report.components.iter().find(|c| c.name == "supervisor").unwrap();
484        let d = sup.details.as_ref().unwrap();
485        assert_eq!(d["daemon_count"], 3);
486        assert!(d["states"].is_object());
487    }
488
489    #[test]
490    fn session_store_details() {
491        let report = evaluate(&sample_input());
492        let sess = report.components.iter().find(|c| c.name == "session_store").unwrap();
493        let d = sess.details.as_ref().unwrap();
494        assert_eq!(d["memory_entries"], 5);
495        assert_eq!(d["persistent_entries"], 2);
496    }
497
498    #[test]
499    fn version_registry_details() {
500        let report = evaluate(&sample_input());
501        let ver = report.components.iter().find(|c| c.name == "version_registry").unwrap();
502        let d = ver.details.as_ref().unwrap();
503        assert_eq!(d["flows_tracked"], 4);
504        assert_eq!(d["versions_total"], 10);
505    }
506
507    #[test]
508    fn health_status_serialization() {
509        let json = serde_json::to_string(&HealthStatus::Healthy).unwrap();
510        assert_eq!(json, "\"healthy\"");
511        let json = serde_json::to_string(&HealthStatus::Degraded).unwrap();
512        assert_eq!(json, "\"degraded\"");
513        let json = serde_json::to_string(&HealthStatus::Unhealthy).unwrap();
514        assert_eq!(json, "\"unhealthy\"");
515    }
516
517    #[test]
518    fn full_report_serializable() {
519        let report = evaluate(&sample_input());
520        let json = serde_json::to_string(&report).unwrap();
521        assert!(json.contains("\"healthy\""));
522        assert!(json.contains("\"event_bus\""));
523        assert!(json.contains("\"supervisor\""));
524        assert!(json.contains("\"session_store\""));
525        assert!(json.contains("\"version_registry\""));
526        assert!(json.contains("\"rate_limiter\""));
527        assert!(json.contains("\"request_logger\""));
528        assert!(json.contains("\"api_keys\""));
529        assert!(json.contains("\"webhooks\""));
530        assert!(json.contains("\"audit_log\""));
531    }
532
533    #[test]
534    fn aggregate_picks_worst_status() {
535        let checks = vec![
536            ComponentCheck { name: "a".into(), status: HealthStatus::Healthy, message: None, details: None },
537            ComponentCheck { name: "b".into(), status: HealthStatus::Degraded, message: None, details: None },
538            ComponentCheck { name: "c".into(), status: HealthStatus::Healthy, message: None, details: None },
539        ];
540        assert_eq!(aggregate_status(&checks), HealthStatus::Degraded);
541
542        let checks2 = vec![
543            ComponentCheck { name: "a".into(), status: HealthStatus::Degraded, message: None, details: None },
544            ComponentCheck { name: "b".into(), status: HealthStatus::Unhealthy, message: None, details: None },
545        ];
546        assert_eq!(aggregate_status(&checks2), HealthStatus::Unhealthy);
547    }
548
549    #[test]
550    fn rate_limiter_details() {
551        let report = evaluate(&sample_input());
552        let rl = report.components.iter().find(|c| c.name == "rate_limiter").unwrap();
553        assert_eq!(rl.status, HealthStatus::Healthy);
554        let d = rl.details.as_ref().unwrap();
555        assert_eq!(d["enabled"], true);
556        assert_eq!(d["max_requests"], 100);
557        assert_eq!(d["window_secs"], 60);
558    }
559
560    #[test]
561    fn rate_limiter_disabled_shows_message() {
562        let mut input = sample_input();
563        input.rate_limiter_enabled = false;
564        let report = evaluate(&input);
565        let rl = report.components.iter().find(|c| c.name == "rate_limiter").unwrap();
566        assert_eq!(rl.status, HealthStatus::Healthy);
567        assert_eq!(rl.message.as_deref(), Some("disabled"));
568    }
569
570    #[test]
571    fn request_logger_degraded_when_buffer_full() {
572        let mut input = sample_input();
573        input.request_log_entries = 950;
574        input.request_log_capacity = 1000;
575        let report = evaluate(&input);
576        let rl = report.components.iter().find(|c| c.name == "request_logger").unwrap();
577        assert_eq!(rl.status, HealthStatus::Degraded);
578        assert!(rl.message.as_ref().unwrap().contains("95%"));
579    }
580
581    #[test]
582    fn request_logger_healthy_when_low_usage() {
583        let report = evaluate(&sample_input());
584        let rl = report.components.iter().find(|c| c.name == "request_logger").unwrap();
585        assert_eq!(rl.status, HealthStatus::Healthy);
586        assert!(rl.message.is_none());
587    }
588
589    #[test]
590    fn api_keys_degraded_when_all_revoked() {
591        let mut input = sample_input();
592        input.api_keys_active = 0;
593        input.api_keys_total = 3;
594        let report = evaluate(&input);
595        let ak = report.components.iter().find(|c| c.name == "api_keys").unwrap();
596        assert_eq!(ak.status, HealthStatus::Degraded);
597        assert!(ak.message.as_ref().unwrap().contains("all keys revoked"));
598    }
599
600    #[test]
601    fn api_keys_healthy_when_disabled() {
602        let mut input = sample_input();
603        input.api_keys_enabled = false;
604        input.api_keys_active = 0;
605        input.api_keys_total = 0;
606        let report = evaluate(&input);
607        let ak = report.components.iter().find(|c| c.name == "api_keys").unwrap();
608        assert_eq!(ak.status, HealthStatus::Healthy);
609    }
610
611    #[test]
612    fn webhooks_degraded_when_many_failures() {
613        let mut input = sample_input();
614        input.webhooks_total = 2;
615        input.webhooks_total_failures = 20; // > 2*5 = 10
616        let report = evaluate(&input);
617        let wh = report.components.iter().find(|c| c.name == "webhooks").unwrap();
618        assert_eq!(wh.status, HealthStatus::Degraded);
619        assert!(wh.message.as_ref().unwrap().contains("20 delivery failures"));
620    }
621
622    #[test]
623    fn webhooks_healthy_with_low_failures() {
624        let report = evaluate(&sample_input());
625        let wh = report.components.iter().find(|c| c.name == "webhooks").unwrap();
626        assert_eq!(wh.status, HealthStatus::Healthy);
627    }
628
629    #[test]
630    fn audit_log_details() {
631        let report = evaluate(&sample_input());
632        let al = report.components.iter().find(|c| c.name == "audit_log").unwrap();
633        assert_eq!(al.status, HealthStatus::Healthy);
634        let d = al.details.as_ref().unwrap();
635        assert_eq!(d["buffered_entries"], 100);
636        assert_eq!(d["total_recorded"], 150);
637    }
638}