stygian_graph/application/health.rs
1//! Health check reporting for Kubernetes liveness and readiness probes.
2//!
3//! Provides structured health-check types and a [`HealthReporter`](health::HealthReporter) for aggregating
4//! component-level health into an overall [`HealthReport`](health::HealthReport).
5//!
6//! # Example
7//!
8//! ```
9//! use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
10//!
11//! let mut reporter = HealthReporter::new();
12//! reporter.register("database", HealthStatus::Healthy);
13//! reporter.register("cache", HealthStatus::Degraded("High latency".to_string()));
14//!
15//! let report = reporter.report();
16//! assert!(report.is_ready()); // Degraded is still operational ⇒ ready
17//! assert!(report.is_live()); // Still alive while degraded
18//! ```
19
20use std::collections::HashMap;
21use std::time::SystemTime;
22
23use parking_lot::RwLock;
24
25use serde::{Deserialize, Serialize};
26
27// ─── HealthStatus ─────────────────────────────────────────────────────────────
28
29/// The health status of a single component.
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(tag = "status", content = "reason", rename_all = "lowercase")]
32pub enum HealthStatus {
33 /// Component is operating normally.
34 Healthy,
35 /// Component is partially impaired but still serving requests.
36 Degraded(String),
37 /// Component is non-functional; requests will fail.
38 Unhealthy(String),
39}
40
41impl HealthStatus {
42 /// Returns `true` only when the component is fully healthy.
43 ///
44 /// # Example
45 ///
46 /// ```
47 /// use stygian_graph::application::health::HealthStatus;
48 /// assert!(HealthStatus::Healthy.is_healthy());
49 /// assert!(!HealthStatus::Degraded("latency".into()).is_healthy());
50 /// ```
51 #[must_use]
52 pub const fn is_healthy(&self) -> bool {
53 matches!(self, Self::Healthy)
54 }
55
56 /// Returns `true` when the component can still serve requests (healthy or degraded).
57 ///
58 /// # Example
59 ///
60 /// ```
61 /// use stygian_graph::application::health::HealthStatus;
62 /// assert!(HealthStatus::Healthy.is_operational());
63 /// assert!(HealthStatus::Degraded("high latency".into()).is_operational());
64 /// assert!(!HealthStatus::Unhealthy("connection refused".into()).is_operational());
65 /// ```
66 #[must_use]
67 pub const fn is_operational(&self) -> bool {
68 !matches!(self, Self::Unhealthy(_))
69 }
70}
71
72impl std::fmt::Display for HealthStatus {
73 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74 match self {
75 Self::Healthy => write!(f, "healthy"),
76 Self::Degraded(r) => write!(f, "degraded: {r}"),
77 Self::Unhealthy(r) => write!(f, "unhealthy: {r}"),
78 }
79 }
80}
81
82// ─── ComponentHealth ─────────────────────────────────────────────────────────
83
84/// Health state for a single named component.
85///
86/// Returned as part of a [`HealthReport`].
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct ComponentHealth {
89 /// Component identifier (e.g. `"database"`, `"cache"`, `"worker_pool"`)
90 pub name: String,
91 /// Component status
92 pub status: HealthStatus,
93 /// Optional free-form details (timings, error messages, etc.)
94 #[serde(skip_serializing_if = "Option::is_none")]
95 pub details: Option<serde_json::Value>,
96}
97
98impl ComponentHealth {
99 /// Create a healthy component with no extra details.
100 ///
101 /// # Example
102 ///
103 /// ```
104 /// use stygian_graph::application::health::{ComponentHealth, HealthStatus};
105 ///
106 /// let c = ComponentHealth::healthy("cache");
107 /// assert_eq!(c.status, HealthStatus::Healthy);
108 /// ```
109 pub fn healthy(name: impl Into<String>) -> Self {
110 Self {
111 name: name.into(),
112 status: HealthStatus::Healthy,
113 details: None,
114 }
115 }
116
117 /// Create a degraded component.
118 ///
119 /// # Example
120 ///
121 /// ```
122 /// use stygian_graph::application::health::ComponentHealth;
123 ///
124 /// let c = ComponentHealth::degraded("database", "replication lag 5s");
125 /// assert!(!c.status.is_healthy());
126 /// ```
127 pub fn degraded(name: impl Into<String>, reason: impl Into<String>) -> Self {
128 Self {
129 name: name.into(),
130 status: HealthStatus::Degraded(reason.into()),
131 details: None,
132 }
133 }
134
135 /// Create an unhealthy component.
136 ///
137 /// # Example
138 ///
139 /// ```
140 /// use stygian_graph::application::health::ComponentHealth;
141 ///
142 /// let c = ComponentHealth::unhealthy("valkey", "connection refused");
143 /// assert!(!c.status.is_operational());
144 /// ```
145 pub fn unhealthy(name: impl Into<String>, reason: impl Into<String>) -> Self {
146 Self {
147 name: name.into(),
148 status: HealthStatus::Unhealthy(reason.into()),
149 details: None,
150 }
151 }
152
153 /// Attach arbitrary JSON details to this component.
154 ///
155 /// # Example
156 ///
157 /// ```
158 /// use stygian_graph::application::health::ComponentHealth;
159 ///
160 /// let c = ComponentHealth::healthy("http_pool")
161 /// .with_details(serde_json::json!({ "idle_connections": 8, "max": 32 }));
162 ///
163 /// assert!(c.details.is_some());
164 /// ```
165 #[must_use]
166 pub fn with_details(mut self, details: serde_json::Value) -> Self {
167 self.details = Some(details);
168 self
169 }
170}
171
172// ─── HealthReport ────────────────────────────────────────────────────────────
173
174/// Aggregated health report for all registered components.
175///
176/// Returned by [`HealthReporter::report`].
177#[derive(Debug, Clone, Serialize, Deserialize)]
178pub struct HealthReport {
179 /// Overall system status (worst of all components)
180 pub overall: HealthStatus,
181 /// Per-component breakdown
182 pub components: Vec<ComponentHealth>,
183 /// When this report was generated (Unix seconds)
184 #[serde(with = "system_time_serde")]
185 pub checked_at: SystemTime,
186}
187
188impl HealthReport {
189 /// Returns `true` when the system is ready to serve traffic.
190 ///
191 /// The system is ready only when **all** components are healthy or degraded
192 /// (Kubernetes readiness probe).
193 ///
194 /// # Example
195 ///
196 /// ```
197 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
198 ///
199 /// let mut r = HealthReporter::new();
200 /// r.register("db", HealthStatus::Healthy);
201 /// assert!(r.report().is_ready());
202 /// ```
203 #[must_use]
204 pub fn is_ready(&self) -> bool {
205 self.components.iter().all(|c| c.status.is_operational())
206 }
207
208 /// Returns `true` while the process should continue running.
209 ///
210 /// The process is considered alive unless every component is unhealthy
211 /// (Kubernetes liveness probe).
212 ///
213 /// # Example
214 ///
215 /// ```
216 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
217 ///
218 /// let r = HealthReporter::new();
219 /// r.register("db", HealthStatus::Unhealthy("disk full".into()));
220 /// r.register("cache", HealthStatus::Healthy);
221 /// // One unhealthy component doesn't kill the process while others are healthy
222 /// assert!(r.report().is_live());
223 /// ```
224 #[must_use]
225 pub fn is_live(&self) -> bool {
226 // Dead when ALL components are unhealthy (or no components registered)
227 if self.components.is_empty() {
228 return true;
229 }
230 self.components.iter().any(|c| c.status.is_operational())
231 }
232
233 /// HTTP status code suitable for a health-check endpoint.
234 ///
235 /// Returns `200` when ready, `503` when not.
236 ///
237 /// # Example
238 ///
239 /// ```
240 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
241 ///
242 /// let mut r = HealthReporter::new();
243 /// r.register("db", HealthStatus::Healthy);
244 /// assert_eq!(r.report().http_status_code(), 200u16);
245 /// ```
246 #[must_use]
247 pub fn http_status_code(&self) -> u16 {
248 if self.is_ready() { 200 } else { 503 }
249 }
250}
251
252// ─── System-time serde helper ─────────────────────────────────────────────────
253
254mod system_time_serde {
255 use serde::{Deserialize, Deserializer, Serializer};
256 use std::time::{SystemTime, UNIX_EPOCH};
257
258 pub fn serialize<S: Serializer>(t: &SystemTime, s: S) -> Result<S::Ok, S::Error> {
259 let secs = t.duration_since(UNIX_EPOCH).unwrap_or_default().as_secs();
260 s.serialize_u64(secs)
261 }
262
263 pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<SystemTime, D::Error> {
264 let secs = u64::deserialize(d)?;
265 Ok(UNIX_EPOCH + std::time::Duration::from_secs(secs))
266 }
267}
268
269// ─── HealthReporter ──────────────────────────────────────────────────────────
270
271/// Collects component-level health checks and produces a [`HealthReport`].
272///
273/// Thread-safe; cheaply cloneable via `Arc` patterns.
274///
275/// # Example
276///
277/// ```
278/// use stygian_graph::application::health::{HealthReporter, HealthStatus, ComponentHealth};
279///
280/// let mut reporter = HealthReporter::new();
281/// reporter.register("database", HealthStatus::Healthy);
282/// reporter.register_component(
283/// ComponentHealth::degraded("cache", "latency p99 > 100ms")
284/// .with_details(serde_json::json!({ "p99_ms": 142 }))
285/// );
286///
287/// let report = reporter.report();
288/// assert_eq!(report.http_status_code(), 200u16); // degraded is operational
289/// ```
290pub struct HealthReporter {
291 components: RwLock<HashMap<String, ComponentHealth>>,
292}
293
294impl HealthReporter {
295 /// Create an empty reporter.
296 ///
297 /// # Example
298 ///
299 /// ```
300 /// use stygian_graph::application::health::HealthReporter;
301 ///
302 /// let r = HealthReporter::new();
303 /// assert!(r.report().components.is_empty());
304 /// ```
305 #[must_use]
306 pub fn new() -> Self {
307 Self {
308 components: RwLock::new(HashMap::new()),
309 }
310 }
311
312 /// Register or update a component's status by name.
313 ///
314 /// # Example
315 ///
316 /// ```
317 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
318 ///
319 /// let mut r = HealthReporter::new();
320 /// r.register("db", HealthStatus::Healthy);
321 /// assert_eq!(r.report().components.len(), 1);
322 /// ```
323 pub fn register(&self, name: impl Into<String>, status: HealthStatus) {
324 let name = name.into();
325 let component = ComponentHealth {
326 name: name.clone(),
327 status,
328 details: None,
329 };
330 self.components.write().insert(name, component);
331 }
332
333 /// Register or update a component with full [`ComponentHealth`].
334 ///
335 /// # Example
336 ///
337 /// ```
338 /// use stygian_graph::application::health::{HealthReporter, ComponentHealth};
339 ///
340 /// let mut r = HealthReporter::new();
341 /// r.register_component(ComponentHealth::healthy("cache"));
342 /// assert_eq!(r.report().components.len(), 1);
343 /// ```
344 pub fn register_component(&self, component: ComponentHealth) {
345 self.components
346 .write()
347 .insert(component.name.clone(), component);
348 }
349
350 /// Remove a component from reporting.
351 ///
352 /// # Example
353 ///
354 /// ```
355 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
356 ///
357 /// let mut r = HealthReporter::new();
358 /// r.register("db", HealthStatus::Healthy);
359 /// r.deregister("db");
360 /// assert!(r.report().components.is_empty());
361 /// ```
362 pub fn deregister(&self, name: &str) {
363 self.components.write().remove(name);
364 }
365
366 /// Generate a [`HealthReport`] from current component states.
367 ///
368 /// # Example
369 ///
370 /// ```
371 /// use stygian_graph::application::health::{HealthReporter, HealthStatus};
372 ///
373 /// let r = HealthReporter::new();
374 /// let report = r.report();
375 /// assert_eq!(report.overall, HealthStatus::Healthy);
376 /// assert!(report.is_live());
377 /// ```
378 pub fn report(&self) -> HealthReport {
379 let components: Vec<ComponentHealth> = self.components.read().values().cloned().collect();
380
381 let overall = aggregate_status(&components);
382 HealthReport {
383 overall,
384 components,
385 checked_at: SystemTime::now(),
386 }
387 }
388}
389
390impl Default for HealthReporter {
391 fn default() -> Self {
392 Self::new()
393 }
394}
395
396fn aggregate_status(components: &[ComponentHealth]) -> HealthStatus {
397 let mut worst = HealthStatus::Healthy;
398 for c in components {
399 match &c.status {
400 HealthStatus::Unhealthy(r) => {
401 return HealthStatus::Unhealthy(r.clone());
402 }
403 HealthStatus::Degraded(r) => {
404 if worst == HealthStatus::Healthy {
405 worst = HealthStatus::Degraded(r.clone());
406 }
407 }
408 HealthStatus::Healthy => {}
409 }
410 }
411 worst
412}
413
414// ─── Tests ────────────────────────────────────────────────────────────────────
415
416#[cfg(test)]
417#[allow(clippy::unwrap_used, clippy::expect_used)]
418mod tests {
419 use super::*;
420
421 #[test]
422 fn healthy_status_is_healthy() {
423 assert!(HealthStatus::Healthy.is_healthy());
424 assert!(HealthStatus::Healthy.is_operational());
425 }
426
427 #[test]
428 fn degraded_status_is_not_healthy_but_operational() {
429 let s = HealthStatus::Degraded("reason".into());
430 assert!(!s.is_healthy());
431 assert!(s.is_operational());
432 }
433
434 #[test]
435 fn unhealthy_status_is_not_operational() {
436 let s = HealthStatus::Unhealthy("crashed".into());
437 assert!(!s.is_healthy());
438 assert!(!s.is_operational());
439 }
440
441 #[test]
442 fn empty_reporter_overall_is_healthy() {
443 let reporter = HealthReporter::new();
444 assert_eq!(reporter.report().overall, HealthStatus::Healthy);
445 }
446
447 #[test]
448 fn all_healthy_report_is_ready_and_live() {
449 let r = HealthReporter::new();
450 r.register("db", HealthStatus::Healthy);
451 r.register("cache", HealthStatus::Healthy);
452 let report = r.report();
453 assert!(report.is_ready());
454 assert!(report.is_live());
455 assert_eq!(report.http_status_code(), 200);
456 }
457
458 #[test]
459 fn degraded_component_report_not_ready_but_still_live() {
460 let r = HealthReporter::new();
461 r.register("db", HealthStatus::Healthy);
462 r.register("cache", HealthStatus::Degraded("high latency".into()));
463 let report = r.report();
464 // Degraded is operational so is_ready returns true
465 assert!(report.is_ready());
466 assert!(report.is_live());
467 }
468
469 #[test]
470 fn unhealthy_component_makes_report_not_ready() {
471 let r = HealthReporter::new();
472 r.register("db", HealthStatus::Unhealthy("connection refused".into()));
473 let report = r.report();
474 assert!(!report.is_ready());
475 assert_eq!(report.http_status_code(), 503);
476 }
477
478 #[test]
479 fn all_unhealthy_not_live() {
480 let r = HealthReporter::new();
481 r.register("a", HealthStatus::Unhealthy("x".into()));
482 r.register("b", HealthStatus::Unhealthy("y".into()));
483 assert!(!r.report().is_live());
484 }
485
486 #[test]
487 fn register_same_component_updates_status() {
488 let r = HealthReporter::new();
489 r.register("db", HealthStatus::Healthy);
490 r.register("db", HealthStatus::Unhealthy("disk full".into()));
491 let report = r.report();
492 assert_eq!(report.components.len(), 1);
493 assert!(!report.is_ready());
494 }
495
496 #[test]
497 fn deregister_removes_component() {
498 let r = HealthReporter::new();
499 r.register("db", HealthStatus::Healthy);
500 r.deregister("db");
501 assert!(r.report().components.is_empty());
502 }
503
504 #[test]
505 fn component_health_builders() {
506 assert!(ComponentHealth::healthy("x").status.is_healthy());
507 assert!(
508 ComponentHealth::degraded("x", "slow")
509 .status
510 .is_operational()
511 );
512 assert!(
513 !ComponentHealth::unhealthy("x", "down")
514 .status
515 .is_operational()
516 );
517 }
518
519 #[test]
520 fn component_with_details_serializes() {
521 let c = ComponentHealth::healthy("pool").with_details(serde_json::json!({ "idle": 8 }));
522 assert!(c.details.is_some());
523 let json = serde_json::to_string(&c).unwrap();
524 assert!(json.contains("idle"));
525 }
526
527 #[test]
528 fn health_report_serializes_to_json() {
529 let r = HealthReporter::new();
530 r.register("db", HealthStatus::Healthy);
531 let report = r.report();
532 let json = serde_json::to_string(&report).expect("serialize");
533 assert!(json.contains("healthy"));
534 }
535
536 #[test]
537 fn aggregate_status_worst_wins() {
538 let components = vec![
539 ComponentHealth::healthy("a"),
540 ComponentHealth::degraded("b", "slow"),
541 ComponentHealth::unhealthy("c", "down"),
542 ];
543 let status = aggregate_status(&components);
544 assert!(matches!(status, HealthStatus::Unhealthy(_)));
545 }
546}