guts_node/health/
mod.rs

1//! # Health Check Module
2//!
3//! Comprehensive health checks for production deployments including:
4//!
5//! - **Liveness Probe**: Is the process running?
6//! - **Readiness Probe**: Is the service ready to accept traffic?
7//! - **Startup Probe**: Has initial startup completed?
8//!
9//! ## Usage
10//!
11//! ```rust,ignore
12//! use axum::Router;
13//! use guts_node::health::{health_routes, HealthState};
14//!
15//! let health_state = HealthState::new();
16//! health_state.set_ready(true);
17//!
18//! let app: Router<()> = Router::new()
19//!     .merge(health_routes(health_state));
20//! ```
21
22use axum::{
23    extract::State,
24    http::StatusCode,
25    response::{IntoResponse, Response},
26    routing::get,
27    Json, Router,
28};
29use parking_lot::RwLock;
30use serde::Serialize;
31use std::sync::atomic::{AtomicBool, Ordering};
32use std::sync::Arc;
33use std::time::{Duration, Instant};
34
35/// Health status values.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
37#[serde(rename_all = "lowercase")]
38pub enum HealthStatus {
39    /// Component is healthy.
40    Up,
41    /// Component is unhealthy.
42    Down,
43    /// Component status is unknown.
44    Unknown,
45}
46
47/// Individual component health.
48#[derive(Debug, Clone, Serialize)]
49pub struct ComponentHealth {
50    /// Component status.
51    pub status: HealthStatus,
52    /// Optional latency in milliseconds.
53    #[serde(skip_serializing_if = "Option::is_none")]
54    pub latency_ms: Option<u64>,
55    /// Optional additional details.
56    #[serde(skip_serializing_if = "Option::is_none")]
57    pub details: Option<serde_json::Value>,
58}
59
60impl ComponentHealth {
61    /// Create a healthy component.
62    pub fn up() -> Self {
63        Self {
64            status: HealthStatus::Up,
65            latency_ms: None,
66            details: None,
67        }
68    }
69
70    /// Create a healthy component with latency.
71    pub fn up_with_latency(latency: Duration) -> Self {
72        Self {
73            status: HealthStatus::Up,
74            latency_ms: Some(latency.as_millis() as u64),
75            details: None,
76        }
77    }
78
79    /// Create an unhealthy component.
80    pub fn down() -> Self {
81        Self {
82            status: HealthStatus::Down,
83            latency_ms: None,
84            details: None,
85        }
86    }
87
88    /// Create an unhealthy component with reason.
89    pub fn down_with_reason(reason: &str) -> Self {
90        Self {
91            status: HealthStatus::Down,
92            latency_ms: None,
93            details: Some(serde_json::json!({ "reason": reason })),
94        }
95    }
96}
97
98/// Liveness probe response.
99#[derive(Debug, Clone, Serialize)]
100pub struct LivenessResponse {
101    /// Overall status.
102    pub status: HealthStatus,
103    /// Uptime in seconds.
104    pub uptime_seconds: u64,
105}
106
107/// Readiness probe response.
108#[derive(Debug, Clone, Serialize)]
109pub struct ReadinessResponse {
110    /// Overall status.
111    pub status: HealthStatus,
112    /// Component health checks.
113    pub checks: ReadinessChecks,
114}
115
116/// Readiness component checks.
117#[derive(Debug, Clone, Serialize)]
118pub struct ReadinessChecks {
119    /// Storage subsystem health.
120    pub storage: ComponentHealth,
121    /// P2P network health.
122    pub p2p: ComponentHealth,
123    /// Real-time (WebSocket) health.
124    pub realtime: ComponentHealth,
125}
126
127/// Startup probe response.
128#[derive(Debug, Clone, Serialize)]
129pub struct StartupResponse {
130    /// Overall status.
131    pub status: HealthStatus,
132    /// Startup duration in milliseconds.
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub startup_duration_ms: Option<u64>,
135}
136
137/// Overall health response.
138#[derive(Debug, Clone, Serialize)]
139pub struct HealthResponse {
140    /// Overall status.
141    pub status: HealthStatus,
142    /// Version info.
143    pub version: String,
144    /// Uptime in seconds.
145    pub uptime_seconds: u64,
146    /// Component checks.
147    pub checks: ReadinessChecks,
148}
149
150/// Health state for tracking component health.
151#[derive(Clone)]
152pub struct HealthState {
153    /// When the service started.
154    start_time: Instant,
155    /// Whether startup is complete.
156    startup_complete: Arc<AtomicBool>,
157    /// Whether the service is ready.
158    ready: Arc<AtomicBool>,
159    /// Component health states.
160    components: Arc<RwLock<ComponentStates>>,
161}
162
163/// Mutable component states.
164#[derive(Default)]
165struct ComponentStates {
166    storage_healthy: bool,
167    p2p_connected: bool,
168    p2p_peer_count: usize,
169    realtime_healthy: bool,
170    websocket_connections: usize,
171}
172
173impl Default for HealthState {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179impl HealthState {
180    /// Create a new health state.
181    pub fn new() -> Self {
182        Self {
183            start_time: Instant::now(),
184            startup_complete: Arc::new(AtomicBool::new(false)),
185            ready: Arc::new(AtomicBool::new(false)),
186            components: Arc::new(RwLock::new(ComponentStates::default())),
187        }
188    }
189
190    /// Get uptime in seconds.
191    pub fn uptime(&self) -> u64 {
192        self.start_time.elapsed().as_secs()
193    }
194
195    /// Mark startup as complete.
196    pub fn set_startup_complete(&self, complete: bool) {
197        self.startup_complete.store(complete, Ordering::SeqCst);
198    }
199
200    /// Check if startup is complete.
201    pub fn is_startup_complete(&self) -> bool {
202        self.startup_complete.load(Ordering::SeqCst)
203    }
204
205    /// Set readiness state.
206    pub fn set_ready(&self, ready: bool) {
207        self.ready.store(ready, Ordering::SeqCst);
208    }
209
210    /// Check if service is ready.
211    pub fn is_ready(&self) -> bool {
212        self.ready.load(Ordering::SeqCst)
213    }
214
215    /// Update storage health.
216    pub fn set_storage_healthy(&self, healthy: bool) {
217        self.components.write().storage_healthy = healthy;
218    }
219
220    /// Update P2P health.
221    pub fn set_p2p_connected(&self, connected: bool, peer_count: usize) {
222        let mut components = self.components.write();
223        components.p2p_connected = connected;
224        components.p2p_peer_count = peer_count;
225    }
226
227    /// Update realtime health.
228    pub fn set_realtime_healthy(&self, healthy: bool, connection_count: usize) {
229        let mut components = self.components.write();
230        components.realtime_healthy = healthy;
231        components.websocket_connections = connection_count;
232    }
233
234    /// Get storage component health.
235    fn storage_health(&self) -> ComponentHealth {
236        let components = self.components.read();
237        if components.storage_healthy {
238            ComponentHealth::up()
239        } else {
240            ComponentHealth::down()
241        }
242    }
243
244    /// Get P2P component health.
245    fn p2p_health(&self) -> ComponentHealth {
246        let components = self.components.read();
247        if components.p2p_connected {
248            ComponentHealth {
249                status: HealthStatus::Up,
250                latency_ms: None,
251                details: Some(serde_json::json!({
252                    "peer_count": components.p2p_peer_count
253                })),
254            }
255        } else {
256            // P2P might not be enabled, so unknown is acceptable
257            ComponentHealth {
258                status: HealthStatus::Unknown,
259                latency_ms: None,
260                details: Some(serde_json::json!({
261                    "reason": "P2P not connected or not enabled"
262                })),
263            }
264        }
265    }
266
267    /// Get realtime component health.
268    fn realtime_health(&self) -> ComponentHealth {
269        let components = self.components.read();
270        ComponentHealth {
271            status: if components.realtime_healthy {
272                HealthStatus::Up
273            } else {
274                HealthStatus::Down
275            },
276            latency_ms: None,
277            details: Some(serde_json::json!({
278                "connections": components.websocket_connections
279            })),
280        }
281    }
282
283    /// Get readiness checks.
284    fn readiness_checks(&self) -> ReadinessChecks {
285        ReadinessChecks {
286            storage: self.storage_health(),
287            p2p: self.p2p_health(),
288            realtime: self.realtime_health(),
289        }
290    }
291}
292
293/// Create health check routes.
294pub fn health_routes<S>(state: HealthState) -> Router<S>
295where
296    S: Clone + Send + Sync + 'static,
297{
298    Router::new()
299        .route("/health", get(health_handler))
300        .route("/health/live", get(liveness_handler))
301        .route("/health/ready", get(readiness_handler))
302        .route("/health/startup", get(startup_handler))
303        .with_state(state)
304}
305
306/// Overall health handler.
307async fn health_handler(State(state): State<HealthState>) -> Response {
308    let checks = state.readiness_checks();
309    let overall_status = if state.is_ready()
310        && checks.storage.status == HealthStatus::Up
311        && checks.realtime.status == HealthStatus::Up
312    {
313        HealthStatus::Up
314    } else {
315        HealthStatus::Down
316    };
317
318    let response = HealthResponse {
319        status: overall_status,
320        version: env!("CARGO_PKG_VERSION").to_string(),
321        uptime_seconds: state.uptime(),
322        checks,
323    };
324
325    let status_code = match overall_status {
326        HealthStatus::Up => StatusCode::OK,
327        _ => StatusCode::SERVICE_UNAVAILABLE,
328    };
329
330    (status_code, Json(response)).into_response()
331}
332
333/// Liveness probe handler.
334async fn liveness_handler(State(state): State<HealthState>) -> Response {
335    let response = LivenessResponse {
336        status: HealthStatus::Up,
337        uptime_seconds: state.uptime(),
338    };
339
340    (StatusCode::OK, Json(response)).into_response()
341}
342
343/// Readiness probe handler.
344async fn readiness_handler(State(state): State<HealthState>) -> Response {
345    if !state.is_ready() {
346        let response = ReadinessResponse {
347            status: HealthStatus::Down,
348            checks: state.readiness_checks(),
349        };
350        return (StatusCode::SERVICE_UNAVAILABLE, Json(response)).into_response();
351    }
352
353    let checks = state.readiness_checks();
354    let overall_status = if checks.storage.status == HealthStatus::Up {
355        HealthStatus::Up
356    } else {
357        HealthStatus::Down
358    };
359
360    let response = ReadinessResponse {
361        status: overall_status,
362        checks,
363    };
364
365    let status_code = match overall_status {
366        HealthStatus::Up => StatusCode::OK,
367        _ => StatusCode::SERVICE_UNAVAILABLE,
368    };
369
370    (status_code, Json(response)).into_response()
371}
372
373/// Startup probe handler.
374async fn startup_handler(State(state): State<HealthState>) -> Response {
375    if state.is_startup_complete() {
376        let response = StartupResponse {
377            status: HealthStatus::Up,
378            startup_duration_ms: None,
379        };
380        (StatusCode::OK, Json(response)).into_response()
381    } else {
382        let response = StartupResponse {
383            status: HealthStatus::Down,
384            startup_duration_ms: None,
385        };
386        (StatusCode::SERVICE_UNAVAILABLE, Json(response)).into_response()
387    }
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_health_state() {
396        let state = HealthState::new();
397
398        assert!(!state.is_startup_complete());
399        assert!(!state.is_ready());
400
401        state.set_startup_complete(true);
402        state.set_ready(true);
403
404        assert!(state.is_startup_complete());
405        assert!(state.is_ready());
406    }
407
408    #[test]
409    fn test_component_health() {
410        let up = ComponentHealth::up();
411        assert_eq!(up.status, HealthStatus::Up);
412
413        let down = ComponentHealth::down_with_reason("test failure");
414        assert_eq!(down.status, HealthStatus::Down);
415        assert!(down.details.is_some());
416    }
417}