mockforge_http/
health.rs

1//! Kubernetes-native health check endpoints
2//!
3//! This module provides comprehensive health check endpoints following Kubernetes best practices:
4//! - **Liveness probe**: Indicates if the container is alive
5//! - **Readiness probe**: Indicates if the container is ready to accept traffic
6//! - **Startup probe**: Indicates if the container has finished initialization
7//!
8//! These endpoints are essential for:
9//! - Kubernetes deployment orchestration
10//! - Load balancer health checks
11//! - Service discovery integration
12//! - Graceful shutdown coordination
13
14use axum::{extract::State, http::StatusCode, response::Json, routing::get};
15use serde::{Deserialize, Serialize};
16use std::sync::Arc;
17use std::time::{Duration, Instant};
18use tokio::sync::RwLock;
19use tracing::{debug, error, info, warn};
20
21/// Service initialization status
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum ServiceStatus {
24    /// Service is initializing (not ready)
25    Initializing,
26    /// Service is ready to accept traffic
27    Ready,
28    /// Service is shutting down (not accepting new requests)
29    ShuttingDown,
30    /// Service has failed and is unhealthy
31    Failed,
32}
33
34impl ServiceStatus {
35    /// Check if service is ready to accept traffic
36    pub fn is_ready(&self) -> bool {
37        matches!(self, ServiceStatus::Ready)
38    }
39
40    /// Check if service is alive (not failed)
41    pub fn is_alive(&self) -> bool {
42        !matches!(self, ServiceStatus::Failed)
43    }
44}
45
46/// Health check manager for tracking service state
47#[derive(Debug, Clone)]
48pub struct HealthManager {
49    /// Current service status
50    status: Arc<RwLock<ServiceStatus>>,
51    /// Server startup time
52    start_time: Arc<Instant>,
53    /// Service initialization deadline (timeout)
54    init_deadline: Arc<Option<Instant>>,
55    /// Shutdown signal for graceful termination
56    shutdown_signal: Arc<RwLock<Option<tokio::sync::oneshot::Sender<()>>>>,
57}
58
59impl HealthManager {
60    /// Create a new health manager
61    pub fn new() -> Self {
62        Self {
63            status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
64            start_time: Arc::new(Instant::now()),
65            init_deadline: Arc::new(None),
66            shutdown_signal: Arc::new(RwLock::new(None)),
67        }
68    }
69
70    /// Create a new health manager with initialization timeout
71    pub fn with_init_timeout(timeout: Duration) -> Self {
72        let deadline = Instant::now() + timeout;
73        Self {
74            status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
75            start_time: Arc::new(Instant::now()),
76            init_deadline: Arc::new(Some(deadline)),
77            shutdown_signal: Arc::new(RwLock::new(None)),
78        }
79    }
80
81    /// Mark service as ready
82    pub async fn set_ready(&self) {
83        let mut status = self.status.write().await;
84        *status = ServiceStatus::Ready;
85        info!("Service marked as ready");
86    }
87
88    /// Mark service as failed
89    pub async fn set_failed(&self, reason: &str) {
90        let mut status = self.status.write().await;
91        *status = ServiceStatus::Failed;
92        error!("Service marked as failed: {}", reason);
93    }
94
95    /// Mark service as shutting down
96    pub async fn set_shutting_down(&self) {
97        let mut status = self.status.write().await;
98        *status = ServiceStatus::ShuttingDown;
99        info!("Service marked as shutting down");
100    }
101
102    /// Get current service status
103    pub async fn get_status(&self) -> ServiceStatus {
104        *self.status.read().await
105    }
106
107    /// Get server uptime in seconds
108    pub fn uptime_seconds(&self) -> u64 {
109        self.start_time.elapsed().as_secs()
110    }
111
112    /// Check if initialization has timed out
113    pub fn is_init_timeout(&self) -> bool {
114        if let Some(deadline) = *self.init_deadline {
115            Instant::now() > deadline
116        } else {
117            false
118        }
119    }
120
121    /// Set shutdown signal receiver for graceful shutdown
122    pub async fn set_shutdown_signal(&self, sender: tokio::sync::oneshot::Sender<()>) {
123        let mut signal = self.shutdown_signal.write().await;
124        *signal = Some(sender);
125    }
126
127    /// Trigger graceful shutdown
128    pub async fn trigger_shutdown(&self) {
129        self.set_shutting_down().await;
130        let mut signal = self.shutdown_signal.write().await;
131        if let Some(sender) = signal.take() {
132            let _ = sender.send(());
133            info!("Graceful shutdown signal sent");
134        }
135    }
136}
137
138impl Default for HealthManager {
139    fn default() -> Self {
140        Self::new()
141    }
142}
143
144/// Health check response structure
145#[derive(Debug, Serialize, Deserialize)]
146pub struct HealthResponse {
147    /// Service status
148    pub status: String,
149    /// ISO 8601 timestamp
150    pub timestamp: String,
151    /// Server uptime in seconds
152    pub uptime_seconds: u64,
153    /// Service version
154    pub version: String,
155    /// Additional status information
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub details: Option<HealthDetails>,
158}
159
160/// Detailed health information
161#[derive(Debug, Serialize, Deserialize)]
162pub struct HealthDetails {
163    /// Service initialization status
164    pub initialization: String,
165    /// Active connection count (if available)
166    #[serde(skip_serializing_if = "Option::is_none")]
167    pub connections: Option<u64>,
168    /// Memory usage in bytes (if available)
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub memory_bytes: Option<u64>,
171}
172
173/// Liveness probe endpoint
174///
175/// Kubernetes uses this to determine if the container should be restarted.
176/// Returns 200 if the service is alive, 503 if it has failed.
177///
178/// This should be a lightweight check that doesn't depend on external services.
179async fn liveness_probe(
180    State(health): State<Arc<HealthManager>>,
181) -> Result<Json<HealthResponse>, StatusCode> {
182    let status = health.get_status().await;
183    let uptime = health.uptime_seconds();
184
185    // Liveness checks if the process is alive (not failed)
186    if status.is_alive() {
187        let response = HealthResponse {
188            status: "alive".to_string(),
189            timestamp: chrono::Utc::now().to_rfc3339(),
190            uptime_seconds: uptime,
191            version: env!("CARGO_PKG_VERSION").to_string(),
192            details: None,
193        };
194        Ok(Json(response))
195    } else {
196        Err(StatusCode::SERVICE_UNAVAILABLE)
197    }
198}
199
200/// Readiness probe endpoint
201///
202/// Kubernetes uses this to determine if the container is ready to receive traffic.
203/// Returns 200 if the service is ready, 503 if it's not ready or shutting down.
204///
205/// This checks if the service has completed initialization and is ready to serve requests.
206async fn readiness_probe(
207    State(health): State<Arc<HealthManager>>,
208) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
209    let status = health.get_status().await;
210    let uptime = health.uptime_seconds();
211
212    // Readiness checks if the service is ready to accept traffic
213    if status.is_ready() {
214        let response = HealthResponse {
215            status: "ready".to_string(),
216            timestamp: chrono::Utc::now().to_rfc3339(),
217            uptime_seconds: uptime,
218            version: env!("CARGO_PKG_VERSION").to_string(),
219            details: Some(HealthDetails {
220                initialization: "complete".to_string(),
221                connections: None,
222                memory_bytes: None,
223            }),
224        };
225        Ok(Json(response))
226    } else {
227        let details = match status {
228            ServiceStatus::Initializing => {
229                if health.is_init_timeout() {
230                    "initialization_timeout".to_string()
231                } else {
232                    "initializing".to_string()
233                }
234            }
235            ServiceStatus::ShuttingDown => "shutting_down".to_string(),
236            ServiceStatus::Failed => "failed".to_string(),
237            ServiceStatus::Ready => unreachable!(),
238        };
239
240        let response = HealthResponse {
241            status: "not_ready".to_string(),
242            timestamp: chrono::Utc::now().to_rfc3339(),
243            uptime_seconds: uptime,
244            version: env!("CARGO_PKG_VERSION").to_string(),
245            details: Some(HealthDetails {
246                initialization: details,
247                connections: None,
248                memory_bytes: None,
249            }),
250        };
251
252        Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
253    }
254}
255
256/// Startup probe endpoint
257///
258/// Kubernetes uses this to determine if the container has finished initialization.
259/// This is useful for services that take a long time to start.
260/// Returns 200 once initialization is complete, 503 while still initializing.
261async fn startup_probe(
262    State(health): State<Arc<HealthManager>>,
263) -> Result<Json<HealthResponse>, StatusCode> {
264    let status = health.get_status().await;
265    let uptime = health.uptime_seconds();
266
267    // Startup probe checks if initialization is complete
268    match status {
269        ServiceStatus::Ready => {
270            let response = HealthResponse {
271                status: "startup_complete".to_string(),
272                timestamp: chrono::Utc::now().to_rfc3339(),
273                uptime_seconds: uptime,
274                version: env!("CARGO_PKG_VERSION").to_string(),
275                details: Some(HealthDetails {
276                    initialization: "complete".to_string(),
277                    connections: None,
278                    memory_bytes: None,
279                }),
280            };
281            Ok(Json(response))
282        }
283        ServiceStatus::Initializing => {
284            if health.is_init_timeout() {
285                warn!("Startup probe: initialization timeout exceeded");
286                Err(StatusCode::SERVICE_UNAVAILABLE)
287            } else {
288                debug!("Startup probe: still initializing");
289                Err(StatusCode::SERVICE_UNAVAILABLE)
290            }
291        }
292        ServiceStatus::Failed => Err(StatusCode::SERVICE_UNAVAILABLE),
293        ServiceStatus::ShuttingDown => {
294            // During shutdown, startup probe should return ready (service was started)
295            let response = HealthResponse {
296                status: "startup_complete".to_string(),
297                timestamp: chrono::Utc::now().to_rfc3339(),
298                uptime_seconds: uptime,
299                version: env!("CARGO_PKG_VERSION").to_string(),
300                details: Some(HealthDetails {
301                    initialization: "complete".to_string(),
302                    connections: None,
303                    memory_bytes: None,
304                }),
305            };
306            Ok(Json(response))
307        }
308    }
309}
310
311/// Combined health check endpoint (backwards compatibility)
312///
313/// This endpoint provides a general health check that combines liveness and readiness.
314/// For Kubernetes deployments, prefer using the specific probe endpoints.
315async fn health_check(
316    State(health): State<Arc<HealthManager>>,
317) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
318    let status = health.get_status().await;
319    let uptime = health.uptime_seconds();
320
321    if status.is_ready() {
322        let response = HealthResponse {
323            status: "healthy".to_string(),
324            timestamp: chrono::Utc::now().to_rfc3339(),
325            uptime_seconds: uptime,
326            version: env!("CARGO_PKG_VERSION").to_string(),
327            details: Some(HealthDetails {
328                initialization: "complete".to_string(),
329                connections: None,
330                memory_bytes: None,
331            }),
332        };
333        Ok(Json(response))
334    } else {
335        let status_str = match status {
336            ServiceStatus::Initializing => "initializing",
337            ServiceStatus::ShuttingDown => "shutting_down",
338            ServiceStatus::Failed => "failed",
339            ServiceStatus::Ready => unreachable!(),
340        };
341
342        let response = HealthResponse {
343            status: status_str.to_string(),
344            timestamp: chrono::Utc::now().to_rfc3339(),
345            uptime_seconds: uptime,
346            version: env!("CARGO_PKG_VERSION").to_string(),
347            details: Some(HealthDetails {
348                initialization: status_str.to_string(),
349                connections: None,
350                memory_bytes: None,
351            }),
352        };
353
354        Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
355    }
356}
357
358/// Create health check router with all probe endpoints
359pub fn health_router(health_manager: Arc<HealthManager>) -> axum::Router {
360    use axum::Router;
361    Router::new()
362        .route("/health", get(health_check))
363        .route("/health/live", get(liveness_probe))
364        .route("/health/ready", get(readiness_probe))
365        .route("/health/startup", get(startup_probe))
366        .with_state(health_manager)
367}
368
369/// Create health check router with custom prefix
370pub fn health_router_with_prefix(health_manager: Arc<HealthManager>, prefix: &str) -> axum::Router {
371    use axum::Router;
372    Router::new()
373        .route(&format!("{}/health", prefix), get(health_check))
374        .route(&format!("{}/health/live", prefix), get(liveness_probe))
375        .route(&format!("{}/health/ready", prefix), get(readiness_probe))
376        .route(&format!("{}/health/startup", prefix), get(startup_probe))
377        .with_state(health_manager)
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383    use axum::body::Body;
384    use axum::http::Request;
385    use tower::ServiceExt;
386
387    #[tokio::test]
388    async fn test_liveness_probe_alive() {
389        let health = Arc::new(HealthManager::new());
390        health.set_ready().await;
391
392        let app = health_router(health.clone());
393        let response = app
394            .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
395            .await
396            .unwrap();
397
398        assert_eq!(response.status(), StatusCode::OK);
399    }
400
401    #[tokio::test]
402    async fn test_liveness_probe_failed() {
403        let health = Arc::new(HealthManager::new());
404        health.set_failed("test failure").await;
405
406        let app = health_router(health.clone());
407        let response = app
408            .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
409            .await
410            .unwrap();
411
412        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
413    }
414
415    #[tokio::test]
416    async fn test_readiness_probe_ready() {
417        let health = Arc::new(HealthManager::new());
418        health.set_ready().await;
419
420        let app = health_router(health.clone());
421        let response = app
422            .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
423            .await
424            .unwrap();
425
426        assert_eq!(response.status(), StatusCode::OK);
427    }
428
429    #[tokio::test]
430    async fn test_readiness_probe_initializing() {
431        let health = Arc::new(HealthManager::new());
432        // Status is Initializing by default
433
434        let app = health_router(health.clone());
435        let response = app
436            .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
437            .await
438            .unwrap();
439
440        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
441    }
442
443    #[tokio::test]
444    async fn test_startup_probe_ready() {
445        let health = Arc::new(HealthManager::new());
446        health.set_ready().await;
447
448        let app = health_router(health.clone());
449        let response = app
450            .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
451            .await
452            .unwrap();
453
454        assert_eq!(response.status(), StatusCode::OK);
455    }
456
457    #[tokio::test]
458    async fn test_startup_probe_initializing() {
459        let health = Arc::new(HealthManager::new());
460        // Status is Initializing by default
461
462        let app = health_router(health.clone());
463        let response = app
464            .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
465            .await
466            .unwrap();
467
468        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
469    }
470
471    #[tokio::test]
472    async fn test_health_check_ready() {
473        let health = Arc::new(HealthManager::new());
474        health.set_ready().await;
475
476        let app = health_router(health.clone());
477        let response = app
478            .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
479            .await
480            .unwrap();
481
482        assert_eq!(response.status(), StatusCode::OK);
483    }
484
485    #[test]
486    fn test_service_status() {
487        assert!(ServiceStatus::Ready.is_ready());
488        assert!(!ServiceStatus::Initializing.is_ready());
489        assert!(!ServiceStatus::ShuttingDown.is_ready());
490        assert!(!ServiceStatus::Failed.is_ready());
491
492        assert!(ServiceStatus::Ready.is_alive());
493        assert!(ServiceStatus::Initializing.is_alive());
494        assert!(ServiceStatus::ShuttingDown.is_alive());
495        assert!(!ServiceStatus::Failed.is_alive());
496    }
497}