mockforge_http/
health.rs

1//! Kubernetes-native health check endpoints
2//!
3//! This module provides comprehensive health check endpoints following Kubernetes best practices:
4//! - **Liveness probe**: Indicates if the container is alive
5//! - **Readiness probe**: Indicates if the container is ready to accept traffic
6//! - **Startup probe**: Indicates if the container has finished initialization
7//!
8//! These endpoints are essential for:
9//! - Kubernetes deployment orchestration
10//! - Load balancer health checks
11//! - Service discovery integration
12//! - Graceful shutdown coordination
13
14use axum::{
15    extract::State,
16    http::StatusCode,
17    response::Json,
18    routing::{get, Router},
19};
20use serde::{Deserialize, Serialize};
21use std::sync::Arc;
22use std::time::{Duration, Instant};
23use tokio::sync::RwLock;
24use tracing::{debug, error, info, warn};
25
26/// Service initialization status
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum ServiceStatus {
29    /// Service is initializing (not ready)
30    Initializing,
31    /// Service is ready to accept traffic
32    Ready,
33    /// Service is shutting down (not accepting new requests)
34    ShuttingDown,
35    /// Service has failed and is unhealthy
36    Failed,
37}
38
39impl ServiceStatus {
40    /// Check if service is ready to accept traffic
41    pub fn is_ready(&self) -> bool {
42        matches!(self, ServiceStatus::Ready)
43    }
44
45    /// Check if service is alive (not failed)
46    pub fn is_alive(&self) -> bool {
47        !matches!(self, ServiceStatus::Failed)
48    }
49}
50
51/// Health check manager for tracking service state
52#[derive(Debug, Clone)]
53pub struct HealthManager {
54    /// Current service status
55    status: Arc<RwLock<ServiceStatus>>,
56    /// Server startup time
57    start_time: Arc<Instant>,
58    /// Service initialization deadline (timeout)
59    init_deadline: Arc<Option<Instant>>,
60    /// Shutdown signal for graceful termination
61    shutdown_signal: Arc<RwLock<Option<tokio::sync::oneshot::Sender<()>>>>,
62}
63
64impl HealthManager {
65    /// Create a new health manager
66    pub fn new() -> Self {
67        Self {
68            status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
69            start_time: Arc::new(Instant::now()),
70            init_deadline: Arc::new(None),
71            shutdown_signal: Arc::new(RwLock::new(None)),
72        }
73    }
74
75    /// Create a new health manager with initialization timeout
76    pub fn with_init_timeout(timeout: Duration) -> Self {
77        let deadline = Instant::now() + timeout;
78        Self {
79            status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
80            start_time: Arc::new(Instant::now()),
81            init_deadline: Arc::new(Some(deadline)),
82            shutdown_signal: Arc::new(RwLock::new(None)),
83        }
84    }
85
86    /// Mark service as ready
87    pub async fn set_ready(&self) {
88        let mut status = self.status.write().await;
89        *status = ServiceStatus::Ready;
90        info!("Service marked as ready");
91    }
92
93    /// Mark service as failed
94    pub async fn set_failed(&self, reason: &str) {
95        let mut status = self.status.write().await;
96        *status = ServiceStatus::Failed;
97        error!("Service marked as failed: {}", reason);
98    }
99
100    /// Mark service as shutting down
101    pub async fn set_shutting_down(&self) {
102        let mut status = self.status.write().await;
103        *status = ServiceStatus::ShuttingDown;
104        info!("Service marked as shutting down");
105    }
106
107    /// Get current service status
108    pub async fn get_status(&self) -> ServiceStatus {
109        *self.status.read().await
110    }
111
112    /// Get server uptime in seconds
113    pub fn uptime_seconds(&self) -> u64 {
114        self.start_time.elapsed().as_secs()
115    }
116
117    /// Check if initialization has timed out
118    pub fn is_init_timeout(&self) -> bool {
119        if let Some(deadline) = *self.init_deadline {
120            Instant::now() > deadline
121        } else {
122            false
123        }
124    }
125
126    /// Set shutdown signal receiver for graceful shutdown
127    pub async fn set_shutdown_signal(&self, sender: tokio::sync::oneshot::Sender<()>) {
128        let mut signal = self.shutdown_signal.write().await;
129        *signal = Some(sender);
130    }
131
132    /// Trigger graceful shutdown
133    pub async fn trigger_shutdown(&self) {
134        self.set_shutting_down().await;
135        let mut signal = self.shutdown_signal.write().await;
136        if let Some(sender) = signal.take() {
137            let _ = sender.send(());
138            info!("Graceful shutdown signal sent");
139        }
140    }
141}
142
143impl Default for HealthManager {
144    fn default() -> Self {
145        Self::new()
146    }
147}
148
149/// Health check response structure
150#[derive(Debug, Serialize, Deserialize)]
151pub struct HealthResponse {
152    /// Service status
153    pub status: String,
154    /// ISO 8601 timestamp
155    pub timestamp: String,
156    /// Server uptime in seconds
157    pub uptime_seconds: u64,
158    /// Service version
159    pub version: String,
160    /// Additional status information
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub details: Option<HealthDetails>,
163}
164
165/// Detailed health information
166#[derive(Debug, Serialize, Deserialize)]
167pub struct HealthDetails {
168    /// Service initialization status
169    pub initialization: String,
170    /// Active connection count (if available)
171    #[serde(skip_serializing_if = "Option::is_none")]
172    pub connections: Option<u64>,
173    /// Memory usage in bytes (if available)
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub memory_bytes: Option<u64>,
176}
177
178/// Liveness probe endpoint
179///
180/// Kubernetes uses this to determine if the container should be restarted.
181/// Returns 200 if the service is alive, 503 if it has failed.
182///
183/// This should be a lightweight check that doesn't depend on external services.
184async fn liveness_probe(
185    State(health): State<Arc<HealthManager>>,
186) -> Result<Json<HealthResponse>, StatusCode> {
187    let status = health.get_status().await;
188    let uptime = health.uptime_seconds();
189
190    // Liveness checks if the process is alive (not failed)
191    if status.is_alive() {
192        let response = HealthResponse {
193            status: "alive".to_string(),
194            timestamp: chrono::Utc::now().to_rfc3339(),
195            uptime_seconds: uptime,
196            version: env!("CARGO_PKG_VERSION").to_string(),
197            details: None,
198        };
199        Ok(Json(response))
200    } else {
201        Err(StatusCode::SERVICE_UNAVAILABLE)
202    }
203}
204
205/// Readiness probe endpoint
206///
207/// Kubernetes uses this to determine if the container is ready to receive traffic.
208/// Returns 200 if the service is ready, 503 if it's not ready or shutting down.
209///
210/// This checks if the service has completed initialization and is ready to serve requests.
211async fn readiness_probe(
212    State(health): State<Arc<HealthManager>>,
213) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
214    let status = health.get_status().await;
215    let uptime = health.uptime_seconds();
216
217    // Readiness checks if the service is ready to accept traffic
218    if status.is_ready() {
219        let response = HealthResponse {
220            status: "ready".to_string(),
221            timestamp: chrono::Utc::now().to_rfc3339(),
222            uptime_seconds: uptime,
223            version: env!("CARGO_PKG_VERSION").to_string(),
224            details: Some(HealthDetails {
225                initialization: "complete".to_string(),
226                connections: None,
227                memory_bytes: None,
228            }),
229        };
230        Ok(Json(response))
231    } else {
232        let details = match status {
233            ServiceStatus::Initializing => {
234                if health.is_init_timeout() {
235                    "initialization_timeout".to_string()
236                } else {
237                    "initializing".to_string()
238                }
239            }
240            ServiceStatus::ShuttingDown => "shutting_down".to_string(),
241            ServiceStatus::Failed => "failed".to_string(),
242            ServiceStatus::Ready => unreachable!(),
243        };
244
245        let response = HealthResponse {
246            status: "not_ready".to_string(),
247            timestamp: chrono::Utc::now().to_rfc3339(),
248            uptime_seconds: uptime,
249            version: env!("CARGO_PKG_VERSION").to_string(),
250            details: Some(HealthDetails {
251                initialization: details,
252                connections: None,
253                memory_bytes: None,
254            }),
255        };
256
257        Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
258    }
259}
260
261/// Startup probe endpoint
262///
263/// Kubernetes uses this to determine if the container has finished initialization.
264/// This is useful for services that take a long time to start.
265/// Returns 200 once initialization is complete, 503 while still initializing.
266async fn startup_probe(
267    State(health): State<Arc<HealthManager>>,
268) -> Result<Json<HealthResponse>, StatusCode> {
269    let status = health.get_status().await;
270    let uptime = health.uptime_seconds();
271
272    // Startup probe checks if initialization is complete
273    match status {
274        ServiceStatus::Ready => {
275            let response = HealthResponse {
276                status: "startup_complete".to_string(),
277                timestamp: chrono::Utc::now().to_rfc3339(),
278                uptime_seconds: uptime,
279                version: env!("CARGO_PKG_VERSION").to_string(),
280                details: Some(HealthDetails {
281                    initialization: "complete".to_string(),
282                    connections: None,
283                    memory_bytes: None,
284                }),
285            };
286            Ok(Json(response))
287        }
288        ServiceStatus::Initializing => {
289            if health.is_init_timeout() {
290                warn!("Startup probe: initialization timeout exceeded");
291                Err(StatusCode::SERVICE_UNAVAILABLE)
292            } else {
293                debug!("Startup probe: still initializing");
294                Err(StatusCode::SERVICE_UNAVAILABLE)
295            }
296        }
297        ServiceStatus::Failed => Err(StatusCode::SERVICE_UNAVAILABLE),
298        ServiceStatus::ShuttingDown => {
299            // During shutdown, startup probe should return ready (service was started)
300            let response = HealthResponse {
301                status: "startup_complete".to_string(),
302                timestamp: chrono::Utc::now().to_rfc3339(),
303                uptime_seconds: uptime,
304                version: env!("CARGO_PKG_VERSION").to_string(),
305                details: Some(HealthDetails {
306                    initialization: "complete".to_string(),
307                    connections: None,
308                    memory_bytes: None,
309                }),
310            };
311            Ok(Json(response))
312        }
313    }
314}
315
316/// Combined health check endpoint (backwards compatibility)
317///
318/// This endpoint provides a general health check that combines liveness and readiness.
319/// For Kubernetes deployments, prefer using the specific probe endpoints.
320async fn health_check(
321    State(health): State<Arc<HealthManager>>,
322) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
323    let status = health.get_status().await;
324    let uptime = health.uptime_seconds();
325
326    if status.is_ready() {
327        let response = HealthResponse {
328            status: "healthy".to_string(),
329            timestamp: chrono::Utc::now().to_rfc3339(),
330            uptime_seconds: uptime,
331            version: env!("CARGO_PKG_VERSION").to_string(),
332            details: Some(HealthDetails {
333                initialization: "complete".to_string(),
334                connections: None,
335                memory_bytes: None,
336            }),
337        };
338        Ok(Json(response))
339    } else {
340        let status_str = match status {
341            ServiceStatus::Initializing => "initializing",
342            ServiceStatus::ShuttingDown => "shutting_down",
343            ServiceStatus::Failed => "failed",
344            ServiceStatus::Ready => unreachable!(),
345        };
346
347        let response = HealthResponse {
348            status: status_str.to_string(),
349            timestamp: chrono::Utc::now().to_rfc3339(),
350            uptime_seconds: uptime,
351            version: env!("CARGO_PKG_VERSION").to_string(),
352            details: Some(HealthDetails {
353                initialization: status_str.to_string(),
354                connections: None,
355                memory_bytes: None,
356            }),
357        };
358
359        Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
360    }
361}
362
363/// Create health check router with all probe endpoints
364pub fn health_router(health_manager: Arc<HealthManager>) -> axum::Router {
365    use axum::Router;
366    Router::new()
367        .route("/health", get(health_check))
368        .route("/health/live", get(liveness_probe))
369        .route("/health/ready", get(readiness_probe))
370        .route("/health/startup", get(startup_probe))
371        .with_state(health_manager)
372}
373
374/// Create health check router with custom prefix
375pub fn health_router_with_prefix(health_manager: Arc<HealthManager>, prefix: &str) -> axum::Router {
376    use axum::Router;
377    Router::new()
378        .route(&format!("{}/health", prefix), get(health_check))
379        .route(&format!("{}/health/live", prefix), get(liveness_probe))
380        .route(&format!("{}/health/ready", prefix), get(readiness_probe))
381        .route(&format!("{}/health/startup", prefix), get(startup_probe))
382        .with_state(health_manager)
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388    use axum::body::Body;
389    use axum::http::Request;
390    use tower::ServiceExt;
391
392    #[tokio::test]
393    async fn test_liveness_probe_alive() {
394        let health = Arc::new(HealthManager::new());
395        health.set_ready().await;
396
397        let app = health_router(health.clone());
398        let response = app
399            .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
400            .await
401            .unwrap();
402
403        assert_eq!(response.status(), StatusCode::OK);
404    }
405
406    #[tokio::test]
407    async fn test_liveness_probe_failed() {
408        let health = Arc::new(HealthManager::new());
409        health.set_failed("test failure").await;
410
411        let app = health_router(health.clone());
412        let response = app
413            .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
414            .await
415            .unwrap();
416
417        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
418    }
419
420    #[tokio::test]
421    async fn test_readiness_probe_ready() {
422        let health = Arc::new(HealthManager::new());
423        health.set_ready().await;
424
425        let app = health_router(health.clone());
426        let response = app
427            .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
428            .await
429            .unwrap();
430
431        assert_eq!(response.status(), StatusCode::OK);
432    }
433
434    #[tokio::test]
435    async fn test_readiness_probe_initializing() {
436        let health = Arc::new(HealthManager::new());
437        // Status is Initializing by default
438
439        let app = health_router(health.clone());
440        let response = app
441            .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
442            .await
443            .unwrap();
444
445        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
446    }
447
448    #[tokio::test]
449    async fn test_startup_probe_ready() {
450        let health = Arc::new(HealthManager::new());
451        health.set_ready().await;
452
453        let app = health_router(health.clone());
454        let response = app
455            .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
456            .await
457            .unwrap();
458
459        assert_eq!(response.status(), StatusCode::OK);
460    }
461
462    #[tokio::test]
463    async fn test_startup_probe_initializing() {
464        let health = Arc::new(HealthManager::new());
465        // Status is Initializing by default
466
467        let app = health_router(health.clone());
468        let response = app
469            .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
470            .await
471            .unwrap();
472
473        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
474    }
475
476    #[tokio::test]
477    async fn test_health_check_ready() {
478        let health = Arc::new(HealthManager::new());
479        health.set_ready().await;
480
481        let app = health_router(health.clone());
482        let response = app
483            .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
484            .await
485            .unwrap();
486
487        assert_eq!(response.status(), StatusCode::OK);
488    }
489
490    #[test]
491    fn test_service_status() {
492        assert!(ServiceStatus::Ready.is_ready());
493        assert!(!ServiceStatus::Initializing.is_ready());
494        assert!(!ServiceStatus::ShuttingDown.is_ready());
495        assert!(!ServiceStatus::Failed.is_ready());
496
497        assert!(ServiceStatus::Ready.is_alive());
498        assert!(ServiceStatus::Initializing.is_alive());
499        assert!(ServiceStatus::ShuttingDown.is_alive());
500        assert!(!ServiceStatus::Failed.is_alive());
501    }
502}