1use axum::{extract::State, http::StatusCode, response::Json, routing::get};
15use serde::{Deserialize, Serialize};
16use std::sync::Arc;
17use std::time::{Duration, Instant};
18use tokio::sync::RwLock;
19use tracing::{debug, error, info, warn};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum ServiceStatus {
24 Initializing,
26 Ready,
28 ShuttingDown,
30 Failed,
32}
33
34impl ServiceStatus {
35 pub fn is_ready(&self) -> bool {
37 matches!(self, ServiceStatus::Ready)
38 }
39
40 pub fn is_alive(&self) -> bool {
42 !matches!(self, ServiceStatus::Failed)
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct HealthManager {
49 status: Arc<RwLock<ServiceStatus>>,
51 start_time: Arc<Instant>,
53 init_deadline: Arc<Option<Instant>>,
55 shutdown_signal: Arc<RwLock<Option<tokio::sync::oneshot::Sender<()>>>>,
57}
58
59impl HealthManager {
60 pub fn new() -> Self {
62 Self {
63 status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
64 start_time: Arc::new(Instant::now()),
65 init_deadline: Arc::new(None),
66 shutdown_signal: Arc::new(RwLock::new(None)),
67 }
68 }
69
70 pub fn with_init_timeout(timeout: Duration) -> Self {
72 let deadline = Instant::now() + timeout;
73 Self {
74 status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
75 start_time: Arc::new(Instant::now()),
76 init_deadline: Arc::new(Some(deadline)),
77 shutdown_signal: Arc::new(RwLock::new(None)),
78 }
79 }
80
81 pub async fn set_ready(&self) {
83 let mut status = self.status.write().await;
84 *status = ServiceStatus::Ready;
85 info!("Service marked as ready");
86 }
87
88 pub async fn set_failed(&self, reason: &str) {
90 let mut status = self.status.write().await;
91 *status = ServiceStatus::Failed;
92 error!("Service marked as failed: {}", reason);
93 }
94
95 pub async fn set_shutting_down(&self) {
97 let mut status = self.status.write().await;
98 *status = ServiceStatus::ShuttingDown;
99 info!("Service marked as shutting down");
100 }
101
102 pub async fn get_status(&self) -> ServiceStatus {
104 *self.status.read().await
105 }
106
107 pub fn uptime_seconds(&self) -> u64 {
109 self.start_time.elapsed().as_secs()
110 }
111
112 pub fn is_init_timeout(&self) -> bool {
114 if let Some(deadline) = *self.init_deadline {
115 Instant::now() > deadline
116 } else {
117 false
118 }
119 }
120
121 pub async fn set_shutdown_signal(&self, sender: tokio::sync::oneshot::Sender<()>) {
123 let mut signal = self.shutdown_signal.write().await;
124 *signal = Some(sender);
125 }
126
127 pub async fn trigger_shutdown(&self) {
129 self.set_shutting_down().await;
130 let mut signal = self.shutdown_signal.write().await;
131 if let Some(sender) = signal.take() {
132 let _ = sender.send(());
133 info!("Graceful shutdown signal sent");
134 }
135 }
136}
137
138impl Default for HealthManager {
139 fn default() -> Self {
140 Self::new()
141 }
142}
143
144#[derive(Debug, Serialize, Deserialize)]
146pub struct HealthResponse {
147 pub status: String,
149 pub timestamp: String,
151 pub uptime_seconds: u64,
153 pub version: String,
155 #[serde(skip_serializing_if = "Option::is_none")]
157 pub details: Option<HealthDetails>,
158}
159
160#[derive(Debug, Serialize, Deserialize)]
162pub struct HealthDetails {
163 pub initialization: String,
165 #[serde(skip_serializing_if = "Option::is_none")]
167 pub connections: Option<u64>,
168 #[serde(skip_serializing_if = "Option::is_none")]
170 pub memory_bytes: Option<u64>,
171}
172
173async fn liveness_probe(
180 State(health): State<Arc<HealthManager>>,
181) -> Result<Json<HealthResponse>, StatusCode> {
182 let status = health.get_status().await;
183 let uptime = health.uptime_seconds();
184
185 if status.is_alive() {
187 let response = HealthResponse {
188 status: "alive".to_string(),
189 timestamp: chrono::Utc::now().to_rfc3339(),
190 uptime_seconds: uptime,
191 version: env!("CARGO_PKG_VERSION").to_string(),
192 details: None,
193 };
194 Ok(Json(response))
195 } else {
196 Err(StatusCode::SERVICE_UNAVAILABLE)
197 }
198}
199
200async fn readiness_probe(
207 State(health): State<Arc<HealthManager>>,
208) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
209 let status = health.get_status().await;
210 let uptime = health.uptime_seconds();
211
212 if status.is_ready() {
214 let response = HealthResponse {
215 status: "ready".to_string(),
216 timestamp: chrono::Utc::now().to_rfc3339(),
217 uptime_seconds: uptime,
218 version: env!("CARGO_PKG_VERSION").to_string(),
219 details: Some(HealthDetails {
220 initialization: "complete".to_string(),
221 connections: None,
222 memory_bytes: None,
223 }),
224 };
225 Ok(Json(response))
226 } else {
227 let details = match status {
228 ServiceStatus::Initializing => {
229 if health.is_init_timeout() {
230 "initialization_timeout".to_string()
231 } else {
232 "initializing".to_string()
233 }
234 }
235 ServiceStatus::ShuttingDown => "shutting_down".to_string(),
236 ServiceStatus::Failed => "failed".to_string(),
237 ServiceStatus::Ready => unreachable!(),
238 };
239
240 let response = HealthResponse {
241 status: "not_ready".to_string(),
242 timestamp: chrono::Utc::now().to_rfc3339(),
243 uptime_seconds: uptime,
244 version: env!("CARGO_PKG_VERSION").to_string(),
245 details: Some(HealthDetails {
246 initialization: details,
247 connections: None,
248 memory_bytes: None,
249 }),
250 };
251
252 Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
253 }
254}
255
256async fn startup_probe(
262 State(health): State<Arc<HealthManager>>,
263) -> Result<Json<HealthResponse>, StatusCode> {
264 let status = health.get_status().await;
265 let uptime = health.uptime_seconds();
266
267 match status {
269 ServiceStatus::Ready => {
270 let response = HealthResponse {
271 status: "startup_complete".to_string(),
272 timestamp: chrono::Utc::now().to_rfc3339(),
273 uptime_seconds: uptime,
274 version: env!("CARGO_PKG_VERSION").to_string(),
275 details: Some(HealthDetails {
276 initialization: "complete".to_string(),
277 connections: None,
278 memory_bytes: None,
279 }),
280 };
281 Ok(Json(response))
282 }
283 ServiceStatus::Initializing => {
284 if health.is_init_timeout() {
285 warn!("Startup probe: initialization timeout exceeded");
286 Err(StatusCode::SERVICE_UNAVAILABLE)
287 } else {
288 debug!("Startup probe: still initializing");
289 Err(StatusCode::SERVICE_UNAVAILABLE)
290 }
291 }
292 ServiceStatus::Failed => Err(StatusCode::SERVICE_UNAVAILABLE),
293 ServiceStatus::ShuttingDown => {
294 let response = HealthResponse {
296 status: "startup_complete".to_string(),
297 timestamp: chrono::Utc::now().to_rfc3339(),
298 uptime_seconds: uptime,
299 version: env!("CARGO_PKG_VERSION").to_string(),
300 details: Some(HealthDetails {
301 initialization: "complete".to_string(),
302 connections: None,
303 memory_bytes: None,
304 }),
305 };
306 Ok(Json(response))
307 }
308 }
309}
310
311async fn health_check(
316 State(health): State<Arc<HealthManager>>,
317) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
318 let status = health.get_status().await;
319 let uptime = health.uptime_seconds();
320
321 if status.is_ready() {
322 let response = HealthResponse {
323 status: "healthy".to_string(),
324 timestamp: chrono::Utc::now().to_rfc3339(),
325 uptime_seconds: uptime,
326 version: env!("CARGO_PKG_VERSION").to_string(),
327 details: Some(HealthDetails {
328 initialization: "complete".to_string(),
329 connections: None,
330 memory_bytes: None,
331 }),
332 };
333 Ok(Json(response))
334 } else {
335 let status_str = match status {
336 ServiceStatus::Initializing => "initializing",
337 ServiceStatus::ShuttingDown => "shutting_down",
338 ServiceStatus::Failed => "failed",
339 ServiceStatus::Ready => unreachable!(),
340 };
341
342 let response = HealthResponse {
343 status: status_str.to_string(),
344 timestamp: chrono::Utc::now().to_rfc3339(),
345 uptime_seconds: uptime,
346 version: env!("CARGO_PKG_VERSION").to_string(),
347 details: Some(HealthDetails {
348 initialization: status_str.to_string(),
349 connections: None,
350 memory_bytes: None,
351 }),
352 };
353
354 Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
355 }
356}
357
358pub fn health_router(health_manager: Arc<HealthManager>) -> axum::Router {
360 use axum::Router;
361 Router::new()
362 .route("/health", get(health_check))
363 .route("/health/live", get(liveness_probe))
364 .route("/health/ready", get(readiness_probe))
365 .route("/health/startup", get(startup_probe))
366 .with_state(health_manager)
367}
368
369pub fn health_router_with_prefix(health_manager: Arc<HealthManager>, prefix: &str) -> axum::Router {
371 use axum::Router;
372 Router::new()
373 .route(&format!("{}/health", prefix), get(health_check))
374 .route(&format!("{}/health/live", prefix), get(liveness_probe))
375 .route(&format!("{}/health/ready", prefix), get(readiness_probe))
376 .route(&format!("{}/health/startup", prefix), get(startup_probe))
377 .with_state(health_manager)
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383 use axum::body::Body;
384 use axum::http::Request;
385 use tower::ServiceExt;
386
387 #[tokio::test]
388 async fn test_liveness_probe_alive() {
389 let health = Arc::new(HealthManager::new());
390 health.set_ready().await;
391
392 let app = health_router(health.clone());
393 let response = app
394 .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
395 .await
396 .unwrap();
397
398 assert_eq!(response.status(), StatusCode::OK);
399 }
400
401 #[tokio::test]
402 async fn test_liveness_probe_failed() {
403 let health = Arc::new(HealthManager::new());
404 health.set_failed("test failure").await;
405
406 let app = health_router(health.clone());
407 let response = app
408 .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
409 .await
410 .unwrap();
411
412 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
413 }
414
415 #[tokio::test]
416 async fn test_readiness_probe_ready() {
417 let health = Arc::new(HealthManager::new());
418 health.set_ready().await;
419
420 let app = health_router(health.clone());
421 let response = app
422 .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
423 .await
424 .unwrap();
425
426 assert_eq!(response.status(), StatusCode::OK);
427 }
428
429 #[tokio::test]
430 async fn test_readiness_probe_initializing() {
431 let health = Arc::new(HealthManager::new());
432 let app = health_router(health.clone());
435 let response = app
436 .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
437 .await
438 .unwrap();
439
440 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
441 }
442
443 #[tokio::test]
444 async fn test_startup_probe_ready() {
445 let health = Arc::new(HealthManager::new());
446 health.set_ready().await;
447
448 let app = health_router(health.clone());
449 let response = app
450 .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
451 .await
452 .unwrap();
453
454 assert_eq!(response.status(), StatusCode::OK);
455 }
456
457 #[tokio::test]
458 async fn test_startup_probe_initializing() {
459 let health = Arc::new(HealthManager::new());
460 let app = health_router(health.clone());
463 let response = app
464 .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
465 .await
466 .unwrap();
467
468 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
469 }
470
471 #[tokio::test]
472 async fn test_health_check_ready() {
473 let health = Arc::new(HealthManager::new());
474 health.set_ready().await;
475
476 let app = health_router(health.clone());
477 let response = app
478 .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
479 .await
480 .unwrap();
481
482 assert_eq!(response.status(), StatusCode::OK);
483 }
484
485 #[test]
486 fn test_service_status() {
487 assert!(ServiceStatus::Ready.is_ready());
488 assert!(!ServiceStatus::Initializing.is_ready());
489 assert!(!ServiceStatus::ShuttingDown.is_ready());
490 assert!(!ServiceStatus::Failed.is_ready());
491
492 assert!(ServiceStatus::Ready.is_alive());
493 assert!(ServiceStatus::Initializing.is_alive());
494 assert!(ServiceStatus::ShuttingDown.is_alive());
495 assert!(!ServiceStatus::Failed.is_alive());
496 }
497}