1use axum::{
15 extract::State,
16 http::StatusCode,
17 response::Json,
18 routing::{get, Router},
19};
20use serde::{Deserialize, Serialize};
21use std::sync::Arc;
22use std::time::{Duration, Instant};
23use tokio::sync::RwLock;
24use tracing::{debug, error, info, warn};
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum ServiceStatus {
29 Initializing,
31 Ready,
33 ShuttingDown,
35 Failed,
37}
38
39impl ServiceStatus {
40 pub fn is_ready(&self) -> bool {
42 matches!(self, ServiceStatus::Ready)
43 }
44
45 pub fn is_alive(&self) -> bool {
47 !matches!(self, ServiceStatus::Failed)
48 }
49}
50
51#[derive(Debug, Clone)]
53pub struct HealthManager {
54 status: Arc<RwLock<ServiceStatus>>,
56 start_time: Arc<Instant>,
58 init_deadline: Arc<Option<Instant>>,
60 shutdown_signal: Arc<RwLock<Option<tokio::sync::oneshot::Sender<()>>>>,
62}
63
64impl HealthManager {
65 pub fn new() -> Self {
67 Self {
68 status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
69 start_time: Arc::new(Instant::now()),
70 init_deadline: Arc::new(None),
71 shutdown_signal: Arc::new(RwLock::new(None)),
72 }
73 }
74
75 pub fn with_init_timeout(timeout: Duration) -> Self {
77 let deadline = Instant::now() + timeout;
78 Self {
79 status: Arc::new(RwLock::new(ServiceStatus::Initializing)),
80 start_time: Arc::new(Instant::now()),
81 init_deadline: Arc::new(Some(deadline)),
82 shutdown_signal: Arc::new(RwLock::new(None)),
83 }
84 }
85
86 pub async fn set_ready(&self) {
88 let mut status = self.status.write().await;
89 *status = ServiceStatus::Ready;
90 info!("Service marked as ready");
91 }
92
93 pub async fn set_failed(&self, reason: &str) {
95 let mut status = self.status.write().await;
96 *status = ServiceStatus::Failed;
97 error!("Service marked as failed: {}", reason);
98 }
99
100 pub async fn set_shutting_down(&self) {
102 let mut status = self.status.write().await;
103 *status = ServiceStatus::ShuttingDown;
104 info!("Service marked as shutting down");
105 }
106
107 pub async fn get_status(&self) -> ServiceStatus {
109 *self.status.read().await
110 }
111
112 pub fn uptime_seconds(&self) -> u64 {
114 self.start_time.elapsed().as_secs()
115 }
116
117 pub fn is_init_timeout(&self) -> bool {
119 if let Some(deadline) = *self.init_deadline {
120 Instant::now() > deadline
121 } else {
122 false
123 }
124 }
125
126 pub async fn set_shutdown_signal(&self, sender: tokio::sync::oneshot::Sender<()>) {
128 let mut signal = self.shutdown_signal.write().await;
129 *signal = Some(sender);
130 }
131
132 pub async fn trigger_shutdown(&self) {
134 self.set_shutting_down().await;
135 let mut signal = self.shutdown_signal.write().await;
136 if let Some(sender) = signal.take() {
137 let _ = sender.send(());
138 info!("Graceful shutdown signal sent");
139 }
140 }
141}
142
143impl Default for HealthManager {
144 fn default() -> Self {
145 Self::new()
146 }
147}
148
149#[derive(Debug, Serialize, Deserialize)]
151pub struct HealthResponse {
152 pub status: String,
154 pub timestamp: String,
156 pub uptime_seconds: u64,
158 pub version: String,
160 #[serde(skip_serializing_if = "Option::is_none")]
162 pub details: Option<HealthDetails>,
163}
164
165#[derive(Debug, Serialize, Deserialize)]
167pub struct HealthDetails {
168 pub initialization: String,
170 #[serde(skip_serializing_if = "Option::is_none")]
172 pub connections: Option<u64>,
173 #[serde(skip_serializing_if = "Option::is_none")]
175 pub memory_bytes: Option<u64>,
176}
177
178async fn liveness_probe(
185 State(health): State<Arc<HealthManager>>,
186) -> Result<Json<HealthResponse>, StatusCode> {
187 let status = health.get_status().await;
188 let uptime = health.uptime_seconds();
189
190 if status.is_alive() {
192 let response = HealthResponse {
193 status: "alive".to_string(),
194 timestamp: chrono::Utc::now().to_rfc3339(),
195 uptime_seconds: uptime,
196 version: env!("CARGO_PKG_VERSION").to_string(),
197 details: None,
198 };
199 Ok(Json(response))
200 } else {
201 Err(StatusCode::SERVICE_UNAVAILABLE)
202 }
203}
204
205async fn readiness_probe(
212 State(health): State<Arc<HealthManager>>,
213) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
214 let status = health.get_status().await;
215 let uptime = health.uptime_seconds();
216
217 if status.is_ready() {
219 let response = HealthResponse {
220 status: "ready".to_string(),
221 timestamp: chrono::Utc::now().to_rfc3339(),
222 uptime_seconds: uptime,
223 version: env!("CARGO_PKG_VERSION").to_string(),
224 details: Some(HealthDetails {
225 initialization: "complete".to_string(),
226 connections: None,
227 memory_bytes: None,
228 }),
229 };
230 Ok(Json(response))
231 } else {
232 let details = match status {
233 ServiceStatus::Initializing => {
234 if health.is_init_timeout() {
235 "initialization_timeout".to_string()
236 } else {
237 "initializing".to_string()
238 }
239 }
240 ServiceStatus::ShuttingDown => "shutting_down".to_string(),
241 ServiceStatus::Failed => "failed".to_string(),
242 ServiceStatus::Ready => unreachable!(),
243 };
244
245 let response = HealthResponse {
246 status: "not_ready".to_string(),
247 timestamp: chrono::Utc::now().to_rfc3339(),
248 uptime_seconds: uptime,
249 version: env!("CARGO_PKG_VERSION").to_string(),
250 details: Some(HealthDetails {
251 initialization: details,
252 connections: None,
253 memory_bytes: None,
254 }),
255 };
256
257 Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
258 }
259}
260
261async fn startup_probe(
267 State(health): State<Arc<HealthManager>>,
268) -> Result<Json<HealthResponse>, StatusCode> {
269 let status = health.get_status().await;
270 let uptime = health.uptime_seconds();
271
272 match status {
274 ServiceStatus::Ready => {
275 let response = HealthResponse {
276 status: "startup_complete".to_string(),
277 timestamp: chrono::Utc::now().to_rfc3339(),
278 uptime_seconds: uptime,
279 version: env!("CARGO_PKG_VERSION").to_string(),
280 details: Some(HealthDetails {
281 initialization: "complete".to_string(),
282 connections: None,
283 memory_bytes: None,
284 }),
285 };
286 Ok(Json(response))
287 }
288 ServiceStatus::Initializing => {
289 if health.is_init_timeout() {
290 warn!("Startup probe: initialization timeout exceeded");
291 Err(StatusCode::SERVICE_UNAVAILABLE)
292 } else {
293 debug!("Startup probe: still initializing");
294 Err(StatusCode::SERVICE_UNAVAILABLE)
295 }
296 }
297 ServiceStatus::Failed => Err(StatusCode::SERVICE_UNAVAILABLE),
298 ServiceStatus::ShuttingDown => {
299 let response = HealthResponse {
301 status: "startup_complete".to_string(),
302 timestamp: chrono::Utc::now().to_rfc3339(),
303 uptime_seconds: uptime,
304 version: env!("CARGO_PKG_VERSION").to_string(),
305 details: Some(HealthDetails {
306 initialization: "complete".to_string(),
307 connections: None,
308 memory_bytes: None,
309 }),
310 };
311 Ok(Json(response))
312 }
313 }
314}
315
316async fn health_check(
321 State(health): State<Arc<HealthManager>>,
322) -> Result<Json<HealthResponse>, (StatusCode, Json<HealthResponse>)> {
323 let status = health.get_status().await;
324 let uptime = health.uptime_seconds();
325
326 if status.is_ready() {
327 let response = HealthResponse {
328 status: "healthy".to_string(),
329 timestamp: chrono::Utc::now().to_rfc3339(),
330 uptime_seconds: uptime,
331 version: env!("CARGO_PKG_VERSION").to_string(),
332 details: Some(HealthDetails {
333 initialization: "complete".to_string(),
334 connections: None,
335 memory_bytes: None,
336 }),
337 };
338 Ok(Json(response))
339 } else {
340 let status_str = match status {
341 ServiceStatus::Initializing => "initializing",
342 ServiceStatus::ShuttingDown => "shutting_down",
343 ServiceStatus::Failed => "failed",
344 ServiceStatus::Ready => unreachable!(),
345 };
346
347 let response = HealthResponse {
348 status: status_str.to_string(),
349 timestamp: chrono::Utc::now().to_rfc3339(),
350 uptime_seconds: uptime,
351 version: env!("CARGO_PKG_VERSION").to_string(),
352 details: Some(HealthDetails {
353 initialization: status_str.to_string(),
354 connections: None,
355 memory_bytes: None,
356 }),
357 };
358
359 Err((StatusCode::SERVICE_UNAVAILABLE, Json(response)))
360 }
361}
362
363pub fn health_router(health_manager: Arc<HealthManager>) -> axum::Router {
365 use axum::Router;
366 Router::new()
367 .route("/health", get(health_check))
368 .route("/health/live", get(liveness_probe))
369 .route("/health/ready", get(readiness_probe))
370 .route("/health/startup", get(startup_probe))
371 .with_state(health_manager)
372}
373
374pub fn health_router_with_prefix(health_manager: Arc<HealthManager>, prefix: &str) -> axum::Router {
376 use axum::Router;
377 Router::new()
378 .route(&format!("{}/health", prefix), get(health_check))
379 .route(&format!("{}/health/live", prefix), get(liveness_probe))
380 .route(&format!("{}/health/ready", prefix), get(readiness_probe))
381 .route(&format!("{}/health/startup", prefix), get(startup_probe))
382 .with_state(health_manager)
383}
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388 use axum::body::Body;
389 use axum::http::Request;
390 use tower::ServiceExt;
391
392 #[tokio::test]
393 async fn test_liveness_probe_alive() {
394 let health = Arc::new(HealthManager::new());
395 health.set_ready().await;
396
397 let app = health_router(health.clone());
398 let response = app
399 .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
400 .await
401 .unwrap();
402
403 assert_eq!(response.status(), StatusCode::OK);
404 }
405
406 #[tokio::test]
407 async fn test_liveness_probe_failed() {
408 let health = Arc::new(HealthManager::new());
409 health.set_failed("test failure").await;
410
411 let app = health_router(health.clone());
412 let response = app
413 .oneshot(Request::builder().uri("/health/live").body(Body::empty()).unwrap())
414 .await
415 .unwrap();
416
417 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
418 }
419
420 #[tokio::test]
421 async fn test_readiness_probe_ready() {
422 let health = Arc::new(HealthManager::new());
423 health.set_ready().await;
424
425 let app = health_router(health.clone());
426 let response = app
427 .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
428 .await
429 .unwrap();
430
431 assert_eq!(response.status(), StatusCode::OK);
432 }
433
434 #[tokio::test]
435 async fn test_readiness_probe_initializing() {
436 let health = Arc::new(HealthManager::new());
437 let app = health_router(health.clone());
440 let response = app
441 .oneshot(Request::builder().uri("/health/ready").body(Body::empty()).unwrap())
442 .await
443 .unwrap();
444
445 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
446 }
447
448 #[tokio::test]
449 async fn test_startup_probe_ready() {
450 let health = Arc::new(HealthManager::new());
451 health.set_ready().await;
452
453 let app = health_router(health.clone());
454 let response = app
455 .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
456 .await
457 .unwrap();
458
459 assert_eq!(response.status(), StatusCode::OK);
460 }
461
462 #[tokio::test]
463 async fn test_startup_probe_initializing() {
464 let health = Arc::new(HealthManager::new());
465 let app = health_router(health.clone());
468 let response = app
469 .oneshot(Request::builder().uri("/health/startup").body(Body::empty()).unwrap())
470 .await
471 .unwrap();
472
473 assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
474 }
475
476 #[tokio::test]
477 async fn test_health_check_ready() {
478 let health = Arc::new(HealthManager::new());
479 health.set_ready().await;
480
481 let app = health_router(health.clone());
482 let response = app
483 .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
484 .await
485 .unwrap();
486
487 assert_eq!(response.status(), StatusCode::OK);
488 }
489
490 #[test]
491 fn test_service_status() {
492 assert!(ServiceStatus::Ready.is_ready());
493 assert!(!ServiceStatus::Initializing.is_ready());
494 assert!(!ServiceStatus::ShuttingDown.is_ready());
495 assert!(!ServiceStatus::Failed.is_ready());
496
497 assert!(ServiceStatus::Ready.is_alive());
498 assert!(ServiceStatus::Initializing.is_alive());
499 assert!(ServiceStatus::ShuttingDown.is_alive());
500 assert!(!ServiceStatus::Failed.is_alive());
501 }
502}