1use std::sync::Arc;
7
8use serde::{Deserialize, Serialize};
9use tracing::{debug, warn};
10
11use crate::service::ServiceRegistry;
12
13#[non_exhaustive]
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub enum HealthStatus {
17 Healthy,
19 Degraded(String),
21 Unhealthy(String),
23 Unknown,
25}
26
27impl std::fmt::Display for HealthStatus {
28 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29 match self {
30 HealthStatus::Healthy => write!(f, "healthy"),
31 HealthStatus::Degraded(msg) => write!(f, "degraded: {msg}"),
32 HealthStatus::Unhealthy(msg) => write!(f, "unhealthy: {msg}"),
33 HealthStatus::Unknown => write!(f, "unknown"),
34 }
35 }
36}
37
38#[non_exhaustive]
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub enum OverallHealth {
42 Healthy,
44 Degraded {
46 unhealthy_services: Vec<String>,
48 },
49 Down,
51}
52
53impl std::fmt::Display for OverallHealth {
54 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55 match self {
56 OverallHealth::Healthy => write!(f, "healthy"),
57 OverallHealth::Degraded { unhealthy_services } => {
58 write!(f, "degraded ({})", unhealthy_services.join(", "))
59 }
60 OverallHealth::Down => write!(f, "down"),
61 }
62 }
63}
64
65pub struct HealthSystem {
70 check_interval_secs: u64,
71}
72
73impl HealthSystem {
74 pub fn new(check_interval_secs: u64) -> Self {
76 Self {
77 check_interval_secs,
78 }
79 }
80
81 pub fn check_interval_secs(&self) -> u64 {
83 self.check_interval_secs
84 }
85
86 pub async fn aggregate(
88 &self,
89 registry: &Arc<ServiceRegistry>,
90 ) -> (OverallHealth, Vec<(String, HealthStatus)>) {
91 let results = registry.health_all().await;
92
93 if results.is_empty() {
94 return (OverallHealth::Down, results);
95 }
96
97 let mut unhealthy = Vec::new();
98 let mut all_unhealthy = true;
99
100 for (name, status) in &results {
101 match status {
102 HealthStatus::Healthy => {
103 debug!(service = %name, "health check: healthy");
104 all_unhealthy = false;
105 }
106 HealthStatus::Degraded(msg) => {
107 warn!(service = %name, reason = %msg, "health check: degraded");
108 unhealthy.push(name.clone());
109 all_unhealthy = false;
110 }
111 HealthStatus::Unhealthy(msg) => {
112 warn!(service = %name, reason = %msg, "health check: unhealthy");
113 unhealthy.push(name.clone());
114 }
115 HealthStatus::Unknown => {
116 warn!(service = %name, "health check: unknown");
117 unhealthy.push(name.clone());
118 }
119 }
120 }
121
122 let overall = if unhealthy.is_empty() {
123 OverallHealth::Healthy
124 } else if all_unhealthy {
125 OverallHealth::Down
126 } else {
127 OverallHealth::Degraded {
128 unhealthy_services: unhealthy,
129 }
130 };
131
132 (overall, results)
133 }
134}
135
136#[non_exhaustive]
140#[cfg(feature = "os-patterns")]
141#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
142pub enum ProbeResult {
143 Live,
145 NotLive { reason: String },
147 Ready,
149 NotReady { reason: String },
151}
152
153#[cfg(feature = "os-patterns")]
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct ProbeConfig {
157 pub liveness_interval_secs: u64,
159 pub readiness_interval_secs: u64,
161 pub failure_threshold: u32,
163 pub success_threshold: u32,
165}
166
167#[cfg(feature = "os-patterns")]
168impl Default for ProbeConfig {
169 fn default() -> Self {
170 Self {
171 liveness_interval_secs: 10,
172 readiness_interval_secs: 5,
173 failure_threshold: 3,
174 success_threshold: 1,
175 }
176 }
177}
178
179#[cfg(feature = "os-patterns")]
181#[derive(Debug, Clone)]
182pub struct ProbeState {
183 pub liveness_failures: u32,
185 pub readiness_failures: u32,
187 pub readiness_successes: u32,
189 pub is_live: bool,
191 pub is_ready: bool,
193}
194
195#[cfg(feature = "os-patterns")]
196impl Default for ProbeState {
197 fn default() -> Self {
198 Self {
199 liveness_failures: 0,
200 readiness_failures: 0,
201 readiness_successes: 0,
202 is_live: true,
203 is_ready: true,
204 }
205 }
206}
207
208#[cfg(feature = "os-patterns")]
209impl ProbeState {
210 pub fn record_liveness(&mut self, result: &ProbeResult, config: &ProbeConfig) -> bool {
214 match result {
215 ProbeResult::Live => {
216 self.liveness_failures = 0;
217 self.is_live = true;
218 false
219 }
220 ProbeResult::NotLive { .. } => {
221 self.liveness_failures += 1;
222 if self.liveness_failures >= config.failure_threshold {
223 self.is_live = false;
224 true
225 } else {
226 false
227 }
228 }
229 _ => false, }
231 }
232
233 pub fn record_readiness(&mut self, result: &ProbeResult, config: &ProbeConfig) -> Option<bool> {
240 match result {
241 ProbeResult::Ready => {
242 self.readiness_failures = 0;
243 self.readiness_successes += 1;
244 if !self.is_ready && self.readiness_successes >= config.success_threshold {
245 self.is_ready = true;
246 Some(true) } else {
248 None
249 }
250 }
251 ProbeResult::NotReady { .. } => {
252 self.readiness_successes = 0;
253 self.readiness_failures += 1;
254 if self.is_ready && self.readiness_failures >= config.failure_threshold {
255 self.is_ready = false;
256 Some(false) } else {
258 None
259 }
260 }
261 _ => None, }
263 }
264}
265
266#[cfg(test)]
267mod tests {
268 use super::*;
269 use crate::service::{ServiceType, SystemService};
270 use async_trait::async_trait;
271
272 struct HealthyService;
273
274 #[async_trait]
275 impl SystemService for HealthyService {
276 fn name(&self) -> &str {
277 "healthy-svc"
278 }
279 fn service_type(&self) -> ServiceType {
280 ServiceType::Core
281 }
282 async fn start(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
283 Ok(())
284 }
285 async fn stop(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
286 Ok(())
287 }
288 async fn health_check(&self) -> HealthStatus {
289 HealthStatus::Healthy
290 }
291 }
292
293 struct UnhealthyService;
294
295 #[async_trait]
296 impl SystemService for UnhealthyService {
297 fn name(&self) -> &str {
298 "unhealthy-svc"
299 }
300 fn service_type(&self) -> ServiceType {
301 ServiceType::Core
302 }
303 async fn start(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
304 Ok(())
305 }
306 async fn stop(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
307 Ok(())
308 }
309 async fn health_check(&self) -> HealthStatus {
310 HealthStatus::Unhealthy("test failure".into())
311 }
312 }
313
314 #[tokio::test]
315 async fn aggregate_all_healthy() {
316 let registry = Arc::new(ServiceRegistry::new());
317 registry.register(Arc::new(HealthyService)).unwrap();
318
319 let health = HealthSystem::new(30);
320 let (overall, results) = health.aggregate(®istry).await;
321
322 assert!(matches!(overall, OverallHealth::Healthy));
323 assert_eq!(results.len(), 1);
324 }
325
326 #[tokio::test]
327 async fn aggregate_mixed() {
328 let registry = Arc::new(ServiceRegistry::new());
329 registry.register(Arc::new(HealthyService)).unwrap();
330 registry.register(Arc::new(UnhealthyService)).unwrap();
331
332 let health = HealthSystem::new(30);
333 let (overall, results) = health.aggregate(®istry).await;
334
335 assert!(matches!(overall, OverallHealth::Degraded { .. }));
336 assert_eq!(results.len(), 2);
337 }
338
339 #[tokio::test]
340 async fn aggregate_all_unhealthy() {
341 let registry = Arc::new(ServiceRegistry::new());
342 registry.register(Arc::new(UnhealthyService)).unwrap();
343
344 let health = HealthSystem::new(30);
345 let (overall, _) = health.aggregate(®istry).await;
346
347 assert!(matches!(overall, OverallHealth::Down));
348 }
349
350 #[tokio::test]
351 async fn aggregate_empty_registry() {
352 let registry = Arc::new(ServiceRegistry::new());
353 let health = HealthSystem::new(30);
354 let (overall, results) = health.aggregate(®istry).await;
355
356 assert!(matches!(overall, OverallHealth::Down));
357 assert!(results.is_empty());
358 }
359
360 #[test]
361 fn health_status_display() {
362 assert_eq!(HealthStatus::Healthy.to_string(), "healthy");
363 assert_eq!(
364 HealthStatus::Degraded("slow".into()).to_string(),
365 "degraded: slow"
366 );
367 assert_eq!(
368 HealthStatus::Unhealthy("crash".into()).to_string(),
369 "unhealthy: crash"
370 );
371 assert_eq!(HealthStatus::Unknown.to_string(), "unknown");
372 }
373
374 #[test]
375 fn overall_health_display() {
376 assert_eq!(OverallHealth::Healthy.to_string(), "healthy");
377 assert_eq!(OverallHealth::Down.to_string(), "down");
378 assert_eq!(
379 OverallHealth::Degraded {
380 unhealthy_services: vec!["svc-a".into(), "svc-b".into()]
381 }
382 .to_string(),
383 "degraded (svc-a, svc-b)"
384 );
385 }
386
387 #[test]
388 fn check_interval() {
389 let health = HealthSystem::new(15);
390 assert_eq!(health.check_interval_secs(), 15);
391 }
392
393 #[cfg(feature = "os-patterns")]
396 mod probe_tests {
397 use super::super::*;
398
399 #[test]
400 fn probe_config_default() {
401 let config = ProbeConfig::default();
402 assert_eq!(config.liveness_interval_secs, 10);
403 assert_eq!(config.readiness_interval_secs, 5);
404 assert_eq!(config.failure_threshold, 3);
405 assert_eq!(config.success_threshold, 1);
406 }
407
408 #[test]
409 fn probe_config_serde_roundtrip() {
410 let config = ProbeConfig {
411 liveness_interval_secs: 15,
412 readiness_interval_secs: 10,
413 failure_threshold: 5,
414 success_threshold: 2,
415 };
416 let json = serde_json::to_string(&config).unwrap();
417 let restored: ProbeConfig = serde_json::from_str(&json).unwrap();
418 assert_eq!(restored.liveness_interval_secs, 15);
419 assert_eq!(restored.failure_threshold, 5);
420 }
421
422 #[test]
423 fn probe_result_serde_roundtrip() {
424 let results = vec![
425 ProbeResult::Live,
426 ProbeResult::NotLive { reason: "oom".into() },
427 ProbeResult::Ready,
428 ProbeResult::NotReady { reason: "init".into() },
429 ];
430 for result in results {
431 let json = serde_json::to_string(&result).unwrap();
432 let restored: ProbeResult = serde_json::from_str(&json).unwrap();
433 assert_eq!(restored, result);
434 }
435 }
436
437 #[test]
438 fn probe_state_default_is_live_and_ready() {
439 let state = ProbeState::default();
440 assert!(state.is_live);
441 assert!(state.is_ready);
442 assert_eq!(state.liveness_failures, 0);
443 assert_eq!(state.readiness_failures, 0);
444 }
445
446 #[test]
447 fn liveness_resets_on_success() {
448 let config = ProbeConfig {
449 failure_threshold: 3,
450 ..Default::default()
451 };
452 let mut state = ProbeState::default();
453 state.liveness_failures = 2;
454
455 let restart = state.record_liveness(&ProbeResult::Live, &config);
456 assert!(!restart);
457 assert_eq!(state.liveness_failures, 0);
458 assert!(state.is_live);
459 }
460
461 #[test]
462 fn liveness_triggers_restart_at_threshold() {
463 let config = ProbeConfig {
464 failure_threshold: 3,
465 ..Default::default()
466 };
467 let mut state = ProbeState::default();
468
469 assert!(!state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
471 assert!(!state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
472 assert!(state.is_live);
473
474 assert!(state.record_liveness(&ProbeResult::NotLive { reason: "hang".into() }, &config));
476 assert!(!state.is_live);
477 }
478
479 #[test]
480 fn readiness_removes_at_threshold() {
481 let config = ProbeConfig {
482 failure_threshold: 2,
483 ..Default::default()
484 };
485 let mut state = ProbeState::default();
486
487 assert!(state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config).is_none());
488 let change = state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config);
489 assert_eq!(change, Some(false)); assert!(!state.is_ready);
491 }
492
493 #[test]
494 fn readiness_recovery_re_adds() {
495 let config = ProbeConfig {
496 failure_threshold: 1,
497 success_threshold: 1,
498 ..Default::default()
499 };
500 let mut state = ProbeState::default();
501
502 state.record_readiness(&ProbeResult::NotReady { reason: "init".into() }, &config);
504 assert!(!state.is_ready);
505
506 let change = state.record_readiness(&ProbeResult::Ready, &config);
508 assert_eq!(change, Some(true));
509 assert!(state.is_ready);
510 }
511
512 #[test]
513 fn threshold_prevents_flapping() {
514 let config = ProbeConfig {
515 failure_threshold: 3,
516 success_threshold: 2,
517 ..Default::default()
518 };
519 let mut state = ProbeState::default();
520
521 assert!(state.record_readiness(&ProbeResult::NotReady { reason: "x".into() }, &config).is_none());
523 assert!(state.is_ready);
524
525 assert!(state.record_readiness(&ProbeResult::Ready, &config).is_none());
527 assert!(state.is_ready);
528 }
529
530 #[test]
531 fn default_probe_returns_live_ready() {
532 let live = ProbeResult::Live;
534 let ready = ProbeResult::Ready;
535 assert_eq!(live, ProbeResult::Live);
536 assert_eq!(ready, ProbeResult::Ready);
537 }
538 }
539}