1use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::time::{Duration, Instant};
9use thiserror::Error;
10use tracing::{debug, warn};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub enum HealthStatus {
15 Healthy,
17 Degraded,
19 Unhealthy,
21}
22
23impl HealthStatus {
24 pub fn is_healthy(&self) -> bool {
26 matches!(self, HealthStatus::Healthy)
27 }
28
29 pub fn is_degraded(&self) -> bool {
31 matches!(self, HealthStatus::Degraded)
32 }
33
34 pub fn is_unhealthy(&self) -> bool {
36 matches!(self, HealthStatus::Unhealthy)
37 }
38
39 pub fn score(&self) -> u8 {
41 match self {
42 HealthStatus::Healthy => 100,
43 HealthStatus::Degraded => 50,
44 HealthStatus::Unhealthy => 0,
45 }
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct SubsystemHealth {
52 pub name: String,
54 pub status: HealthStatus,
56 pub last_check: Instant,
58 pub consecutive_failures: u32,
60 pub consecutive_successes: u32,
62 pub total_checks: u64,
64 pub total_failures: u64,
66 pub metadata: HashMap<String, String>,
68}
69
70impl SubsystemHealth {
71 pub fn new(name: String) -> Self {
73 Self {
74 name,
75 status: HealthStatus::Healthy,
76 last_check: Instant::now(),
77 consecutive_failures: 0,
78 consecutive_successes: 0,
79 total_checks: 0,
80 total_failures: 0,
81 metadata: HashMap::new(),
82 }
83 }
84
85 pub fn time_since_check(&self) -> Duration {
87 self.last_check.elapsed()
88 }
89
90 pub fn failure_rate(&self) -> f64 {
92 if self.total_checks == 0 {
93 0.0
94 } else {
95 (self.total_failures as f64 / self.total_checks as f64) * 100.0
96 }
97 }
98
99 pub fn is_stale(&self, threshold: Duration) -> bool {
101 self.time_since_check() > threshold
102 }
103}
104
105#[derive(Debug, Clone)]
107pub struct HealthMonitorConfig {
108 pub degraded_threshold: u32,
110 pub unhealthy_threshold: u32,
112 pub recovery_threshold: u32,
114 pub stale_threshold: Duration,
116}
117
118impl Default for HealthMonitorConfig {
119 fn default() -> Self {
120 Self {
121 degraded_threshold: 3,
122 unhealthy_threshold: 5,
123 recovery_threshold: 5,
124 stale_threshold: Duration::from_secs(300), }
126 }
127}
128
129#[derive(Debug, Error)]
131pub enum HealthError {
132 #[error("Subsystem not found: {0}")]
133 SubsystemNotFound(String),
134 #[error("Invalid health status transition: {0}")]
135 InvalidTransition(String),
136}
137
138pub struct HealthMonitor {
140 subsystems: RwLock<HashMap<String, SubsystemHealth>>,
141 config: HealthMonitorConfig,
142}
143
144impl HealthMonitor {
145 pub fn new(config: HealthMonitorConfig) -> Self {
147 Self {
148 subsystems: RwLock::new(HashMap::new()),
149 config,
150 }
151 }
152
153 pub fn with_default_config() -> Self {
155 Self::new(HealthMonitorConfig::default())
156 }
157
158 pub fn register_subsystem(&self, name: String) {
160 let mut subsystems = self.subsystems.write();
161 if !subsystems.contains_key(&name) {
162 debug!("Registering subsystem for health monitoring: {}", name);
163 subsystems.insert(name.clone(), SubsystemHealth::new(name));
164 }
165 }
166
167 pub fn unregister_subsystem(&self, name: &str) -> Result<(), HealthError> {
169 let mut subsystems = self.subsystems.write();
170 subsystems
171 .remove(name)
172 .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
173 debug!("Unregistered subsystem: {}", name);
174 Ok(())
175 }
176
177 pub fn update_health(&self, name: &str, is_healthy: bool) -> Result<(), HealthError> {
179 let mut subsystems = self.subsystems.write();
180 let health = subsystems
181 .get_mut(name)
182 .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
183
184 health.last_check = Instant::now();
185 health.total_checks += 1;
186
187 if is_healthy {
188 health.consecutive_successes += 1;
189 health.consecutive_failures = 0;
190
191 match health.status {
193 HealthStatus::Unhealthy | HealthStatus::Degraded => {
194 if health.consecutive_successes >= self.config.recovery_threshold {
195 debug!("Subsystem {} recovered to healthy", name);
196 health.status = HealthStatus::Healthy;
197 }
198 }
199 HealthStatus::Healthy => {}
200 }
201 } else {
202 health.consecutive_failures += 1;
203 health.consecutive_successes = 0;
204 health.total_failures += 1;
205
206 let old_status = health.status;
208 if health.consecutive_failures >= self.config.unhealthy_threshold {
209 health.status = HealthStatus::Unhealthy;
210 if old_status != HealthStatus::Unhealthy {
211 warn!("Subsystem {} marked as unhealthy", name);
212 }
213 } else if health.consecutive_failures >= self.config.degraded_threshold {
214 health.status = HealthStatus::Degraded;
215 if old_status != HealthStatus::Degraded {
216 warn!("Subsystem {} marked as degraded", name);
217 }
218 }
219 }
220
221 Ok(())
222 }
223
224 pub fn set_health(
226 &self,
227 name: &str,
228 status: HealthStatus,
229 ) -> Result<(), HealthError> {
230 let mut subsystems = self.subsystems.write();
231 let health = subsystems
232 .get_mut(name)
233 .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
234
235 health.status = status;
236 health.last_check = Instant::now();
237
238 Ok(())
239 }
240
241 pub fn check_health(&self, name: &str) -> Result<SubsystemHealth, HealthError> {
243 let subsystems = self.subsystems.read();
244 subsystems
245 .get(name)
246 .cloned()
247 .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))
248 }
249
250 pub fn overall_status(&self) -> HealthStatus {
252 let subsystems = self.subsystems.read();
253
254 if subsystems.is_empty() {
255 return HealthStatus::Healthy;
256 }
257
258 let mut has_unhealthy = false;
259 let mut has_degraded = false;
260
261 for health in subsystems.values() {
262 match health.status {
263 HealthStatus::Unhealthy => has_unhealthy = true,
264 HealthStatus::Degraded => has_degraded = true,
265 HealthStatus::Healthy => {}
266 }
267
268 if health.is_stale(self.config.stale_threshold) {
270 warn!("Subsystem {} is stale (last check {:?} ago)", health.name, health.time_since_check());
271 has_degraded = true;
272 }
273 }
274
275 if has_unhealthy {
276 HealthStatus::Unhealthy
277 } else if has_degraded {
278 HealthStatus::Degraded
279 } else {
280 HealthStatus::Healthy
281 }
282 }
283
284 pub fn all_subsystems(&self) -> HashMap<String, SubsystemHealth> {
286 self.subsystems.read().clone()
287 }
288
289 pub fn status_counts(&self) -> HashMap<HealthStatus, usize> {
291 let subsystems = self.subsystems.read();
292 let mut counts = HashMap::new();
293
294 for health in subsystems.values() {
295 *counts.entry(health.status).or_insert(0) += 1;
296 }
297
298 counts
299 }
300
301 pub fn unhealthy_subsystems(&self) -> Vec<String> {
303 let subsystems = self.subsystems.read();
304 subsystems
305 .values()
306 .filter(|h| h.status.is_unhealthy())
307 .map(|h| h.name.clone())
308 .collect()
309 }
310
311 pub fn degraded_subsystems(&self) -> Vec<String> {
313 let subsystems = self.subsystems.read();
314 subsystems
315 .values()
316 .filter(|h| h.status.is_degraded())
317 .map(|h| h.name.clone())
318 .collect()
319 }
320
321 pub fn update_metadata(
323 &self,
324 name: &str,
325 key: String,
326 value: String,
327 ) -> Result<(), HealthError> {
328 let mut subsystems = self.subsystems.write();
329 let health = subsystems
330 .get_mut(name)
331 .ok_or_else(|| HealthError::SubsystemNotFound(name.to_string()))?;
332
333 health.metadata.insert(key, value);
334 Ok(())
335 }
336
337 pub fn subsystem_count(&self) -> usize {
339 self.subsystems.read().len()
340 }
341
342 pub fn clear(&self) {
344 self.subsystems.write().clear();
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351 use std::thread::sleep;
352
353 #[test]
354 fn test_health_monitor_creation() {
355 let monitor = HealthMonitor::default();
356 assert_eq!(monitor.subsystem_count(), 0);
357 assert_eq!(monitor.overall_status(), HealthStatus::Healthy);
358 }
359
360 #[test]
361 fn test_register_subsystem() {
362 let monitor = HealthMonitor::default();
363 monitor.register_subsystem("test-subsystem".to_string());
364 assert_eq!(monitor.subsystem_count(), 1);
365
366 let health = monitor.check_health("test-subsystem").unwrap();
367 assert_eq!(health.name, "test-subsystem");
368 assert_eq!(health.status, HealthStatus::Healthy);
369 }
370
371 #[test]
372 fn test_register_duplicate_subsystem() {
373 let monitor = HealthMonitor::default();
374 monitor.register_subsystem("test".to_string());
375 monitor.register_subsystem("test".to_string());
376 assert_eq!(monitor.subsystem_count(), 1);
377 }
378
379 #[test]
380 fn test_unregister_subsystem() {
381 let monitor = HealthMonitor::default();
382 monitor.register_subsystem("test".to_string());
383 assert_eq!(monitor.subsystem_count(), 1);
384
385 monitor.unregister_subsystem("test").unwrap();
386 assert_eq!(monitor.subsystem_count(), 0);
387 }
388
389 #[test]
390 fn test_update_health_success() {
391 let monitor = HealthMonitor::default();
392 monitor.register_subsystem("test".to_string());
393
394 monitor.update_health("test", true).unwrap();
395
396 let health = monitor.check_health("test").unwrap();
397 assert_eq!(health.consecutive_successes, 1);
398 assert_eq!(health.total_checks, 1);
399 assert_eq!(health.status, HealthStatus::Healthy);
400 }
401
402 #[test]
403 fn test_update_health_failure() {
404 let config = HealthMonitorConfig {
405 degraded_threshold: 2,
406 unhealthy_threshold: 4,
407 ..Default::default()
408 };
409 let monitor = HealthMonitor::new(config);
410 monitor.register_subsystem("test".to_string());
411
412 monitor.update_health("test", false).unwrap();
414 let health = monitor.check_health("test").unwrap();
415 assert_eq!(health.status, HealthStatus::Healthy);
416
417 monitor.update_health("test", false).unwrap();
419 let health = monitor.check_health("test").unwrap();
420 assert_eq!(health.status, HealthStatus::Degraded);
421
422 monitor.update_health("test", false).unwrap();
424 monitor.update_health("test", false).unwrap();
425 let health = monitor.check_health("test").unwrap();
426 assert_eq!(health.status, HealthStatus::Unhealthy);
427 }
428
429 #[test]
430 fn test_recovery_from_degraded() {
431 let config = HealthMonitorConfig {
432 degraded_threshold: 2,
433 recovery_threshold: 3,
434 ..Default::default()
435 };
436 let monitor = HealthMonitor::new(config);
437 monitor.register_subsystem("test".to_string());
438
439 monitor.update_health("test", false).unwrap();
441 monitor.update_health("test", false).unwrap();
442 assert_eq!(
443 monitor.check_health("test").unwrap().status,
444 HealthStatus::Degraded
445 );
446
447 monitor.update_health("test", true).unwrap();
449 monitor.update_health("test", true).unwrap();
450 assert_eq!(
451 monitor.check_health("test").unwrap().status,
452 HealthStatus::Degraded
453 );
454
455 monitor.update_health("test", true).unwrap();
456 assert_eq!(
457 monitor.check_health("test").unwrap().status,
458 HealthStatus::Healthy
459 );
460 }
461
462 #[test]
463 fn test_overall_status_healthy() {
464 let monitor = HealthMonitor::default();
465 monitor.register_subsystem("test1".to_string());
466 monitor.register_subsystem("test2".to_string());
467
468 monitor.update_health("test1", true).unwrap();
469 monitor.update_health("test2", true).unwrap();
470
471 assert_eq!(monitor.overall_status(), HealthStatus::Healthy);
472 }
473
474 #[test]
475 fn test_overall_status_degraded() {
476 let config = HealthMonitorConfig {
477 degraded_threshold: 2,
478 ..Default::default()
479 };
480 let monitor = HealthMonitor::new(config);
481 monitor.register_subsystem("test1".to_string());
482 monitor.register_subsystem("test2".to_string());
483
484 monitor.update_health("test1", true).unwrap();
485
486 monitor.update_health("test2", false).unwrap();
488 monitor.update_health("test2", false).unwrap();
489
490 assert_eq!(monitor.overall_status(), HealthStatus::Degraded);
491 }
492
493 #[test]
494 fn test_overall_status_unhealthy() {
495 let config = HealthMonitorConfig {
496 unhealthy_threshold: 2,
497 ..Default::default()
498 };
499 let monitor = HealthMonitor::new(config);
500 monitor.register_subsystem("test1".to_string());
501 monitor.register_subsystem("test2".to_string());
502
503 monitor.update_health("test1", true).unwrap();
504
505 monitor.update_health("test2", false).unwrap();
507 monitor.update_health("test2", false).unwrap();
508
509 assert_eq!(monitor.overall_status(), HealthStatus::Unhealthy);
510 }
511
512 #[test]
513 fn test_status_counts() {
514 let config = HealthMonitorConfig {
515 degraded_threshold: 2,
516 unhealthy_threshold: 4,
517 ..Default::default()
518 };
519 let monitor = HealthMonitor::new(config);
520
521 monitor.register_subsystem("healthy".to_string());
522 monitor.register_subsystem("degraded".to_string());
523 monitor.register_subsystem("unhealthy".to_string());
524
525 monitor.update_health("degraded", false).unwrap();
527 monitor.update_health("degraded", false).unwrap();
528
529 for _ in 0..4 {
531 monitor.update_health("unhealthy", false).unwrap();
532 }
533
534 let counts = monitor.status_counts();
535 assert_eq!(counts.get(&HealthStatus::Healthy), Some(&1));
536 assert_eq!(counts.get(&HealthStatus::Degraded), Some(&1));
537 assert_eq!(counts.get(&HealthStatus::Unhealthy), Some(&1));
538 }
539
540 #[test]
541 fn test_unhealthy_subsystems() {
542 let config = HealthMonitorConfig {
543 unhealthy_threshold: 2,
544 ..Default::default()
545 };
546 let monitor = HealthMonitor::new(config);
547
548 monitor.register_subsystem("healthy".to_string());
549 monitor.register_subsystem("unhealthy1".to_string());
550 monitor.register_subsystem("unhealthy2".to_string());
551
552 monitor.update_health("unhealthy1", false).unwrap();
553 monitor.update_health("unhealthy1", false).unwrap();
554 monitor.update_health("unhealthy2", false).unwrap();
555 monitor.update_health("unhealthy2", false).unwrap();
556
557 let unhealthy = monitor.unhealthy_subsystems();
558 assert_eq!(unhealthy.len(), 2);
559 assert!(unhealthy.contains(&"unhealthy1".to_string()));
560 assert!(unhealthy.contains(&"unhealthy2".to_string()));
561 }
562
563 #[test]
564 fn test_metadata() {
565 let monitor = HealthMonitor::default();
566 monitor.register_subsystem("test".to_string());
567
568 monitor
569 .update_metadata("test", "version".to_string(), "1.0.0".to_string())
570 .unwrap();
571
572 let health = monitor.check_health("test").unwrap();
573 assert_eq!(health.metadata.get("version"), Some(&"1.0.0".to_string()));
574 }
575
576 #[test]
577 fn test_failure_rate() {
578 let monitor = HealthMonitor::default();
579 monitor.register_subsystem("test".to_string());
580
581 monitor.update_health("test", true).unwrap();
582 monitor.update_health("test", false).unwrap();
583 monitor.update_health("test", true).unwrap();
584 monitor.update_health("test", false).unwrap();
585
586 let health = monitor.check_health("test").unwrap();
587 assert_eq!(health.failure_rate(), 50.0);
588 }
589
590 #[test]
591 fn test_stale_detection() {
592 let config = HealthMonitorConfig {
593 stale_threshold: Duration::from_millis(100),
594 ..Default::default()
595 };
596 let monitor = HealthMonitor::new(config);
597 monitor.register_subsystem("test".to_string());
598
599 let health = monitor.check_health("test").unwrap();
601 assert!(!health.is_stale(Duration::from_millis(100)));
602
603 sleep(Duration::from_millis(150));
605 let health = monitor.check_health("test").unwrap();
606 assert!(health.is_stale(Duration::from_millis(100)));
607 }
608
609 #[test]
610 fn test_set_health_manually() {
611 let monitor = HealthMonitor::default();
612 monitor.register_subsystem("test".to_string());
613
614 monitor
615 .set_health("test", HealthStatus::Degraded)
616 .unwrap();
617
618 let health = monitor.check_health("test").unwrap();
619 assert_eq!(health.status, HealthStatus::Degraded);
620 }
621
622 #[test]
623 fn test_clear_subsystems() {
624 let monitor = HealthMonitor::default();
625 monitor.register_subsystem("test1".to_string());
626 monitor.register_subsystem("test2".to_string());
627 assert_eq!(monitor.subsystem_count(), 2);
628
629 monitor.clear();
630 assert_eq!(monitor.subsystem_count(), 0);
631 }
632
633 #[test]
634 fn test_health_status_score() {
635 assert_eq!(HealthStatus::Healthy.score(), 100);
636 assert_eq!(HealthStatus::Degraded.score(), 50);
637 assert_eq!(HealthStatus::Unhealthy.score(), 0);
638 }
639
640 #[test]
641 fn test_subsystem_not_found_error() {
642 let monitor = HealthMonitor::default();
643
644 let result = monitor.check_health("nonexistent");
645 assert!(result.is_err());
646 assert!(matches!(result, Err(HealthError::SubsystemNotFound(_))));
647 }
648}