1use crate::{RragError, RragResult};
7use chrono::{DateTime, Duration, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct HealthConfig {
16 pub enabled: bool,
17 pub check_interval_seconds: u64,
18 pub timeout_seconds: u64,
19 pub max_consecutive_failures: u32,
20 pub recovery_threshold: u32,
21 pub enable_detailed_checks: bool,
22 pub enable_dependency_checks: bool,
23 pub custom_checks: Vec<CustomHealthCheckConfig>,
24}
25
26impl Default for HealthConfig {
27 fn default() -> Self {
28 Self {
29 enabled: true,
30 check_interval_seconds: 30,
31 timeout_seconds: 10,
32 max_consecutive_failures: 3,
33 recovery_threshold: 2,
34 enable_detailed_checks: true,
35 enable_dependency_checks: true,
36 custom_checks: Vec::new(),
37 }
38 }
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CustomHealthCheckConfig {
43 pub name: String,
44 pub description: String,
45 pub check_type: CustomCheckType,
46 pub config: HashMap<String, String>,
47 pub critical: bool,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub enum CustomCheckType {
52 HttpEndpoint,
53 DatabaseConnection,
54 FileSystemCheck,
55 NetworkConnectivity,
56 CustomScript,
57}
58
59#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
61pub enum ComponentStatus {
62 Healthy = 1,
63 Degraded = 2,
64 Unhealthy = 3,
65 Critical = 4,
66 Unknown = 5,
67}
68
69impl std::fmt::Display for ComponentStatus {
70 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71 match self {
72 Self::Healthy => write!(f, "HEALTHY"),
73 Self::Degraded => write!(f, "DEGRADED"),
74 Self::Unhealthy => write!(f, "UNHEALTHY"),
75 Self::Critical => write!(f, "CRITICAL"),
76 Self::Unknown => write!(f, "UNKNOWN"),
77 }
78 }
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct ServiceHealth {
84 pub component_name: String,
85 pub status: ComponentStatus,
86 pub last_check: DateTime<Utc>,
87 pub last_healthy: Option<DateTime<Utc>>,
88 pub consecutive_failures: u32,
89 pub consecutive_successes: u32,
90 pub response_time_ms: Option<f64>,
91 pub error_message: Option<String>,
92 pub details: HashMap<String, serde_json::Value>,
93 pub dependencies: Vec<String>,
94 pub uptime_seconds: Option<i64>,
95}
96
97impl ServiceHealth {
98 pub fn new(component_name: impl Into<String>) -> Self {
99 Self {
100 component_name: component_name.into(),
101 status: ComponentStatus::Unknown,
102 last_check: Utc::now(),
103 last_healthy: None,
104 consecutive_failures: 0,
105 consecutive_successes: 0,
106 response_time_ms: None,
107 error_message: None,
108 details: HashMap::new(),
109 dependencies: Vec::new(),
110 uptime_seconds: None,
111 }
112 }
113
114 pub fn with_status(mut self, status: ComponentStatus) -> Self {
115 self.status = status;
116 if status == ComponentStatus::Healthy {
117 self.last_healthy = Some(self.last_check);
118 }
119 self
120 }
121
122 pub fn with_response_time(mut self, response_time_ms: f64) -> Self {
123 self.response_time_ms = Some(response_time_ms);
124 self
125 }
126
127 pub fn with_error(mut self, error: impl Into<String>) -> Self {
128 self.error_message = Some(error.into());
129 self
130 }
131
132 pub fn with_detail(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
133 self.details.insert(key.into(), value);
134 self
135 }
136
137 pub fn with_dependencies(mut self, dependencies: Vec<String>) -> Self {
138 self.dependencies = dependencies;
139 self
140 }
141
142 pub fn with_uptime(mut self, uptime_seconds: i64) -> Self {
143 self.uptime_seconds = Some(uptime_seconds);
144 self
145 }
146}
147
148#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct HealthReport {
151 pub overall_status: ComponentStatus,
152 pub timestamp: DateTime<Utc>,
153 pub services: HashMap<String, ServiceHealth>,
154 pub dependencies_status: HashMap<String, ComponentStatus>,
155 pub system_info: SystemInfo,
156 pub alerts: Vec<HealthAlert>,
157}
158
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct SystemInfo {
161 pub uptime_seconds: i64,
162 pub version: String,
163 pub environment: String,
164 pub hostname: String,
165 pub total_memory_mb: Option<f64>,
166 pub available_memory_mb: Option<f64>,
167 pub cpu_count: Option<u32>,
168 pub load_average: Option<f64>,
169}
170
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct HealthAlert {
173 pub component: String,
174 pub severity: ComponentStatus,
175 pub message: String,
176 pub timestamp: DateTime<Utc>,
177 pub resolved: bool,
178}
179
180#[async_trait::async_trait]
182pub trait HealthChecker: Send + Sync {
183 async fn check_health(&self) -> RragResult<ServiceHealth>;
184 fn component_name(&self) -> &str;
185 fn dependencies(&self) -> Vec<String> {
186 Vec::new()
187 }
188 fn is_critical(&self) -> bool {
189 false
190 }
191}
192
193pub struct BasicHealthChecker {
195 name: String,
196 is_critical: bool,
197 check_fn: Arc<dyn Fn() -> RragResult<ComponentStatus> + Send + Sync>,
198}
199
200impl BasicHealthChecker {
201 pub fn new<F>(name: impl Into<String>, check_fn: F) -> Self
202 where
203 F: Fn() -> RragResult<ComponentStatus> + Send + Sync + 'static,
204 {
205 Self {
206 name: name.into(),
207 is_critical: false,
208 check_fn: Arc::new(check_fn),
209 }
210 }
211
212 pub fn with_critical(mut self, critical: bool) -> Self {
213 self.is_critical = critical;
214 self
215 }
216}
217
218#[async_trait::async_trait]
219impl HealthChecker for BasicHealthChecker {
220 async fn check_health(&self) -> RragResult<ServiceHealth> {
221 let start_time = std::time::Instant::now();
222
223 match (self.check_fn)() {
224 Ok(status) => {
225 let response_time = start_time.elapsed().as_millis() as f64;
226 Ok(ServiceHealth::new(&self.name)
227 .with_status(status)
228 .with_response_time(response_time))
229 }
230 Err(e) => {
231 let response_time = start_time.elapsed().as_millis() as f64;
232 Ok(ServiceHealth::new(&self.name)
233 .with_status(ComponentStatus::Unhealthy)
234 .with_response_time(response_time)
235 .with_error(e.to_string()))
236 }
237 }
238 }
239
240 fn component_name(&self) -> &str {
241 &self.name
242 }
243
244 fn is_critical(&self) -> bool {
245 self.is_critical
246 }
247}
248
249pub struct HttpHealthChecker {
251 name: String,
252 url: String,
253 expected_status: u16,
254 timeout: Duration,
255 #[cfg(feature = "http")]
256 client: reqwest::Client,
257 is_critical: bool,
258}
259
260impl HttpHealthChecker {
261 pub fn new(name: impl Into<String>, url: impl Into<String>) -> Self {
262 Self {
263 name: name.into(),
264 url: url.into(),
265 expected_status: 200,
266 timeout: Duration::seconds(10),
267 #[cfg(feature = "http")]
268 client: reqwest::Client::new(),
269 is_critical: false,
270 }
271 }
272
273 pub fn with_expected_status(mut self, status: u16) -> Self {
274 self.expected_status = status;
275 self
276 }
277
278 pub fn with_timeout(mut self, timeout: Duration) -> Self {
279 self.timeout = timeout;
280 self
281 }
282
283 pub fn with_critical(mut self, critical: bool) -> Self {
284 self.is_critical = critical;
285 self
286 }
287}
288
289#[async_trait::async_trait]
290impl HealthChecker for HttpHealthChecker {
291 async fn check_health(&self) -> RragResult<ServiceHealth> {
292 #[cfg(feature = "http")]
293 {
294 let start_time = std::time::Instant::now();
295
296 let timeout_duration =
297 std::time::Duration::from_millis(self.timeout.num_milliseconds() as u64);
298
299 match tokio::time::timeout(timeout_duration, self.client.get(&self.url).send()).await {
300 Ok(Ok(response)) => {
301 let response_time = start_time.elapsed().as_millis() as f64;
302 let status_code = response.status().as_u16();
303
304 let status = if status_code == self.expected_status {
305 ComponentStatus::Healthy
306 } else {
307 ComponentStatus::Degraded
308 };
309
310 Ok(ServiceHealth::new(&self.name)
311 .with_status(status)
312 .with_response_time(response_time)
313 .with_detail("status_code", serde_json::json!(status_code))
314 .with_detail("url", serde_json::json!(self.url)))
315 }
316 Ok(Err(e)) => {
317 let response_time = start_time.elapsed().as_millis() as f64;
318 Ok(ServiceHealth::new(&self.name)
319 .with_status(ComponentStatus::Unhealthy)
320 .with_response_time(response_time)
321 .with_error(e.to_string())
322 .with_detail("url", serde_json::json!(self.url)))
323 }
324 Err(_) => {
325 let response_time = start_time.elapsed().as_millis() as f64;
326 Ok(ServiceHealth::new(&self.name)
327 .with_status(ComponentStatus::Unhealthy)
328 .with_response_time(response_time)
329 .with_error("Request timeout")
330 .with_detail(
331 "timeout_ms",
332 serde_json::json!(self.timeout.num_milliseconds()),
333 )
334 .with_detail("url", serde_json::json!(self.url)))
335 }
336 }
337 }
338 #[cfg(not(feature = "http"))]
339 {
340 Ok(ServiceHealth::new(&self.name)
342 .with_status(ComponentStatus::Healthy)
343 .with_response_time(0.0)
344 .with_detail("note", serde_json::json!("HTTP feature disabled"))
345 .with_detail("url", serde_json::json!(self.url)))
346 }
347 }
348
349 fn component_name(&self) -> &str {
350 &self.name
351 }
352
353 fn is_critical(&self) -> bool {
354 self.is_critical
355 }
356}
357
358pub struct DatabaseHealthChecker {
360 name: String,
361 connection_string: String,
362 timeout: Duration,
363 is_critical: bool,
364}
365
366impl DatabaseHealthChecker {
367 pub fn new(name: impl Into<String>, connection_string: impl Into<String>) -> Self {
368 Self {
369 name: name.into(),
370 connection_string: connection_string.into(),
371 timeout: Duration::seconds(5),
372 is_critical: true,
373 }
374 }
375
376 pub fn with_timeout(mut self, timeout: Duration) -> Self {
377 self.timeout = timeout;
378 self
379 }
380
381 pub fn with_critical(mut self, critical: bool) -> Self {
382 self.is_critical = critical;
383 self
384 }
385}
386
387#[async_trait::async_trait]
388impl HealthChecker for DatabaseHealthChecker {
389 async fn check_health(&self) -> RragResult<ServiceHealth> {
390 let start_time = std::time::Instant::now();
391
392 tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
395
396 let response_time = start_time.elapsed().as_millis() as f64;
397
398 let status = if rand::random::<f64>() > 0.1 {
400 ComponentStatus::Healthy
401 } else {
402 ComponentStatus::Unhealthy
403 };
404
405 let mut health = ServiceHealth::new(&self.name)
406 .with_status(status)
407 .with_response_time(response_time)
408 .with_detail("connection_string", serde_json::json!("***masked***"));
409
410 if status == ComponentStatus::Unhealthy {
411 health = health.with_error("Connection failed");
412 }
413
414 Ok(health)
415 }
416
417 fn component_name(&self) -> &str {
418 &self.name
419 }
420
421 fn is_critical(&self) -> bool {
422 self.is_critical
423 }
424}
425
426pub struct HealthMonitor {
428 config: HealthConfig,
429 checkers: Arc<RwLock<HashMap<String, Box<dyn HealthChecker>>>>,
430 health_history: Arc<RwLock<HashMap<String, Vec<ServiceHealth>>>>,
431 alerts: Arc<RwLock<Vec<HealthAlert>>>,
432 system_start_time: DateTime<Utc>,
433 monitoring_handle: Arc<RwLock<Option<tokio::task::JoinHandle<()>>>>,
434 is_running: Arc<RwLock<bool>>,
435}
436
437impl HealthMonitor {
438 pub async fn new(config: HealthConfig) -> RragResult<Self> {
439 Ok(Self {
440 config,
441 checkers: Arc::new(RwLock::new(HashMap::new())),
442 health_history: Arc::new(RwLock::new(HashMap::new())),
443 alerts: Arc::new(RwLock::new(Vec::new())),
444 system_start_time: Utc::now(),
445 monitoring_handle: Arc::new(RwLock::new(None)),
446 is_running: Arc::new(RwLock::new(false)),
447 })
448 }
449
450 pub async fn start(&self) -> RragResult<()> {
451 let mut running = self.is_running.write().await;
452 if *running {
453 return Err(RragError::config(
454 "health_monitor",
455 "stopped",
456 "already running",
457 ));
458 }
459
460 let handle = self.start_monitoring_loop().await?;
461 {
462 let mut handle_guard = self.monitoring_handle.write().await;
463 *handle_guard = Some(handle);
464 }
465
466 *running = true;
467 tracing::info!("Health monitor started");
468 Ok(())
469 }
470
471 pub async fn stop(&self) -> RragResult<()> {
472 let mut running = self.is_running.write().await;
473 if !*running {
474 return Ok(());
475 }
476
477 {
478 let mut handle_guard = self.monitoring_handle.write().await;
479 if let Some(handle) = handle_guard.take() {
480 handle.abort();
481 }
482 }
483
484 *running = false;
485 tracing::info!("Health monitor stopped");
486 Ok(())
487 }
488
489 pub async fn is_healthy(&self) -> bool {
490 *self.is_running.read().await
491 }
492
493 async fn start_monitoring_loop(&self) -> RragResult<tokio::task::JoinHandle<()>> {
494 let config = self.config.clone();
495 let checkers = self.checkers.clone();
496 let health_history = self.health_history.clone();
497 let alerts = self.alerts.clone();
498 let is_running = self.is_running.clone();
499
500 let handle = tokio::spawn(async move {
501 let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
502 config.check_interval_seconds,
503 ));
504
505 while *is_running.read().await {
506 interval.tick().await;
507
508 let checker_map = checkers.read().await;
509 let checker_names: Vec<String> = checker_map.keys().cloned().collect();
510 drop(checker_map);
511
512 for checker_name in checker_names {
513 let checker_map = checkers.read().await;
514 if let Some(checker) = checker_map.get(&checker_name) {
515 let timeout_duration =
516 std::time::Duration::from_secs(config.timeout_seconds);
517
518 let health_result =
519 tokio::time::timeout(timeout_duration, checker.check_health()).await;
520
521 let mut service_health = match health_result {
522 Ok(Ok(health)) => health,
523 Ok(Err(e)) => ServiceHealth::new(&checker_name)
524 .with_status(ComponentStatus::Unhealthy)
525 .with_error(e.to_string()),
526 Err(_) => ServiceHealth::new(&checker_name)
527 .with_status(ComponentStatus::Unhealthy)
528 .with_error("Health check timeout"),
529 };
530
531 let mut history = health_history.write().await;
533 let component_history =
534 history.entry(checker_name.clone()).or_insert_with(Vec::new);
535
536 if let Some(last_health) = component_history.last() {
537 if service_health.status == ComponentStatus::Healthy {
538 if last_health.status == ComponentStatus::Healthy {
539 service_health.consecutive_successes =
540 last_health.consecutive_successes + 1;
541 } else {
542 service_health.consecutive_successes = 1;
543 }
544 service_health.consecutive_failures = 0;
545 } else {
546 if last_health.status != ComponentStatus::Healthy {
547 service_health.consecutive_failures =
548 last_health.consecutive_failures + 1;
549 } else {
550 service_health.consecutive_failures = 1;
551 }
552 service_health.consecutive_successes = 0;
553 }
554 } else {
555 if service_health.status == ComponentStatus::Healthy {
556 service_health.consecutive_successes = 1;
557 } else {
558 service_health.consecutive_failures = 1;
559 }
560 }
561
562 component_history.push(service_health.clone());
563
564 if component_history.len() > 100 {
566 component_history.drain(0..component_history.len() - 100);
567 }
568
569 if service_health.consecutive_failures >= config.max_consecutive_failures {
571 let alert = HealthAlert {
572 component: checker_name.clone(),
573 severity: service_health.status,
574 message: format!(
575 "Component {} has failed {} consecutive health checks",
576 checker_name, service_health.consecutive_failures
577 ),
578 timestamp: Utc::now(),
579 resolved: false,
580 };
581
582 let mut alert_list = alerts.write().await;
583 alert_list.push(alert);
584
585 let alert_list_len = alert_list.len();
587 if alert_list_len > 1000 {
588 alert_list.drain(0..alert_list_len - 1000);
589 }
590 }
591 }
592 drop(checker_map);
593 }
594 }
595 });
596
597 Ok(handle)
598 }
599
600 pub async fn add_checker(&self, checker: Box<dyn HealthChecker>) -> RragResult<()> {
601 let name = checker.component_name().to_string();
602 let mut checkers = self.checkers.write().await;
603 checkers.insert(name, checker);
604 Ok(())
605 }
606
607 pub async fn remove_checker(&self, name: &str) -> RragResult<()> {
608 let mut checkers = self.checkers.write().await;
609 checkers.remove(name);
610
611 let mut history = self.health_history.write().await;
613 history.remove(name);
614
615 Ok(())
616 }
617
618 pub async fn get_health_report(&self) -> HealthReport {
619 let health_history = self.health_history.read().await;
620 let alerts = self.alerts.read().await;
621
622 let mut services = HashMap::new();
623 let mut overall_status = ComponentStatus::Healthy;
624
625 for (component_name, history) in health_history.iter() {
627 if let Some(latest_health) = history.last() {
628 services.insert(component_name.clone(), latest_health.clone());
629
630 if latest_health.status > overall_status {
632 overall_status = latest_health.status;
633 }
634 }
635 }
636
637 let system_info = self.get_system_info().await;
638
639 HealthReport {
640 overall_status,
641 timestamp: Utc::now(),
642 services,
643 dependencies_status: HashMap::new(), system_info,
645 alerts: alerts.clone(),
646 }
647 }
648
649 pub async fn get_component_health(&self, component_name: &str) -> Option<ServiceHealth> {
650 let history = self.health_history.read().await;
651 history.get(component_name)?.last().cloned()
652 }
653
654 pub async fn get_component_history(
655 &self,
656 component_name: &str,
657 limit: Option<usize>,
658 ) -> Vec<ServiceHealth> {
659 let history = self.health_history.read().await;
660 if let Some(component_history) = history.get(component_name) {
661 let limit = limit.unwrap_or(component_history.len());
662 let start_index = component_history.len().saturating_sub(limit);
663 component_history[start_index..].to_vec()
664 } else {
665 Vec::new()
666 }
667 }
668
669 pub async fn get_alerts(&self, resolved: Option<bool>) -> Vec<HealthAlert> {
670 let alerts = self.alerts.read().await;
671 if let Some(resolved_filter) = resolved {
672 alerts
673 .iter()
674 .filter(|alert| alert.resolved == resolved_filter)
675 .cloned()
676 .collect()
677 } else {
678 alerts.clone()
679 }
680 }
681
682 pub async fn acknowledge_alert(
683 &self,
684 component: &str,
685 timestamp: DateTime<Utc>,
686 ) -> RragResult<()> {
687 let mut alerts = self.alerts.write().await;
688 if let Some(alert) = alerts
689 .iter_mut()
690 .find(|a| a.component == component && a.timestamp == timestamp)
691 {
692 alert.resolved = true;
693 }
694 Ok(())
695 }
696
697 async fn get_system_info(&self) -> SystemInfo {
698 let uptime = (Utc::now() - self.system_start_time).num_seconds();
699
700 SystemInfo {
701 uptime_seconds: uptime,
702 version: env!("CARGO_PKG_VERSION").to_string(),
703 environment: "production".to_string(),
704 hostname: hostname::get()
705 .unwrap_or_else(|_| "unknown".into())
706 .to_string_lossy()
707 .to_string(),
708 total_memory_mb: None, available_memory_mb: None,
710 cpu_count: Some(num_cpus::get() as u32),
711 load_average: None,
712 }
713 }
714
715 pub async fn force_health_check(&self, component_name: &str) -> RragResult<ServiceHealth> {
716 let checkers = self.checkers.read().await;
717 let checker = checkers.get(component_name).ok_or_else(|| {
718 RragError::agent(
719 "health_monitor",
720 format!("Component not found: {}", component_name),
721 )
722 })?;
723
724 let timeout_duration = std::time::Duration::from_secs(self.config.timeout_seconds);
725
726 let health_result = tokio::time::timeout(timeout_duration, checker.check_health()).await;
727
728 match health_result {
729 Ok(Ok(health)) => Ok(health),
730 Ok(Err(e)) => Ok(ServiceHealth::new(component_name)
731 .with_status(ComponentStatus::Unhealthy)
732 .with_error(e.to_string())),
733 Err(_) => Ok(ServiceHealth::new(component_name)
734 .with_status(ComponentStatus::Unhealthy)
735 .with_error("Health check timeout")),
736 }
737 }
738
739 pub async fn get_health_summary(&self) -> HealthSummary {
740 let report = self.get_health_report().await;
741
742 let total_services = report.services.len();
743 let healthy_services = report
744 .services
745 .values()
746 .filter(|s| s.status == ComponentStatus::Healthy)
747 .count();
748 let degraded_services = report
749 .services
750 .values()
751 .filter(|s| s.status == ComponentStatus::Degraded)
752 .count();
753 let unhealthy_services = report
754 .services
755 .values()
756 .filter(|s| s.status == ComponentStatus::Unhealthy)
757 .count();
758 let critical_services = report
759 .services
760 .values()
761 .filter(|s| s.status == ComponentStatus::Critical)
762 .count();
763
764 let active_alerts = report.alerts.iter().filter(|a| !a.resolved).count();
765
766 HealthSummary {
767 overall_status: report.overall_status,
768 total_services,
769 healthy_services,
770 degraded_services,
771 unhealthy_services,
772 critical_services,
773 active_alerts,
774 uptime_seconds: report.system_info.uptime_seconds,
775 last_check: report.timestamp,
776 }
777 }
778}
779
780#[derive(Debug, Clone, Serialize, Deserialize)]
781pub struct HealthSummary {
782 pub overall_status: ComponentStatus,
783 pub total_services: usize,
784 pub healthy_services: usize,
785 pub degraded_services: usize,
786 pub unhealthy_services: usize,
787 pub critical_services: usize,
788 pub active_alerts: usize,
789 pub uptime_seconds: i64,
790 pub last_check: DateTime<Utc>,
791}
792
793#[cfg(test)]
794mod tests {
795 use super::*;
796
797 #[tokio::test]
798 async fn test_service_health_creation() {
799 let health = ServiceHealth::new("test_service")
800 .with_status(ComponentStatus::Healthy)
801 .with_response_time(150.5)
802 .with_detail("version", serde_json::json!("1.0.0"))
803 .with_dependencies(vec!["database".to_string(), "cache".to_string()])
804 .with_uptime(3600);
805
806 assert_eq!(health.component_name, "test_service");
807 assert_eq!(health.status, ComponentStatus::Healthy);
808 assert_eq!(health.response_time_ms.unwrap(), 150.5);
809 assert_eq!(health.dependencies.len(), 2);
810 assert_eq!(health.uptime_seconds.unwrap(), 3600);
811 assert!(health.details.contains_key("version"));
812 }
813
814 #[tokio::test]
815 async fn test_basic_health_checker() {
816 let checker = BasicHealthChecker::new("test_component", || Ok(ComponentStatus::Healthy))
817 .with_critical(true);
818
819 assert_eq!(checker.component_name(), "test_component");
820 assert!(checker.is_critical());
821
822 let health = checker.check_health().await.unwrap();
823 assert_eq!(health.component_name, "test_component");
824 assert_eq!(health.status, ComponentStatus::Healthy);
825 assert!(health.response_time_ms.is_some());
826 }
827
828 #[tokio::test]
829 async fn test_basic_health_checker_failure() {
830 let checker = BasicHealthChecker::new("failing_component", || {
831 Err(RragError::agent("test", "Simulated failure"))
832 });
833
834 let health = checker.check_health().await.unwrap();
835 assert_eq!(health.status, ComponentStatus::Unhealthy);
836 assert!(health.error_message.is_some());
837 assert!(health
838 .error_message
839 .as_ref()
840 .unwrap()
841 .contains("Simulated failure"));
842 }
843
844 #[tokio::test]
845 async fn test_database_health_checker() {
846 let checker = DatabaseHealthChecker::new("test_db", "postgresql://localhost:5432/test")
847 .with_timeout(Duration::seconds(5))
848 .with_critical(true);
849
850 assert_eq!(checker.component_name(), "test_db");
851 assert!(checker.is_critical());
852
853 let health = checker.check_health().await.unwrap();
854 assert_eq!(health.component_name, "test_db");
855 assert!(health.response_time_ms.is_some());
856 assert!(health.details.contains_key("connection_string"));
857 }
858
859 #[tokio::test]
860 async fn test_health_monitor() {
861 let config = HealthConfig {
862 check_interval_seconds: 1, ..Default::default()
864 };
865
866 let mut monitor = HealthMonitor::new(config).await.unwrap();
867
868 let checker = BasicHealthChecker::new("test_service", || Ok(ComponentStatus::Healthy));
870 monitor.add_checker(Box::new(checker)).await.unwrap();
871
872 assert!(!monitor.is_healthy().await);
873
874 monitor.start().await.unwrap();
875 assert!(monitor.is_healthy().await);
876
877 tokio::time::sleep(tokio::time::Duration::from_millis(1100)).await;
879
880 let report = monitor.get_health_report().await;
881 assert_eq!(report.overall_status, ComponentStatus::Healthy);
882 assert!(report.services.contains_key("test_service"));
883
884 let service_health = monitor.get_component_health("test_service").await;
885 assert!(service_health.is_some());
886 assert_eq!(service_health.unwrap().status, ComponentStatus::Healthy);
887
888 monitor.stop().await.unwrap();
889 assert!(!monitor.is_healthy().await);
890 }
891
892 #[tokio::test]
893 async fn test_health_monitor_consecutive_failures() {
894 let config = HealthConfig {
895 check_interval_seconds: 1,
896 max_consecutive_failures: 2,
897 ..Default::default()
898 };
899
900 let mut monitor = HealthMonitor::new(config).await.unwrap();
901
902 let checker = BasicHealthChecker::new("failing_service", || {
904 Err(RragError::agent("test", "Always fails"))
905 });
906 monitor.add_checker(Box::new(checker)).await.unwrap();
907
908 monitor.start().await.unwrap();
909
910 tokio::time::sleep(tokio::time::Duration::from_millis(2500)).await;
912
913 let alerts = monitor.get_alerts(Some(false)).await; assert!(!alerts.is_empty());
915
916 let service_health = monitor
917 .get_component_health("failing_service")
918 .await
919 .unwrap();
920 assert!(service_health.consecutive_failures >= 2);
921
922 monitor.stop().await.unwrap();
923 }
924
925 #[tokio::test]
926 async fn test_force_health_check() {
927 let config = HealthConfig::default();
928 let mut monitor = HealthMonitor::new(config).await.unwrap();
929
930 let checker = BasicHealthChecker::new("manual_test", || Ok(ComponentStatus::Degraded));
931 monitor.add_checker(Box::new(checker)).await.unwrap();
932
933 let health = monitor.force_health_check("manual_test").await.unwrap();
934 assert_eq!(health.component_name, "manual_test");
935 assert_eq!(health.status, ComponentStatus::Degraded);
936 }
937
938 #[tokio::test]
939 async fn test_health_summary() {
940 let config = HealthConfig::default();
941 let mut monitor = HealthMonitor::new(config).await.unwrap();
942
943 let healthy_checker =
945 BasicHealthChecker::new("healthy_service", || Ok(ComponentStatus::Healthy));
946 let degraded_checker =
947 BasicHealthChecker::new("degraded_service", || Ok(ComponentStatus::Degraded));
948 let unhealthy_checker = BasicHealthChecker::new("unhealthy_service", || {
949 Err(RragError::agent("test", "Service down"))
950 });
951
952 monitor
953 .add_checker(Box::new(healthy_checker))
954 .await
955 .unwrap();
956 monitor
957 .add_checker(Box::new(degraded_checker))
958 .await
959 .unwrap();
960 monitor
961 .add_checker(Box::new(unhealthy_checker))
962 .await
963 .unwrap();
964
965 monitor.start().await.unwrap();
966
967 tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
969
970 let summary = monitor.get_health_summary().await;
971 assert_eq!(summary.total_services, 3);
972 assert!(summary.uptime_seconds >= 0);
973
974 monitor.stop().await.unwrap();
975 }
976
977 #[test]
978 fn test_component_status_ordering() {
979 assert!(ComponentStatus::Critical > ComponentStatus::Unhealthy);
980 assert!(ComponentStatus::Unhealthy > ComponentStatus::Degraded);
981 assert!(ComponentStatus::Degraded > ComponentStatus::Healthy);
982 assert!(ComponentStatus::Unknown > ComponentStatus::Critical);
983 }
984
985 #[test]
986 fn test_component_status_display() {
987 assert_eq!(ComponentStatus::Healthy.to_string(), "HEALTHY");
988 assert_eq!(ComponentStatus::Degraded.to_string(), "DEGRADED");
989 assert_eq!(ComponentStatus::Unhealthy.to_string(), "UNHEALTHY");
990 assert_eq!(ComponentStatus::Critical.to_string(), "CRITICAL");
991 assert_eq!(ComponentStatus::Unknown.to_string(), "UNKNOWN");
992 }
993}