rexis_rag/observability/
health.rs

1//! # Health Monitoring System
2//!
3//! Comprehensive health checking with component status monitoring,
4//! dependency verification, and automated health reporting.
5
6use crate::{RragError, RragResult};
7use chrono::{DateTime, Duration, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13/// Health monitoring configuration
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct HealthConfig {
16    pub enabled: bool,
17    pub check_interval_seconds: u64,
18    pub timeout_seconds: u64,
19    pub max_consecutive_failures: u32,
20    pub recovery_threshold: u32,
21    pub enable_detailed_checks: bool,
22    pub enable_dependency_checks: bool,
23    pub custom_checks: Vec<CustomHealthCheckConfig>,
24}
25
26impl Default for HealthConfig {
27    fn default() -> Self {
28        Self {
29            enabled: true,
30            check_interval_seconds: 30,
31            timeout_seconds: 10,
32            max_consecutive_failures: 3,
33            recovery_threshold: 2,
34            enable_detailed_checks: true,
35            enable_dependency_checks: true,
36            custom_checks: Vec::new(),
37        }
38    }
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CustomHealthCheckConfig {
43    pub name: String,
44    pub description: String,
45    pub check_type: CustomCheckType,
46    pub config: HashMap<String, String>,
47    pub critical: bool,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub enum CustomCheckType {
52    HttpEndpoint,
53    DatabaseConnection,
54    FileSystemCheck,
55    NetworkConnectivity,
56    CustomScript,
57}
58
59/// Component status levels
60#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
61pub enum ComponentStatus {
62    Healthy = 1,
63    Degraded = 2,
64    Unhealthy = 3,
65    Critical = 4,
66    Unknown = 5,
67}
68
69impl std::fmt::Display for ComponentStatus {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        match self {
72            Self::Healthy => write!(f, "HEALTHY"),
73            Self::Degraded => write!(f, "DEGRADED"),
74            Self::Unhealthy => write!(f, "UNHEALTHY"),
75            Self::Critical => write!(f, "CRITICAL"),
76            Self::Unknown => write!(f, "UNKNOWN"),
77        }
78    }
79}
80
81/// Individual service health information
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct ServiceHealth {
84    pub component_name: String,
85    pub status: ComponentStatus,
86    pub last_check: DateTime<Utc>,
87    pub last_healthy: Option<DateTime<Utc>>,
88    pub consecutive_failures: u32,
89    pub consecutive_successes: u32,
90    pub response_time_ms: Option<f64>,
91    pub error_message: Option<String>,
92    pub details: HashMap<String, serde_json::Value>,
93    pub dependencies: Vec<String>,
94    pub uptime_seconds: Option<i64>,
95}
96
97impl ServiceHealth {
98    pub fn new(component_name: impl Into<String>) -> Self {
99        Self {
100            component_name: component_name.into(),
101            status: ComponentStatus::Unknown,
102            last_check: Utc::now(),
103            last_healthy: None,
104            consecutive_failures: 0,
105            consecutive_successes: 0,
106            response_time_ms: None,
107            error_message: None,
108            details: HashMap::new(),
109            dependencies: Vec::new(),
110            uptime_seconds: None,
111        }
112    }
113
114    pub fn with_status(mut self, status: ComponentStatus) -> Self {
115        self.status = status;
116        if status == ComponentStatus::Healthy {
117            self.last_healthy = Some(self.last_check);
118        }
119        self
120    }
121
122    pub fn with_response_time(mut self, response_time_ms: f64) -> Self {
123        self.response_time_ms = Some(response_time_ms);
124        self
125    }
126
127    pub fn with_error(mut self, error: impl Into<String>) -> Self {
128        self.error_message = Some(error.into());
129        self
130    }
131
132    pub fn with_detail(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
133        self.details.insert(key.into(), value);
134        self
135    }
136
137    pub fn with_dependencies(mut self, dependencies: Vec<String>) -> Self {
138        self.dependencies = dependencies;
139        self
140    }
141
142    pub fn with_uptime(mut self, uptime_seconds: i64) -> Self {
143        self.uptime_seconds = Some(uptime_seconds);
144        self
145    }
146}
147
148/// Complete health report
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct HealthReport {
151    pub overall_status: ComponentStatus,
152    pub timestamp: DateTime<Utc>,
153    pub services: HashMap<String, ServiceHealth>,
154    pub dependencies_status: HashMap<String, ComponentStatus>,
155    pub system_info: SystemInfo,
156    pub alerts: Vec<HealthAlert>,
157}
158
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct SystemInfo {
161    pub uptime_seconds: i64,
162    pub version: String,
163    pub environment: String,
164    pub hostname: String,
165    pub total_memory_mb: Option<f64>,
166    pub available_memory_mb: Option<f64>,
167    pub cpu_count: Option<u32>,
168    pub load_average: Option<f64>,
169}
170
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct HealthAlert {
173    pub component: String,
174    pub severity: ComponentStatus,
175    pub message: String,
176    pub timestamp: DateTime<Utc>,
177    pub resolved: bool,
178}
179
180/// Health check trait for components
181#[async_trait::async_trait]
182pub trait HealthChecker: Send + Sync {
183    async fn check_health(&self) -> RragResult<ServiceHealth>;
184    fn component_name(&self) -> &str;
185    fn dependencies(&self) -> Vec<String> {
186        Vec::new()
187    }
188    fn is_critical(&self) -> bool {
189        false
190    }
191}
192
193/// Basic component health checker
194pub struct BasicHealthChecker {
195    name: String,
196    is_critical: bool,
197    check_fn: Arc<dyn Fn() -> RragResult<ComponentStatus> + Send + Sync>,
198}
199
200impl BasicHealthChecker {
201    pub fn new<F>(name: impl Into<String>, check_fn: F) -> Self
202    where
203        F: Fn() -> RragResult<ComponentStatus> + Send + Sync + 'static,
204    {
205        Self {
206            name: name.into(),
207            is_critical: false,
208            check_fn: Arc::new(check_fn),
209        }
210    }
211
212    pub fn with_critical(mut self, critical: bool) -> Self {
213        self.is_critical = critical;
214        self
215    }
216}
217
218#[async_trait::async_trait]
219impl HealthChecker for BasicHealthChecker {
220    async fn check_health(&self) -> RragResult<ServiceHealth> {
221        let start_time = std::time::Instant::now();
222
223        match (self.check_fn)() {
224            Ok(status) => {
225                let response_time = start_time.elapsed().as_millis() as f64;
226                Ok(ServiceHealth::new(&self.name)
227                    .with_status(status)
228                    .with_response_time(response_time))
229            }
230            Err(e) => {
231                let response_time = start_time.elapsed().as_millis() as f64;
232                Ok(ServiceHealth::new(&self.name)
233                    .with_status(ComponentStatus::Unhealthy)
234                    .with_response_time(response_time)
235                    .with_error(e.to_string()))
236            }
237        }
238    }
239
240    fn component_name(&self) -> &str {
241        &self.name
242    }
243
244    fn is_critical(&self) -> bool {
245        self.is_critical
246    }
247}
248
249/// HTTP endpoint health checker
250pub struct HttpHealthChecker {
251    name: String,
252    url: String,
253    expected_status: u16,
254    timeout: Duration,
255    #[cfg(feature = "http")]
256    client: reqwest::Client,
257    is_critical: bool,
258}
259
260impl HttpHealthChecker {
261    pub fn new(name: impl Into<String>, url: impl Into<String>) -> Self {
262        Self {
263            name: name.into(),
264            url: url.into(),
265            expected_status: 200,
266            timeout: Duration::seconds(10),
267            #[cfg(feature = "http")]
268            client: reqwest::Client::new(),
269            is_critical: false,
270        }
271    }
272
273    pub fn with_expected_status(mut self, status: u16) -> Self {
274        self.expected_status = status;
275        self
276    }
277
278    pub fn with_timeout(mut self, timeout: Duration) -> Self {
279        self.timeout = timeout;
280        self
281    }
282
283    pub fn with_critical(mut self, critical: bool) -> Self {
284        self.is_critical = critical;
285        self
286    }
287}
288
289#[async_trait::async_trait]
290impl HealthChecker for HttpHealthChecker {
291    async fn check_health(&self) -> RragResult<ServiceHealth> {
292        #[cfg(feature = "http")]
293        {
294            let start_time = std::time::Instant::now();
295
296            let timeout_duration =
297                std::time::Duration::from_millis(self.timeout.num_milliseconds() as u64);
298
299            match tokio::time::timeout(timeout_duration, self.client.get(&self.url).send()).await {
300                Ok(Ok(response)) => {
301                    let response_time = start_time.elapsed().as_millis() as f64;
302                    let status_code = response.status().as_u16();
303
304                    let status = if status_code == self.expected_status {
305                        ComponentStatus::Healthy
306                    } else {
307                        ComponentStatus::Degraded
308                    };
309
310                    Ok(ServiceHealth::new(&self.name)
311                        .with_status(status)
312                        .with_response_time(response_time)
313                        .with_detail("status_code", serde_json::json!(status_code))
314                        .with_detail("url", serde_json::json!(self.url)))
315                }
316                Ok(Err(e)) => {
317                    let response_time = start_time.elapsed().as_millis() as f64;
318                    Ok(ServiceHealth::new(&self.name)
319                        .with_status(ComponentStatus::Unhealthy)
320                        .with_response_time(response_time)
321                        .with_error(e.to_string())
322                        .with_detail("url", serde_json::json!(self.url)))
323                }
324                Err(_) => {
325                    let response_time = start_time.elapsed().as_millis() as f64;
326                    Ok(ServiceHealth::new(&self.name)
327                        .with_status(ComponentStatus::Unhealthy)
328                        .with_response_time(response_time)
329                        .with_error("Request timeout")
330                        .with_detail(
331                            "timeout_ms",
332                            serde_json::json!(self.timeout.num_milliseconds()),
333                        )
334                        .with_detail("url", serde_json::json!(self.url)))
335                }
336            }
337        }
338        #[cfg(not(feature = "http"))]
339        {
340            // Without HTTP feature, return a placeholder healthy status
341            Ok(ServiceHealth::new(&self.name)
342                .with_status(ComponentStatus::Healthy)
343                .with_response_time(0.0)
344                .with_detail("note", serde_json::json!("HTTP feature disabled"))
345                .with_detail("url", serde_json::json!(self.url)))
346        }
347    }
348
349    fn component_name(&self) -> &str {
350        &self.name
351    }
352
353    fn is_critical(&self) -> bool {
354        self.is_critical
355    }
356}
357
358/// Database connection health checker
359pub struct DatabaseHealthChecker {
360    name: String,
361    connection_string: String,
362    timeout: Duration,
363    is_critical: bool,
364}
365
366impl DatabaseHealthChecker {
367    pub fn new(name: impl Into<String>, connection_string: impl Into<String>) -> Self {
368        Self {
369            name: name.into(),
370            connection_string: connection_string.into(),
371            timeout: Duration::seconds(5),
372            is_critical: true,
373        }
374    }
375
376    pub fn with_timeout(mut self, timeout: Duration) -> Self {
377        self.timeout = timeout;
378        self
379    }
380
381    pub fn with_critical(mut self, critical: bool) -> Self {
382        self.is_critical = critical;
383        self
384    }
385}
386
387#[async_trait::async_trait]
388impl HealthChecker for DatabaseHealthChecker {
389    async fn check_health(&self) -> RragResult<ServiceHealth> {
390        let start_time = std::time::Instant::now();
391
392        // In a real implementation, this would attempt to connect to the database
393        // For now, we'll simulate the check
394        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
395
396        let response_time = start_time.elapsed().as_millis() as f64;
397
398        // Simulate occasional connection issues
399        let status = if rand::random::<f64>() > 0.1 {
400            ComponentStatus::Healthy
401        } else {
402            ComponentStatus::Unhealthy
403        };
404
405        let mut health = ServiceHealth::new(&self.name)
406            .with_status(status)
407            .with_response_time(response_time)
408            .with_detail("connection_string", serde_json::json!("***masked***"));
409
410        if status == ComponentStatus::Unhealthy {
411            health = health.with_error("Connection failed");
412        }
413
414        Ok(health)
415    }
416
417    fn component_name(&self) -> &str {
418        &self.name
419    }
420
421    fn is_critical(&self) -> bool {
422        self.is_critical
423    }
424}
425
426/// Main health monitoring system
427pub struct HealthMonitor {
428    config: HealthConfig,
429    checkers: Arc<RwLock<HashMap<String, Box<dyn HealthChecker>>>>,
430    health_history: Arc<RwLock<HashMap<String, Vec<ServiceHealth>>>>,
431    alerts: Arc<RwLock<Vec<HealthAlert>>>,
432    system_start_time: DateTime<Utc>,
433    monitoring_handle: Arc<RwLock<Option<tokio::task::JoinHandle<()>>>>,
434    is_running: Arc<RwLock<bool>>,
435}
436
437impl HealthMonitor {
438    pub async fn new(config: HealthConfig) -> RragResult<Self> {
439        Ok(Self {
440            config,
441            checkers: Arc::new(RwLock::new(HashMap::new())),
442            health_history: Arc::new(RwLock::new(HashMap::new())),
443            alerts: Arc::new(RwLock::new(Vec::new())),
444            system_start_time: Utc::now(),
445            monitoring_handle: Arc::new(RwLock::new(None)),
446            is_running: Arc::new(RwLock::new(false)),
447        })
448    }
449
450    pub async fn start(&self) -> RragResult<()> {
451        let mut running = self.is_running.write().await;
452        if *running {
453            return Err(RragError::config(
454                "health_monitor",
455                "stopped",
456                "already running",
457            ));
458        }
459
460        let handle = self.start_monitoring_loop().await?;
461        {
462            let mut handle_guard = self.monitoring_handle.write().await;
463            *handle_guard = Some(handle);
464        }
465
466        *running = true;
467        tracing::info!("Health monitor started");
468        Ok(())
469    }
470
471    pub async fn stop(&self) -> RragResult<()> {
472        let mut running = self.is_running.write().await;
473        if !*running {
474            return Ok(());
475        }
476
477        {
478            let mut handle_guard = self.monitoring_handle.write().await;
479            if let Some(handle) = handle_guard.take() {
480                handle.abort();
481            }
482        }
483
484        *running = false;
485        tracing::info!("Health monitor stopped");
486        Ok(())
487    }
488
489    pub async fn is_healthy(&self) -> bool {
490        *self.is_running.read().await
491    }
492
493    async fn start_monitoring_loop(&self) -> RragResult<tokio::task::JoinHandle<()>> {
494        let config = self.config.clone();
495        let checkers = self.checkers.clone();
496        let health_history = self.health_history.clone();
497        let alerts = self.alerts.clone();
498        let is_running = self.is_running.clone();
499
500        let handle = tokio::spawn(async move {
501            let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
502                config.check_interval_seconds,
503            ));
504
505            while *is_running.read().await {
506                interval.tick().await;
507
508                let checker_map = checkers.read().await;
509                let checker_names: Vec<String> = checker_map.keys().cloned().collect();
510                drop(checker_map);
511
512                for checker_name in checker_names {
513                    let checker_map = checkers.read().await;
514                    if let Some(checker) = checker_map.get(&checker_name) {
515                        let timeout_duration =
516                            std::time::Duration::from_secs(config.timeout_seconds);
517
518                        let health_result =
519                            tokio::time::timeout(timeout_duration, checker.check_health()).await;
520
521                        let mut service_health = match health_result {
522                            Ok(Ok(health)) => health,
523                            Ok(Err(e)) => ServiceHealth::new(&checker_name)
524                                .with_status(ComponentStatus::Unhealthy)
525                                .with_error(e.to_string()),
526                            Err(_) => ServiceHealth::new(&checker_name)
527                                .with_status(ComponentStatus::Unhealthy)
528                                .with_error("Health check timeout"),
529                        };
530
531                        // Update consecutive counters
532                        let mut history = health_history.write().await;
533                        let component_history =
534                            history.entry(checker_name.clone()).or_insert_with(Vec::new);
535
536                        if let Some(last_health) = component_history.last() {
537                            if service_health.status == ComponentStatus::Healthy {
538                                if last_health.status == ComponentStatus::Healthy {
539                                    service_health.consecutive_successes =
540                                        last_health.consecutive_successes + 1;
541                                } else {
542                                    service_health.consecutive_successes = 1;
543                                }
544                                service_health.consecutive_failures = 0;
545                            } else {
546                                if last_health.status != ComponentStatus::Healthy {
547                                    service_health.consecutive_failures =
548                                        last_health.consecutive_failures + 1;
549                                } else {
550                                    service_health.consecutive_failures = 1;
551                                }
552                                service_health.consecutive_successes = 0;
553                            }
554                        } else {
555                            if service_health.status == ComponentStatus::Healthy {
556                                service_health.consecutive_successes = 1;
557                            } else {
558                                service_health.consecutive_failures = 1;
559                            }
560                        }
561
562                        component_history.push(service_health.clone());
563
564                        // Keep only recent history (last 100 checks)
565                        if component_history.len() > 100 {
566                            component_history.drain(0..component_history.len() - 100);
567                        }
568
569                        // Generate alerts for significant status changes
570                        if service_health.consecutive_failures >= config.max_consecutive_failures {
571                            let alert = HealthAlert {
572                                component: checker_name.clone(),
573                                severity: service_health.status,
574                                message: format!(
575                                    "Component {} has failed {} consecutive health checks",
576                                    checker_name, service_health.consecutive_failures
577                                ),
578                                timestamp: Utc::now(),
579                                resolved: false,
580                            };
581
582                            let mut alert_list = alerts.write().await;
583                            alert_list.push(alert);
584
585                            // Keep only recent alerts
586                            let alert_list_len = alert_list.len();
587                            if alert_list_len > 1000 {
588                                alert_list.drain(0..alert_list_len - 1000);
589                            }
590                        }
591                    }
592                    drop(checker_map);
593                }
594            }
595        });
596
597        Ok(handle)
598    }
599
600    pub async fn add_checker(&self, checker: Box<dyn HealthChecker>) -> RragResult<()> {
601        let name = checker.component_name().to_string();
602        let mut checkers = self.checkers.write().await;
603        checkers.insert(name, checker);
604        Ok(())
605    }
606
607    pub async fn remove_checker(&self, name: &str) -> RragResult<()> {
608        let mut checkers = self.checkers.write().await;
609        checkers.remove(name);
610
611        // Also remove health history for this component
612        let mut history = self.health_history.write().await;
613        history.remove(name);
614
615        Ok(())
616    }
617
618    pub async fn get_health_report(&self) -> HealthReport {
619        let health_history = self.health_history.read().await;
620        let alerts = self.alerts.read().await;
621
622        let mut services = HashMap::new();
623        let mut overall_status = ComponentStatus::Healthy;
624
625        // Collect current health status for each service
626        for (component_name, history) in health_history.iter() {
627            if let Some(latest_health) = history.last() {
628                services.insert(component_name.clone(), latest_health.clone());
629
630                // Determine overall system status
631                if latest_health.status > overall_status {
632                    overall_status = latest_health.status;
633                }
634            }
635        }
636
637        let system_info = self.get_system_info().await;
638
639        HealthReport {
640            overall_status,
641            timestamp: Utc::now(),
642            services,
643            dependencies_status: HashMap::new(), // Would be populated with dependency checks
644            system_info,
645            alerts: alerts.clone(),
646        }
647    }
648
649    pub async fn get_component_health(&self, component_name: &str) -> Option<ServiceHealth> {
650        let history = self.health_history.read().await;
651        history.get(component_name)?.last().cloned()
652    }
653
654    pub async fn get_component_history(
655        &self,
656        component_name: &str,
657        limit: Option<usize>,
658    ) -> Vec<ServiceHealth> {
659        let history = self.health_history.read().await;
660        if let Some(component_history) = history.get(component_name) {
661            let limit = limit.unwrap_or(component_history.len());
662            let start_index = component_history.len().saturating_sub(limit);
663            component_history[start_index..].to_vec()
664        } else {
665            Vec::new()
666        }
667    }
668
669    pub async fn get_alerts(&self, resolved: Option<bool>) -> Vec<HealthAlert> {
670        let alerts = self.alerts.read().await;
671        if let Some(resolved_filter) = resolved {
672            alerts
673                .iter()
674                .filter(|alert| alert.resolved == resolved_filter)
675                .cloned()
676                .collect()
677        } else {
678            alerts.clone()
679        }
680    }
681
682    pub async fn acknowledge_alert(
683        &self,
684        component: &str,
685        timestamp: DateTime<Utc>,
686    ) -> RragResult<()> {
687        let mut alerts = self.alerts.write().await;
688        if let Some(alert) = alerts
689            .iter_mut()
690            .find(|a| a.component == component && a.timestamp == timestamp)
691        {
692            alert.resolved = true;
693        }
694        Ok(())
695    }
696
697    async fn get_system_info(&self) -> SystemInfo {
698        let uptime = (Utc::now() - self.system_start_time).num_seconds();
699
700        SystemInfo {
701            uptime_seconds: uptime,
702            version: env!("CARGO_PKG_VERSION").to_string(),
703            environment: "production".to_string(),
704            hostname: hostname::get()
705                .unwrap_or_else(|_| "unknown".into())
706                .to_string_lossy()
707                .to_string(),
708            total_memory_mb: None, // Would be populated with actual system info
709            available_memory_mb: None,
710            cpu_count: Some(num_cpus::get() as u32),
711            load_average: None,
712        }
713    }
714
715    pub async fn force_health_check(&self, component_name: &str) -> RragResult<ServiceHealth> {
716        let checkers = self.checkers.read().await;
717        let checker = checkers.get(component_name).ok_or_else(|| {
718            RragError::agent(
719                "health_monitor",
720                format!("Component not found: {}", component_name),
721            )
722        })?;
723
724        let timeout_duration = std::time::Duration::from_secs(self.config.timeout_seconds);
725
726        let health_result = tokio::time::timeout(timeout_duration, checker.check_health()).await;
727
728        match health_result {
729            Ok(Ok(health)) => Ok(health),
730            Ok(Err(e)) => Ok(ServiceHealth::new(component_name)
731                .with_status(ComponentStatus::Unhealthy)
732                .with_error(e.to_string())),
733            Err(_) => Ok(ServiceHealth::new(component_name)
734                .with_status(ComponentStatus::Unhealthy)
735                .with_error("Health check timeout")),
736        }
737    }
738
739    pub async fn get_health_summary(&self) -> HealthSummary {
740        let report = self.get_health_report().await;
741
742        let total_services = report.services.len();
743        let healthy_services = report
744            .services
745            .values()
746            .filter(|s| s.status == ComponentStatus::Healthy)
747            .count();
748        let degraded_services = report
749            .services
750            .values()
751            .filter(|s| s.status == ComponentStatus::Degraded)
752            .count();
753        let unhealthy_services = report
754            .services
755            .values()
756            .filter(|s| s.status == ComponentStatus::Unhealthy)
757            .count();
758        let critical_services = report
759            .services
760            .values()
761            .filter(|s| s.status == ComponentStatus::Critical)
762            .count();
763
764        let active_alerts = report.alerts.iter().filter(|a| !a.resolved).count();
765
766        HealthSummary {
767            overall_status: report.overall_status,
768            total_services,
769            healthy_services,
770            degraded_services,
771            unhealthy_services,
772            critical_services,
773            active_alerts,
774            uptime_seconds: report.system_info.uptime_seconds,
775            last_check: report.timestamp,
776        }
777    }
778}
779
780#[derive(Debug, Clone, Serialize, Deserialize)]
781pub struct HealthSummary {
782    pub overall_status: ComponentStatus,
783    pub total_services: usize,
784    pub healthy_services: usize,
785    pub degraded_services: usize,
786    pub unhealthy_services: usize,
787    pub critical_services: usize,
788    pub active_alerts: usize,
789    pub uptime_seconds: i64,
790    pub last_check: DateTime<Utc>,
791}
792
793#[cfg(test)]
794mod tests {
795    use super::*;
796
797    #[tokio::test]
798    async fn test_service_health_creation() {
799        let health = ServiceHealth::new("test_service")
800            .with_status(ComponentStatus::Healthy)
801            .with_response_time(150.5)
802            .with_detail("version", serde_json::json!("1.0.0"))
803            .with_dependencies(vec!["database".to_string(), "cache".to_string()])
804            .with_uptime(3600);
805
806        assert_eq!(health.component_name, "test_service");
807        assert_eq!(health.status, ComponentStatus::Healthy);
808        assert_eq!(health.response_time_ms.unwrap(), 150.5);
809        assert_eq!(health.dependencies.len(), 2);
810        assert_eq!(health.uptime_seconds.unwrap(), 3600);
811        assert!(health.details.contains_key("version"));
812    }
813
814    #[tokio::test]
815    async fn test_basic_health_checker() {
816        let checker = BasicHealthChecker::new("test_component", || Ok(ComponentStatus::Healthy))
817            .with_critical(true);
818
819        assert_eq!(checker.component_name(), "test_component");
820        assert!(checker.is_critical());
821
822        let health = checker.check_health().await.unwrap();
823        assert_eq!(health.component_name, "test_component");
824        assert_eq!(health.status, ComponentStatus::Healthy);
825        assert!(health.response_time_ms.is_some());
826    }
827
828    #[tokio::test]
829    async fn test_basic_health_checker_failure() {
830        let checker = BasicHealthChecker::new("failing_component", || {
831            Err(RragError::agent("test", "Simulated failure"))
832        });
833
834        let health = checker.check_health().await.unwrap();
835        assert_eq!(health.status, ComponentStatus::Unhealthy);
836        assert!(health.error_message.is_some());
837        assert!(health
838            .error_message
839            .as_ref()
840            .unwrap()
841            .contains("Simulated failure"));
842    }
843
844    #[tokio::test]
845    async fn test_database_health_checker() {
846        let checker = DatabaseHealthChecker::new("test_db", "postgresql://localhost:5432/test")
847            .with_timeout(Duration::seconds(5))
848            .with_critical(true);
849
850        assert_eq!(checker.component_name(), "test_db");
851        assert!(checker.is_critical());
852
853        let health = checker.check_health().await.unwrap();
854        assert_eq!(health.component_name, "test_db");
855        assert!(health.response_time_ms.is_some());
856        assert!(health.details.contains_key("connection_string"));
857    }
858
859    #[tokio::test]
860    async fn test_health_monitor() {
861        let config = HealthConfig {
862            check_interval_seconds: 1, // Fast interval for testing
863            ..Default::default()
864        };
865
866        let mut monitor = HealthMonitor::new(config).await.unwrap();
867
868        // Add a test checker
869        let checker = BasicHealthChecker::new("test_service", || Ok(ComponentStatus::Healthy));
870        monitor.add_checker(Box::new(checker)).await.unwrap();
871
872        assert!(!monitor.is_healthy().await);
873
874        monitor.start().await.unwrap();
875        assert!(monitor.is_healthy().await);
876
877        // Wait for a health check to occur
878        tokio::time::sleep(tokio::time::Duration::from_millis(1100)).await;
879
880        let report = monitor.get_health_report().await;
881        assert_eq!(report.overall_status, ComponentStatus::Healthy);
882        assert!(report.services.contains_key("test_service"));
883
884        let service_health = monitor.get_component_health("test_service").await;
885        assert!(service_health.is_some());
886        assert_eq!(service_health.unwrap().status, ComponentStatus::Healthy);
887
888        monitor.stop().await.unwrap();
889        assert!(!monitor.is_healthy().await);
890    }
891
892    #[tokio::test]
893    async fn test_health_monitor_consecutive_failures() {
894        let config = HealthConfig {
895            check_interval_seconds: 1,
896            max_consecutive_failures: 2,
897            ..Default::default()
898        };
899
900        let mut monitor = HealthMonitor::new(config).await.unwrap();
901
902        // Add a failing checker
903        let checker = BasicHealthChecker::new("failing_service", || {
904            Err(RragError::agent("test", "Always fails"))
905        });
906        monitor.add_checker(Box::new(checker)).await.unwrap();
907
908        monitor.start().await.unwrap();
909
910        // Wait for multiple health checks to occur
911        tokio::time::sleep(tokio::time::Duration::from_millis(2500)).await;
912
913        let alerts = monitor.get_alerts(Some(false)).await; // Get unresolved alerts
914        assert!(!alerts.is_empty());
915
916        let service_health = monitor
917            .get_component_health("failing_service")
918            .await
919            .unwrap();
920        assert!(service_health.consecutive_failures >= 2);
921
922        monitor.stop().await.unwrap();
923    }
924
925    #[tokio::test]
926    async fn test_force_health_check() {
927        let config = HealthConfig::default();
928        let mut monitor = HealthMonitor::new(config).await.unwrap();
929
930        let checker = BasicHealthChecker::new("manual_test", || Ok(ComponentStatus::Degraded));
931        monitor.add_checker(Box::new(checker)).await.unwrap();
932
933        let health = monitor.force_health_check("manual_test").await.unwrap();
934        assert_eq!(health.component_name, "manual_test");
935        assert_eq!(health.status, ComponentStatus::Degraded);
936    }
937
938    #[tokio::test]
939    async fn test_health_summary() {
940        let config = HealthConfig::default();
941        let mut monitor = HealthMonitor::new(config).await.unwrap();
942
943        // Add multiple checkers with different statuses
944        let healthy_checker =
945            BasicHealthChecker::new("healthy_service", || Ok(ComponentStatus::Healthy));
946        let degraded_checker =
947            BasicHealthChecker::new("degraded_service", || Ok(ComponentStatus::Degraded));
948        let unhealthy_checker = BasicHealthChecker::new("unhealthy_service", || {
949            Err(RragError::agent("test", "Service down"))
950        });
951
952        monitor
953            .add_checker(Box::new(healthy_checker))
954            .await
955            .unwrap();
956        monitor
957            .add_checker(Box::new(degraded_checker))
958            .await
959            .unwrap();
960        monitor
961            .add_checker(Box::new(unhealthy_checker))
962            .await
963            .unwrap();
964
965        monitor.start().await.unwrap();
966
967        // Wait for health checks
968        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
969
970        let summary = monitor.get_health_summary().await;
971        assert_eq!(summary.total_services, 3);
972        assert!(summary.uptime_seconds >= 0);
973
974        monitor.stop().await.unwrap();
975    }
976
977    #[test]
978    fn test_component_status_ordering() {
979        assert!(ComponentStatus::Critical > ComponentStatus::Unhealthy);
980        assert!(ComponentStatus::Unhealthy > ComponentStatus::Degraded);
981        assert!(ComponentStatus::Degraded > ComponentStatus::Healthy);
982        assert!(ComponentStatus::Unknown > ComponentStatus::Critical);
983    }
984
985    #[test]
986    fn test_component_status_display() {
987        assert_eq!(ComponentStatus::Healthy.to_string(), "HEALTHY");
988        assert_eq!(ComponentStatus::Degraded.to_string(), "DEGRADED");
989        assert_eq!(ComponentStatus::Unhealthy.to_string(), "UNHEALTHY");
990        assert_eq!(ComponentStatus::Critical.to_string(), "CRITICAL");
991        assert_eq!(ComponentStatus::Unknown.to_string(), "UNKNOWN");
992    }
993}