Skip to main content

auth_framework/deployment/
health.rs

1// Health monitoring system for production deployment
2// Comprehensive health checks, metrics collection, and service monitoring
3
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7use thiserror::Error;
8use tokio::time::interval;
9
10#[derive(Debug, Error)]
11pub enum HealthError {
12    #[error("Health check failed: {0}")]
13    CheckFailed(String),
14    #[error("Service unavailable: {0}")]
15    ServiceUnavailable(String),
16    #[error("Timeout error: {0}")]
17    Timeout(String),
18    #[error("Network error: {0}")]
19    Network(String),
20    #[error("Configuration error: {0}")]
21    Configuration(String),
22    #[error("IO error: {0}")]
23    Io(#[from] std::io::Error),
24}
25
26/// Health check status
27#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
28pub enum HealthStatus {
29    Healthy,
30    Degraded,
31    Unhealthy,
32    Unknown,
33}
34
35/// Health check type
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub enum HealthCheckType {
38    Http,
39    Database,
40    Redis,
41    FileSystem,
42    Memory,
43    Cpu,
44    Disk,
45    Custom(String),
46}
47
48/// Individual health check configuration
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HealthCheck {
51    pub name: String,
52    pub check_type: HealthCheckType,
53    pub endpoint: String,
54    pub timeout: Duration,
55    pub interval: Duration,
56    pub retries: u32,
57    pub enabled: bool,
58    pub critical: bool,
59    pub tags: HashMap<String, String>,
60}
61
62/// Health check result
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct HealthCheckResult {
65    pub name: String,
66    pub status: HealthStatus,
67    pub message: String,
68    pub response_time: Duration,
69    pub timestamp: u64,
70    pub metadata: HashMap<String, serde_json::Value>,
71}
72
73/// Service health status
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct ServiceHealth {
76    pub service_name: String,
77    pub overall_status: HealthStatus,
78    pub checks: Vec<HealthCheckResult>,
79    pub uptime: Duration,
80    pub last_updated: u64,
81    pub version: String,
82}
83
84/// System metrics for health monitoring
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct SystemMetrics {
87    pub cpu_usage: f64,
88    pub memory_usage: f64,
89    pub disk_usage: f64,
90    pub network_io: NetworkIoMetrics,
91    pub process_count: u32,
92    pub load_average: LoadAverage,
93    pub timestamp: u64,
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct NetworkIoMetrics {
98    pub bytes_sent: u64,
99    pub bytes_received: u64,
100    pub packets_sent: u64,
101    pub packets_received: u64,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct LoadAverage {
106    pub one_minute: f64,
107    pub five_minutes: f64,
108    pub fifteen_minutes: f64,
109}
110
111/// Health monitoring configuration
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct HealthMonitorConfig {
114    pub enabled: bool,
115    pub global_timeout: Duration,
116    pub check_interval: Duration,
117    pub unhealthy_threshold: u32,
118    pub degraded_threshold: u32,
119    pub metrics_retention: Duration,
120    pub alert_on_failure: bool,
121    pub alert_endpoints: Vec<String>,
122}
123
124/// Health monitor manager
125pub struct HealthMonitor {
126    config: HealthMonitorConfig,
127    checks: Vec<HealthCheck>,
128    results: HashMap<String, HealthCheckResult>,
129    service_health: ServiceHealth,
130    system_metrics: SystemMetrics,
131    failure_counts: HashMap<String, u32>,
132    sys: tokio::sync::Mutex<sysinfo::System>,
133}
134
135impl HealthMonitor {
136    /// Create new health monitor
137    pub fn new(config: HealthMonitorConfig) -> Self {
138        let now = SystemTime::now()
139            .duration_since(UNIX_EPOCH)
140            .unwrap_or_default();
141
142        Self {
143            config,
144            checks: Vec::new(),
145            results: HashMap::new(),
146            service_health: ServiceHealth {
147                service_name: "authframework".to_string(),
148                overall_status: HealthStatus::Unknown,
149                checks: Vec::new(),
150                uptime: Duration::from_secs(0),
151                last_updated: now.as_secs(),
152                version: env!("CARGO_PKG_VERSION").to_string(),
153            },
154            system_metrics: SystemMetrics {
155                cpu_usage: 0.0,
156                memory_usage: 0.0,
157                disk_usage: 0.0,
158                network_io: NetworkIoMetrics {
159                    bytes_sent: 0,
160                    bytes_received: 0,
161                    packets_sent: 0,
162                    packets_received: 0,
163                },
164                process_count: 0,
165                load_average: LoadAverage {
166                    one_minute: 0.0,
167                    five_minutes: 0.0,
168                    fifteen_minutes: 0.0,
169                },
170                timestamp: now.as_secs(),
171            },
172            failure_counts: HashMap::new(),
173            sys: tokio::sync::Mutex::new(sysinfo::System::new_all()),
174        }
175    }
176
177    /// Add health check
178    pub fn add_check(&mut self, check: HealthCheck) {
179        self.checks.push(check);
180    }
181
182    /// Remove health check
183    pub fn remove_check(&mut self, name: &str) {
184        self.checks.retain(|check| check.name != name);
185        self.results.remove(name);
186        self.failure_counts.remove(name);
187    }
188
189    /// Start health monitoring
190    pub async fn start_monitoring(&mut self) -> Result<(), HealthError> {
191        if !self.config.enabled {
192            return Ok(());
193        }
194
195        // Start monitoring loop
196        let mut interval = interval(self.config.check_interval);
197
198        loop {
199            interval.tick().await;
200
201            // Run all health checks
202            self.run_health_checks().await?;
203
204            // Update system metrics
205            self.update_system_metrics().await?;
206
207            // Update overall service health
208            self.update_service_health();
209
210            // Check for alerts
211            self.check_alerts().await?;
212        }
213    }
214
215    /// Run all configured health checks
216    async fn run_health_checks(&mut self) -> Result<(), HealthError> {
217        for check in &self.checks {
218            if !check.enabled {
219                continue;
220            }
221
222            let result = self.run_single_check(check).await;
223            self.results.insert(check.name.clone(), result.clone());
224
225            // Update failure count
226            match result.status {
227                HealthStatus::Healthy => {
228                    self.failure_counts.insert(check.name.clone(), 0);
229                }
230                _ => {
231                    let count = self.failure_counts.get(&check.name).unwrap_or(&0) + 1;
232                    self.failure_counts.insert(check.name.clone(), count);
233                }
234            }
235        }
236
237        Ok(())
238    }
239
240    /// Run single health check
241    async fn run_single_check(&self, check: &HealthCheck) -> HealthCheckResult {
242        let start_time = SystemTime::now();
243        let mut retries = 0;
244        let mut last_error = String::new();
245
246        while retries <= check.retries {
247            let result = match check.check_type {
248                HealthCheckType::Http => self.check_http(&check.endpoint).await,
249                HealthCheckType::Database => self.check_database(&check.endpoint).await,
250                HealthCheckType::Redis => self.check_redis(&check.endpoint).await,
251                HealthCheckType::FileSystem => self.check_filesystem(&check.endpoint).await,
252                HealthCheckType::Memory => self.check_memory().await,
253                HealthCheckType::Cpu => self.check_cpu().await,
254                HealthCheckType::Disk => self.check_disk(&check.endpoint).await,
255                HealthCheckType::Custom(ref custom_type) => {
256                    self.check_custom(custom_type, &check.endpoint).await
257                }
258            };
259
260            match result {
261                Ok(status) => {
262                    let response_time = start_time.elapsed().unwrap_or_default();
263                    return HealthCheckResult {
264                        name: check.name.clone(),
265                        status,
266                        message: "Health check passed".to_string(),
267                        response_time,
268                        timestamp: SystemTime::now()
269                            .duration_since(UNIX_EPOCH)
270                            .unwrap_or_default()
271                            .as_secs(),
272                        metadata: HashMap::new(),
273                    };
274                }
275                Err(e) => {
276                    last_error = e.to_string();
277                    retries += 1;
278
279                    if retries <= check.retries {
280                        tokio::time::sleep(Duration::from_millis(100 * retries as u64)).await;
281                    }
282                }
283            }
284        }
285
286        let response_time = start_time.elapsed().unwrap_or_default();
287        HealthCheckResult {
288            name: check.name.clone(),
289            status: HealthStatus::Unhealthy,
290            message: format!(
291                "Health check failed after {} retries: {}",
292                check.retries, last_error
293            ),
294            response_time,
295            timestamp: SystemTime::now()
296                .duration_since(UNIX_EPOCH)
297                .unwrap_or_default()
298                .as_secs(),
299            metadata: HashMap::new(),
300        }
301    }
302
303    /// Check HTTP endpoint health by issuing a real HEAD/GET request.
304    async fn check_http(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
305        if !endpoint.starts_with("http") {
306            return Err(HealthError::CheckFailed(
307                "Invalid HTTP endpoint: must start with http".to_string(),
308            ));
309        }
310
311        let client = reqwest::Client::builder()
312            .timeout(Duration::from_secs(5))
313            .build()
314            .map_err(|e| HealthError::Network(e.to_string()))?;
315
316        match client.head(endpoint).send().await {
317            Ok(response) => {
318                let status = response.status().as_u16();
319                if status < 500 {
320                    Ok(HealthStatus::Healthy)
321                } else {
322                    Ok(HealthStatus::Unhealthy)
323                }
324            }
325            Err(e) if e.is_connect() || e.is_timeout() => Ok(HealthStatus::Unhealthy),
326            Err(e) => Err(HealthError::Network(e.to_string())),
327        }
328    }
329
330    /// Check database connectivity by opening a TCP connection to the endpoint.
331    ///
332    /// `endpoint` should be `"host:port"` — e.g. `"localhost:5432"` for PostgreSQL
333    /// or `"postgres://user:pass@localhost:5432/db"` (the host:port is extracted).
334    async fn check_database(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
335        if endpoint.is_empty() {
336            return Err(HealthError::CheckFailed(
337                "Database endpoint not configured".to_string(),
338            ));
339        }
340        let addr = extract_host_port(endpoint);
341        tcp_connect_check(&addr).await
342    }
343
344    /// Check Redis connectivity by opening a TCP connection to the endpoint.
345    ///
346    /// `endpoint` should be `"host:port"` — e.g. `"localhost:6379"`.
347    async fn check_redis(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
348        if endpoint.is_empty() {
349            return Err(HealthError::CheckFailed(
350                "Redis endpoint not configured".to_string(),
351            ));
352        }
353        let addr = extract_host_port(endpoint);
354        tcp_connect_check(&addr).await
355    }
356
357    /// Check filesystem health
358    async fn check_filesystem(&self, path: &str) -> Result<HealthStatus, HealthError> {
359        use std::path::Path;
360
361        if Path::new(path).exists() {
362            Ok(HealthStatus::Healthy)
363        } else {
364            Err(HealthError::CheckFailed(format!(
365                "Path does not exist: {}",
366                path
367            )))
368        }
369    }
370
371    /// Check memory usage
372    async fn check_memory(&self) -> Result<HealthStatus, HealthError> {
373        let memory_usage = self.get_memory_usage().await?;
374
375        if memory_usage < 0.8 {
376            Ok(HealthStatus::Healthy)
377        } else if memory_usage < 0.9 {
378            Ok(HealthStatus::Degraded)
379        } else {
380            Ok(HealthStatus::Unhealthy)
381        }
382    }
383
384    /// Check CPU usage
385    async fn check_cpu(&self) -> Result<HealthStatus, HealthError> {
386        let cpu_usage = self.get_cpu_usage().await?;
387
388        if cpu_usage < 0.7 {
389            Ok(HealthStatus::Healthy)
390        } else if cpu_usage < 0.85 {
391            Ok(HealthStatus::Degraded)
392        } else {
393            Ok(HealthStatus::Unhealthy)
394        }
395    }
396
397    /// Check disk usage
398    async fn check_disk(&self, path: &str) -> Result<HealthStatus, HealthError> {
399        let disk_usage = self.get_disk_usage(path).await?;
400
401        if disk_usage < 0.8 {
402            Ok(HealthStatus::Healthy)
403        } else if disk_usage < 0.9 {
404            Ok(HealthStatus::Degraded)
405        } else {
406            Ok(HealthStatus::Unhealthy)
407        }
408    }
409
410    /// Check custom health endpoint
411    async fn check_custom(
412        &self,
413        _custom_type: &str,
414        _endpoint: &str,
415    ) -> Result<HealthStatus, HealthError> {
416        // Implement custom health check logic
417        Ok(HealthStatus::Healthy)
418    }
419
420    /// Update system metrics
421    async fn update_system_metrics(&mut self) -> Result<(), HealthError> {
422        let now = SystemTime::now()
423            .duration_since(UNIX_EPOCH)
424            .unwrap_or_default();
425
426        self.system_metrics = SystemMetrics {
427            cpu_usage: self.get_cpu_usage().await?,
428            memory_usage: self.get_memory_usage().await?,
429            disk_usage: self.get_disk_usage("/").await?,
430            network_io: self.get_network_io().await?,
431            process_count: self.get_process_count().await?,
432            load_average: self.get_load_average().await?,
433            timestamp: now.as_secs(),
434        };
435
436        Ok(())
437    }
438
439    /// Get CPU usage percentage.
440    async fn get_cpu_usage(&self) -> Result<f64, HealthError> {
441        let mut sys = self.sys.lock().await;
442        sys.refresh_cpu_usage();
443        Ok(sys.global_cpu_usage() as f64 / 100.0)
444    }
445
446    /// Get memory usage percentage.
447    async fn get_memory_usage(&self) -> Result<f64, HealthError> {
448        let mut sys = self.sys.lock().await;
449        sys.refresh_memory();
450        let total = sys.total_memory();
451        if total > 0 {
452            Ok(sys.used_memory() as f64 / total as f64)
453        } else {
454            Ok(0.0)
455        }
456    }
457
458    /// Get disk usage percentage for `path`.
459    ///
460    /// Returns a value in `[0.0, 1.0]` representing the fraction of disk space
461    /// used.  Uses `sysinfo::Disks` to enumerate mounted file-systems and finds
462    /// the one whose mount-point is the longest prefix of `path` (i.e. the most
463    /// specific mount point).  Falls back to the first listed disk if no prefix
464    /// match is found, and returns `0.0` only when no disks are available.
465    async fn get_disk_usage(&self, path: &str) -> Result<f64, HealthError> {
466        use sysinfo::Disks;
467        let disks = Disks::new_with_refreshed_list();
468
469        // Find the mount-point that is the longest prefix of `path`.
470        let mut best: Option<(usize, f64)> = None; // (prefix_len, usage)
471        for disk in disks.list() {
472            let mount = disk.mount_point().to_string_lossy();
473            let total = disk.total_space();
474            if total == 0 {
475                continue;
476            }
477            let available = disk.available_space();
478            let usage = 1.0 - (available as f64 / total as f64);
479
480            if path.starts_with(mount.as_ref()) {
481                let len = mount.len();
482                match best {
483                    Some((prev_len, _)) if len > prev_len => best = Some((len, usage)),
484                    None => best = Some((len, usage)),
485                    _ => {}
486                }
487            }
488        }
489
490        if let Some((_, usage)) = best {
491            return Ok(usage.clamp(0.0, 1.0));
492        }
493
494        // Fallback: use first available disk.
495        if let Some(disk) = disks.list().first() {
496            let total = disk.total_space();
497            if total > 0 {
498                let available = disk.available_space();
499                let usage = 1.0 - (available as f64 / total as f64);
500                tracing::debug!(
501                    "No disk mount point matched '{}'; using first available disk",
502                    path
503                );
504                return Ok(usage.clamp(0.0, 1.0));
505            }
506        }
507
508        tracing::debug!("No disks found; reporting disk usage 0.0 for '{}'", path);
509        Ok(0.0)
510    }
511
512    /// Get network I/O metrics.
513    async fn get_network_io(&self) -> Result<NetworkIoMetrics, HealthError> {
514        let mut networks = sysinfo::Networks::new_with_refreshed_list();
515        // Give a tiny wait so network diffs can be populated
516        tokio::time::sleep(Duration::from_millis(10)).await;
517        networks.refresh_list();
518
519        let mut bytes_recv = 0u64;
520        let mut pkts_recv = 0u64;
521        let mut bytes_sent = 0u64;
522        let mut pkts_sent = 0u64;
523
524        for (_, data) in networks.into_iter() {
525            bytes_recv += data.received();
526            pkts_recv += data.packets_received();
527            bytes_sent += data.transmitted();
528            pkts_sent += data.packets_transmitted();
529        }
530
531        Ok(NetworkIoMetrics {
532            bytes_sent,
533            bytes_received: bytes_recv,
534            packets_sent: pkts_sent,
535            packets_received: pkts_recv,
536        })
537    }
538
539    /// Get the number of running processes.
540    async fn get_process_count(&self) -> Result<u32, HealthError> {
541        let mut sys = self.sys.lock().await;
542        sys.refresh_processes(sysinfo::ProcessesToUpdate::All, true);
543        Ok(sys.processes().len() as u32)
544    }
545
546    /// Get load averages.
547    async fn get_load_average(&self) -> Result<LoadAverage, HealthError> {
548        let load = sysinfo::System::load_average();
549        Ok(LoadAverage {
550            one_minute: load.one,
551            five_minutes: load.five,
552            fifteen_minutes: load.fifteen,
553        })
554    }
555
556    /// Update overall service health based on individual checks
557    fn update_service_health(&mut self) {
558        let mut healthy_count = 0;
559        let mut degraded_count = 0;
560        let mut unhealthy_count = 0;
561        let mut critical_unhealthy = false;
562
563        let check_results: Vec<HealthCheckResult> = self.results.values().cloned().collect();
564
565        for result in &check_results {
566            // Check if this is a critical check
567            let is_critical = self
568                .checks
569                .iter()
570                .find(|check| check.name == result.name)
571                .map(|check| check.critical)
572                .unwrap_or(false);
573
574            match result.status {
575                HealthStatus::Healthy => healthy_count += 1,
576                HealthStatus::Degraded => degraded_count += 1,
577                HealthStatus::Unhealthy => {
578                    unhealthy_count += 1;
579                    if is_critical {
580                        critical_unhealthy = true;
581                    }
582                }
583                HealthStatus::Unknown => {}
584            }
585        }
586
587        // Determine overall status
588        let overall_status = if critical_unhealthy {
589            HealthStatus::Unhealthy
590        } else if unhealthy_count > 0 || degraded_count > 0 {
591            HealthStatus::Degraded
592        } else if healthy_count > 0 {
593            HealthStatus::Healthy
594        } else {
595            HealthStatus::Unknown
596        };
597
598        let now = SystemTime::now()
599            .duration_since(UNIX_EPOCH)
600            .unwrap_or_default();
601
602        self.service_health = ServiceHealth {
603            service_name: self.service_health.service_name.clone(),
604            overall_status,
605            checks: check_results,
606            uptime: Duration::from_secs(now.as_secs() - self.service_health.last_updated),
607            last_updated: now.as_secs(),
608            version: self.service_health.version.clone(),
609        };
610    }
611
612    /// Check for alert conditions
613    async fn check_alerts(&self) -> Result<(), HealthError> {
614        if !self.config.alert_on_failure {
615            return Ok(());
616        }
617
618        // Check for unhealthy services
619        if self.service_health.overall_status == HealthStatus::Unhealthy {
620            self.send_alert("Service is unhealthy").await?;
621        }
622
623        // Check for high failure rates
624        for (check_name, failure_count) in &self.failure_counts {
625            if *failure_count >= self.config.unhealthy_threshold {
626                self.send_alert(&format!(
627                    "Health check '{}' has failed {} times",
628                    check_name, failure_count
629                ))
630                .await?;
631            }
632        }
633
634        Ok(())
635    }
636
637    /// Send alert to configured endpoints
638    async fn send_alert(&self, message: &str) -> Result<(), HealthError> {
639        for endpoint in &self.config.alert_endpoints {
640            // Real HTTP dispatch to `endpoint` requires an HTTP client (e.g. reqwest).
641            // Until that integration is added, log the alert through the structured
642            // logging system so it is visible in production log streams.
643            tracing::warn!(
644                target: "health_alert",
645                alert_endpoint = %endpoint,
646                "HEALTH ALERT: {message}"
647            );
648        }
649        Ok(())
650    }
651
652    /// Get current service health
653    pub fn get_service_health(&self) -> &ServiceHealth {
654        &self.service_health
655    }
656
657    /// Get current system metrics
658    pub fn get_system_metrics(&self) -> &SystemMetrics {
659        &self.system_metrics
660    }
661
662    /// Get health check results
663    pub fn get_check_results(&self) -> &HashMap<String, HealthCheckResult> {
664        &self.results
665    }
666
667    /// Get specific health check result
668    pub fn get_check_result(&self, name: &str) -> Option<&HealthCheckResult> {
669        self.results.get(name)
670    }
671}
672
673impl Default for HealthMonitorConfig {
674    fn default() -> Self {
675        Self {
676            enabled: true,
677            global_timeout: Duration::from_secs(30),
678            check_interval: Duration::from_secs(30),
679            unhealthy_threshold: 3,
680            degraded_threshold: 2,
681            metrics_retention: Duration::from_secs(24 * 3600), // 24 hours
682            alert_on_failure: true,
683            alert_endpoints: vec!["http://localhost:9093/api/v1/alerts".to_string()],
684        }
685    }
686}
687
688impl Default for HealthCheck {
689    fn default() -> Self {
690        Self {
691            name: "default".to_string(),
692            check_type: HealthCheckType::Http,
693            endpoint: "/health".to_string(),
694            timeout: Duration::from_secs(10),
695            interval: Duration::from_secs(30),
696            retries: 3,
697            enabled: true,
698            critical: false,
699            tags: HashMap::new(),
700        }
701    }
702}
703
704/// Extract a `"host:port"` address from an endpoint string.
705///
706/// Handles plain `"host:port"` strings as well as full URIs like
707/// `"postgres://user:pass@host:port/db"` or `"redis://host:port"`.
708fn extract_host_port(endpoint: &str) -> String {
709    // If it looks like a URI, extract the authority component.
710    if let Some(rest) = endpoint
711        .strip_prefix("postgres://")
712        .or_else(|| endpoint.strip_prefix("postgresql://"))
713        .or_else(|| endpoint.strip_prefix("redis://"))
714        .or_else(|| endpoint.strip_prefix("mysql://"))
715        .or_else(|| endpoint.strip_prefix("mongodb://"))
716        .or_else(|| endpoint.strip_prefix("http://"))
717        .or_else(|| endpoint.strip_prefix("https://"))
718    {
719        // Drop any user-info (user:pass@)
720        let after_auth = if let Some(at_pos) = rest.rfind('@') {
721            &rest[at_pos + 1..]
722        } else {
723            rest
724        };
725        // Keep only the host:port portion (drop /path?query etc.)
726        let host_port = after_auth.split('/').next().unwrap_or(after_auth);
727        return host_port.to_string();
728    }
729    endpoint.to_string()
730}
731
732/// Attempt a TCP connection to `addr` with a 5-second timeout.
733///
734/// Returns [`HealthStatus::Healthy`] on success, [`HealthStatus::Unhealthy`]
735/// on connection refused / timeout, and a [`HealthError::Network`] only for
736/// unexpected I/O errors.
737async fn tcp_connect_check(addr: &str) -> Result<HealthStatus, HealthError> {
738    match tokio::time::timeout(Duration::from_secs(5), tokio::net::TcpStream::connect(addr)).await {
739        Ok(Ok(_stream)) => Ok(HealthStatus::Healthy),
740        Ok(Err(e)) => {
741            tracing::debug!("TCP health check to {} failed: {}", addr, e);
742            Ok(HealthStatus::Unhealthy)
743        }
744        Err(_timeout) => {
745            tracing::debug!("TCP health check to {} timed out", addr);
746            Ok(HealthStatus::Unhealthy)
747        }
748    }
749}
750
751#[cfg(test)]
752mod tests {
753    use super::*;
754
755    #[test]
756    fn test_health_monitor_creation() {
757        let config = HealthMonitorConfig::default();
758        let monitor = HealthMonitor::new(config);
759
760        assert_eq!(monitor.service_health.service_name, "authframework");
761        assert_eq!(monitor.service_health.overall_status, HealthStatus::Unknown);
762    }
763
764    #[test]
765    fn test_add_health_check() {
766        let config = HealthMonitorConfig::default();
767        let mut monitor = HealthMonitor::new(config);
768
769        let check = HealthCheck {
770            name: "test-check".to_string(),
771            check_type: HealthCheckType::Http,
772            endpoint: "/test".to_string(),
773            ..Default::default()
774        };
775
776        monitor.add_check(check);
777        assert_eq!(monitor.checks.len(), 1);
778        assert_eq!(monitor.checks[0].name, "test-check");
779    }
780
781    #[test]
782    fn test_remove_health_check() {
783        let config = HealthMonitorConfig::default();
784        let mut monitor = HealthMonitor::new(config);
785
786        let check = HealthCheck {
787            name: "test-check".to_string(),
788            check_type: HealthCheckType::Http,
789            endpoint: "/test".to_string(),
790            ..Default::default()
791        };
792
793        monitor.add_check(check);
794        assert_eq!(monitor.checks.len(), 1);
795
796        monitor.remove_check("test-check");
797        assert_eq!(monitor.checks.len(), 0);
798    }
799
800    #[tokio::test]
801    async fn test_http_health_check() {
802        let config = HealthMonitorConfig::default();
803        let monitor = HealthMonitor::new(config);
804
805        // Invalid scheme: expect an error, not a health status.
806        let result = monitor.check_http("/local/path").await;
807        assert!(result.is_err(), "non-http URL should return Err");
808
809        // Valid HTTP URL pointing at a server that is not running:
810        // the real client returns HealthStatus::Unhealthy (connection refused),
811        // not an error.
812        let result = monitor.check_http("http://localhost:19999/health").await;
813        assert!(
814            result.is_ok(),
815            "connection-refused should yield Ok(Unhealthy), not Err"
816        );
817        assert_eq!(
818            result.unwrap(),
819            HealthStatus::Unhealthy,
820            "unreachable host should be Unhealthy"
821        );
822    }
823
824    #[tokio::test]
825    async fn test_filesystem_health_check() {
826        let config = HealthMonitorConfig::default();
827        let monitor = HealthMonitor::new(config);
828
829        let result = monitor.check_filesystem("/tmp").await;
830        // This might fail on Windows, but demonstrates the concept
831        let _ = result;
832    }
833
834    #[tokio::test]
835    async fn test_disk_usage_health_check() {
836        let config = HealthMonitorConfig::default();
837        let monitor = HealthMonitor::new(config);
838
839        // "/" on Unix, "C:\" on Windows — both are valid paths for the root disk.
840        #[cfg(unix)]
841        let path = "/";
842        #[cfg(windows)]
843        let path = "C:\\";
844
845        let usage = monitor.get_disk_usage(path).await;
846        assert!(
847            usage.is_ok(),
848            "get_disk_usage returned Err: {:?}",
849            usage.err()
850        );
851        let value = usage.unwrap();
852        assert!(
853            (0.0..=1.0).contains(&value),
854            "Disk usage {value} is out of [0.0, 1.0]"
855        );
856    }
857
858    #[tokio::test]
859    async fn test_memory_health_check() {
860        let config = HealthMonitorConfig::default();
861        let monitor = HealthMonitor::new(config);
862
863        let result = monitor.check_memory().await;
864        assert!(result.is_ok());
865
866        let status = result.unwrap();
867        assert!(matches!(
868            status,
869            HealthStatus::Healthy | HealthStatus::Degraded | HealthStatus::Unhealthy
870        ));
871    }
872
873    #[test]
874    fn test_extract_host_port() {
875        // Plain host:port — returned as-is.
876        assert_eq!(extract_host_port("localhost:5432"), "localhost:5432");
877
878        // PostgreSQL URI with credentials.
879        assert_eq!(
880            extract_host_port("postgres://user:pass@db.example.com:5432/mydb"),
881            "db.example.com:5432"
882        );
883
884        // Redis URI without credentials.
885        assert_eq!(
886            extract_host_port("redis://cache.host:6379"),
887            "cache.host:6379"
888        );
889
890        // Redis URI with path.
891        assert_eq!(
892            extract_host_port("redis://cache.host:6379/0"),
893            "cache.host:6379"
894        );
895
896        // MySQL URI.
897        assert_eq!(
898            extract_host_port("mysql://root@localhost:3306/app"),
899            "localhost:3306"
900        );
901
902        // Bare endpoint with no scheme.
903        assert_eq!(extract_host_port("192.168.1.1:9200"), "192.168.1.1:9200");
904    }
905}