auth_framework/deployment/
health.rs

1// Health monitoring system for production deployment
2// Comprehensive health checks, metrics collection, and service monitoring
3
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7use thiserror::Error;
8use tokio::time::interval;
9
10#[derive(Debug, Error)]
11pub enum HealthError {
12    #[error("Health check failed: {0}")]
13    CheckFailed(String),
14    #[error("Service unavailable: {0}")]
15    ServiceUnavailable(String),
16    #[error("Timeout error: {0}")]
17    Timeout(String),
18    #[error("Network error: {0}")]
19    Network(String),
20    #[error("Configuration error: {0}")]
21    Configuration(String),
22    #[error("IO error: {0}")]
23    Io(#[from] std::io::Error),
24}
25
26/// Health check status
27#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
28pub enum HealthStatus {
29    Healthy,
30    Degraded,
31    Unhealthy,
32    Unknown,
33}
34
35/// Health check type
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub enum HealthCheckType {
38    Http,
39    Database,
40    Redis,
41    FileSystem,
42    Memory,
43    Cpu,
44    Disk,
45    Custom(String),
46}
47
48/// Individual health check configuration
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HealthCheck {
51    pub name: String,
52    pub check_type: HealthCheckType,
53    pub endpoint: String,
54    pub timeout: Duration,
55    pub interval: Duration,
56    pub retries: u32,
57    pub enabled: bool,
58    pub critical: bool,
59    pub tags: HashMap<String, String>,
60}
61
62/// Health check result
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct HealthCheckResult {
65    pub name: String,
66    pub status: HealthStatus,
67    pub message: String,
68    pub response_time: Duration,
69    pub timestamp: u64,
70    pub metadata: HashMap<String, serde_json::Value>,
71}
72
73/// Service health status
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct ServiceHealth {
76    pub service_name: String,
77    pub overall_status: HealthStatus,
78    pub checks: Vec<HealthCheckResult>,
79    pub uptime: Duration,
80    pub last_updated: u64,
81    pub version: String,
82}
83
84/// System metrics for health monitoring
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct SystemMetrics {
87    pub cpu_usage: f64,
88    pub memory_usage: f64,
89    pub disk_usage: f64,
90    pub network_io: NetworkIoMetrics,
91    pub process_count: u32,
92    pub load_average: LoadAverage,
93    pub timestamp: u64,
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct NetworkIoMetrics {
98    pub bytes_sent: u64,
99    pub bytes_received: u64,
100    pub packets_sent: u64,
101    pub packets_received: u64,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct LoadAverage {
106    pub one_minute: f64,
107    pub five_minutes: f64,
108    pub fifteen_minutes: f64,
109}
110
111/// Health monitoring configuration
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct HealthMonitorConfig {
114    pub enabled: bool,
115    pub global_timeout: Duration,
116    pub check_interval: Duration,
117    pub unhealthy_threshold: u32,
118    pub degraded_threshold: u32,
119    pub metrics_retention: Duration,
120    pub alert_on_failure: bool,
121    pub alert_endpoints: Vec<String>,
122}
123
124/// Health monitor manager
125pub struct HealthMonitor {
126    config: HealthMonitorConfig,
127    checks: Vec<HealthCheck>,
128    results: HashMap<String, HealthCheckResult>,
129    service_health: ServiceHealth,
130    system_metrics: SystemMetrics,
131    failure_counts: HashMap<String, u32>,
132}
133
134impl HealthMonitor {
135    /// Create new health monitor
136    pub fn new(config: HealthMonitorConfig) -> Self {
137        let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
138
139        Self {
140            config,
141            checks: Vec::new(),
142            results: HashMap::new(),
143            service_health: ServiceHealth {
144                service_name: "authframework".to_string(),
145                overall_status: HealthStatus::Unknown,
146                checks: Vec::new(),
147                uptime: Duration::from_secs(0),
148                last_updated: now.as_secs(),
149                version: env!("CARGO_PKG_VERSION").to_string(),
150            },
151            system_metrics: SystemMetrics {
152                cpu_usage: 0.0,
153                memory_usage: 0.0,
154                disk_usage: 0.0,
155                network_io: NetworkIoMetrics {
156                    bytes_sent: 0,
157                    bytes_received: 0,
158                    packets_sent: 0,
159                    packets_received: 0,
160                },
161                process_count: 0,
162                load_average: LoadAverage {
163                    one_minute: 0.0,
164                    five_minutes: 0.0,
165                    fifteen_minutes: 0.0,
166                },
167                timestamp: now.as_secs(),
168            },
169            failure_counts: HashMap::new(),
170        }
171    }
172
173    /// Add health check
174    pub fn add_check(&mut self, check: HealthCheck) {
175        self.checks.push(check);
176    }
177
178    /// Remove health check
179    pub fn remove_check(&mut self, name: &str) {
180        self.checks.retain(|check| check.name != name);
181        self.results.remove(name);
182        self.failure_counts.remove(name);
183    }
184
185    /// Start health monitoring
186    pub async fn start_monitoring(&mut self) -> Result<(), HealthError> {
187        if !self.config.enabled {
188            return Ok(());
189        }
190
191        // Start monitoring loop
192        let mut interval = interval(self.config.check_interval);
193
194        loop {
195            interval.tick().await;
196
197            // Run all health checks
198            self.run_health_checks().await?;
199
200            // Update system metrics
201            self.update_system_metrics().await?;
202
203            // Update overall service health
204            self.update_service_health();
205
206            // Check for alerts
207            self.check_alerts().await?;
208        }
209    }
210
211    /// Run all configured health checks
212    async fn run_health_checks(&mut self) -> Result<(), HealthError> {
213        for check in &self.checks {
214            if !check.enabled {
215                continue;
216            }
217
218            let result = self.run_single_check(check).await;
219            self.results.insert(check.name.clone(), result.clone());
220
221            // Update failure count
222            match result.status {
223                HealthStatus::Healthy => {
224                    self.failure_counts.insert(check.name.clone(), 0);
225                }
226                _ => {
227                    let count = self.failure_counts.get(&check.name).unwrap_or(&0) + 1;
228                    self.failure_counts.insert(check.name.clone(), count);
229                }
230            }
231        }
232
233        Ok(())
234    }
235
236    /// Run single health check
237    async fn run_single_check(&self, check: &HealthCheck) -> HealthCheckResult {
238        let start_time = SystemTime::now();
239        let mut retries = 0;
240        let mut last_error = String::new();
241
242        while retries <= check.retries {
243            let result = match check.check_type {
244                HealthCheckType::Http => self.check_http(&check.endpoint).await,
245                HealthCheckType::Database => self.check_database(&check.endpoint).await,
246                HealthCheckType::Redis => self.check_redis(&check.endpoint).await,
247                HealthCheckType::FileSystem => self.check_filesystem(&check.endpoint).await,
248                HealthCheckType::Memory => self.check_memory().await,
249                HealthCheckType::Cpu => self.check_cpu().await,
250                HealthCheckType::Disk => self.check_disk(&check.endpoint).await,
251                HealthCheckType::Custom(ref custom_type) => {
252                    self.check_custom(custom_type, &check.endpoint).await
253                }
254            };
255
256            match result {
257                Ok(status) => {
258                    let response_time = start_time.elapsed().unwrap_or_default();
259                    return HealthCheckResult {
260                        name: check.name.clone(),
261                        status,
262                        message: "Health check passed".to_string(),
263                        response_time,
264                        timestamp: SystemTime::now()
265                            .duration_since(UNIX_EPOCH)
266                            .unwrap()
267                            .as_secs(),
268                        metadata: HashMap::new(),
269                    };
270                }
271                Err(e) => {
272                    last_error = e.to_string();
273                    retries += 1;
274
275                    if retries <= check.retries {
276                        tokio::time::sleep(Duration::from_millis(100 * retries as u64)).await;
277                    }
278                }
279            }
280        }
281
282        let response_time = start_time.elapsed().unwrap_or_default();
283        HealthCheckResult {
284            name: check.name.clone(),
285            status: HealthStatus::Unhealthy,
286            message: format!(
287                "Health check failed after {} retries: {}",
288                check.retries, last_error
289            ),
290            response_time,
291            timestamp: SystemTime::now()
292                .duration_since(UNIX_EPOCH)
293                .unwrap()
294                .as_secs(),
295            metadata: HashMap::new(),
296        }
297    }
298
299    /// Check HTTP endpoint health
300    async fn check_http(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
301        // Simulate HTTP health check
302        if endpoint.starts_with("http") {
303            Ok(HealthStatus::Healthy)
304        } else {
305            Err(HealthError::CheckFailed(
306                "Invalid HTTP endpoint".to_string(),
307            ))
308        }
309    }
310
311    /// Check database connectivity
312    async fn check_database(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
313        // Simulate database health check
314        if !endpoint.is_empty() {
315            Ok(HealthStatus::Healthy)
316        } else {
317            Err(HealthError::CheckFailed(
318                "Database endpoint not configured".to_string(),
319            ))
320        }
321    }
322
323    /// Check Redis connectivity
324    async fn check_redis(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
325        // Simulate Redis health check
326        if !endpoint.is_empty() {
327            Ok(HealthStatus::Healthy)
328        } else {
329            Err(HealthError::CheckFailed(
330                "Redis endpoint not configured".to_string(),
331            ))
332        }
333    }
334
335    /// Check filesystem health
336    async fn check_filesystem(&self, path: &str) -> Result<HealthStatus, HealthError> {
337        use std::path::Path;
338
339        if Path::new(path).exists() {
340            Ok(HealthStatus::Healthy)
341        } else {
342            Err(HealthError::CheckFailed(format!(
343                "Path does not exist: {}",
344                path
345            )))
346        }
347    }
348
349    /// Check memory usage
350    async fn check_memory(&self) -> Result<HealthStatus, HealthError> {
351        let memory_usage = self.get_memory_usage().await?;
352
353        if memory_usage < 0.8 {
354            Ok(HealthStatus::Healthy)
355        } else if memory_usage < 0.9 {
356            Ok(HealthStatus::Degraded)
357        } else {
358            Ok(HealthStatus::Unhealthy)
359        }
360    }
361
362    /// Check CPU usage
363    async fn check_cpu(&self) -> Result<HealthStatus, HealthError> {
364        let cpu_usage = self.get_cpu_usage().await?;
365
366        if cpu_usage < 0.7 {
367            Ok(HealthStatus::Healthy)
368        } else if cpu_usage < 0.85 {
369            Ok(HealthStatus::Degraded)
370        } else {
371            Ok(HealthStatus::Unhealthy)
372        }
373    }
374
375    /// Check disk usage
376    async fn check_disk(&self, path: &str) -> Result<HealthStatus, HealthError> {
377        let disk_usage = self.get_disk_usage(path).await?;
378
379        if disk_usage < 0.8 {
380            Ok(HealthStatus::Healthy)
381        } else if disk_usage < 0.9 {
382            Ok(HealthStatus::Degraded)
383        } else {
384            Ok(HealthStatus::Unhealthy)
385        }
386    }
387
388    /// Check custom health endpoint
389    async fn check_custom(
390        &self,
391        _custom_type: &str,
392        _endpoint: &str,
393    ) -> Result<HealthStatus, HealthError> {
394        // Implement custom health check logic
395        Ok(HealthStatus::Healthy)
396    }
397
398    /// Update system metrics
399    async fn update_system_metrics(&mut self) -> Result<(), HealthError> {
400        let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
401
402        self.system_metrics = SystemMetrics {
403            cpu_usage: self.get_cpu_usage().await?,
404            memory_usage: self.get_memory_usage().await?,
405            disk_usage: self.get_disk_usage("/").await?,
406            network_io: self.get_network_io().await?,
407            process_count: self.get_process_count().await?,
408            load_average: self.get_load_average().await?,
409            timestamp: now.as_secs(),
410        };
411
412        Ok(())
413    }
414
415    /// Get CPU usage percentage
416    async fn get_cpu_usage(&self) -> Result<f64, HealthError> {
417        // Simulate CPU usage
418        Ok(0.45)
419    }
420
421    /// Get memory usage percentage
422    async fn get_memory_usage(&self) -> Result<f64, HealthError> {
423        // Simulate memory usage
424        Ok(0.65)
425    }
426
427    /// Get disk usage percentage
428    async fn get_disk_usage(&self, _path: &str) -> Result<f64, HealthError> {
429        // Simulate disk usage
430        Ok(0.55)
431    }
432
433    /// Get network I/O metrics
434    async fn get_network_io(&self) -> Result<NetworkIoMetrics, HealthError> {
435        // Simulate network I/O metrics
436        Ok(NetworkIoMetrics {
437            bytes_sent: 1024000,
438            bytes_received: 2048000,
439            packets_sent: 1000,
440            packets_received: 1500,
441        })
442    }
443
444    /// Get process count
445    async fn get_process_count(&self) -> Result<u32, HealthError> {
446        // Simulate process count
447        Ok(150)
448    }
449
450    /// Get load average
451    async fn get_load_average(&self) -> Result<LoadAverage, HealthError> {
452        // Simulate load average
453        Ok(LoadAverage {
454            one_minute: 1.2,
455            five_minutes: 1.1,
456            fifteen_minutes: 0.9,
457        })
458    }
459
460    /// Update overall service health based on individual checks
461    fn update_service_health(&mut self) {
462        let mut healthy_count = 0;
463        let mut degraded_count = 0;
464        let mut unhealthy_count = 0;
465        let mut critical_unhealthy = false;
466
467        let check_results: Vec<HealthCheckResult> = self.results.values().cloned().collect();
468
469        for result in &check_results {
470            // Check if this is a critical check
471            let is_critical = self
472                .checks
473                .iter()
474                .find(|check| check.name == result.name)
475                .map(|check| check.critical)
476                .unwrap_or(false);
477
478            match result.status {
479                HealthStatus::Healthy => healthy_count += 1,
480                HealthStatus::Degraded => degraded_count += 1,
481                HealthStatus::Unhealthy => {
482                    unhealthy_count += 1;
483                    if is_critical {
484                        critical_unhealthy = true;
485                    }
486                }
487                HealthStatus::Unknown => {}
488            }
489        }
490
491        // Determine overall status
492        let overall_status = if critical_unhealthy {
493            HealthStatus::Unhealthy
494        } else if unhealthy_count > 0 || degraded_count > 0 {
495            HealthStatus::Degraded
496        } else if healthy_count > 0 {
497            HealthStatus::Healthy
498        } else {
499            HealthStatus::Unknown
500        };
501
502        let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
503
504        self.service_health = ServiceHealth {
505            service_name: self.service_health.service_name.clone(),
506            overall_status,
507            checks: check_results,
508            uptime: Duration::from_secs(now.as_secs() - self.service_health.last_updated),
509            last_updated: now.as_secs(),
510            version: self.service_health.version.clone(),
511        };
512    }
513
514    /// Check for alert conditions
515    async fn check_alerts(&self) -> Result<(), HealthError> {
516        if !self.config.alert_on_failure {
517            return Ok(());
518        }
519
520        // Check for unhealthy services
521        if self.service_health.overall_status == HealthStatus::Unhealthy {
522            self.send_alert("Service is unhealthy").await?;
523        }
524
525        // Check for high failure rates
526        for (check_name, failure_count) in &self.failure_counts {
527            if *failure_count >= self.config.unhealthy_threshold {
528                self.send_alert(&format!(
529                    "Health check '{}' has failed {} times",
530                    check_name, failure_count
531                ))
532                .await?;
533            }
534        }
535
536        Ok(())
537    }
538
539    /// Send alert to configured endpoints
540    async fn send_alert(&self, message: &str) -> Result<(), HealthError> {
541        for endpoint in &self.config.alert_endpoints {
542            // Simulate sending alert
543            println!("ALERT to {}: {}", endpoint, message);
544        }
545        Ok(())
546    }
547
548    /// Get current service health
549    pub fn get_service_health(&self) -> &ServiceHealth {
550        &self.service_health
551    }
552
553    /// Get current system metrics
554    pub fn get_system_metrics(&self) -> &SystemMetrics {
555        &self.system_metrics
556    }
557
558    /// Get health check results
559    pub fn get_check_results(&self) -> &HashMap<String, HealthCheckResult> {
560        &self.results
561    }
562
563    /// Get specific health check result
564    pub fn get_check_result(&self, name: &str) -> Option<&HealthCheckResult> {
565        self.results.get(name)
566    }
567}
568
569impl Default for HealthMonitorConfig {
570    fn default() -> Self {
571        Self {
572            enabled: true,
573            global_timeout: Duration::from_secs(30),
574            check_interval: Duration::from_secs(30),
575            unhealthy_threshold: 3,
576            degraded_threshold: 2,
577            metrics_retention: Duration::from_secs(24 * 3600), // 24 hours
578            alert_on_failure: true,
579            alert_endpoints: vec!["http://localhost:9093/api/v1/alerts".to_string()],
580        }
581    }
582}
583
584impl Default for HealthCheck {
585    fn default() -> Self {
586        Self {
587            name: "default".to_string(),
588            check_type: HealthCheckType::Http,
589            endpoint: "/health".to_string(),
590            timeout: Duration::from_secs(10),
591            interval: Duration::from_secs(30),
592            retries: 3,
593            enabled: true,
594            critical: false,
595            tags: HashMap::new(),
596        }
597    }
598}
599
600#[cfg(test)]
601mod tests {
602    use super::*;
603
604    #[test]
605    fn test_health_monitor_creation() {
606        let config = HealthMonitorConfig::default();
607        let monitor = HealthMonitor::new(config);
608
609        assert_eq!(monitor.service_health.service_name, "authframework");
610        assert_eq!(monitor.service_health.overall_status, HealthStatus::Unknown);
611    }
612
613    #[test]
614    fn test_add_health_check() {
615        let config = HealthMonitorConfig::default();
616        let mut monitor = HealthMonitor::new(config);
617
618        let check = HealthCheck {
619            name: "test-check".to_string(),
620            check_type: HealthCheckType::Http,
621            endpoint: "/test".to_string(),
622            ..Default::default()
623        };
624
625        monitor.add_check(check);
626        assert_eq!(monitor.checks.len(), 1);
627        assert_eq!(monitor.checks[0].name, "test-check");
628    }
629
630    #[test]
631    fn test_remove_health_check() {
632        let config = HealthMonitorConfig::default();
633        let mut monitor = HealthMonitor::new(config);
634
635        let check = HealthCheck {
636            name: "test-check".to_string(),
637            check_type: HealthCheckType::Http,
638            endpoint: "/test".to_string(),
639            ..Default::default()
640        };
641
642        monitor.add_check(check);
643        assert_eq!(monitor.checks.len(), 1);
644
645        monitor.remove_check("test-check");
646        assert_eq!(monitor.checks.len(), 0);
647    }
648
649    #[tokio::test]
650    async fn test_http_health_check() {
651        let config = HealthMonitorConfig::default();
652        let monitor = HealthMonitor::new(config);
653
654        let result = monitor.check_http("http://localhost:8080/health").await;
655        assert!(result.is_ok());
656        assert_eq!(result.unwrap(), HealthStatus::Healthy);
657    }
658
659    #[tokio::test]
660    async fn test_filesystem_health_check() {
661        let config = HealthMonitorConfig::default();
662        let monitor = HealthMonitor::new(config);
663
664        let result = monitor.check_filesystem("/tmp").await;
665        // This might fail on Windows, but demonstrates the concept
666        let _ = result;
667    }
668
669    #[tokio::test]
670    async fn test_memory_health_check() {
671        let config = HealthMonitorConfig::default();
672        let monitor = HealthMonitor::new(config);
673
674        let result = monitor.check_memory().await;
675        assert!(result.is_ok());
676
677        let status = result.unwrap();
678        assert!(matches!(
679            status,
680            HealthStatus::Healthy | HealthStatus::Degraded | HealthStatus::Unhealthy
681        ));
682    }
683}
684
685