1use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, SystemTime, UNIX_EPOCH};
7use thiserror::Error;
8use tokio::time::interval;
9
10#[derive(Debug, Error)]
11pub enum HealthError {
12 #[error("Health check failed: {0}")]
13 CheckFailed(String),
14 #[error("Service unavailable: {0}")]
15 ServiceUnavailable(String),
16 #[error("Timeout error: {0}")]
17 Timeout(String),
18 #[error("Network error: {0}")]
19 Network(String),
20 #[error("Configuration error: {0}")]
21 Configuration(String),
22 #[error("IO error: {0}")]
23 Io(#[from] std::io::Error),
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
28pub enum HealthStatus {
29 Healthy,
30 Degraded,
31 Unhealthy,
32 Unknown,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
37pub enum HealthCheckType {
38 Http,
39 Database,
40 Redis,
41 FileSystem,
42 Memory,
43 Cpu,
44 Disk,
45 Custom(String),
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HealthCheck {
51 pub name: String,
52 pub check_type: HealthCheckType,
53 pub endpoint: String,
54 pub timeout: Duration,
55 pub interval: Duration,
56 pub retries: u32,
57 pub enabled: bool,
58 pub critical: bool,
59 pub tags: HashMap<String, String>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct HealthCheckResult {
65 pub name: String,
66 pub status: HealthStatus,
67 pub message: String,
68 pub response_time: Duration,
69 pub timestamp: u64,
70 pub metadata: HashMap<String, serde_json::Value>,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct ServiceHealth {
76 pub service_name: String,
77 pub overall_status: HealthStatus,
78 pub checks: Vec<HealthCheckResult>,
79 pub uptime: Duration,
80 pub last_updated: u64,
81 pub version: String,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct SystemMetrics {
87 pub cpu_usage: f64,
88 pub memory_usage: f64,
89 pub disk_usage: f64,
90 pub network_io: NetworkIoMetrics,
91 pub process_count: u32,
92 pub load_average: LoadAverage,
93 pub timestamp: u64,
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct NetworkIoMetrics {
98 pub bytes_sent: u64,
99 pub bytes_received: u64,
100 pub packets_sent: u64,
101 pub packets_received: u64,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct LoadAverage {
106 pub one_minute: f64,
107 pub five_minutes: f64,
108 pub fifteen_minutes: f64,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct HealthMonitorConfig {
114 pub enabled: bool,
115 pub global_timeout: Duration,
116 pub check_interval: Duration,
117 pub unhealthy_threshold: u32,
118 pub degraded_threshold: u32,
119 pub metrics_retention: Duration,
120 pub alert_on_failure: bool,
121 pub alert_endpoints: Vec<String>,
122}
123
124pub struct HealthMonitor {
126 config: HealthMonitorConfig,
127 checks: Vec<HealthCheck>,
128 results: HashMap<String, HealthCheckResult>,
129 service_health: ServiceHealth,
130 system_metrics: SystemMetrics,
131 failure_counts: HashMap<String, u32>,
132}
133
134impl HealthMonitor {
135 pub fn new(config: HealthMonitorConfig) -> Self {
137 let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
138
139 Self {
140 config,
141 checks: Vec::new(),
142 results: HashMap::new(),
143 service_health: ServiceHealth {
144 service_name: "authframework".to_string(),
145 overall_status: HealthStatus::Unknown,
146 checks: Vec::new(),
147 uptime: Duration::from_secs(0),
148 last_updated: now.as_secs(),
149 version: env!("CARGO_PKG_VERSION").to_string(),
150 },
151 system_metrics: SystemMetrics {
152 cpu_usage: 0.0,
153 memory_usage: 0.0,
154 disk_usage: 0.0,
155 network_io: NetworkIoMetrics {
156 bytes_sent: 0,
157 bytes_received: 0,
158 packets_sent: 0,
159 packets_received: 0,
160 },
161 process_count: 0,
162 load_average: LoadAverage {
163 one_minute: 0.0,
164 five_minutes: 0.0,
165 fifteen_minutes: 0.0,
166 },
167 timestamp: now.as_secs(),
168 },
169 failure_counts: HashMap::new(),
170 }
171 }
172
173 pub fn add_check(&mut self, check: HealthCheck) {
175 self.checks.push(check);
176 }
177
178 pub fn remove_check(&mut self, name: &str) {
180 self.checks.retain(|check| check.name != name);
181 self.results.remove(name);
182 self.failure_counts.remove(name);
183 }
184
185 pub async fn start_monitoring(&mut self) -> Result<(), HealthError> {
187 if !self.config.enabled {
188 return Ok(());
189 }
190
191 let mut interval = interval(self.config.check_interval);
193
194 loop {
195 interval.tick().await;
196
197 self.run_health_checks().await?;
199
200 self.update_system_metrics().await?;
202
203 self.update_service_health();
205
206 self.check_alerts().await?;
208 }
209 }
210
211 async fn run_health_checks(&mut self) -> Result<(), HealthError> {
213 for check in &self.checks {
214 if !check.enabled {
215 continue;
216 }
217
218 let result = self.run_single_check(check).await;
219 self.results.insert(check.name.clone(), result.clone());
220
221 match result.status {
223 HealthStatus::Healthy => {
224 self.failure_counts.insert(check.name.clone(), 0);
225 }
226 _ => {
227 let count = self.failure_counts.get(&check.name).unwrap_or(&0) + 1;
228 self.failure_counts.insert(check.name.clone(), count);
229 }
230 }
231 }
232
233 Ok(())
234 }
235
236 async fn run_single_check(&self, check: &HealthCheck) -> HealthCheckResult {
238 let start_time = SystemTime::now();
239 let mut retries = 0;
240 let mut last_error = String::new();
241
242 while retries <= check.retries {
243 let result = match check.check_type {
244 HealthCheckType::Http => self.check_http(&check.endpoint).await,
245 HealthCheckType::Database => self.check_database(&check.endpoint).await,
246 HealthCheckType::Redis => self.check_redis(&check.endpoint).await,
247 HealthCheckType::FileSystem => self.check_filesystem(&check.endpoint).await,
248 HealthCheckType::Memory => self.check_memory().await,
249 HealthCheckType::Cpu => self.check_cpu().await,
250 HealthCheckType::Disk => self.check_disk(&check.endpoint).await,
251 HealthCheckType::Custom(ref custom_type) => {
252 self.check_custom(custom_type, &check.endpoint).await
253 }
254 };
255
256 match result {
257 Ok(status) => {
258 let response_time = start_time.elapsed().unwrap_or_default();
259 return HealthCheckResult {
260 name: check.name.clone(),
261 status,
262 message: "Health check passed".to_string(),
263 response_time,
264 timestamp: SystemTime::now()
265 .duration_since(UNIX_EPOCH)
266 .unwrap()
267 .as_secs(),
268 metadata: HashMap::new(),
269 };
270 }
271 Err(e) => {
272 last_error = e.to_string();
273 retries += 1;
274
275 if retries <= check.retries {
276 tokio::time::sleep(Duration::from_millis(100 * retries as u64)).await;
277 }
278 }
279 }
280 }
281
282 let response_time = start_time.elapsed().unwrap_or_default();
283 HealthCheckResult {
284 name: check.name.clone(),
285 status: HealthStatus::Unhealthy,
286 message: format!(
287 "Health check failed after {} retries: {}",
288 check.retries, last_error
289 ),
290 response_time,
291 timestamp: SystemTime::now()
292 .duration_since(UNIX_EPOCH)
293 .unwrap()
294 .as_secs(),
295 metadata: HashMap::new(),
296 }
297 }
298
299 async fn check_http(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
301 if endpoint.starts_with("http") {
303 Ok(HealthStatus::Healthy)
304 } else {
305 Err(HealthError::CheckFailed(
306 "Invalid HTTP endpoint".to_string(),
307 ))
308 }
309 }
310
311 async fn check_database(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
313 if !endpoint.is_empty() {
315 Ok(HealthStatus::Healthy)
316 } else {
317 Err(HealthError::CheckFailed(
318 "Database endpoint not configured".to_string(),
319 ))
320 }
321 }
322
323 async fn check_redis(&self, endpoint: &str) -> Result<HealthStatus, HealthError> {
325 if !endpoint.is_empty() {
327 Ok(HealthStatus::Healthy)
328 } else {
329 Err(HealthError::CheckFailed(
330 "Redis endpoint not configured".to_string(),
331 ))
332 }
333 }
334
335 async fn check_filesystem(&self, path: &str) -> Result<HealthStatus, HealthError> {
337 use std::path::Path;
338
339 if Path::new(path).exists() {
340 Ok(HealthStatus::Healthy)
341 } else {
342 Err(HealthError::CheckFailed(format!(
343 "Path does not exist: {}",
344 path
345 )))
346 }
347 }
348
349 async fn check_memory(&self) -> Result<HealthStatus, HealthError> {
351 let memory_usage = self.get_memory_usage().await?;
352
353 if memory_usage < 0.8 {
354 Ok(HealthStatus::Healthy)
355 } else if memory_usage < 0.9 {
356 Ok(HealthStatus::Degraded)
357 } else {
358 Ok(HealthStatus::Unhealthy)
359 }
360 }
361
362 async fn check_cpu(&self) -> Result<HealthStatus, HealthError> {
364 let cpu_usage = self.get_cpu_usage().await?;
365
366 if cpu_usage < 0.7 {
367 Ok(HealthStatus::Healthy)
368 } else if cpu_usage < 0.85 {
369 Ok(HealthStatus::Degraded)
370 } else {
371 Ok(HealthStatus::Unhealthy)
372 }
373 }
374
375 async fn check_disk(&self, path: &str) -> Result<HealthStatus, HealthError> {
377 let disk_usage = self.get_disk_usage(path).await?;
378
379 if disk_usage < 0.8 {
380 Ok(HealthStatus::Healthy)
381 } else if disk_usage < 0.9 {
382 Ok(HealthStatus::Degraded)
383 } else {
384 Ok(HealthStatus::Unhealthy)
385 }
386 }
387
388 async fn check_custom(
390 &self,
391 _custom_type: &str,
392 _endpoint: &str,
393 ) -> Result<HealthStatus, HealthError> {
394 Ok(HealthStatus::Healthy)
396 }
397
398 async fn update_system_metrics(&mut self) -> Result<(), HealthError> {
400 let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
401
402 self.system_metrics = SystemMetrics {
403 cpu_usage: self.get_cpu_usage().await?,
404 memory_usage: self.get_memory_usage().await?,
405 disk_usage: self.get_disk_usage("/").await?,
406 network_io: self.get_network_io().await?,
407 process_count: self.get_process_count().await?,
408 load_average: self.get_load_average().await?,
409 timestamp: now.as_secs(),
410 };
411
412 Ok(())
413 }
414
415 async fn get_cpu_usage(&self) -> Result<f64, HealthError> {
417 Ok(0.45)
419 }
420
421 async fn get_memory_usage(&self) -> Result<f64, HealthError> {
423 Ok(0.65)
425 }
426
427 async fn get_disk_usage(&self, _path: &str) -> Result<f64, HealthError> {
429 Ok(0.55)
431 }
432
433 async fn get_network_io(&self) -> Result<NetworkIoMetrics, HealthError> {
435 Ok(NetworkIoMetrics {
437 bytes_sent: 1024000,
438 bytes_received: 2048000,
439 packets_sent: 1000,
440 packets_received: 1500,
441 })
442 }
443
444 async fn get_process_count(&self) -> Result<u32, HealthError> {
446 Ok(150)
448 }
449
450 async fn get_load_average(&self) -> Result<LoadAverage, HealthError> {
452 Ok(LoadAverage {
454 one_minute: 1.2,
455 five_minutes: 1.1,
456 fifteen_minutes: 0.9,
457 })
458 }
459
460 fn update_service_health(&mut self) {
462 let mut healthy_count = 0;
463 let mut degraded_count = 0;
464 let mut unhealthy_count = 0;
465 let mut critical_unhealthy = false;
466
467 let check_results: Vec<HealthCheckResult> = self.results.values().cloned().collect();
468
469 for result in &check_results {
470 let is_critical = self
472 .checks
473 .iter()
474 .find(|check| check.name == result.name)
475 .map(|check| check.critical)
476 .unwrap_or(false);
477
478 match result.status {
479 HealthStatus::Healthy => healthy_count += 1,
480 HealthStatus::Degraded => degraded_count += 1,
481 HealthStatus::Unhealthy => {
482 unhealthy_count += 1;
483 if is_critical {
484 critical_unhealthy = true;
485 }
486 }
487 HealthStatus::Unknown => {}
488 }
489 }
490
491 let overall_status = if critical_unhealthy {
493 HealthStatus::Unhealthy
494 } else if unhealthy_count > 0 || degraded_count > 0 {
495 HealthStatus::Degraded
496 } else if healthy_count > 0 {
497 HealthStatus::Healthy
498 } else {
499 HealthStatus::Unknown
500 };
501
502 let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
503
504 self.service_health = ServiceHealth {
505 service_name: self.service_health.service_name.clone(),
506 overall_status,
507 checks: check_results,
508 uptime: Duration::from_secs(now.as_secs() - self.service_health.last_updated),
509 last_updated: now.as_secs(),
510 version: self.service_health.version.clone(),
511 };
512 }
513
514 async fn check_alerts(&self) -> Result<(), HealthError> {
516 if !self.config.alert_on_failure {
517 return Ok(());
518 }
519
520 if self.service_health.overall_status == HealthStatus::Unhealthy {
522 self.send_alert("Service is unhealthy").await?;
523 }
524
525 for (check_name, failure_count) in &self.failure_counts {
527 if *failure_count >= self.config.unhealthy_threshold {
528 self.send_alert(&format!(
529 "Health check '{}' has failed {} times",
530 check_name, failure_count
531 ))
532 .await?;
533 }
534 }
535
536 Ok(())
537 }
538
539 async fn send_alert(&self, message: &str) -> Result<(), HealthError> {
541 for endpoint in &self.config.alert_endpoints {
542 println!("ALERT to {}: {}", endpoint, message);
544 }
545 Ok(())
546 }
547
548 pub fn get_service_health(&self) -> &ServiceHealth {
550 &self.service_health
551 }
552
553 pub fn get_system_metrics(&self) -> &SystemMetrics {
555 &self.system_metrics
556 }
557
558 pub fn get_check_results(&self) -> &HashMap<String, HealthCheckResult> {
560 &self.results
561 }
562
563 pub fn get_check_result(&self, name: &str) -> Option<&HealthCheckResult> {
565 self.results.get(name)
566 }
567}
568
569impl Default for HealthMonitorConfig {
570 fn default() -> Self {
571 Self {
572 enabled: true,
573 global_timeout: Duration::from_secs(30),
574 check_interval: Duration::from_secs(30),
575 unhealthy_threshold: 3,
576 degraded_threshold: 2,
577 metrics_retention: Duration::from_secs(24 * 3600), alert_on_failure: true,
579 alert_endpoints: vec!["http://localhost:9093/api/v1/alerts".to_string()],
580 }
581 }
582}
583
584impl Default for HealthCheck {
585 fn default() -> Self {
586 Self {
587 name: "default".to_string(),
588 check_type: HealthCheckType::Http,
589 endpoint: "/health".to_string(),
590 timeout: Duration::from_secs(10),
591 interval: Duration::from_secs(30),
592 retries: 3,
593 enabled: true,
594 critical: false,
595 tags: HashMap::new(),
596 }
597 }
598}
599
600#[cfg(test)]
601mod tests {
602 use super::*;
603
604 #[test]
605 fn test_health_monitor_creation() {
606 let config = HealthMonitorConfig::default();
607 let monitor = HealthMonitor::new(config);
608
609 assert_eq!(monitor.service_health.service_name, "authframework");
610 assert_eq!(monitor.service_health.overall_status, HealthStatus::Unknown);
611 }
612
613 #[test]
614 fn test_add_health_check() {
615 let config = HealthMonitorConfig::default();
616 let mut monitor = HealthMonitor::new(config);
617
618 let check = HealthCheck {
619 name: "test-check".to_string(),
620 check_type: HealthCheckType::Http,
621 endpoint: "/test".to_string(),
622 ..Default::default()
623 };
624
625 monitor.add_check(check);
626 assert_eq!(monitor.checks.len(), 1);
627 assert_eq!(monitor.checks[0].name, "test-check");
628 }
629
630 #[test]
631 fn test_remove_health_check() {
632 let config = HealthMonitorConfig::default();
633 let mut monitor = HealthMonitor::new(config);
634
635 let check = HealthCheck {
636 name: "test-check".to_string(),
637 check_type: HealthCheckType::Http,
638 endpoint: "/test".to_string(),
639 ..Default::default()
640 };
641
642 monitor.add_check(check);
643 assert_eq!(monitor.checks.len(), 1);
644
645 monitor.remove_check("test-check");
646 assert_eq!(monitor.checks.len(), 0);
647 }
648
649 #[tokio::test]
650 async fn test_http_health_check() {
651 let config = HealthMonitorConfig::default();
652 let monitor = HealthMonitor::new(config);
653
654 let result = monitor.check_http("http://localhost:8080/health").await;
655 assert!(result.is_ok());
656 assert_eq!(result.unwrap(), HealthStatus::Healthy);
657 }
658
659 #[tokio::test]
660 async fn test_filesystem_health_check() {
661 let config = HealthMonitorConfig::default();
662 let monitor = HealthMonitor::new(config);
663
664 let result = monitor.check_filesystem("/tmp").await;
665 let _ = result;
667 }
668
669 #[tokio::test]
670 async fn test_memory_health_check() {
671 let config = HealthMonitorConfig::default();
672 let monitor = HealthMonitor::new(config);
673
674 let result = monitor.check_memory().await;
675 assert!(result.is_ok());
676
677 let status = result.unwrap();
678 assert!(matches!(
679 status,
680 HealthStatus::Healthy | HealthStatus::Degraded | HealthStatus::Unhealthy
681 ));
682 }
683}
684
685