ipfrs_storage/
health.rs

1//! Health check system for storage backends
2//!
3//! Provides standardized health checks for all storage backends including:
4//! - Liveness checks (is the service running?)
5//! - Readiness checks (can the service handle requests?)
6//! - Detailed status reporting
7//! - Aggregate health across multiple backends
8//!
9//! ## Example
10//! ```no_run
11//! use ipfrs_storage::{HealthChecker, HealthStatus};
12//!
13//! #[tokio::main]
14//! async fn main() {
15//!     let checker = HealthChecker::new();
16//!
17//!     let status = checker.check_liveness().await;
18//!     println!("Health: {:?}", status);
19//! }
20//! ```
21
22use async_trait::async_trait;
23use serde::{Deserialize, Serialize};
24use std::collections::HashMap;
25use std::sync::Arc;
26use std::time::{Duration, Instant};
27
28/// Health status of a component
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
30pub enum HealthStatus {
31    /// Component is healthy and operational
32    Healthy,
33    /// Component is degraded but operational
34    Degraded,
35    /// Component is unhealthy and not operational
36    Unhealthy,
37}
38
39impl HealthStatus {
40    /// Check if status is healthy
41    pub fn is_healthy(&self) -> bool {
42        matches!(self, HealthStatus::Healthy)
43    }
44
45    /// Check if status is degraded
46    pub fn is_degraded(&self) -> bool {
47        matches!(self, HealthStatus::Degraded)
48    }
49
50    /// Check if status is unhealthy
51    pub fn is_unhealthy(&self) -> bool {
52        matches!(self, HealthStatus::Unhealthy)
53    }
54
55    /// Check if component can serve requests (healthy or degraded)
56    pub fn is_ready(&self) -> bool {
57        !self.is_unhealthy()
58    }
59}
60
61/// Detailed health check result
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct HealthCheckResult {
64    /// Overall status
65    pub status: HealthStatus,
66    /// Component name
67    pub component: String,
68    /// Human-readable message
69    pub message: String,
70    /// When the check was performed
71    pub checked_at: String,
72    /// Check duration
73    pub duration_ms: u64,
74    /// Additional metadata
75    pub metadata: HashMap<String, String>,
76}
77
78impl HealthCheckResult {
79    /// Create a healthy result
80    pub fn healthy(component: String, message: String, duration: Duration) -> Self {
81        Self {
82            status: HealthStatus::Healthy,
83            component,
84            message,
85            checked_at: chrono::Utc::now().to_rfc3339(),
86            duration_ms: duration.as_millis() as u64,
87            metadata: HashMap::new(),
88        }
89    }
90
91    /// Create a degraded result
92    pub fn degraded(component: String, message: String, duration: Duration) -> Self {
93        Self {
94            status: HealthStatus::Degraded,
95            component,
96            message,
97            checked_at: chrono::Utc::now().to_rfc3339(),
98            duration_ms: duration.as_millis() as u64,
99            metadata: HashMap::new(),
100        }
101    }
102
103    /// Create an unhealthy result
104    pub fn unhealthy(component: String, message: String, duration: Duration) -> Self {
105        Self {
106            status: HealthStatus::Unhealthy,
107            component,
108            message,
109            checked_at: chrono::Utc::now().to_rfc3339(),
110            duration_ms: duration.as_millis() as u64,
111            metadata: HashMap::new(),
112        }
113    }
114
115    /// Add metadata to the result
116    pub fn with_metadata(mut self, key: String, value: String) -> Self {
117        self.metadata.insert(key, value);
118        self
119    }
120}
121
122/// Trait for health-checkable components
123#[async_trait]
124pub trait HealthCheck: Send + Sync {
125    /// Perform a liveness check
126    ///
127    /// Liveness checks verify that the component is running.
128    /// A failed liveness check indicates the component should be restarted.
129    async fn check_liveness(&self) -> HealthCheckResult;
130
131    /// Perform a readiness check
132    ///
133    /// Readiness checks verify that the component can handle requests.
134    /// A failed readiness check means the component should not receive traffic.
135    async fn check_readiness(&self) -> HealthCheckResult;
136
137    /// Get component name
138    fn component_name(&self) -> String;
139}
140
141/// Aggregate health checker for multiple components
142pub struct HealthChecker {
143    /// Registered health checks
144    checks: Arc<parking_lot::RwLock<Vec<Arc<dyn HealthCheck>>>>,
145}
146
147impl HealthChecker {
148    /// Create a new health checker
149    pub fn new() -> Self {
150        Self {
151            checks: Arc::new(parking_lot::RwLock::new(Vec::new())),
152        }
153    }
154
155    /// Register a health check
156    pub fn register<H: HealthCheck + 'static>(&self, check: H) {
157        self.checks.write().push(Arc::new(check));
158    }
159
160    /// Check liveness of all registered components
161    pub async fn check_liveness(&self) -> AggregateHealthResult {
162        let checks = self.checks.read().clone();
163        let mut results = Vec::new();
164
165        for check in checks {
166            results.push(check.check_liveness().await);
167        }
168
169        AggregateHealthResult::from_results(results)
170    }
171
172    /// Check readiness of all registered components
173    pub async fn check_readiness(&self) -> AggregateHealthResult {
174        let checks = self.checks.read().clone();
175        let mut results = Vec::new();
176
177        for check in checks {
178            results.push(check.check_readiness().await);
179        }
180
181        AggregateHealthResult::from_results(results)
182    }
183
184    /// Get detailed status of all components
185    pub async fn detailed_status(&self) -> DetailedHealthStatus {
186        let checks = self.checks.read().clone();
187        let mut liveness_results = Vec::new();
188        let mut readiness_results = Vec::new();
189
190        for check in checks {
191            liveness_results.push(check.check_liveness().await);
192            readiness_results.push(check.check_readiness().await);
193        }
194
195        DetailedHealthStatus {
196            liveness: AggregateHealthResult::from_results(liveness_results),
197            readiness: AggregateHealthResult::from_results(readiness_results),
198        }
199    }
200}
201
202impl Default for HealthChecker {
203    fn default() -> Self {
204        Self::new()
205    }
206}
207
208/// Aggregate health result across multiple components
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct AggregateHealthResult {
211    /// Overall status
212    pub status: HealthStatus,
213    /// Individual component results
214    pub components: Vec<HealthCheckResult>,
215    /// Total number of components
216    pub total_components: usize,
217    /// Number of healthy components
218    pub healthy_count: usize,
219    /// Number of degraded components
220    pub degraded_count: usize,
221    /// Number of unhealthy components
222    pub unhealthy_count: usize,
223}
224
225impl AggregateHealthResult {
226    /// Create aggregate result from individual results
227    pub fn from_results(components: Vec<HealthCheckResult>) -> Self {
228        let total_components = components.len();
229        let mut healthy_count = 0;
230        let mut degraded_count = 0;
231        let mut unhealthy_count = 0;
232
233        for result in &components {
234            match result.status {
235                HealthStatus::Healthy => healthy_count += 1,
236                HealthStatus::Degraded => degraded_count += 1,
237                HealthStatus::Unhealthy => unhealthy_count += 1,
238            }
239        }
240
241        // Determine overall status
242        let status = if unhealthy_count > 0 {
243            HealthStatus::Unhealthy
244        } else if degraded_count > 0 {
245            HealthStatus::Degraded
246        } else {
247            HealthStatus::Healthy
248        };
249
250        Self {
251            status,
252            components,
253            total_components,
254            healthy_count,
255            degraded_count,
256            unhealthy_count,
257        }
258    }
259
260    /// Check if all components are healthy
261    pub fn all_healthy(&self) -> bool {
262        self.status == HealthStatus::Healthy
263    }
264
265    /// Check if any component is unhealthy
266    pub fn any_unhealthy(&self) -> bool {
267        self.unhealthy_count > 0
268    }
269
270    /// Get unhealthy components
271    pub fn unhealthy_components(&self) -> Vec<&HealthCheckResult> {
272        self.components
273            .iter()
274            .filter(|r| r.status == HealthStatus::Unhealthy)
275            .collect()
276    }
277}
278
279/// Detailed health status with liveness and readiness
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct DetailedHealthStatus {
282    /// Liveness check results
283    pub liveness: AggregateHealthResult,
284    /// Readiness check results
285    pub readiness: AggregateHealthResult,
286}
287
288impl DetailedHealthStatus {
289    /// Check if system is alive
290    pub fn is_alive(&self) -> bool {
291        self.liveness.status != HealthStatus::Unhealthy
292    }
293
294    /// Check if system is ready
295    pub fn is_ready(&self) -> bool {
296        self.readiness.status != HealthStatus::Unhealthy
297    }
298}
299
300/// Simple health check implementation for testing
301#[derive(Clone)]
302pub struct SimpleHealthCheck {
303    name: String,
304    is_healthy: Arc<parking_lot::RwLock<bool>>,
305}
306
307impl SimpleHealthCheck {
308    /// Create a new simple health check
309    pub fn new(name: String) -> Self {
310        Self {
311            name,
312            is_healthy: Arc::new(parking_lot::RwLock::new(true)),
313        }
314    }
315
316    /// Set health status
317    pub fn set_healthy(&self, healthy: bool) {
318        *self.is_healthy.write() = healthy;
319    }
320}
321
322#[async_trait]
323impl HealthCheck for SimpleHealthCheck {
324    async fn check_liveness(&self) -> HealthCheckResult {
325        let start = Instant::now();
326        let is_healthy = *self.is_healthy.read();
327        let duration = start.elapsed();
328
329        if is_healthy {
330            HealthCheckResult::healthy(
331                self.name.clone(),
332                "Component is alive".to_string(),
333                duration,
334            )
335        } else {
336            HealthCheckResult::unhealthy(
337                self.name.clone(),
338                "Component is not alive".to_string(),
339                duration,
340            )
341        }
342    }
343
344    async fn check_readiness(&self) -> HealthCheckResult {
345        let start = Instant::now();
346        let is_healthy = *self.is_healthy.read();
347        let duration = start.elapsed();
348
349        if is_healthy {
350            HealthCheckResult::healthy(
351                self.name.clone(),
352                "Component is ready".to_string(),
353                duration,
354            )
355        } else {
356            HealthCheckResult::unhealthy(
357                self.name.clone(),
358                "Component is not ready".to_string(),
359                duration,
360            )
361        }
362    }
363
364    fn component_name(&self) -> String {
365        self.name.clone()
366    }
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[tokio::test]
374    async fn test_health_status() {
375        assert!(HealthStatus::Healthy.is_healthy());
376        assert!(!HealthStatus::Degraded.is_healthy());
377        assert!(!HealthStatus::Unhealthy.is_healthy());
378
379        assert!(HealthStatus::Healthy.is_ready());
380        assert!(HealthStatus::Degraded.is_ready());
381        assert!(!HealthStatus::Unhealthy.is_ready());
382    }
383
384    #[tokio::test]
385    async fn test_simple_health_check() {
386        let check = SimpleHealthCheck::new("test".to_string());
387
388        let result = check.check_liveness().await;
389        assert!(result.status.is_healthy());
390
391        check.set_healthy(false);
392        let result = check.check_liveness().await;
393        assert!(result.status.is_unhealthy());
394    }
395
396    #[tokio::test]
397    async fn test_health_checker_aggregate() {
398        let checker = HealthChecker::new();
399
400        let check1 = SimpleHealthCheck::new("component1".to_string());
401        let check2 = SimpleHealthCheck::new("component2".to_string());
402
403        checker.register(check1.clone());
404        checker.register(check2.clone());
405
406        let result = checker.check_liveness().await;
407        assert!(result.all_healthy());
408        assert_eq!(result.healthy_count, 2);
409
410        // Make one component unhealthy
411        check1.set_healthy(false);
412
413        let result = checker.check_liveness().await;
414        assert!(!result.all_healthy());
415        assert!(result.any_unhealthy());
416        assert_eq!(result.healthy_count, 1);
417        assert_eq!(result.unhealthy_count, 1);
418    }
419
420    #[tokio::test]
421    async fn test_detailed_status() {
422        let checker = HealthChecker::new();
423        let check = SimpleHealthCheck::new("test".to_string());
424        checker.register(check);
425
426        let status = checker.detailed_status().await;
427        assert!(status.is_alive());
428        assert!(status.is_ready());
429    }
430
431    #[tokio::test]
432    async fn test_aggregate_health_result() {
433        let results = vec![
434            HealthCheckResult::healthy(
435                "comp1".to_string(),
436                "OK".to_string(),
437                Duration::from_millis(10),
438            ),
439            HealthCheckResult::degraded(
440                "comp2".to_string(),
441                "Slow".to_string(),
442                Duration::from_millis(100),
443            ),
444        ];
445
446        let aggregate = AggregateHealthResult::from_results(results);
447        assert_eq!(aggregate.status, HealthStatus::Degraded);
448        assert_eq!(aggregate.healthy_count, 1);
449        assert_eq!(aggregate.degraded_count, 1);
450        assert_eq!(aggregate.unhealthy_count, 0);
451    }
452}