halldyll_core/observe/
health.rs

1//! Health Check - Kubernetes/Cloud Run liveness and readiness probes
2//!
3//! Provides health check functionality for cloud deployments.
4//!
5//! ## Usage
6//!
7//! ```rust,ignore
8//! let health = HealthCheck::new(&orchestrator);
9//! 
10//! // For liveness probe (is the process alive?)
11//! let liveness = health.liveness();
12//! // For readiness probe (can we accept traffic?)
13//! let readiness = health.readiness();
14//! 
15//! // JSON output for HTTP endpoint
16//! let json = health.to_json();
17//! ```
18
19use std::time::Instant;
20use serde::{Deserialize, Serialize};
21
22/// Health check status
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
24#[serde(rename_all = "lowercase")]
25pub enum HealthStatus {
26    /// Service is healthy
27    Healthy,
28    /// Service is degraded but operational
29    Degraded,
30    /// Service is unhealthy
31    Unhealthy,
32}
33
34impl HealthStatus {
35    /// Convert to HTTP status code
36    pub fn http_status_code(&self) -> u16 {
37        match self {
38            HealthStatus::Healthy => 200,
39            HealthStatus::Degraded => 200, // Still accept traffic
40            HealthStatus::Unhealthy => 503,
41        }
42    }
43
44    /// Is the service operational?
45    pub fn is_operational(&self) -> bool {
46        matches!(self, HealthStatus::Healthy | HealthStatus::Degraded)
47    }
48}
49
50/// Component health
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ComponentHealth {
53    /// Component name
54    pub name: String,
55    /// Status
56    pub status: HealthStatus,
57    /// Optional message
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub message: Option<String>,
60    /// Response time (ms)
61    #[serde(skip_serializing_if = "Option::is_none")]
62    pub response_time_ms: Option<u64>,
63}
64
65/// Health check response
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct HealthResponse {
68    /// Overall status
69    pub status: HealthStatus,
70    /// Version
71    pub version: String,
72    /// Uptime in seconds
73    pub uptime_secs: u64,
74    /// Component checks
75    pub checks: Vec<ComponentHealth>,
76    /// Timestamp (ISO 8601)
77    pub timestamp: String,
78}
79
80/// Health checker configuration
81#[derive(Debug, Clone)]
82pub struct HealthCheckConfig {
83    /// Success rate threshold for healthy (0.0 - 1.0)
84    pub success_rate_healthy: f64,
85    /// Success rate threshold for degraded (0.0 - 1.0)
86    pub success_rate_degraded: f64,
87    /// Max average latency for healthy (ms)
88    pub max_latency_healthy_ms: u64,
89    /// Max average latency for degraded (ms)
90    pub max_latency_degraded_ms: u64,
91    /// Max open circuits for healthy
92    pub max_open_circuits_healthy: usize,
93    /// Max open circuits for degraded
94    pub max_open_circuits_degraded: usize,
95}
96
97impl Default for HealthCheckConfig {
98    fn default() -> Self {
99        Self {
100            success_rate_healthy: 0.95,
101            success_rate_degraded: 0.80,
102            max_latency_healthy_ms: 5000,
103            max_latency_degraded_ms: 15000,
104            max_open_circuits_healthy: 2,
105            max_open_circuits_degraded: 10,
106        }
107    }
108}
109
110/// Health checker
111pub struct HealthChecker {
112    config: HealthCheckConfig,
113    started_at: Instant,
114    version: String,
115}
116
117impl HealthChecker {
118    /// Create new health checker
119    pub fn new(config: HealthCheckConfig) -> Self {
120        Self {
121            config,
122            started_at: Instant::now(),
123            version: env!("CARGO_PKG_VERSION").to_string(),
124        }
125    }
126
127    /// Create with default config
128    pub fn default_config() -> Self {
129        Self::new(HealthCheckConfig::default())
130    }
131
132    /// Liveness check - is the process alive?
133    /// This should almost always return healthy unless the process is stuck
134    pub fn liveness(&self) -> HealthResponse {
135        HealthResponse {
136            status: HealthStatus::Healthy,
137            version: self.version.clone(),
138            uptime_secs: self.started_at.elapsed().as_secs(),
139            checks: vec![ComponentHealth {
140                name: "process".to_string(),
141                status: HealthStatus::Healthy,
142                message: Some("Process is running".to_string()),
143                response_time_ms: None,
144            }],
145            timestamp: chrono::Utc::now().to_rfc3339(),
146        }
147    }
148
149    /// Readiness check - can we accept traffic?
150    pub fn readiness(&self, metrics: &HealthMetrics) -> HealthResponse {
151        let mut checks = Vec::new();
152        let mut overall_status = HealthStatus::Healthy;
153
154        // Check success rate
155        let success_check = self.check_success_rate(metrics.success_rate);
156        if success_check.status == HealthStatus::Unhealthy {
157            overall_status = HealthStatus::Unhealthy;
158        } else if success_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
159            overall_status = HealthStatus::Degraded;
160        }
161        checks.push(success_check);
162
163        // Check latency
164        let latency_check = self.check_latency(metrics.avg_latency_ms);
165        if latency_check.status == HealthStatus::Unhealthy {
166            overall_status = HealthStatus::Unhealthy;
167        } else if latency_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
168            overall_status = HealthStatus::Degraded;
169        }
170        checks.push(latency_check);
171
172        // Check circuit breakers
173        let circuit_check = self.check_circuits(metrics.open_circuits);
174        if circuit_check.status == HealthStatus::Unhealthy {
175            overall_status = HealthStatus::Unhealthy;
176        } else if circuit_check.status == HealthStatus::Degraded && overall_status == HealthStatus::Healthy {
177            overall_status = HealthStatus::Degraded;
178        }
179        checks.push(circuit_check);
180
181        // Check memory (if available)
182        if let Some(memory_mb) = metrics.memory_mb {
183            checks.push(ComponentHealth {
184                name: "memory".to_string(),
185                status: HealthStatus::Healthy,
186                message: Some(format!("{} MB used", memory_mb)),
187                response_time_ms: None,
188            });
189        }
190
191        HealthResponse {
192            status: overall_status,
193            version: self.version.clone(),
194            uptime_secs: self.started_at.elapsed().as_secs(),
195            checks,
196            timestamp: chrono::Utc::now().to_rfc3339(),
197        }
198    }
199
200    fn check_success_rate(&self, rate: f64) -> ComponentHealth {
201        let (status, message) = if rate >= self.config.success_rate_healthy {
202            (HealthStatus::Healthy, format!("{:.1}% success rate", rate * 100.0))
203        } else if rate >= self.config.success_rate_degraded {
204            (HealthStatus::Degraded, format!("{:.1}% success rate (degraded)", rate * 100.0))
205        } else {
206            (HealthStatus::Unhealthy, format!("{:.1}% success rate (critical)", rate * 100.0))
207        };
208
209        ComponentHealth {
210            name: "success_rate".to_string(),
211            status,
212            message: Some(message),
213            response_time_ms: None,
214        }
215    }
216
217    fn check_latency(&self, latency_ms: f64) -> ComponentHealth {
218        let (status, message) = if latency_ms <= self.config.max_latency_healthy_ms as f64 {
219            (HealthStatus::Healthy, format!("{:.0}ms avg latency", latency_ms))
220        } else if latency_ms <= self.config.max_latency_degraded_ms as f64 {
221            (HealthStatus::Degraded, format!("{:.0}ms avg latency (high)", latency_ms))
222        } else {
223            (HealthStatus::Unhealthy, format!("{:.0}ms avg latency (critical)", latency_ms))
224        };
225
226        ComponentHealth {
227            name: "latency".to_string(),
228            status,
229            message: Some(message),
230            response_time_ms: Some(latency_ms as u64),
231        }
232    }
233
234    fn check_circuits(&self, open_circuits: usize) -> ComponentHealth {
235        let (status, message) = if open_circuits <= self.config.max_open_circuits_healthy {
236            (HealthStatus::Healthy, format!("{} open circuits", open_circuits))
237        } else if open_circuits <= self.config.max_open_circuits_degraded {
238            (HealthStatus::Degraded, format!("{} open circuits (elevated)", open_circuits))
239        } else {
240            (HealthStatus::Unhealthy, format!("{} open circuits (critical)", open_circuits))
241        };
242
243        ComponentHealth {
244            name: "circuit_breakers".to_string(),
245            status,
246            message: Some(message),
247            response_time_ms: None,
248        }
249    }
250}
251
252/// Metrics for health check
253#[derive(Debug, Clone, Default)]
254pub struct HealthMetrics {
255    /// Success rate (0.0 - 1.0)
256    pub success_rate: f64,
257    /// Average latency (ms)
258    pub avg_latency_ms: f64,
259    /// Number of open circuit breakers
260    pub open_circuits: usize,
261    /// Memory usage (MB) - optional
262    pub memory_mb: Option<u64>,
263    /// Active requests
264    pub active_requests: usize,
265}
266
267impl HealthResponse {
268    /// Convert to JSON
269    pub fn to_json(&self) -> String {
270        serde_json::to_string_pretty(self).unwrap_or_else(|_| "{}".to_string())
271    }
272
273    /// Convert to compact JSON
274    pub fn to_json_compact(&self) -> String {
275        serde_json::to_string(self).unwrap_or_else(|_| "{}".to_string())
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_liveness_always_healthy() {
285        let checker = HealthChecker::default_config();
286        let response = checker.liveness();
287        assert_eq!(response.status, HealthStatus::Healthy);
288    }
289
290    #[test]
291    fn test_readiness_healthy() {
292        let checker = HealthChecker::default_config();
293        let metrics = HealthMetrics {
294            success_rate: 0.99,
295            avg_latency_ms: 100.0,
296            open_circuits: 0,
297            memory_mb: Some(256),
298            active_requests: 5,
299        };
300
301        let response = checker.readiness(&metrics);
302        assert_eq!(response.status, HealthStatus::Healthy);
303    }
304
305    #[test]
306    fn test_readiness_degraded() {
307        let checker = HealthChecker::default_config();
308        let metrics = HealthMetrics {
309            success_rate: 0.85,
310            avg_latency_ms: 8000.0,
311            open_circuits: 5,
312            memory_mb: None,
313            active_requests: 10,
314        };
315
316        let response = checker.readiness(&metrics);
317        assert_eq!(response.status, HealthStatus::Degraded);
318    }
319
320    #[test]
321    fn test_readiness_unhealthy() {
322        let checker = HealthChecker::default_config();
323        let metrics = HealthMetrics {
324            success_rate: 0.50,
325            avg_latency_ms: 20000.0,
326            open_circuits: 20,
327            memory_mb: None,
328            active_requests: 0,
329        };
330
331        let response = checker.readiness(&metrics);
332        assert_eq!(response.status, HealthStatus::Unhealthy);
333    }
334
335    #[test]
336    fn test_json_output() {
337        let checker = HealthChecker::default_config();
338        let response = checker.liveness();
339        let json = response.to_json();
340
341        assert!(json.contains("\"status\""));
342        assert!(json.contains("\"version\""));
343        assert!(json.contains("\"uptime_secs\""));
344    }
345
346    #[test]
347    fn test_http_status_codes() {
348        assert_eq!(HealthStatus::Healthy.http_status_code(), 200);
349        assert_eq!(HealthStatus::Degraded.http_status_code(), 200);
350        assert_eq!(HealthStatus::Unhealthy.http_status_code(), 503);
351    }
352}