rust_expect/
health.rs

1//! Health checking and diagnostics.
2//!
3//! This module provides health checking capabilities for sessions
4//! and connections.
5
6use std::time::{Duration, Instant};
7
8/// Health status.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum HealthStatus {
11    /// Healthy and operational.
12    Healthy,
13    /// Degraded but functional.
14    Degraded,
15    /// Unhealthy and non-functional.
16    Unhealthy,
17    /// Status unknown.
18    Unknown,
19}
20
21impl HealthStatus {
22    /// Check if healthy.
23    #[must_use]
24    pub const fn is_healthy(&self) -> bool {
25        matches!(self, Self::Healthy)
26    }
27
28    /// Check if operational (healthy or degraded).
29    #[must_use]
30    pub const fn is_operational(&self) -> bool {
31        matches!(self, Self::Healthy | Self::Degraded)
32    }
33}
34
35/// Health check result.
36#[derive(Debug, Clone)]
37pub struct HealthCheckResult {
38    /// Status.
39    pub status: HealthStatus,
40    /// Message.
41    pub message: Option<String>,
42    /// Check duration.
43    pub duration: Duration,
44    /// Timestamp.
45    pub timestamp: Instant,
46}
47
48impl HealthCheckResult {
49    /// Create a healthy result.
50    #[must_use]
51    pub fn healthy() -> Self {
52        Self {
53            status: HealthStatus::Healthy,
54            message: None,
55            duration: Duration::ZERO,
56            timestamp: Instant::now(),
57        }
58    }
59
60    /// Create an unhealthy result.
61    #[must_use]
62    pub fn unhealthy(message: impl Into<String>) -> Self {
63        Self {
64            status: HealthStatus::Unhealthy,
65            message: Some(message.into()),
66            duration: Duration::ZERO,
67            timestamp: Instant::now(),
68        }
69    }
70
71    /// Create a degraded result.
72    #[must_use]
73    pub fn degraded(message: impl Into<String>) -> Self {
74        Self {
75            status: HealthStatus::Degraded,
76            message: Some(message.into()),
77            duration: Duration::ZERO,
78            timestamp: Instant::now(),
79        }
80    }
81
82    /// Set duration.
83    #[must_use]
84    pub const fn with_duration(mut self, duration: Duration) -> Self {
85        self.duration = duration;
86        self
87    }
88}
89
90/// Health check configuration.
91#[derive(Debug, Clone)]
92pub struct HealthCheckConfig {
93    /// Check interval.
94    pub interval: Duration,
95    /// Timeout for health checks.
96    pub timeout: Duration,
97    /// Number of failures before unhealthy.
98    pub failure_threshold: u32,
99    /// Number of successes before healthy.
100    pub success_threshold: u32,
101}
102
103impl Default for HealthCheckConfig {
104    fn default() -> Self {
105        Self {
106            interval: Duration::from_secs(30),
107            timeout: Duration::from_secs(5),
108            failure_threshold: 3,
109            success_threshold: 1,
110        }
111    }
112}
113
114impl HealthCheckConfig {
115    /// Create new config.
116    #[must_use]
117    pub fn new() -> Self {
118        Self::default()
119    }
120
121    /// Set interval.
122    #[must_use]
123    pub const fn with_interval(mut self, interval: Duration) -> Self {
124        self.interval = interval;
125        self
126    }
127
128    /// Set timeout.
129    #[must_use]
130    pub const fn with_timeout(mut self, timeout: Duration) -> Self {
131        self.timeout = timeout;
132        self
133    }
134
135    /// Set failure threshold.
136    #[must_use]
137    pub const fn with_failure_threshold(mut self, threshold: u32) -> Self {
138        self.failure_threshold = threshold;
139        self
140    }
141
142    /// Set success threshold.
143    #[must_use]
144    pub const fn with_success_threshold(mut self, threshold: u32) -> Self {
145        self.success_threshold = threshold;
146        self
147    }
148}
149
150/// Health checker state.
151#[derive(Debug)]
152pub struct HealthChecker {
153    /// Configuration.
154    config: HealthCheckConfig,
155    /// Current status.
156    status: HealthStatus,
157    /// Consecutive failures.
158    failures: u32,
159    /// Consecutive successes.
160    successes: u32,
161    /// Last check time.
162    last_check: Option<Instant>,
163    /// Last result.
164    last_result: Option<HealthCheckResult>,
165}
166
167impl HealthChecker {
168    /// Create a new health checker.
169    #[must_use]
170    pub const fn new(config: HealthCheckConfig) -> Self {
171        Self {
172            config,
173            status: HealthStatus::Unknown,
174            failures: 0,
175            successes: 0,
176            last_check: None,
177            last_result: None,
178        }
179    }
180
181    /// Get current status.
182    #[must_use]
183    pub const fn status(&self) -> HealthStatus {
184        self.status
185    }
186
187    /// Get last result.
188    #[must_use]
189    pub const fn last_result(&self) -> Option<&HealthCheckResult> {
190        self.last_result.as_ref()
191    }
192
193    /// Check if a health check is due.
194    #[must_use]
195    pub fn is_check_due(&self) -> bool {
196        match self.last_check {
197            Some(last) => last.elapsed() >= self.config.interval,
198            None => true,
199        }
200    }
201
202    /// Record a successful check.
203    pub fn record_success(&mut self) {
204        self.failures = 0;
205        self.successes += 1;
206        self.last_check = Some(Instant::now());
207
208        if self.successes >= self.config.success_threshold {
209            self.status = HealthStatus::Healthy;
210        }
211
212        self.last_result = Some(HealthCheckResult::healthy());
213    }
214
215    /// Record a failed check.
216    pub fn record_failure(&mut self, message: impl Into<String>) {
217        self.successes = 0;
218        self.failures += 1;
219        self.last_check = Some(Instant::now());
220
221        if self.failures >= self.config.failure_threshold {
222            self.status = HealthStatus::Unhealthy;
223        } else if self.failures > 0 {
224            self.status = HealthStatus::Degraded;
225        }
226
227        self.last_result = Some(HealthCheckResult::unhealthy(message));
228    }
229
230    /// Reset the checker.
231    pub fn reset(&mut self) {
232        self.status = HealthStatus::Unknown;
233        self.failures = 0;
234        self.successes = 0;
235        self.last_check = None;
236        self.last_result = None;
237    }
238}
239
240/// Simple liveness check.
241#[must_use]
242pub fn liveness_check() -> HealthCheckResult {
243    HealthCheckResult::healthy()
244}
245
246/// Check if a process is alive by PID.
247#[must_use]
248#[cfg(unix)]
249#[allow(unsafe_code)]
250pub fn process_alive(pid: i32) -> bool {
251    // Send signal 0 to check if process exists
252    unsafe { libc::kill(pid, 0) == 0 }
253}
254
255/// Check if a process is alive by PID.
256///
257/// On Windows, this uses `OpenProcess` with `PROCESS_SYNCHRONIZE` access and
258/// `WaitForSingleObject` with a zero timeout to check process state.
259#[must_use]
260#[cfg(windows)]
261#[allow(unsafe_code)]
262pub fn process_alive(pid: i32) -> bool {
263    use windows_sys::Win32::Foundation::{CloseHandle, WAIT_TIMEOUT};
264    use windows_sys::Win32::System::Threading::{
265        OpenProcess, PROCESS_SYNCHRONIZE, WaitForSingleObject,
266    };
267
268    if pid <= 0 {
269        return false;
270    }
271
272    // SAFETY: OpenProcess is safe to call with valid parameters.
273    // We use PROCESS_SYNCHRONIZE which is the minimum access needed for WaitForSingleObject.
274    let handle = unsafe { OpenProcess(PROCESS_SYNCHRONIZE, 0, pid as u32) };
275
276    if handle.is_null() {
277        // Process doesn't exist or we don't have access
278        return false;
279    }
280
281    // SAFETY: WaitForSingleObject is safe with a valid handle and timeout.
282    // A timeout of 0 returns immediately: WAIT_TIMEOUT means still running,
283    // WAIT_OBJECT_0 (0) means the process has terminated.
284    let result = unsafe { WaitForSingleObject(handle, 0) };
285
286    // SAFETY: CloseHandle is safe with a valid handle.
287    unsafe {
288        CloseHandle(handle);
289    }
290
291    result == WAIT_TIMEOUT
292}
293
294/// Check if a process is alive by PID.
295///
296/// On non-Unix, non-Windows platforms, this returns `false` as there is no
297/// portable way to check process liveness.
298#[must_use]
299#[cfg(not(any(unix, windows)))]
300pub fn process_alive(_pid: i32) -> bool {
301    false
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307
308    #[test]
309    fn health_status() {
310        assert!(HealthStatus::Healthy.is_healthy());
311        assert!(HealthStatus::Healthy.is_operational());
312        assert!(!HealthStatus::Degraded.is_healthy());
313        assert!(HealthStatus::Degraded.is_operational());
314        assert!(!HealthStatus::Unhealthy.is_operational());
315    }
316
317    #[test]
318    fn health_checker_transitions() {
319        let config = HealthCheckConfig {
320            failure_threshold: 2,
321            success_threshold: 1,
322            ..Default::default()
323        };
324        let mut checker = HealthChecker::new(config);
325
326        assert_eq!(checker.status(), HealthStatus::Unknown);
327
328        checker.record_success();
329        assert_eq!(checker.status(), HealthStatus::Healthy);
330
331        checker.record_failure("test");
332        assert_eq!(checker.status(), HealthStatus::Degraded);
333
334        checker.record_failure("test");
335        assert_eq!(checker.status(), HealthStatus::Unhealthy);
336
337        checker.record_success();
338        assert_eq!(checker.status(), HealthStatus::Healthy);
339    }
340}