Skip to main content

raps_kernel/
api_health.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2024-2025 Dmytro Yemelianov
3
4//! API Health Tracking
5//!
6//! Lock-free latency tracker fed by `send_with_retry`. Computes running average,
7//! jitter (standard deviation), min/max, and health status from actual request data
8//! with zero extra HTTP calls.
9
10use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
11use std::time::Duration;
12
13/// Health status classification based on latency metrics.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum HealthStatus {
16    /// avg < 500ms and jitter < 200ms
17    Healthy,
18    /// avg < 2s and jitter < 500ms
19    Degraded,
20    /// avg >= 2s or jitter >= 500ms
21    Unhealthy,
22    /// No samples recorded yet
23    Unknown,
24}
25
26impl std::fmt::Display for HealthStatus {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        match self {
29            HealthStatus::Healthy => write!(f, "healthy"),
30            HealthStatus::Degraded => write!(f, "degraded"),
31            HealthStatus::Unhealthy => write!(f, "unhealthy"),
32            HealthStatus::Unknown => write!(f, "unknown"),
33        }
34    }
35}
36
37/// Snapshot of API health metrics at a point in time.
38#[derive(Debug, Clone)]
39pub struct HealthSnapshot {
40    pub avg_latency: Duration,
41    pub jitter: Duration,
42    pub min_latency: Duration,
43    pub max_latency: Duration,
44    pub last_latency: Duration,
45    pub sample_count: usize,
46    pub failure_count: usize,
47    pub health_status: HealthStatus,
48}
49
50/// Global lock-free API health tracker.
51///
52/// Uses atomics (same pattern as `profiler.rs`) to avoid mutex contention
53/// during concurrent HTTP requests.
54pub struct ApiHealth {
55    /// Sum of latencies in microseconds.
56    total_latency_us: AtomicU64,
57    /// Sum of squared latencies in microseconds (for jitter/stddev computation).
58    total_latency_sq_us: AtomicU64,
59    /// Number of successful request samples.
60    sample_count: AtomicUsize,
61    /// Minimum observed latency in microseconds.
62    min_latency_us: AtomicU64,
63    /// Maximum observed latency in microseconds.
64    max_latency_us: AtomicU64,
65    /// Most recent latency in microseconds.
66    last_latency_us: AtomicU64,
67    /// Number of terminal failures.
68    failure_count: AtomicUsize,
69}
70
71impl ApiHealth {
72    const fn new() -> Self {
73        Self {
74            total_latency_us: AtomicU64::new(0),
75            total_latency_sq_us: AtomicU64::new(0),
76            sample_count: AtomicUsize::new(0),
77            min_latency_us: AtomicU64::new(u64::MAX),
78            max_latency_us: AtomicU64::new(0),
79            last_latency_us: AtomicU64::new(0),
80            failure_count: AtomicUsize::new(0),
81        }
82    }
83}
84
85static GLOBAL_HEALTH: ApiHealth = ApiHealth::new();
86
87/// Record latency from a completed HTTP request.
88/// Called from `send_with_retry` on every successful completion.
89pub fn record_latency(duration: Duration) {
90    let us = duration.as_micros() as u64;
91
92    GLOBAL_HEALTH
93        .total_latency_us
94        .fetch_add(us, Ordering::Relaxed);
95
96    // For jitter: accumulate squared latency.
97    // Cap individual squared value to avoid overflow on extremely slow requests.
98    let us_sq = us.saturating_mul(us);
99    GLOBAL_HEALTH
100        .total_latency_sq_us
101        .fetch_add(us_sq, Ordering::Relaxed);
102
103    GLOBAL_HEALTH.sample_count.fetch_add(1, Ordering::Relaxed);
104
105    // Update min (atomic fetch_min)
106    GLOBAL_HEALTH
107        .min_latency_us
108        .fetch_min(us, Ordering::Relaxed);
109
110    // Update max (atomic fetch_max)
111    GLOBAL_HEALTH
112        .max_latency_us
113        .fetch_max(us, Ordering::Relaxed);
114
115    // Store last latency
116    GLOBAL_HEALTH.last_latency_us.store(us, Ordering::Relaxed);
117}
118
119/// Record a terminal failure (request that exhausted all retries).
120pub fn record_failure() {
121    GLOBAL_HEALTH.failure_count.fetch_add(1, Ordering::Relaxed);
122}
123
124/// Take a snapshot of current API health metrics.
125pub fn snapshot() -> HealthSnapshot {
126    let count = GLOBAL_HEALTH.sample_count.load(Ordering::Relaxed);
127    let failures = GLOBAL_HEALTH.failure_count.load(Ordering::Relaxed);
128
129    if count == 0 {
130        return HealthSnapshot {
131            avg_latency: Duration::ZERO,
132            jitter: Duration::ZERO,
133            min_latency: Duration::ZERO,
134            max_latency: Duration::ZERO,
135            last_latency: Duration::ZERO,
136            sample_count: 0,
137            failure_count: failures,
138            health_status: HealthStatus::Unknown,
139        };
140    }
141
142    let total_us = GLOBAL_HEALTH.total_latency_us.load(Ordering::Relaxed);
143    let total_sq_us = GLOBAL_HEALTH.total_latency_sq_us.load(Ordering::Relaxed);
144    let min_us = GLOBAL_HEALTH.min_latency_us.load(Ordering::Relaxed);
145    let max_us = GLOBAL_HEALTH.max_latency_us.load(Ordering::Relaxed);
146    let last_us = GLOBAL_HEALTH.last_latency_us.load(Ordering::Relaxed);
147
148    let avg_us = total_us / count as u64;
149
150    // Compute standard deviation (jitter) using: stddev = sqrt(E[X^2] - E[X]^2)
151    let mean_sq = total_sq_us / count as u64;
152    let sq_mean = avg_us.saturating_mul(avg_us);
153    let variance_us = mean_sq.saturating_sub(sq_mean);
154    let jitter_us = isqrt(variance_us);
155
156    let avg_ms = avg_us / 1000;
157    let jitter_ms = jitter_us / 1000;
158
159    let health_status = classify_health(avg_ms, jitter_ms);
160
161    HealthSnapshot {
162        avg_latency: Duration::from_micros(avg_us),
163        jitter: Duration::from_micros(jitter_us),
164        min_latency: Duration::from_micros(min_us),
165        max_latency: Duration::from_micros(max_us),
166        last_latency: Duration::from_micros(last_us),
167        sample_count: count,
168        failure_count: failures,
169        health_status,
170    }
171}
172
173/// Format a one-liner status string.
174pub fn status_line() -> String {
175    let snap = snapshot();
176    match snap.health_status {
177        HealthStatus::Unknown => "API: unknown (no samples)".to_string(),
178        _ => {
179            format!(
180                "API: {} (avg: {}, jitter: {})",
181                snap.health_status,
182                format_duration_ms(snap.avg_latency),
183                format_duration_ms(snap.jitter),
184            )
185        }
186    }
187}
188
189/// Classify health status from average latency and jitter (in milliseconds).
190fn classify_health(avg_ms: u64, jitter_ms: u64) -> HealthStatus {
191    if avg_ms < 500 && jitter_ms < 200 {
192        HealthStatus::Healthy
193    } else if avg_ms < 2000 && jitter_ms < 500 {
194        HealthStatus::Degraded
195    } else {
196        HealthStatus::Unhealthy
197    }
198}
199
200/// Integer square root (Heron's method).
201fn isqrt(n: u64) -> u64 {
202    if n == 0 {
203        return 0;
204    }
205    let mut x = n;
206    let mut y = x.div_ceil(2);
207    while y < x {
208        x = y;
209        y = (x + n / x) / 2;
210    }
211    x
212}
213
214/// Format a Duration as a human-readable millisecond or second string.
215pub fn format_duration_ms(d: Duration) -> String {
216    let ms = d.as_millis();
217    if ms < 1000 {
218        format!("{}ms", ms)
219    } else {
220        format!("{:.1}s", d.as_secs_f64())
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    // Note: Tests use isolated ApiHealth instances to avoid global state interference.
229
230    fn make_tracker() -> ApiHealth {
231        ApiHealth::new()
232    }
233
234    fn record_on(tracker: &ApiHealth, duration: Duration) {
235        let us = duration.as_micros() as u64;
236        tracker.total_latency_us.fetch_add(us, Ordering::Relaxed);
237        let us_sq = us.saturating_mul(us);
238        tracker
239            .total_latency_sq_us
240            .fetch_add(us_sq, Ordering::Relaxed);
241        tracker.sample_count.fetch_add(1, Ordering::Relaxed);
242        tracker.min_latency_us.fetch_min(us, Ordering::Relaxed);
243        tracker.max_latency_us.fetch_max(us, Ordering::Relaxed);
244        tracker.last_latency_us.store(us, Ordering::Relaxed);
245    }
246
247    fn snapshot_of(tracker: &ApiHealth) -> HealthSnapshot {
248        let count = tracker.sample_count.load(Ordering::Relaxed);
249        let failures = tracker.failure_count.load(Ordering::Relaxed);
250
251        if count == 0 {
252            return HealthSnapshot {
253                avg_latency: Duration::ZERO,
254                jitter: Duration::ZERO,
255                min_latency: Duration::ZERO,
256                max_latency: Duration::ZERO,
257                last_latency: Duration::ZERO,
258                sample_count: 0,
259                failure_count: failures,
260                health_status: HealthStatus::Unknown,
261            };
262        }
263
264        let total_us = tracker.total_latency_us.load(Ordering::Relaxed);
265        let total_sq_us = tracker.total_latency_sq_us.load(Ordering::Relaxed);
266        let min_us = tracker.min_latency_us.load(Ordering::Relaxed);
267        let max_us = tracker.max_latency_us.load(Ordering::Relaxed);
268        let last_us = tracker.last_latency_us.load(Ordering::Relaxed);
269
270        let avg_us = total_us / count as u64;
271        let mean_sq = total_sq_us / count as u64;
272        let sq_mean = avg_us.saturating_mul(avg_us);
273        let variance_us = mean_sq.saturating_sub(sq_mean);
274        let jitter_us = isqrt(variance_us);
275
276        let avg_ms = avg_us / 1000;
277        let jitter_ms = jitter_us / 1000;
278        let health_status = classify_health(avg_ms, jitter_ms);
279
280        HealthSnapshot {
281            avg_latency: Duration::from_micros(avg_us),
282            jitter: Duration::from_micros(jitter_us),
283            min_latency: Duration::from_micros(min_us),
284            max_latency: Duration::from_micros(max_us),
285            last_latency: Duration::from_micros(last_us),
286            sample_count: count,
287            failure_count: failures,
288            health_status,
289        }
290    }
291
292    #[test]
293    fn test_no_samples_unknown() {
294        let tracker = make_tracker();
295        let snap = snapshot_of(&tracker);
296        assert_eq!(snap.health_status, HealthStatus::Unknown);
297        assert_eq!(snap.sample_count, 0);
298        assert_eq!(snap.avg_latency, Duration::ZERO);
299    }
300
301    #[test]
302    fn test_single_sample() {
303        let tracker = make_tracker();
304        record_on(&tracker, Duration::from_millis(100));
305        let snap = snapshot_of(&tracker);
306        assert_eq!(snap.sample_count, 1);
307        assert_eq!(snap.avg_latency.as_millis(), 100);
308        assert_eq!(snap.min_latency.as_millis(), 100);
309        assert_eq!(snap.max_latency.as_millis(), 100);
310        assert_eq!(snap.jitter.as_millis(), 0);
311        assert_eq!(snap.health_status, HealthStatus::Healthy);
312    }
313
314    #[test]
315    fn test_average_latency() {
316        let tracker = make_tracker();
317        record_on(&tracker, Duration::from_millis(100));
318        record_on(&tracker, Duration::from_millis(200));
319        record_on(&tracker, Duration::from_millis(300));
320        let snap = snapshot_of(&tracker);
321        assert_eq!(snap.sample_count, 3);
322        assert_eq!(snap.avg_latency.as_millis(), 200);
323        assert_eq!(snap.min_latency.as_millis(), 100);
324        assert_eq!(snap.max_latency.as_millis(), 300);
325    }
326
327    #[test]
328    fn test_jitter_calculation() {
329        let tracker = make_tracker();
330        // Two samples: 100ms and 300ms. Mean = 200ms.
331        // Variance = ((100-200)^2 + (300-200)^2) / 2 = 10000 ms^2
332        // Stddev = sqrt(10000) = 100ms
333        record_on(&tracker, Duration::from_millis(100));
334        record_on(&tracker, Duration::from_millis(300));
335        let snap = snapshot_of(&tracker);
336        let jitter_ms = snap.jitter.as_millis();
337        // Allow small rounding error from integer arithmetic
338        assert!(
339            jitter_ms >= 99 && jitter_ms <= 101,
340            "jitter was {}ms",
341            jitter_ms
342        );
343    }
344
345    #[test]
346    fn test_healthy_classification() {
347        assert_eq!(classify_health(200, 50), HealthStatus::Healthy);
348        assert_eq!(classify_health(499, 199), HealthStatus::Healthy);
349    }
350
351    #[test]
352    fn test_degraded_classification() {
353        assert_eq!(classify_health(500, 50), HealthStatus::Degraded);
354        assert_eq!(classify_health(1999, 499), HealthStatus::Degraded);
355        assert_eq!(classify_health(200, 200), HealthStatus::Degraded);
356    }
357
358    #[test]
359    fn test_unhealthy_classification() {
360        assert_eq!(classify_health(2000, 50), HealthStatus::Unhealthy);
361        assert_eq!(classify_health(200, 500), HealthStatus::Unhealthy);
362        assert_eq!(classify_health(5000, 1000), HealthStatus::Unhealthy);
363    }
364
365    #[test]
366    fn test_failure_count() {
367        let tracker = make_tracker();
368        tracker.failure_count.fetch_add(1, Ordering::Relaxed);
369        tracker.failure_count.fetch_add(1, Ordering::Relaxed);
370        let snap = snapshot_of(&tracker);
371        assert_eq!(snap.failure_count, 2);
372    }
373
374    #[test]
375    fn test_last_latency_tracks_most_recent() {
376        let tracker = make_tracker();
377        record_on(&tracker, Duration::from_millis(100));
378        record_on(&tracker, Duration::from_millis(500));
379        record_on(&tracker, Duration::from_millis(200));
380        let snap = snapshot_of(&tracker);
381        assert_eq!(snap.last_latency.as_millis(), 200);
382    }
383
384    #[test]
385    fn test_min_max_tracking() {
386        let tracker = make_tracker();
387        record_on(&tracker, Duration::from_millis(500));
388        record_on(&tracker, Duration::from_millis(100));
389        record_on(&tracker, Duration::from_millis(1000));
390        record_on(&tracker, Duration::from_millis(200));
391        let snap = snapshot_of(&tracker);
392        assert_eq!(snap.min_latency.as_millis(), 100);
393        assert_eq!(snap.max_latency.as_millis(), 1000);
394    }
395
396    #[test]
397    fn test_isqrt() {
398        assert_eq!(isqrt(0), 0);
399        assert_eq!(isqrt(1), 1);
400        assert_eq!(isqrt(4), 2);
401        assert_eq!(isqrt(9), 3);
402        assert_eq!(isqrt(10), 3); // floor
403        assert_eq!(isqrt(100), 10);
404        assert_eq!(isqrt(10000), 100);
405    }
406
407    #[test]
408    fn test_format_duration_ms() {
409        assert_eq!(format_duration_ms(Duration::from_millis(0)), "0ms");
410        assert_eq!(format_duration_ms(Duration::from_millis(340)), "340ms");
411        assert_eq!(format_duration_ms(Duration::from_millis(999)), "999ms");
412        assert_eq!(format_duration_ms(Duration::from_millis(1000)), "1.0s");
413        assert_eq!(format_duration_ms(Duration::from_millis(2100)), "2.1s");
414    }
415
416    #[test]
417    fn test_health_status_display() {
418        assert_eq!(HealthStatus::Healthy.to_string(), "healthy");
419        assert_eq!(HealthStatus::Degraded.to_string(), "degraded");
420        assert_eq!(HealthStatus::Unhealthy.to_string(), "unhealthy");
421        assert_eq!(HealthStatus::Unknown.to_string(), "unknown");
422    }
423
424    #[test]
425    fn test_overflow_safety_large_latency() {
426        let tracker = make_tracker();
427        // Very large latency (100 seconds) — should not panic
428        record_on(&tracker, Duration::from_secs(100));
429        let snap = snapshot_of(&tracker);
430        assert_eq!(snap.sample_count, 1);
431        assert_eq!(snap.avg_latency.as_secs(), 100);
432    }
433}