Skip to main content

grate_limiter/
health.rs

1use serde::{Deserialize, Serialize};
2
3use crate::clock::Timestamp;
4
5/// Health engine configuration.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct HealthConfig {
8    /// Half-life for EWMA decay in seconds. Older observations matter less.
9    pub decay_half_life_seconds: f64,
10    /// Penalty applied to health score on a 429 response.
11    pub penalty_429: f32,
12    /// Penalty applied to health score on a 403 response.
13    pub penalty_403: f32,
14    /// Penalty applied to health score on a 5xx response.
15    pub penalty_5xx: f32,
16    /// Penalty applied to health score on a timeout.
17    pub penalty_timeout: f32,
18    /// Boost applied on successful response.
19    pub boost_success: f32,
20    /// Number of consecutive failures before triggering cooldown.
21    pub cooldown_trigger_count: u32,
22    /// Multiplier for exponential cooldown growth.
23    pub cooldown_multiplier: f64,
24    /// Maximum cooldown duration in seconds.
25    pub max_cooldown_seconds: u64,
26}
27
28impl Default for HealthConfig {
29    fn default() -> Self {
30        Self {
31            decay_half_life_seconds: 300.0,
32            penalty_429: 0.25,
33            penalty_403: 0.50,
34            penalty_5xx: 0.10,
35            penalty_timeout: 0.20,
36            boost_success: 0.02,
37            cooldown_trigger_count: 3,
38            cooldown_multiplier: 2.0,
39            max_cooldown_seconds: 600,
40        }
41    }
42}
43
44/// Runtime health state for a single provider.
45pub(crate) struct HealthState {
46    /// Current health score [0.0, 1.0].
47    score: f32,
48    /// Consecutive failure count (for cooldown triggering).
49    consecutive_failures: u32,
50    /// Current cooldown duration in seconds (grows exponentially).
51    current_cooldown_secs: u64,
52    /// Timestamp when cooldown expires (if active).
53    cooldown_until: Option<Timestamp>,
54    /// Last observation timestamp (for EWMA decay).
55    last_observation: Timestamp,
56    /// Total observations.
57    total_observations: u64,
58    /// Total successes.
59    total_successes: u64,
60    /// EWMA latency in milliseconds.
61    ewma_latency_ms: f64,
62}
63
64impl HealthState {
65    pub(crate) fn new(now: Timestamp) -> Self {
66        Self {
67            score: 1.0,
68            consecutive_failures: 0,
69            current_cooldown_secs: 0,
70            cooldown_until: None,
71            last_observation: now,
72            total_observations: 0,
73            total_successes: 0,
74            ewma_latency_ms: 0.0,
75        }
76    }
77
78    /// Current health score.
79    pub(crate) fn score(&self) -> f32 {
80        self.score
81    }
82
83    /// Whether the provider is currently in cooldown.
84    pub(crate) fn is_in_cooldown(&self, now: Timestamp) -> bool {
85        match self.cooldown_until {
86            Some(until) => now < until,
87            None => false,
88        }
89    }
90
91    /// EWMA latency in milliseconds.
92    pub(crate) fn latency_ms(&self) -> f64 {
93        self.ewma_latency_ms
94    }
95
96    /// Apply a successful observation.
97    pub(crate) fn record_success(
98        &mut self,
99        latency_ms: u64,
100        now: Timestamp,
101        config: &HealthConfig,
102    ) {
103        self.apply_decay(now, config);
104        self.score = (self.score + config.boost_success).min(1.0);
105        self.consecutive_failures = 0;
106        self.total_observations += 1;
107        self.total_successes += 1;
108        self.update_latency(latency_ms, config);
109        self.last_observation = now;
110    }
111
112    /// Apply a 429 (rate limited) observation.
113    pub(crate) fn record_rate_limited(
114        &mut self,
115        now: Timestamp,
116        config: &HealthConfig,
117        default_cooldown_secs: u64,
118    ) {
119        self.apply_decay(now, config);
120        self.score = (self.score - config.penalty_429).max(0.0);
121        self.total_observations += 1;
122        self.record_failure(now, config, default_cooldown_secs);
123        self.last_observation = now;
124    }
125
126    /// Apply a 403 (forbidden) observation.
127    pub(crate) fn record_forbidden(
128        &mut self,
129        now: Timestamp,
130        config: &HealthConfig,
131        default_cooldown_secs: u64,
132    ) {
133        self.apply_decay(now, config);
134        self.score = (self.score - config.penalty_403).max(0.0);
135        self.total_observations += 1;
136        self.record_failure(now, config, default_cooldown_secs);
137        self.last_observation = now;
138    }
139
140    /// Apply a 5xx (server error) observation.
141    pub(crate) fn record_server_error(
142        &mut self,
143        now: Timestamp,
144        config: &HealthConfig,
145        default_cooldown_secs: u64,
146    ) {
147        self.apply_decay(now, config);
148        self.score = (self.score - config.penalty_5xx).max(0.0);
149        self.total_observations += 1;
150        self.record_failure(now, config, default_cooldown_secs);
151        self.last_observation = now;
152    }
153
154    /// Apply a timeout observation.
155    pub(crate) fn record_timeout(
156        &mut self,
157        now: Timestamp,
158        config: &HealthConfig,
159        default_cooldown_secs: u64,
160    ) {
161        self.apply_decay(now, config);
162        self.score = (self.score - config.penalty_timeout).max(0.0);
163        self.total_observations += 1;
164        self.record_failure(now, config, default_cooldown_secs);
165        self.last_observation = now;
166    }
167
168    /// Apply EWMA decay — old penalties fade, health recovers naturally.
169    fn apply_decay(&mut self, now: Timestamp, config: &HealthConfig) {
170        let elapsed_secs = now.duration_since(self.last_observation) as f64 / 1_000_000_000.0;
171        if elapsed_secs <= 0.0 || config.decay_half_life_seconds <= 0.0 {
172            return;
173        }
174
175        // EWMA decay: score moves toward 1.0 over time
176        let decay_factor = (0.5_f64).powf(elapsed_secs / config.decay_half_life_seconds);
177        // score = 1.0 - (1.0 - score) * decay_factor
178        let deficit = 1.0 - self.score;
179        self.score = 1.0 - deficit * decay_factor as f32;
180        self.score = self.score.clamp(0.0, 1.0);
181    }
182
183    /// Record a failure and potentially trigger cooldown.
184    fn record_failure(
185        &mut self,
186        now: Timestamp,
187        config: &HealthConfig,
188        default_cooldown_secs: u64,
189    ) {
190        self.consecutive_failures += 1;
191
192        if self.consecutive_failures >= config.cooldown_trigger_count {
193            // Calculate cooldown duration with exponential growth
194            let excess = self.consecutive_failures - config.cooldown_trigger_count;
195            let multiplier = config.cooldown_multiplier.powi(excess as i32);
196            let cooldown_secs = (default_cooldown_secs as f64 * multiplier) as u64;
197            self.current_cooldown_secs = cooldown_secs.min(config.max_cooldown_seconds);
198            self.cooldown_until = Some(now.add_secs(self.current_cooldown_secs));
199        }
200    }
201
202    /// Update EWMA latency with a new sample.
203    fn update_latency(&mut self, latency_ms: u64, _config: &HealthConfig) {
204        const ALPHA: f64 = 0.3; // EWMA smoothing factor
205        if self.total_observations <= 1 {
206            self.ewma_latency_ms = latency_ms as f64;
207        } else {
208            self.ewma_latency_ms = ALPHA * latency_ms as f64 + (1.0 - ALPHA) * self.ewma_latency_ms;
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    fn ts(ms: u64) -> Timestamp {
218        Timestamp(ms * 1_000_000)
219    }
220
221    #[test]
222    fn initial_health_is_perfect() {
223        let h = HealthState::new(ts(0));
224        assert_eq!(h.score(), 1.0);
225        assert!(!h.is_in_cooldown(ts(0)));
226    }
227
228    #[test]
229    fn success_maintains_health() {
230        let config = HealthConfig::default();
231        let mut h = HealthState::new(ts(0));
232        h.record_success(100, ts(1_000), &config);
233        assert!(h.score() >= 1.0); // boost applied, clamped to 1.0
234    }
235
236    #[test]
237    fn rate_limit_reduces_health() {
238        let config = HealthConfig::default();
239        let mut h = HealthState::new(ts(0));
240        h.record_rate_limited(ts(1_000), &config, 60);
241        assert!(h.score() < 1.0);
242        assert!((h.score() - (1.0 - config.penalty_429)).abs() < 0.01);
243    }
244
245    #[test]
246    fn health_decays_toward_full() {
247        let config = HealthConfig {
248            decay_half_life_seconds: 10.0, // fast decay for testing
249            ..Default::default()
250        };
251        let mut h = HealthState::new(ts(0));
252        h.record_rate_limited(ts(0), &config, 60);
253        let after_penalty = h.score();
254
255        // After 10s (one half-life), deficit should halve
256        h.record_success(100, ts(10_000), &config);
257        assert!(h.score() > after_penalty);
258    }
259
260    #[test]
261    fn consecutive_failures_trigger_cooldown() {
262        let config = HealthConfig {
263            cooldown_trigger_count: 3,
264            ..Default::default()
265        };
266        let mut h = HealthState::new(ts(0));
267
268        h.record_rate_limited(ts(1_000), &config, 30);
269        assert!(!h.is_in_cooldown(ts(1_000)));
270
271        h.record_rate_limited(ts(2_000), &config, 30);
272        assert!(!h.is_in_cooldown(ts(2_000)));
273
274        h.record_rate_limited(ts(3_000), &config, 30);
275        assert!(h.is_in_cooldown(ts(3_000)));
276        assert!(h.is_in_cooldown(ts(32_000))); // still in cooldown at 32s
277
278        // After cooldown expires
279        assert!(!h.is_in_cooldown(ts(34_000)));
280    }
281
282    #[test]
283    fn cooldown_grows_exponentially() {
284        let config = HealthConfig {
285            cooldown_trigger_count: 2,
286            cooldown_multiplier: 2.0,
287            max_cooldown_seconds: 600,
288            ..Default::default()
289        };
290        let mut h = HealthState::new(ts(0));
291
292        // First cooldown at consecutive_failures=2: base duration
293        h.record_rate_limited(ts(1_000), &config, 30);
294        h.record_rate_limited(ts(2_000), &config, 30);
295        assert!(h.is_in_cooldown(ts(2_000)));
296
297        // Third failure: cooldown should double
298        h.record_rate_limited(ts(33_000), &config, 30);
299        // excess=1, multiplier=2^1=2, cooldown=60s
300        assert!(h.is_in_cooldown(ts(92_000))); // 33s + 60s = 93s
301    }
302
303    #[test]
304    fn health_score_bounded() {
305        let config = HealthConfig::default();
306        let mut h = HealthState::new(ts(0));
307
308        // Many failures
309        for i in 0..20 {
310            h.record_rate_limited(ts(i * 1_000), &config, 60);
311        }
312        assert!(h.score() >= 0.0);
313
314        // Many successes
315        for i in 20..40 {
316            h.record_success(100, ts(i * 1_000), &config);
317        }
318        assert!(h.score() <= 1.0);
319    }
320
321    #[test]
322    fn ewma_latency_smooths() {
323        let config = HealthConfig::default();
324        let mut h = HealthState::new(ts(0));
325
326        h.record_success(100, ts(1_000), &config);
327        assert_eq!(h.latency_ms(), 100.0);
328
329        h.record_success(200, ts(2_000), &config);
330        // EWMA: 0.3 * 200 + 0.7 * 100 = 130
331        assert!((h.latency_ms() - 130.0).abs() < 1.0);
332    }
333
334    #[test]
335    fn forbidden_is_severe() {
336        let config = HealthConfig::default();
337        let mut h = HealthState::new(ts(0));
338        h.record_forbidden(ts(1_000), &config, 60);
339        assert!((h.score() - (1.0 - config.penalty_403)).abs() < 0.01);
340    }
341}