Skip to main content

rust_supervisor/policy/
failure_window.rs

1//! Failure window tracking for sliding accumulation.
2//!
3//! This module implements `FailureWindow` that supports two modes:
4//! - `time_sliding`: Accumulates failures within a fixed time window (e.g., last 60 seconds)
5//! - `count_sliding`: Accumulates the most recent N failures (e.g., last 10 exits)
6//!
7//! The accumulated results are written to `MeltdownScopeState.quota_counters`
8//! for the `evaluate budget` stage to read.
9
10use serde::{Deserialize, Serialize};
11use std::collections::VecDeque;
12use std::time::{Duration, Instant};
13
14/// Window mode configuration for failure accumulation.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum WindowMode {
17    /// Time-based sliding window with fixed duration.
18    TimeSliding {
19        /// Window width in seconds.
20        window_secs: u64,
21    },
22    /// Count-based sliding window with fixed failure count.
23    CountSliding {
24        /// Maximum number of failures to retain.
25        max_count: usize,
26    },
27}
28
29impl Default for WindowMode {
30    /// Creates a default time-sliding window with 60-second width.
31    fn default() -> Self {
32        Self::TimeSliding { window_secs: 60 }
33    }
34}
35
36/// Configuration for failure window behavior.
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38pub struct FailureWindowConfig {
39    /// Window mode selection and parameters.
40    pub mode: WindowMode,
41    /// Threshold at which the window is considered exhausted.
42    pub threshold: usize,
43}
44
45impl FailureWindowConfig {
46    /// Creates a time-sliding failure window configuration.
47    ///
48    /// # Arguments
49    ///
50    /// - `window_secs`: Window width in seconds.
51    /// - `threshold`: Failure count threshold.
52    ///
53    /// # Returns
54    ///
55    /// Returns a [`FailureWindowConfig`] with time-sliding mode.
56    ///
57    /// # Examples
58    ///
59    /// ```
60    /// let config = rust_supervisor::policy::failure_window::FailureWindowConfig::time_sliding(60, 5);
61    /// assert_eq!(config.threshold, 5);
62    /// ```
63    pub fn time_sliding(window_secs: u64, threshold: usize) -> Self {
64        Self {
65            mode: WindowMode::TimeSliding { window_secs },
66            threshold,
67        }
68    }
69
70    /// Creates a count-sliding failure window configuration.
71    ///
72    /// # Arguments
73    ///
74    /// - `max_count`: Maximum number of failures to retain.
75    /// - `threshold`: Failure count threshold.
76    ///
77    /// # Returns
78    ///
79    /// Returns a [`FailureWindowConfig`] with count-sliding mode.
80    pub fn count_sliding(max_count: usize, threshold: usize) -> Self {
81        Self {
82            mode: WindowMode::CountSliding { max_count },
83            threshold,
84        }
85    }
86}
87
88/// State of the failure window after recording a sample.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct FailureWindowState {
91    /// Current number of failures in the window.
92    pub current_count: usize,
93    /// Whether the threshold has been reached or exceeded.
94    pub threshold_reached: bool,
95    /// Oldest timestamp in the window (for time-sliding mode).
96    pub oldest_timestamp: Option<Instant>,
97}
98
99/// Mutable failure window tracker supporting time and count sliding modes.
100#[derive(Debug, Clone)]
101pub struct FailureWindow {
102    /// Configuration that defines window behavior.
103    pub config: FailureWindowConfig,
104    /// Timestamps of recorded failures.
105    failures: VecDeque<Instant>,
106    /// Latest failure timestamp for cleanup logic.
107    last_failure: Option<Instant>,
108}
109
110impl FailureWindow {
111    /// Creates a new failure window with the given configuration.
112    ///
113    /// # Arguments
114    ///
115    /// - `config`: Window configuration defining mode and thresholds.
116    ///
117    /// # Returns
118    ///
119    /// Returns a [`FailureWindow`] with no recorded failures.
120    ///
121    /// # Examples
122    ///
123    /// ```
124    /// use rust_supervisor::policy::failure_window::{FailureWindow, FailureWindowConfig};
125    ///
126    /// let config = FailureWindowConfig::time_sliding(60, 5);
127    /// let window = FailureWindow::new(config);
128    /// assert_eq!(window.current_state().current_count, 0);
129    /// ```
130    pub fn new(config: FailureWindowConfig) -> Self {
131        Self {
132            config,
133            failures: VecDeque::new(),
134            last_failure: None,
135        }
136    }
137
138    /// Records a failure into the window.
139    ///
140    /// # Arguments
141    ///
142    /// - `now`: Current monotonic time supplied by the runtime or test.
143    ///
144    /// # Returns
145    ///
146    /// Returns the updated [`FailureWindowState`] after pruning and recording.
147    pub fn record_failure(&mut self, now: Instant) -> FailureWindowState {
148        self.prune(now);
149        self.failures.push_back(now);
150        self.last_failure = Some(now);
151
152        // For count-sliding mode, enforce max_count limit
153        if let WindowMode::CountSliding { max_count } = self.config.mode {
154            while self.failures.len() > max_count {
155                self.failures.pop_front();
156            }
157        }
158
159        self.current_state()
160    }
161
162    /// Clears all recorded failures.
163    ///
164    /// # Arguments
165    ///
166    /// This function has no arguments.
167    ///
168    /// # Returns
169    ///
170    /// This function returns nothing.
171    pub fn clear(&mut self) {
172        self.failures.clear();
173        self.last_failure = None;
174    }
175
176    /// Returns the current state of the failure window.
177    ///
178    /// # Arguments
179    ///
180    /// - `now`: Current monotonic time for time-sliding calculations.
181    ///
182    /// # Returns
183    ///
184    /// Returns a [`FailureWindowState`] with current metrics.
185    pub fn current_state_at(&self, now: Instant) -> FailureWindowState {
186        // Create a temporary copy to prune without mutating
187        let mut temp_failures = self.failures.clone();
188        if let WindowMode::TimeSliding { window_secs } = self.config.mode {
189            let window = Duration::from_secs(window_secs);
190            while temp_failures
191                .front()
192                .is_some_and(|ts| now.duration_since(*ts) > window)
193            {
194                temp_failures.pop_front();
195            }
196        }
197
198        let current_count = temp_failures.len();
199        let threshold_reached = current_count >= self.config.threshold;
200        let oldest_timestamp = temp_failures.front().copied();
201
202        FailureWindowState {
203            current_count,
204            threshold_reached,
205            oldest_timestamp,
206        }
207    }
208
209    /// Returns the current state without time-based pruning.
210    ///
211    /// # Arguments
212    ///
213    /// This function has no arguments.
214    ///
215    /// # Returns
216    ///
217    /// Returns a [`FailureWindowState`] with raw current metrics.
218    pub fn current_state(&self) -> FailureWindowState {
219        let current_count = self.failures.len();
220        let threshold_reached = current_count >= self.config.threshold;
221        let oldest_timestamp = self.failures.front().copied();
222
223        FailureWindowState {
224            current_count,
225            threshold_reached,
226            oldest_timestamp,
227        }
228    }
229
230    /// Removes expired entries based on window mode.
231    ///
232    /// # Arguments
233    ///
234    /// - `now`: Current monotonic time.
235    ///
236    /// # Returns
237    ///
238    /// This function returns nothing.
239    fn prune(&mut self, now: Instant) {
240        if let WindowMode::TimeSliding { window_secs } = self.config.mode {
241            let window = Duration::from_secs(window_secs);
242            while self
243                .failures
244                .front()
245                .is_some_and(|ts| now.duration_since(*ts) > window)
246            {
247                self.failures.pop_front();
248            }
249        }
250        // Count-sliding mode does not prune by time
251    }
252
253    /// Returns the number of failures currently in the window.
254    ///
255    /// # Arguments
256    ///
257    /// This function has no arguments.
258    ///
259    /// # Returns
260    ///
261    /// Returns the current failure count.
262    pub fn failure_count(&self) -> usize {
263        self.failures.len()
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use crate::policy::failure_window::{FailureWindow, FailureWindowConfig, WindowMode};
270    use std::time::{Duration, Instant};
271
272    /// Tests that time-sliding window correctly expires old failures.
273    #[test]
274    fn test_time_sliding_window_expiration() {
275        let config = FailureWindowConfig::time_sliding(10, 3);
276        let mut window = FailureWindow::new(config);
277
278        let base = Instant::now();
279        window.record_failure(base);
280        window.record_failure(base + Duration::from_secs(5));
281
282        // Both failures still in window
283        let state = window.current_state_at(base + Duration::from_secs(8));
284        assert_eq!(state.current_count, 2);
285        assert!(!state.threshold_reached);
286
287        // First failure should expire after 10 seconds
288        let state = window.current_state_at(base + Duration::from_secs(11));
289        assert_eq!(state.current_count, 1);
290    }
291
292    /// Tests that count-sliding window retains only the most recent N failures.
293    #[test]
294    fn test_count_sliding_window_limit() {
295        let config = FailureWindowConfig::count_sliding(3, 5);
296        let mut window = FailureWindow::new(config);
297
298        let base = Instant::now();
299        window.record_failure(base);
300        window.record_failure(base + Duration::from_secs(1));
301        window.record_failure(base + Duration::from_secs(2));
302        window.record_failure(base + Duration::from_secs(3));
303
304        // Should only retain last 3 failures
305        assert_eq!(window.failure_count(), 3);
306    }
307
308    /// Tests that threshold detection triggers when failure count reaches limit.
309    #[test]
310    fn test_threshold_detection() {
311        let config = FailureWindowConfig::time_sliding(60, 3);
312        let mut window = FailureWindow::new(config);
313
314        let base = Instant::now();
315        window.record_failure(base);
316        window.record_failure(base + Duration::from_secs(1));
317
318        let state = window.current_state();
319        assert!(!state.threshold_reached);
320
321        window.record_failure(base + Duration::from_secs(2));
322        let state = window.current_state();
323        assert!(state.threshold_reached);
324    }
325
326    /// Tests that default window mode configuration uses time-sliding with 60-second window.
327    #[test]
328    fn test_default_config() {
329        let config = WindowMode::default();
330        match config {
331            WindowMode::TimeSliding { window_secs } => {
332                assert_eq!(window_secs, 60);
333            }
334            _ => panic!("Default should be TimeSliding"),
335        }
336    }
337}