rust_supervisor/policy/failure_window.rs
1//! Failure window tracking for sliding accumulation.
2//!
3//! This module implements `FailureWindow` that supports two modes:
4//! - `time_sliding`: Accumulates failures within a fixed time window (e.g., last 60 seconds)
5//! - `count_sliding`: Accumulates the most recent N failures (e.g., last 10 exits)
6//!
7//! The accumulated results are written to `MeltdownScopeState.quota_counters`
8//! for the `evaluate budget` stage to read.
9
10use serde::{Deserialize, Serialize};
11use std::collections::VecDeque;
12use std::time::{Duration, Instant};
13
14/// Window mode configuration for failure accumulation.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum WindowMode {
17 /// Time-based sliding window with fixed duration.
18 TimeSliding {
19 /// Window width in seconds.
20 window_secs: u64,
21 },
22 /// Count-based sliding window with fixed failure count.
23 CountSliding {
24 /// Maximum number of failures to retain.
25 max_count: usize,
26 },
27}
28
29impl Default for WindowMode {
30 /// Creates a default time-sliding window with 60-second width.
31 fn default() -> Self {
32 Self::TimeSliding { window_secs: 60 }
33 }
34}
35
36/// Configuration for failure window behavior.
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38pub struct FailureWindowConfig {
39 /// Window mode selection and parameters.
40 pub mode: WindowMode,
41 /// Threshold at which the window is considered exhausted.
42 pub threshold: usize,
43}
44
45impl FailureWindowConfig {
46 /// Creates a time-sliding failure window configuration.
47 ///
48 /// # Arguments
49 ///
50 /// - `window_secs`: Window width in seconds.
51 /// - `threshold`: Failure count threshold.
52 ///
53 /// # Returns
54 ///
55 /// Returns a [`FailureWindowConfig`] with time-sliding mode.
56 ///
57 /// # Examples
58 ///
59 /// ```
60 /// let config = rust_supervisor::policy::failure_window::FailureWindowConfig::time_sliding(60, 5);
61 /// assert_eq!(config.threshold, 5);
62 /// ```
63 pub fn time_sliding(window_secs: u64, threshold: usize) -> Self {
64 Self {
65 mode: WindowMode::TimeSliding { window_secs },
66 threshold,
67 }
68 }
69
70 /// Creates a count-sliding failure window configuration.
71 ///
72 /// # Arguments
73 ///
74 /// - `max_count`: Maximum number of failures to retain.
75 /// - `threshold`: Failure count threshold.
76 ///
77 /// # Returns
78 ///
79 /// Returns a [`FailureWindowConfig`] with count-sliding mode.
80 pub fn count_sliding(max_count: usize, threshold: usize) -> Self {
81 Self {
82 mode: WindowMode::CountSliding { max_count },
83 threshold,
84 }
85 }
86}
87
88/// State of the failure window after recording a sample.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct FailureWindowState {
91 /// Current number of failures in the window.
92 pub current_count: usize,
93 /// Whether the threshold has been reached or exceeded.
94 pub threshold_reached: bool,
95 /// Oldest timestamp in the window (for time-sliding mode).
96 pub oldest_timestamp: Option<Instant>,
97}
98
99/// Mutable failure window tracker supporting time and count sliding modes.
100#[derive(Debug, Clone)]
101pub struct FailureWindow {
102 /// Configuration that defines window behavior.
103 pub config: FailureWindowConfig,
104 /// Timestamps of recorded failures.
105 failures: VecDeque<Instant>,
106 /// Latest failure timestamp for cleanup logic.
107 last_failure: Option<Instant>,
108}
109
110impl FailureWindow {
111 /// Creates a new failure window with the given configuration.
112 ///
113 /// # Arguments
114 ///
115 /// - `config`: Window configuration defining mode and thresholds.
116 ///
117 /// # Returns
118 ///
119 /// Returns a [`FailureWindow`] with no recorded failures.
120 ///
121 /// # Examples
122 ///
123 /// ```
124 /// use rust_supervisor::policy::failure_window::{FailureWindow, FailureWindowConfig};
125 ///
126 /// let config = FailureWindowConfig::time_sliding(60, 5);
127 /// let window = FailureWindow::new(config);
128 /// assert_eq!(window.current_state().current_count, 0);
129 /// ```
130 pub fn new(config: FailureWindowConfig) -> Self {
131 Self {
132 config,
133 failures: VecDeque::new(),
134 last_failure: None,
135 }
136 }
137
138 /// Records a failure into the window.
139 ///
140 /// # Arguments
141 ///
142 /// - `now`: Current monotonic time supplied by the runtime or test.
143 ///
144 /// # Returns
145 ///
146 /// Returns the updated [`FailureWindowState`] after pruning and recording.
147 pub fn record_failure(&mut self, now: Instant) -> FailureWindowState {
148 self.prune(now);
149 self.failures.push_back(now);
150 self.last_failure = Some(now);
151
152 // For count-sliding mode, enforce max_count limit
153 if let WindowMode::CountSliding { max_count } = self.config.mode {
154 while self.failures.len() > max_count {
155 self.failures.pop_front();
156 }
157 }
158
159 self.current_state()
160 }
161
162 /// Clears all recorded failures.
163 ///
164 /// # Arguments
165 ///
166 /// This function has no arguments.
167 ///
168 /// # Returns
169 ///
170 /// This function returns nothing.
171 pub fn clear(&mut self) {
172 self.failures.clear();
173 self.last_failure = None;
174 }
175
176 /// Returns the current state of the failure window.
177 ///
178 /// # Arguments
179 ///
180 /// - `now`: Current monotonic time for time-sliding calculations.
181 ///
182 /// # Returns
183 ///
184 /// Returns a [`FailureWindowState`] with current metrics.
185 pub fn current_state_at(&self, now: Instant) -> FailureWindowState {
186 // Create a temporary copy to prune without mutating
187 let mut temp_failures = self.failures.clone();
188 if let WindowMode::TimeSliding { window_secs } = self.config.mode {
189 let window = Duration::from_secs(window_secs);
190 while temp_failures
191 .front()
192 .is_some_and(|ts| now.duration_since(*ts) > window)
193 {
194 temp_failures.pop_front();
195 }
196 }
197
198 let current_count = temp_failures.len();
199 let threshold_reached = current_count >= self.config.threshold;
200 let oldest_timestamp = temp_failures.front().copied();
201
202 FailureWindowState {
203 current_count,
204 threshold_reached,
205 oldest_timestamp,
206 }
207 }
208
209 /// Returns the current state without time-based pruning.
210 ///
211 /// # Arguments
212 ///
213 /// This function has no arguments.
214 ///
215 /// # Returns
216 ///
217 /// Returns a [`FailureWindowState`] with raw current metrics.
218 pub fn current_state(&self) -> FailureWindowState {
219 let current_count = self.failures.len();
220 let threshold_reached = current_count >= self.config.threshold;
221 let oldest_timestamp = self.failures.front().copied();
222
223 FailureWindowState {
224 current_count,
225 threshold_reached,
226 oldest_timestamp,
227 }
228 }
229
230 /// Removes expired entries based on window mode.
231 ///
232 /// # Arguments
233 ///
234 /// - `now`: Current monotonic time.
235 ///
236 /// # Returns
237 ///
238 /// This function returns nothing.
239 fn prune(&mut self, now: Instant) {
240 if let WindowMode::TimeSliding { window_secs } = self.config.mode {
241 let window = Duration::from_secs(window_secs);
242 while self
243 .failures
244 .front()
245 .is_some_and(|ts| now.duration_since(*ts) > window)
246 {
247 self.failures.pop_front();
248 }
249 }
250 // Count-sliding mode does not prune by time
251 }
252
253 /// Returns the number of failures currently in the window.
254 ///
255 /// # Arguments
256 ///
257 /// This function has no arguments.
258 ///
259 /// # Returns
260 ///
261 /// Returns the current failure count.
262 pub fn failure_count(&self) -> usize {
263 self.failures.len()
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use crate::policy::failure_window::{FailureWindow, FailureWindowConfig, WindowMode};
270 use std::time::{Duration, Instant};
271
272 /// Tests that time-sliding window correctly expires old failures.
273 #[test]
274 fn test_time_sliding_window_expiration() {
275 let config = FailureWindowConfig::time_sliding(10, 3);
276 let mut window = FailureWindow::new(config);
277
278 let base = Instant::now();
279 window.record_failure(base);
280 window.record_failure(base + Duration::from_secs(5));
281
282 // Both failures still in window
283 let state = window.current_state_at(base + Duration::from_secs(8));
284 assert_eq!(state.current_count, 2);
285 assert!(!state.threshold_reached);
286
287 // First failure should expire after 10 seconds
288 let state = window.current_state_at(base + Duration::from_secs(11));
289 assert_eq!(state.current_count, 1);
290 }
291
292 /// Tests that count-sliding window retains only the most recent N failures.
293 #[test]
294 fn test_count_sliding_window_limit() {
295 let config = FailureWindowConfig::count_sliding(3, 5);
296 let mut window = FailureWindow::new(config);
297
298 let base = Instant::now();
299 window.record_failure(base);
300 window.record_failure(base + Duration::from_secs(1));
301 window.record_failure(base + Duration::from_secs(2));
302 window.record_failure(base + Duration::from_secs(3));
303
304 // Should only retain last 3 failures
305 assert_eq!(window.failure_count(), 3);
306 }
307
308 /// Tests that threshold detection triggers when failure count reaches limit.
309 #[test]
310 fn test_threshold_detection() {
311 let config = FailureWindowConfig::time_sliding(60, 3);
312 let mut window = FailureWindow::new(config);
313
314 let base = Instant::now();
315 window.record_failure(base);
316 window.record_failure(base + Duration::from_secs(1));
317
318 let state = window.current_state();
319 assert!(!state.threshold_reached);
320
321 window.record_failure(base + Duration::from_secs(2));
322 let state = window.current_state();
323 assert!(state.threshold_reached);
324 }
325
326 /// Tests that default window mode configuration uses time-sliding with 60-second window.
327 #[test]
328 fn test_default_config() {
329 let config = WindowMode::default();
330 match config {
331 WindowMode::TimeSliding { window_secs } => {
332 assert_eq!(window_secs, 60);
333 }
334 _ => panic!("Default should be TimeSliding"),
335 }
336 }
337}