Skip to main content

chant/
retry.rs

1//! Retry logic with exponential backoff for failed specs.
2//!
3//! Provides retry state tracking and decision logic for determining whether
4//! a failed spec should be retried or marked as permanently failed.
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::time::{Duration, SystemTime, UNIX_EPOCH};
9
10use crate::config::FailureConfig;
11
12/// Maximum retry delay capped at 1 hour to prevent overflow
13const MAX_RETRY_DELAY_MS: u64 = 3_600_000;
14
15/// Retry state for tracking retry attempts and timing
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct RetryState {
18    /// Number of retry attempts made so far
19    pub attempts: usize,
20    /// Timestamp of last retry attempt (milliseconds since epoch)
21    pub last_retry_time: u64,
22    /// Timestamp when next retry should occur (milliseconds since epoch)
23    pub next_retry_time: u64,
24}
25
26impl RetryState {
27    /// Create a new retry state with no attempts
28    pub fn new() -> Self {
29        let now = SystemTime::now()
30            .duration_since(UNIX_EPOCH)
31            .unwrap()
32            .as_millis() as u64;
33
34        Self {
35            attempts: 0,
36            last_retry_time: now,
37            next_retry_time: now,
38        }
39    }
40
41    /// Update retry state after a failed attempt
42    pub fn record_attempt(&mut self, next_delay_ms: u64) {
43        let now = SystemTime::now()
44            .duration_since(UNIX_EPOCH)
45            .unwrap()
46            .as_millis() as u64;
47
48        self.attempts += 1;
49        self.last_retry_time = now;
50        self.next_retry_time = now + next_delay_ms;
51    }
52}
53
54impl Default for RetryState {
55    fn default() -> Self {
56        Self::new()
57    }
58}
59
60/// Decision on whether to retry a failed spec
61#[derive(Debug, Clone, PartialEq, Eq)]
62pub enum RetryDecision {
63    /// Retry after the specified delay
64    Retry(Duration),
65    /// Permanent failure with reason
66    PermanentFailure(String),
67}
68
69/// Determine whether a failed spec should be retried based on retry state, error log and config.
70///
71/// # Arguments
72/// * `spec_id` - The spec ID (for error messages, currently unused)
73/// * `retry_state` - Current retry state with attempt count
74/// * `error_log` - The error log content to scan for retryable patterns
75/// * `config` - Failure configuration with retry settings and patterns
76///
77/// # Returns
78/// * `Ok(RetryDecision::Retry(delay))` - Should retry after the delay
79/// * `Ok(RetryDecision::PermanentFailure(reason))` - Permanent failure, don't retry
80/// * `Err(_)` - Configuration error
81///
82/// # Edge Cases
83/// * Empty or missing error log → PermanentFailure
84/// * max_retries = 0 → First failure is permanent
85/// * No pattern match → PermanentFailure
86/// * Backoff overflow → Capped at 1 hour
87/// * Multiple pattern matches → Still retryable (OR logic)
88/// * Exceeded max_retries → PermanentFailure
89pub fn should_retry(
90    _spec_id: &str,
91    retry_state: &RetryState,
92    error_log: &str,
93    config: &FailureConfig,
94) -> Result<RetryDecision> {
95    // Validate config
96    config.validate()?;
97
98    // Delegate to decide_retry which has the full implementation
99    Ok(decide_retry(retry_state, error_log, config))
100}
101
102/// Calculate exponential backoff delay for a given attempt number.
103///
104/// Formula: delay = base_delay * (backoff_multiplier ^ attempt)
105/// Capped at MAX_RETRY_DELAY_MS (1 hour) to prevent overflow.
106///
107/// # Arguments
108/// * `attempt` - The current attempt number (0-indexed)
109/// * `base_delay_ms` - Base delay in milliseconds
110/// * `backoff_multiplier` - Multiplier for exponential backoff (must be >= 1.0)
111///
112/// # Returns
113/// Delay in milliseconds, capped at 1 hour
114pub fn calculate_backoff_delay(attempt: usize, base_delay_ms: u64, backoff_multiplier: f64) -> u64 {
115    // Calculate delay with overflow protection
116    let delay = (base_delay_ms as f64) * backoff_multiplier.powi(attempt as i32);
117
118    // Cap at maximum delay
119    if delay > MAX_RETRY_DELAY_MS as f64 {
120        MAX_RETRY_DELAY_MS
121    } else {
122        delay as u64
123    }
124}
125
126/// Determine retry decision based on retry state and config.
127///
128/// # Arguments
129/// * `state` - Current retry state with attempt count
130/// * `error_log` - Error log to check for retryable patterns
131/// * `config` - Failure configuration
132///
133/// # Returns
134/// * `RetryDecision::Retry(delay)` if should retry
135/// * `RetryDecision::PermanentFailure(reason)` if should not retry
136pub fn decide_retry(state: &RetryState, error_log: &str, config: &FailureConfig) -> RetryDecision {
137    // Edge case: Empty or missing error log
138    if error_log.trim().is_empty() {
139        return RetryDecision::PermanentFailure("Empty error log (no pattern match)".to_string());
140    }
141
142    // Edge case: max_retries = 0 means first failure is permanent
143    if config.max_retries == 0 {
144        return RetryDecision::PermanentFailure("max_retries is 0".to_string());
145    }
146
147    // Check if we've exceeded max retries
148    if state.attempts >= config.max_retries {
149        return RetryDecision::PermanentFailure(format!(
150            "Exceeded max retries ({}/{})",
151            state.attempts, config.max_retries
152        ));
153    }
154
155    // Check if error log contains any retryable pattern (OR logic)
156    let has_retryable_pattern = config
157        .retryable_patterns
158        .iter()
159        .any(|pattern| error_log.contains(pattern));
160
161    if !has_retryable_pattern {
162        return RetryDecision::PermanentFailure(
163            "No retryable pattern found in error log".to_string(),
164        );
165    }
166
167    // Calculate exponential backoff delay
168    let delay_ms = calculate_backoff_delay(
169        state.attempts,
170        config.retry_delay_ms,
171        config.backoff_multiplier,
172    );
173
174    RetryDecision::Retry(Duration::from_millis(delay_ms))
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    fn test_config() -> FailureConfig {
182        FailureConfig {
183            max_retries: 3,
184            retry_delay_ms: 60_000, // 60 seconds
185            backoff_multiplier: 2.0,
186            retryable_patterns: vec!["rate_limit".to_string()],
187            on_permanent_failure: crate::config::OnPermanentFailure::Skip,
188        }
189    }
190
191    #[test]
192    fn test_retry_state_new() {
193        let state = RetryState::new();
194        assert_eq!(state.attempts, 0);
195        assert!(state.last_retry_time > 0);
196        assert_eq!(state.last_retry_time, state.next_retry_time);
197    }
198
199    #[test]
200    fn test_retry_state_record_attempt() {
201        let mut state = RetryState::new();
202        let initial_time = state.last_retry_time;
203
204        state.record_attempt(5000);
205
206        assert_eq!(state.attempts, 1);
207        assert!(state.last_retry_time >= initial_time);
208        assert_eq!(state.next_retry_time, state.last_retry_time + 5000);
209    }
210
211    #[test]
212    fn test_calculate_backoff_delay() {
213        // Base case: attempt 0
214        assert_eq!(calculate_backoff_delay(0, 60_000, 2.0), 60_000);
215
216        // Attempt 1: 60s * 2^1 = 120s
217        assert_eq!(calculate_backoff_delay(1, 60_000, 2.0), 120_000);
218
219        // Attempt 2: 60s * 2^2 = 240s
220        assert_eq!(calculate_backoff_delay(2, 60_000, 2.0), 240_000);
221
222        // Attempt 3: 60s * 2^3 = 480s
223        assert_eq!(calculate_backoff_delay(3, 60_000, 2.0), 480_000);
224    }
225
226    #[test]
227    fn test_calculate_backoff_delay_with_different_multiplier() {
228        // Multiplier 1.5
229        assert_eq!(calculate_backoff_delay(0, 60_000, 1.5), 60_000);
230        assert_eq!(calculate_backoff_delay(1, 60_000, 1.5), 90_000);
231        assert_eq!(calculate_backoff_delay(2, 60_000, 1.5), 135_000);
232    }
233
234    #[test]
235    fn test_calculate_backoff_delay_overflow_cap() {
236        // Large attempt number should be capped at 1 hour
237        let delay = calculate_backoff_delay(100, 60_000, 2.0);
238        assert_eq!(delay, MAX_RETRY_DELAY_MS);
239    }
240
241    #[test]
242    fn test_decide_retry_with_retryable_error() {
243        let mut state = RetryState::new();
244        let config = test_config();
245        let error_log = "Error: API rate_limit exceeded";
246
247        // First attempt (state.attempts = 0)
248        let decision = decide_retry(&state, error_log, &config);
249        assert!(matches!(decision, RetryDecision::Retry(_)));
250        if let RetryDecision::Retry(delay) = decision {
251            assert_eq!(delay.as_millis(), 60_000); // 60s
252        }
253
254        // Second attempt
255        state.record_attempt(60_000);
256        let decision = decide_retry(&state, error_log, &config);
257        assert!(matches!(decision, RetryDecision::Retry(_)));
258        if let RetryDecision::Retry(delay) = decision {
259            assert_eq!(delay.as_millis(), 120_000); // 120s
260        }
261
262        // Third attempt
263        state.record_attempt(120_000);
264        let decision = decide_retry(&state, error_log, &config);
265        assert!(matches!(decision, RetryDecision::Retry(_)));
266        if let RetryDecision::Retry(delay) = decision {
267            assert_eq!(delay.as_millis(), 240_000); // 240s
268        }
269
270        // Fourth attempt - exceeds max_retries
271        state.record_attempt(240_000);
272        let decision = decide_retry(&state, error_log, &config);
273        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
274    }
275
276    #[test]
277    fn test_decide_retry_with_non_retryable_error() {
278        let state = RetryState::new();
279        let config = test_config();
280        let error_log = "Error: syntax error in code";
281
282        let decision = decide_retry(&state, error_log, &config);
283        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
284    }
285
286    #[test]
287    fn test_decide_retry_empty_error_log() {
288        let state = RetryState::new();
289        let config = test_config();
290
291        let decision = decide_retry(&state, "", &config);
292        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
293
294        let decision = decide_retry(&state, "   ", &config);
295        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
296    }
297
298    #[test]
299    fn test_decide_retry_max_retries_zero() {
300        let state = RetryState::new();
301        let mut config = test_config();
302        config.max_retries = 0;
303
304        let error_log = "Error: rate_limit exceeded";
305        let decision = decide_retry(&state, error_log, &config);
306        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
307    }
308
309    #[test]
310    fn test_decide_retry_multiple_patterns() {
311        let state = RetryState::new();
312        let mut config = test_config();
313        config.retryable_patterns = vec![
314            "rate_limit".to_string(),
315            "timeout".to_string(),
316            "connection_refused".to_string(),
317        ];
318
319        // Test each pattern matches (OR logic)
320        let error_log1 = "Error: rate_limit exceeded";
321        assert!(matches!(
322            decide_retry(&state, error_log1, &config),
323            RetryDecision::Retry(_)
324        ));
325
326        let error_log2 = "Error: timeout occurred";
327        assert!(matches!(
328            decide_retry(&state, error_log2, &config),
329            RetryDecision::Retry(_)
330        ));
331
332        let error_log3 = "Error: connection_refused";
333        assert!(matches!(
334            decide_retry(&state, error_log3, &config),
335            RetryDecision::Retry(_)
336        ));
337    }
338
339    #[test]
340    fn test_decide_retry_backoff_calculation() {
341        let mut state = RetryState::new();
342        let config = test_config();
343        let error_log = "Error: rate_limit exceeded";
344
345        // Attempt 0: 60s * 2^0 = 60s
346        let decision = decide_retry(&state, error_log, &config);
347        if let RetryDecision::Retry(delay) = decision {
348            assert_eq!(delay.as_secs(), 60);
349        }
350
351        // Attempt 1: 60s * 2^1 = 120s
352        state.record_attempt(60_000);
353        let decision = decide_retry(&state, error_log, &config);
354        if let RetryDecision::Retry(delay) = decision {
355            assert_eq!(delay.as_secs(), 120);
356        }
357
358        // Attempt 2: 60s * 2^2 = 240s
359        state.record_attempt(120_000);
360        let decision = decide_retry(&state, error_log, &config);
361        if let RetryDecision::Retry(delay) = decision {
362            assert_eq!(delay.as_secs(), 240);
363        }
364    }
365}