Skip to main content

chant/
retry.rs

1//! Retry logic with exponential backoff for failed specs.
2//!
3//! Provides retry state tracking and decision logic for determining whether
4//! a failed spec should be retried or marked as permanently failed.
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::time::{Duration, SystemTime, UNIX_EPOCH};
9
10use crate::config::FailureConfig;
11
12/// Maximum retry delay capped at 1 hour to prevent overflow
13const MAX_RETRY_DELAY_MS: u64 = 3_600_000;
14
15/// Retry state for tracking retry attempts and timing
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct RetryState {
18    /// Number of retry attempts made so far
19    pub attempts: usize,
20    /// Timestamp of last retry attempt (milliseconds since epoch)
21    pub last_retry_time: u64,
22    /// Timestamp when next retry should occur (milliseconds since epoch)
23    pub next_retry_time: u64,
24}
25
26impl RetryState {
27    /// Create a new retry state with no attempts
28    pub fn new() -> Self {
29        let now = SystemTime::now()
30            .duration_since(UNIX_EPOCH)
31            .unwrap()
32            .as_millis() as u64;
33
34        Self {
35            attempts: 0,
36            last_retry_time: now,
37            next_retry_time: now,
38        }
39    }
40
41    /// Update retry state after a failed attempt
42    pub fn record_attempt(&mut self, next_delay_ms: u64) {
43        let now = SystemTime::now()
44            .duration_since(UNIX_EPOCH)
45            .unwrap()
46            .as_millis() as u64;
47
48        self.attempts += 1;
49        self.last_retry_time = now;
50        self.next_retry_time = now + next_delay_ms;
51    }
52}
53
54impl Default for RetryState {
55    fn default() -> Self {
56        Self::new()
57    }
58}
59
60/// Decision on whether to retry a failed spec
61#[derive(Debug, Clone, PartialEq, Eq)]
62pub enum RetryDecision {
63    /// Retry after the specified delay
64    Retry(Duration),
65    /// Permanent failure with reason
66    PermanentFailure(String),
67}
68
69/// Determine whether a failed spec should be retried based on error log and config.
70///
71/// # Arguments
72/// * `spec_id` - The spec ID (for error messages)
73/// * `error_log` - The error log content to scan for retryable patterns
74/// * `config` - Failure configuration with retry settings and patterns
75///
76/// # Returns
77/// * `Ok(RetryDecision::Retry(delay))` - Should retry after the delay
78/// * `Ok(RetryDecision::PermanentFailure(reason))` - Permanent failure, don't retry
79/// * `Err(_)` - Configuration error
80///
81/// # Edge Cases
82/// * Empty or missing error log → PermanentFailure
83/// * max_retries = 0 → First failure is permanent
84/// * No pattern match → PermanentFailure
85/// * Backoff overflow → Capped at 1 hour
86/// * Multiple pattern matches → Still retryable (OR logic)
87pub fn should_retry(
88    spec_id: &str,
89    error_log: &str,
90    config: &FailureConfig,
91) -> Result<RetryDecision> {
92    // Validate config
93    config.validate()?;
94
95    // Edge case: Empty or missing error log
96    if error_log.trim().is_empty() {
97        return Ok(RetryDecision::PermanentFailure(
98            "Empty error log (no pattern match)".to_string(),
99        ));
100    }
101
102    // Edge case: max_retries = 0 means first failure is permanent
103    if config.max_retries == 0 {
104        return Ok(RetryDecision::PermanentFailure(
105            "max_retries is 0".to_string(),
106        ));
107    }
108
109    // Check if error log contains any retryable pattern
110    let has_retryable_pattern = config
111        .retryable_patterns
112        .iter()
113        .any(|pattern| error_log.contains(pattern));
114
115    if !has_retryable_pattern {
116        return Ok(RetryDecision::PermanentFailure(format!(
117            "No retryable pattern found in error log for spec {}",
118            spec_id
119        )));
120    }
121
122    // At this point, we have a retryable error
123    // We need to get the current attempt count from somewhere
124    // For now, we'll calculate based on the assumption this is called after each failure
125    // The caller should maintain RetryState to track attempts
126
127    Ok(RetryDecision::PermanentFailure(
128        "Retry state tracking not yet integrated".to_string(),
129    ))
130}
131
132/// Calculate exponential backoff delay for a given attempt number.
133///
134/// Formula: delay = base_delay * (backoff_multiplier ^ attempt)
135/// Capped at MAX_RETRY_DELAY_MS (1 hour) to prevent overflow.
136///
137/// # Arguments
138/// * `attempt` - The current attempt number (0-indexed)
139/// * `base_delay_ms` - Base delay in milliseconds
140/// * `backoff_multiplier` - Multiplier for exponential backoff (must be >= 1.0)
141///
142/// # Returns
143/// Delay in milliseconds, capped at 1 hour
144pub fn calculate_backoff_delay(attempt: usize, base_delay_ms: u64, backoff_multiplier: f64) -> u64 {
145    // Calculate delay with overflow protection
146    let delay = (base_delay_ms as f64) * backoff_multiplier.powi(attempt as i32);
147
148    // Cap at maximum delay
149    if delay > MAX_RETRY_DELAY_MS as f64 {
150        MAX_RETRY_DELAY_MS
151    } else {
152        delay as u64
153    }
154}
155
156/// Determine retry decision based on retry state and config.
157///
158/// # Arguments
159/// * `state` - Current retry state with attempt count
160/// * `error_log` - Error log to check for retryable patterns
161/// * `config` - Failure configuration
162///
163/// # Returns
164/// * `RetryDecision::Retry(delay)` if should retry
165/// * `RetryDecision::PermanentFailure(reason)` if should not retry
166pub fn decide_retry(state: &RetryState, error_log: &str, config: &FailureConfig) -> RetryDecision {
167    // Edge case: Empty or missing error log
168    if error_log.trim().is_empty() {
169        return RetryDecision::PermanentFailure("Empty error log (no pattern match)".to_string());
170    }
171
172    // Edge case: max_retries = 0 means first failure is permanent
173    if config.max_retries == 0 {
174        return RetryDecision::PermanentFailure("max_retries is 0".to_string());
175    }
176
177    // Check if we've exceeded max retries
178    if state.attempts >= config.max_retries {
179        return RetryDecision::PermanentFailure(format!(
180            "Exceeded max retries ({}/{})",
181            state.attempts, config.max_retries
182        ));
183    }
184
185    // Check if error log contains any retryable pattern (OR logic)
186    let has_retryable_pattern = config
187        .retryable_patterns
188        .iter()
189        .any(|pattern| error_log.contains(pattern));
190
191    if !has_retryable_pattern {
192        return RetryDecision::PermanentFailure(
193            "No retryable pattern found in error log".to_string(),
194        );
195    }
196
197    // Calculate exponential backoff delay
198    let delay_ms = calculate_backoff_delay(
199        state.attempts,
200        config.retry_delay_ms,
201        config.backoff_multiplier,
202    );
203
204    RetryDecision::Retry(Duration::from_millis(delay_ms))
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    fn test_config() -> FailureConfig {
212        FailureConfig {
213            max_retries: 3,
214            retry_delay_ms: 60_000, // 60 seconds
215            backoff_multiplier: 2.0,
216            retryable_patterns: vec!["rate_limit".to_string()],
217            on_permanent_failure: crate::config::OnPermanentFailure::Skip,
218        }
219    }
220
221    #[test]
222    fn test_retry_state_new() {
223        let state = RetryState::new();
224        assert_eq!(state.attempts, 0);
225        assert!(state.last_retry_time > 0);
226        assert_eq!(state.last_retry_time, state.next_retry_time);
227    }
228
229    #[test]
230    fn test_retry_state_record_attempt() {
231        let mut state = RetryState::new();
232        let initial_time = state.last_retry_time;
233
234        state.record_attempt(5000);
235
236        assert_eq!(state.attempts, 1);
237        assert!(state.last_retry_time >= initial_time);
238        assert_eq!(state.next_retry_time, state.last_retry_time + 5000);
239    }
240
241    #[test]
242    fn test_calculate_backoff_delay() {
243        // Base case: attempt 0
244        assert_eq!(calculate_backoff_delay(0, 60_000, 2.0), 60_000);
245
246        // Attempt 1: 60s * 2^1 = 120s
247        assert_eq!(calculate_backoff_delay(1, 60_000, 2.0), 120_000);
248
249        // Attempt 2: 60s * 2^2 = 240s
250        assert_eq!(calculate_backoff_delay(2, 60_000, 2.0), 240_000);
251
252        // Attempt 3: 60s * 2^3 = 480s
253        assert_eq!(calculate_backoff_delay(3, 60_000, 2.0), 480_000);
254    }
255
256    #[test]
257    fn test_calculate_backoff_delay_with_different_multiplier() {
258        // Multiplier 1.5
259        assert_eq!(calculate_backoff_delay(0, 60_000, 1.5), 60_000);
260        assert_eq!(calculate_backoff_delay(1, 60_000, 1.5), 90_000);
261        assert_eq!(calculate_backoff_delay(2, 60_000, 1.5), 135_000);
262    }
263
264    #[test]
265    fn test_calculate_backoff_delay_overflow_cap() {
266        // Large attempt number should be capped at 1 hour
267        let delay = calculate_backoff_delay(100, 60_000, 2.0);
268        assert_eq!(delay, MAX_RETRY_DELAY_MS);
269    }
270
271    #[test]
272    fn test_decide_retry_with_retryable_error() {
273        let mut state = RetryState::new();
274        let config = test_config();
275        let error_log = "Error: API rate_limit exceeded";
276
277        // First attempt (state.attempts = 0)
278        let decision = decide_retry(&state, error_log, &config);
279        assert!(matches!(decision, RetryDecision::Retry(_)));
280        if let RetryDecision::Retry(delay) = decision {
281            assert_eq!(delay.as_millis(), 60_000); // 60s
282        }
283
284        // Second attempt
285        state.record_attempt(60_000);
286        let decision = decide_retry(&state, error_log, &config);
287        assert!(matches!(decision, RetryDecision::Retry(_)));
288        if let RetryDecision::Retry(delay) = decision {
289            assert_eq!(delay.as_millis(), 120_000); // 120s
290        }
291
292        // Third attempt
293        state.record_attempt(120_000);
294        let decision = decide_retry(&state, error_log, &config);
295        assert!(matches!(decision, RetryDecision::Retry(_)));
296        if let RetryDecision::Retry(delay) = decision {
297            assert_eq!(delay.as_millis(), 240_000); // 240s
298        }
299
300        // Fourth attempt - exceeds max_retries
301        state.record_attempt(240_000);
302        let decision = decide_retry(&state, error_log, &config);
303        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
304    }
305
306    #[test]
307    fn test_decide_retry_with_non_retryable_error() {
308        let state = RetryState::new();
309        let config = test_config();
310        let error_log = "Error: syntax error in code";
311
312        let decision = decide_retry(&state, error_log, &config);
313        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
314    }
315
316    #[test]
317    fn test_decide_retry_empty_error_log() {
318        let state = RetryState::new();
319        let config = test_config();
320
321        let decision = decide_retry(&state, "", &config);
322        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
323
324        let decision = decide_retry(&state, "   ", &config);
325        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
326    }
327
328    #[test]
329    fn test_decide_retry_max_retries_zero() {
330        let state = RetryState::new();
331        let mut config = test_config();
332        config.max_retries = 0;
333
334        let error_log = "Error: rate_limit exceeded";
335        let decision = decide_retry(&state, error_log, &config);
336        assert!(matches!(decision, RetryDecision::PermanentFailure(_)));
337    }
338
339    #[test]
340    fn test_decide_retry_multiple_patterns() {
341        let state = RetryState::new();
342        let mut config = test_config();
343        config.retryable_patterns = vec![
344            "rate_limit".to_string(),
345            "timeout".to_string(),
346            "connection_refused".to_string(),
347        ];
348
349        // Test each pattern matches (OR logic)
350        let error_log1 = "Error: rate_limit exceeded";
351        assert!(matches!(
352            decide_retry(&state, error_log1, &config),
353            RetryDecision::Retry(_)
354        ));
355
356        let error_log2 = "Error: timeout occurred";
357        assert!(matches!(
358            decide_retry(&state, error_log2, &config),
359            RetryDecision::Retry(_)
360        ));
361
362        let error_log3 = "Error: connection_refused";
363        assert!(matches!(
364            decide_retry(&state, error_log3, &config),
365            RetryDecision::Retry(_)
366        ));
367    }
368
369    #[test]
370    fn test_decide_retry_backoff_calculation() {
371        let mut state = RetryState::new();
372        let config = test_config();
373        let error_log = "Error: rate_limit exceeded";
374
375        // Attempt 0: 60s * 2^0 = 60s
376        let decision = decide_retry(&state, error_log, &config);
377        if let RetryDecision::Retry(delay) = decision {
378            assert_eq!(delay.as_secs(), 60);
379        }
380
381        // Attempt 1: 60s * 2^1 = 120s
382        state.record_attempt(60_000);
383        let decision = decide_retry(&state, error_log, &config);
384        if let RetryDecision::Retry(delay) = decision {
385            assert_eq!(delay.as_secs(), 120);
386        }
387
388        // Attempt 2: 60s * 2^2 = 240s
389        state.record_attempt(120_000);
390        let decision = decide_retry(&state, error_log, &config);
391        if let RetryDecision::Retry(delay) = decision {
392            assert_eq!(delay.as_secs(), 240);
393        }
394    }
395}