Skip to main content

batty_cli/team/
retry.rs

1//! Retry helpers for transient failures with configurable exponential backoff.
2
3use std::fmt::Debug;
4use std::thread;
5use std::time::{Duration, SystemTime, UNIX_EPOCH};
6
7use tracing::warn;
8
9use super::board_cmd::BoardError;
10use super::errors::DeliveryError;
11use super::git_cmd::GitError;
12
13/// Classifies whether an error is safe to retry.
14pub trait Retryable {
15    fn is_transient(&self) -> bool;
16}
17
18impl Retryable for GitError {
19    fn is_transient(&self) -> bool {
20        self.is_transient()
21    }
22}
23
24impl Retryable for BoardError {
25    fn is_transient(&self) -> bool {
26        self.is_transient()
27    }
28}
29
30impl Retryable for DeliveryError {
31    fn is_transient(&self) -> bool {
32        self.is_transient()
33    }
34}
35
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct RetryConfig {
38    pub max_retries: u32,
39    pub base_delay_ms: u64,
40    pub max_delay_ms: u64,
41    pub jitter: bool,
42}
43
44impl Default for RetryConfig {
45    fn default() -> Self {
46        Self {
47            max_retries: 3,
48            base_delay_ms: 100,
49            max_delay_ms: 5_000,
50            jitter: true,
51        }
52    }
53}
54
55impl RetryConfig {
56    #[cfg_attr(not(test), allow(dead_code))]
57    pub fn no_retry() -> Self {
58        Self {
59            max_retries: 0,
60            ..Self::default()
61        }
62    }
63
64    pub fn fast() -> Self {
65        Self {
66            max_retries: 2,
67            base_delay_ms: 50,
68            max_delay_ms: 500,
69            jitter: true,
70        }
71    }
72
73    #[cfg_attr(not(test), allow(dead_code))]
74    pub fn conservative() -> Self {
75        Self {
76            max_retries: 5,
77            base_delay_ms: 200,
78            max_delay_ms: 10_000,
79            jitter: true,
80        }
81    }
82}
83
84pub fn retry_sync<T, E, F>(config: &RetryConfig, operation: F) -> Result<T, E>
85where
86    E: Retryable + Debug,
87    F: Fn() -> Result<T, E>,
88{
89    retry_sync_with_sleep(config, operation, |delay_ms| {
90        thread::sleep(Duration::from_millis(delay_ms));
91    })
92}
93
94fn retry_sync_with_sleep<T, E, F, S>(config: &RetryConfig, operation: F, sleep: S) -> Result<T, E>
95where
96    E: Retryable + Debug,
97    F: Fn() -> Result<T, E>,
98    S: Fn(u64),
99{
100    let mut retries = 0;
101
102    loop {
103        match operation() {
104            Ok(value) => return Ok(value),
105            Err(error) if !error.is_transient() => return Err(error),
106            Err(error) if retries >= config.max_retries => return Err(error),
107            Err(error) => {
108                let delay_ms = next_delay_ms(config, retries);
109                let next_attempt = retries + 2;
110                warn!(
111                    retry = retries + 1,
112                    next_attempt,
113                    delay_ms,
114                    error = ?error,
115                    "transient failure, retrying operation"
116                );
117                sleep(delay_ms);
118                retries += 1;
119            }
120        }
121    }
122}
123
124fn next_delay_ms(config: &RetryConfig, retry_index: u32) -> u64 {
125    let multiplier = 1_u64.checked_shl(retry_index).unwrap_or(u64::MAX);
126    let base_delay = config.base_delay_ms.saturating_mul(multiplier);
127    let capped_delay = base_delay.min(config.max_delay_ms);
128    if config.jitter {
129        jitter_delay_ms(capped_delay)
130    } else {
131        capped_delay
132    }
133}
134
135fn jitter_delay_ms(delay_ms: u64) -> u64 {
136    let jitter_span = delay_ms / 4;
137    if jitter_span == 0 {
138        return delay_ms;
139    }
140
141    let max_offset = jitter_span.saturating_mul(2);
142    let nanos = SystemTime::now()
143        .duration_since(UNIX_EPOCH)
144        .unwrap_or_default()
145        .subsec_nanos() as u64;
146    let offset = nanos % (max_offset + 1);
147
148    delay_ms.saturating_sub(jitter_span).saturating_add(offset)
149}
150
151#[cfg(test)]
152mod tests {
153    use std::sync::atomic::{AtomicU32, Ordering};
154
155    use crate::team::board_cmd::BoardError;
156
157    use super::{RetryConfig, Retryable, next_delay_ms, retry_sync, retry_sync_with_sleep};
158
159    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
160    enum TestError {
161        Transient(u32),
162        Permanent(u32),
163    }
164
165    impl Retryable for TestError {
166        fn is_transient(&self) -> bool {
167            matches!(self, Self::Transient(_))
168        }
169    }
170
171    #[test]
172    fn retry_returns_success_on_first_attempt() {
173        let config = RetryConfig::default();
174        let calls = AtomicU32::new(0);
175
176        let result = retry_sync(&config, || {
177            calls.fetch_add(1, Ordering::SeqCst);
178            Ok::<_, TestError>("ok")
179        });
180
181        assert_eq!(result, Ok("ok"));
182        assert_eq!(calls.load(Ordering::SeqCst), 1);
183    }
184
185    #[test]
186    fn retry_retries_transient_errors_until_exhausted() {
187        let config = RetryConfig {
188            max_retries: 3,
189            base_delay_ms: 0,
190            max_delay_ms: 0,
191            jitter: false,
192        };
193        let calls = AtomicU32::new(0);
194
195        let result = retry_sync(&config, || {
196            let attempt = calls.fetch_add(1, Ordering::SeqCst) + 1;
197            Err::<(), _>(TestError::Transient(attempt))
198        });
199
200        assert_eq!(result, Err(TestError::Transient(4)));
201        assert_eq!(calls.load(Ordering::SeqCst), 4);
202    }
203
204    #[test]
205    fn retry_returns_permanent_error_without_retrying() {
206        let config = RetryConfig::default();
207        let calls = AtomicU32::new(0);
208
209        let result = retry_sync(&config, || {
210            calls.fetch_add(1, Ordering::SeqCst);
211            Err::<(), _>(TestError::Permanent(1))
212        });
213
214        assert_eq!(result, Err(TestError::Permanent(1)));
215        assert_eq!(calls.load(Ordering::SeqCst), 1);
216    }
217
218    #[test]
219    fn retry_succeeds_after_transient_failures() {
220        let config = RetryConfig {
221            max_retries: 4,
222            base_delay_ms: 0,
223            max_delay_ms: 0,
224            jitter: false,
225        };
226        let calls = AtomicU32::new(0);
227
228        let result = retry_sync(&config, || {
229            let attempt = calls.fetch_add(1, Ordering::SeqCst) + 1;
230            if attempt < 3 {
231                Err(TestError::Transient(attempt))
232            } else {
233                Ok("recovered")
234            }
235        });
236
237        assert_eq!(result, Ok("recovered"));
238        assert_eq!(calls.load(Ordering::SeqCst), 3);
239    }
240
241    #[test]
242    fn backoff_delay_grows_exponentially_and_caps() {
243        let config = RetryConfig {
244            max_retries: 5,
245            base_delay_ms: 100,
246            max_delay_ms: 250,
247            jitter: false,
248        };
249
250        assert_eq!(next_delay_ms(&config, 0), 100);
251        assert_eq!(next_delay_ms(&config, 1), 200);
252        assert_eq!(next_delay_ms(&config, 2), 250);
253        assert_eq!(next_delay_ms(&config, 3), 250);
254    }
255
256    #[test]
257    fn no_retry_config_never_retries() {
258        let config = RetryConfig::no_retry();
259        let calls = AtomicU32::new(0);
260        let sleep_calls = AtomicU32::new(0);
261
262        let result = retry_sync_with_sleep(
263            &config,
264            || {
265                calls.fetch_add(1, Ordering::SeqCst);
266                Err::<(), _>(TestError::Transient(1))
267            },
268            |_| {
269                sleep_calls.fetch_add(1, Ordering::SeqCst);
270            },
271        );
272
273        assert_eq!(result, Err(TestError::Transient(1)));
274        assert_eq!(calls.load(Ordering::SeqCst), 1);
275        assert_eq!(sleep_calls.load(Ordering::SeqCst), 0);
276    }
277
278    #[test]
279    fn default_config_matches_expected_values() {
280        let config = RetryConfig::default();
281
282        assert_eq!(config.max_retries, 3);
283        assert_eq!(config.base_delay_ms, 100);
284        assert_eq!(config.max_delay_ms, 5_000);
285        assert!(config.jitter);
286    }
287
288    #[test]
289    fn preset_configs_match_expected_values() {
290        let fast = RetryConfig::fast();
291        assert_eq!(fast.max_retries, 2);
292        assert_eq!(fast.base_delay_ms, 50);
293        assert_eq!(fast.max_delay_ms, 500);
294        assert!(fast.jitter);
295
296        let conservative = RetryConfig::conservative();
297        assert_eq!(conservative.max_retries, 5);
298        assert_eq!(conservative.base_delay_ms, 200);
299        assert_eq!(conservative.max_delay_ms, 10_000);
300        assert!(conservative.jitter);
301    }
302
303    #[test]
304    fn board_error_uses_typed_transient_classification() {
305        let transient = BoardError::Transient {
306            message: "lock held".to_string(),
307            stderr: "try again".to_string(),
308        };
309        let permanent = BoardError::Permanent {
310            message: "bad input".to_string(),
311            stderr: "usage".to_string(),
312        };
313
314        assert!(transient.is_transient());
315        assert!(!permanent.is_transient());
316    }
317}