1use std::fmt::Debug;
4use std::thread;
5use std::time::{Duration, SystemTime, UNIX_EPOCH};
6
7use tracing::warn;
8
9use super::board_cmd::BoardError;
10use super::errors::DeliveryError;
11use super::git_cmd::GitError;
12
13pub trait Retryable {
15 fn is_transient(&self) -> bool;
16}
17
18impl Retryable for GitError {
19 fn is_transient(&self) -> bool {
20 self.is_transient()
21 }
22}
23
24impl Retryable for BoardError {
25 fn is_transient(&self) -> bool {
26 self.is_transient()
27 }
28}
29
30impl Retryable for DeliveryError {
31 fn is_transient(&self) -> bool {
32 self.is_transient()
33 }
34}
35
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct RetryConfig {
38 pub max_retries: u32,
39 pub base_delay_ms: u64,
40 pub max_delay_ms: u64,
41 pub jitter: bool,
42}
43
44impl Default for RetryConfig {
45 fn default() -> Self {
46 Self {
47 max_retries: 3,
48 base_delay_ms: 100,
49 max_delay_ms: 5_000,
50 jitter: true,
51 }
52 }
53}
54
55impl RetryConfig {
56 #[cfg_attr(not(test), allow(dead_code))]
57 pub fn no_retry() -> Self {
58 Self {
59 max_retries: 0,
60 ..Self::default()
61 }
62 }
63
64 pub fn fast() -> Self {
65 Self {
66 max_retries: 2,
67 base_delay_ms: 50,
68 max_delay_ms: 500,
69 jitter: true,
70 }
71 }
72
73 #[cfg_attr(not(test), allow(dead_code))]
74 pub fn conservative() -> Self {
75 Self {
76 max_retries: 5,
77 base_delay_ms: 200,
78 max_delay_ms: 10_000,
79 jitter: true,
80 }
81 }
82}
83
84pub fn retry_sync<T, E, F>(config: &RetryConfig, operation: F) -> Result<T, E>
85where
86 E: Retryable + Debug,
87 F: Fn() -> Result<T, E>,
88{
89 retry_sync_with_sleep(config, operation, |delay_ms| {
90 thread::sleep(Duration::from_millis(delay_ms));
91 })
92}
93
94fn retry_sync_with_sleep<T, E, F, S>(config: &RetryConfig, operation: F, sleep: S) -> Result<T, E>
95where
96 E: Retryable + Debug,
97 F: Fn() -> Result<T, E>,
98 S: Fn(u64),
99{
100 let mut retries = 0;
101
102 loop {
103 match operation() {
104 Ok(value) => return Ok(value),
105 Err(error) if !error.is_transient() => return Err(error),
106 Err(error) if retries >= config.max_retries => return Err(error),
107 Err(error) => {
108 let delay_ms = next_delay_ms(config, retries);
109 let next_attempt = retries + 2;
110 warn!(
111 retry = retries + 1,
112 next_attempt,
113 delay_ms,
114 error = ?error,
115 "transient failure, retrying operation"
116 );
117 sleep(delay_ms);
118 retries += 1;
119 }
120 }
121 }
122}
123
124fn next_delay_ms(config: &RetryConfig, retry_index: u32) -> u64 {
125 let multiplier = 1_u64.checked_shl(retry_index).unwrap_or(u64::MAX);
126 let base_delay = config.base_delay_ms.saturating_mul(multiplier);
127 let capped_delay = base_delay.min(config.max_delay_ms);
128 if config.jitter {
129 jitter_delay_ms(capped_delay)
130 } else {
131 capped_delay
132 }
133}
134
135fn jitter_delay_ms(delay_ms: u64) -> u64 {
136 let jitter_span = delay_ms / 4;
137 if jitter_span == 0 {
138 return delay_ms;
139 }
140
141 let max_offset = jitter_span.saturating_mul(2);
142 let nanos = SystemTime::now()
143 .duration_since(UNIX_EPOCH)
144 .unwrap_or_default()
145 .subsec_nanos() as u64;
146 let offset = nanos % (max_offset + 1);
147
148 delay_ms.saturating_sub(jitter_span).saturating_add(offset)
149}
150
151#[cfg(test)]
152mod tests {
153 use std::sync::atomic::{AtomicU32, Ordering};
154
155 use crate::team::board_cmd::BoardError;
156
157 use super::{RetryConfig, Retryable, next_delay_ms, retry_sync, retry_sync_with_sleep};
158
159 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
160 enum TestError {
161 Transient(u32),
162 Permanent(u32),
163 }
164
165 impl Retryable for TestError {
166 fn is_transient(&self) -> bool {
167 matches!(self, Self::Transient(_))
168 }
169 }
170
171 #[test]
172 fn retry_returns_success_on_first_attempt() {
173 let config = RetryConfig::default();
174 let calls = AtomicU32::new(0);
175
176 let result = retry_sync(&config, || {
177 calls.fetch_add(1, Ordering::SeqCst);
178 Ok::<_, TestError>("ok")
179 });
180
181 assert_eq!(result, Ok("ok"));
182 assert_eq!(calls.load(Ordering::SeqCst), 1);
183 }
184
185 #[test]
186 fn retry_retries_transient_errors_until_exhausted() {
187 let config = RetryConfig {
188 max_retries: 3,
189 base_delay_ms: 0,
190 max_delay_ms: 0,
191 jitter: false,
192 };
193 let calls = AtomicU32::new(0);
194
195 let result = retry_sync(&config, || {
196 let attempt = calls.fetch_add(1, Ordering::SeqCst) + 1;
197 Err::<(), _>(TestError::Transient(attempt))
198 });
199
200 assert_eq!(result, Err(TestError::Transient(4)));
201 assert_eq!(calls.load(Ordering::SeqCst), 4);
202 }
203
204 #[test]
205 fn retry_returns_permanent_error_without_retrying() {
206 let config = RetryConfig::default();
207 let calls = AtomicU32::new(0);
208
209 let result = retry_sync(&config, || {
210 calls.fetch_add(1, Ordering::SeqCst);
211 Err::<(), _>(TestError::Permanent(1))
212 });
213
214 assert_eq!(result, Err(TestError::Permanent(1)));
215 assert_eq!(calls.load(Ordering::SeqCst), 1);
216 }
217
218 #[test]
219 fn retry_succeeds_after_transient_failures() {
220 let config = RetryConfig {
221 max_retries: 4,
222 base_delay_ms: 0,
223 max_delay_ms: 0,
224 jitter: false,
225 };
226 let calls = AtomicU32::new(0);
227
228 let result = retry_sync(&config, || {
229 let attempt = calls.fetch_add(1, Ordering::SeqCst) + 1;
230 if attempt < 3 {
231 Err(TestError::Transient(attempt))
232 } else {
233 Ok("recovered")
234 }
235 });
236
237 assert_eq!(result, Ok("recovered"));
238 assert_eq!(calls.load(Ordering::SeqCst), 3);
239 }
240
241 #[test]
242 fn backoff_delay_grows_exponentially_and_caps() {
243 let config = RetryConfig {
244 max_retries: 5,
245 base_delay_ms: 100,
246 max_delay_ms: 250,
247 jitter: false,
248 };
249
250 assert_eq!(next_delay_ms(&config, 0), 100);
251 assert_eq!(next_delay_ms(&config, 1), 200);
252 assert_eq!(next_delay_ms(&config, 2), 250);
253 assert_eq!(next_delay_ms(&config, 3), 250);
254 }
255
256 #[test]
257 fn no_retry_config_never_retries() {
258 let config = RetryConfig::no_retry();
259 let calls = AtomicU32::new(0);
260 let sleep_calls = AtomicU32::new(0);
261
262 let result = retry_sync_with_sleep(
263 &config,
264 || {
265 calls.fetch_add(1, Ordering::SeqCst);
266 Err::<(), _>(TestError::Transient(1))
267 },
268 |_| {
269 sleep_calls.fetch_add(1, Ordering::SeqCst);
270 },
271 );
272
273 assert_eq!(result, Err(TestError::Transient(1)));
274 assert_eq!(calls.load(Ordering::SeqCst), 1);
275 assert_eq!(sleep_calls.load(Ordering::SeqCst), 0);
276 }
277
278 #[test]
279 fn default_config_matches_expected_values() {
280 let config = RetryConfig::default();
281
282 assert_eq!(config.max_retries, 3);
283 assert_eq!(config.base_delay_ms, 100);
284 assert_eq!(config.max_delay_ms, 5_000);
285 assert!(config.jitter);
286 }
287
288 #[test]
289 fn preset_configs_match_expected_values() {
290 let fast = RetryConfig::fast();
291 assert_eq!(fast.max_retries, 2);
292 assert_eq!(fast.base_delay_ms, 50);
293 assert_eq!(fast.max_delay_ms, 500);
294 assert!(fast.jitter);
295
296 let conservative = RetryConfig::conservative();
297 assert_eq!(conservative.max_retries, 5);
298 assert_eq!(conservative.base_delay_ms, 200);
299 assert_eq!(conservative.max_delay_ms, 10_000);
300 assert!(conservative.jitter);
301 }
302
303 #[test]
304 fn board_error_uses_typed_transient_classification() {
305 let transient = BoardError::Transient {
306 message: "lock held".to_string(),
307 stderr: "try again".to_string(),
308 };
309 let permanent = BoardError::Permanent {
310 message: "bad input".to_string(),
311 stderr: "usage".to_string(),
312 };
313
314 assert!(transient.is_transient());
315 assert!(!permanent.is_transient());
316 }
317}