phi_core/provider/retry.rs
1//! Retry with exponential backoff and jitter for provider calls.
2//!
3//! ARCHITECTURE NOTE: Why retry at the agent loop level (not the provider level)?
4//!
5//! Retrying is a cross-cutting concern — all 7 providers share the same retry logic.
6//! By handling it in stream_assistant_response() (agent_loop.rs), we avoid duplicating
7//! retry logic in every provider. Providers simply return ProviderError::RateLimited
8//! or ProviderError::Network, and this module decides what to do.
9
10use crate::provider::ProviderError;
11use serde::{Deserialize, Serialize};
12use std::time::Duration;
13use tracing::warn;
14
15/// Configuration for automatic retry of transient provider errors.
16///
17/// Defaults: 3 retries, 1s initial delay, 2x backoff, 30s max delay.
18/// Use `RetryConfig::none()` to disable retries entirely.
19/*
20ARCHITECTURE: Exponential backoff + jitter — why both?
21
22Exponential backoff (delay doubles each attempt) prevents thundering herd:
23if 1000 clients all hit a rate limit at the same time and all retry after
24exactly 1s, they'll hit the limit again simultaneously. Doubling adds space.
25
26Jitter (±20% random noise) prevents synchronized retries even with backoff:
27two clients with the same delay would still retry at the same moment.
28With jitter, their windows are offset, reducing server load spikes.
29
30Attempt 1: 1000ms * (0.8–1.2) = 800–1200ms
31Attempt 2: 2000ms * (0.8–1.2) = 1600–2400ms
32Attempt 3: 4000ms * (0.8–1.2) = 3200–4800ms → capped at 30s
33*/
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct RetryConfig {
36 /// Maximum number of retry attempts (0 = no retries, fail immediately).
37 pub max_retries: usize,
38 /// Delay before the first retry in milliseconds (e.g., 1000 = 1 second).
39 pub initial_delay_ms: u64,
40 /// Multiplier applied each attempt: delay[n] = initial * multiplier^(n-1).
41 /// 2.0 = double each time (standard exponential backoff).
42 pub backoff_multiplier: f64,
43 /// Maximum delay cap in milliseconds — backoff stops growing beyond this.
44 pub max_delay_ms: u64,
45}
46
47/*
48RUST QUIRK: `impl Default` — explicitly defining the "zero value"
49
50Rust has no constructor syntax. Instead, the `Default` trait provides `::default()`.
51We implement it manually here because the defaults are non-trivial constants.
52
53If all fields were 0/false/empty-string, we could use `#[derive(Default)]` to get
54it for free. But initial_delay_ms = 1000 and backoff_multiplier = 2.0 are not
55the "zero values" of u64 and f64 (which are 0 and 0.0 respectively).
56
57Usage examples:
58 let cfg = RetryConfig::default(); // 3 retries, 1s delay, 2x backoff
59 let cfg = RetryConfig { max_retries: 5, ..Default::default() }; // override one field
60 let cfg = RetryConfig::none(); // 0 retries (disable retries)
61*/
62impl Default for RetryConfig {
63 fn default() -> Self {
64 Self {
65 max_retries: 3,
66 initial_delay_ms: 1000,
67 backoff_multiplier: 2.0,
68 max_delay_ms: 30_000, // numeric literal underscores: _ is ignored, just a readability separator
69 }
70 }
71}
72
73impl RetryConfig {
74 /// No retries — fail immediately on any error.
75 /*
76 RUST QUIRK: `..Default::default()` — struct update syntax
77
78 `Self { max_retries: 0, ..Default::default() }` means:
79 "construct a Self where max_retries = 0,
80 and all other fields come from Default::default()"
81
82 It's Rust's equivalent of Python's dataclasses.replace():
83 dataclasses.replace(RetryConfig(), max_retries=0)
84
85 This lets RetryConfig::none() reuse the sensible defaults for all other
86 fields, and only override what matters (max_retries = 0).
87 The order matters: named fields come first, `..expr` must be last.
88 */
89 pub fn none() -> Self {
90 Self {
91 max_retries: 0,
92 ..Default::default() // fill remaining fields from the default
93 }
94 }
95
96 /// Calculate the delay for a given attempt (1-indexed).
97 /// Uses exponential backoff with ±20% jitter.
98 /*
99 RUST QUIRK: Mixed numeric types require explicit casting (`as`)
100
101 `self.initial_delay_ms` is u64 (unsigned integer).
102 `self.backoff_multiplier` is f64 (floating point).
103 Rust won't mix them — you must explicitly cast.
104
105 `self.initial_delay_ms as f64` — widening cast: u64 → f64 (safe, no data loss)
106 `(attempt - 1) as i32` — narrowing cast: usize → i32 (safe for small values)
107 `(capped_ms * jitter) as u64` — narrowing cast: f64 → u64 (truncates fraction, no panic)
108
109 `powi(n: i32)` — integer power for f64:
110 2.0_f64.powi(3) = 8.0
111 This is more precise than powf(n as f64) for integer exponents.
112
113 `base_ms.min(max)` — clamp from above:
114 Python analogy: min(base_ms, self.max_delay_ms as f64)
115
116 `rand::random::<f64>()` — generate a random f64 in [0.0, 1.0).
117 The `::<f64>` is a "turbofish" — explicit type parameter at the call site.
118 Needed because random() is generic and Rust can't always infer the type.
119 Python analogy: random.random()
120 */
121 pub fn delay_for_attempt(&self, attempt: usize) -> Duration {
122 // base_ms: initial_delay * multiplier^(attempt-1)
123 // attempt is 1-indexed: attempt 1 → multiplier^0 = 1.0 → no extra delay
124 let base_ms =
125 self.initial_delay_ms as f64 * self.backoff_multiplier.powi((attempt - 1) as i32);
126 let capped_ms = base_ms.min(self.max_delay_ms as f64); // cap at max_delay_ms
127
128 // Jitter: multiply by a random factor in [0.8, 1.2) = ±20% noise
129 let jitter = 0.8 + rand::random::<f64>() * 0.4; // 0.4 range → [0.8, 1.2)
130 Duration::from_millis((capped_ms * jitter) as u64) // f64 → u64: truncates, never panics
131 }
132}
133
134/*
135RUST QUIRK: Adding methods to a type defined in another module — `impl OtherType`
136
137`ProviderError` is defined in provider/traits.rs, but we add retry-related methods
138to it HERE in retry.rs. Rust allows this as long as either:
139 a) The type is defined in THIS crate (ProviderError is — it's in our crate)
140 b) You own the trait being implemented
141
142This is different from Python where methods must live in the class definition.
143In Rust, you can add methods to your own types from any module in the crate.
144The split is intentional: ProviderError is a pure type in provider/traits.rs;
145retry logic lives here in retry.rs (separation of concerns).
146*/
147impl ProviderError {
148 /// Whether this error is safe to retry.
149 ///
150 /// Retryable: rate limits (429) and network/transient errors.
151 /// Not retryable: auth errors, API errors (bad request), cancellation.
152 /*
153 RUST QUIRK: `matches!` macro — compact pattern matching returning bool
154
155 `matches!(self, Pattern1 | Pattern2)` is shorthand for:
156 match self {
157 Pattern1 | Pattern2 => true,
158 _ => false,
159 }
160
161 The `..` inside `RateLimited { .. }` means "I don't care about the fields,
162 just check that it's this variant." It matches any RateLimited value.
163
164 Python analogy: isinstance(self, (RateLimited, Network))
165 */
166 pub fn is_retryable(&self) -> bool {
167 matches!(self, Self::RateLimited { .. } | Self::Network(_))
168 }
169
170 /// If this is a rate limit with a server-specified retry delay, return it.
171 /*
172 ARCHITECTURE: Respecting server-specified retry delays (Retry-After header)
173
174 When an API returns HTTP 429 with a `Retry-After: 60` header, we should
175 wait exactly that long — not our computed backoff. The server knows its
176 own rate limit windows better than we do.
177
178 `retry_after_ms: Some(ms)` — only matches if the field is Some (not None).
179 `*ms` — dereferences the &u64 to get the u64 value.
180 `Duration::from_millis(*ms)` — wraps it in a Duration for the caller.
181
182 The caller in agent_loop.rs uses:
183 e.retry_after().unwrap_or_else(|| retry.delay_for_attempt(attempt))
184 meaning: "use server's delay if available, else compute our own."
185 */
186 pub fn retry_after(&self) -> Option<Duration> {
187 match self {
188 Self::RateLimited {
189 retry_after_ms: Some(ms), // match guard: only if retry_after_ms is Some
190 } => Some(Duration::from_millis(*ms)),
191 _ => None, // all other cases (including RateLimited { retry_after_ms: None })
192 }
193 }
194}
195
196/// Log a retry attempt.
197/*
198RUST QUIRK: `pub(crate)` visibility — "public within this crate only"
199
200`pub(crate)` is between `pub` (visible everywhere) and private (visible only in this module).
201It means: "any module in this crate can use this function, but external consumers cannot."
202
203Here, log_retry is called from agent_loop.rs (same crate) but shouldn't be part of
204the public API that library users see.
205
206`warn!()` is a logging macro from the `tracing` crate (similar to Python's logging.warning()).
207It uses the same syntax as println! but routes to the tracing subscriber configured by the user.
208`{:.1}` format: float with 1 decimal place → "2.5" not "2.500000"
209*/
210pub(crate) fn log_retry(
211 attempt: usize, // CURRENT — which attempt just failed (1-indexed; printed as "attempt X/Y")
212 max: usize, // TOTAL — RetryConfig::max_retries (the denominator in "attempt X/Y")
213 delay: &Duration, // WAIT — computed backoff delay before the next attempt (shown in seconds)
214 error: &ProviderError, // CAUSE — the error that triggered this retry (shown in the log message)
215) {
216 warn!(
217 "Provider error (attempt {}/{}), retrying in {:.1}s: {}",
218 attempt,
219 max,
220 delay.as_secs_f64(), // Duration → f64 seconds (e.g., 1500ms → 1.5)
221 error // uses ProviderError's Display impl
222 );
223}