Skip to main content

phi_core/provider/
retry.rs

1//! Retry with exponential backoff and jitter for provider calls.
2//!
3//! ARCHITECTURE NOTE: Why retry at the agent loop level (not the provider level)?
4//!
5//! Retrying is a cross-cutting concern — all 7 providers share the same retry logic.
6//! By handling it in stream_assistant_response() (agent_loop.rs), we avoid duplicating
7//! retry logic in every provider. Providers simply return ProviderError::RateLimited
8//! or ProviderError::Network, and this module decides what to do.
9
10use crate::provider::ProviderError;
11use serde::{Deserialize, Serialize};
12use std::time::Duration;
13use tracing::warn;
14
15/// Configuration for automatic retry of transient provider errors.
16///
17/// Defaults: 3 retries, 1s initial delay, 2x backoff, 30s max delay.
18/// Use `RetryConfig::none()` to disable retries entirely.
19/*
20ARCHITECTURE: Exponential backoff + jitter — why both?
21
22Exponential backoff (delay doubles each attempt) prevents thundering herd:
23if 1000 clients all hit a rate limit at the same time and all retry after
24exactly 1s, they'll hit the limit again simultaneously. Doubling adds space.
25
26Jitter (±20% random noise) prevents synchronized retries even with backoff:
27two clients with the same delay would still retry at the same moment.
28With jitter, their windows are offset, reducing server load spikes.
29
30Attempt 1: 1000ms * (0.8–1.2) = 800–1200ms
31Attempt 2: 2000ms * (0.8–1.2) = 1600–2400ms
32Attempt 3: 4000ms * (0.8–1.2) = 3200–4800ms → capped at 30s
33*/
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct RetryConfig {
36    /// Maximum number of retry attempts (0 = no retries, fail immediately).
37    pub max_retries: usize,
38    /// Delay before the first retry in milliseconds (e.g., 1000 = 1 second).
39    pub initial_delay_ms: u64,
40    /// Multiplier applied each attempt: delay[n] = initial * multiplier^(n-1).
41    /// 2.0 = double each time (standard exponential backoff).
42    pub backoff_multiplier: f64,
43    /// Maximum delay cap in milliseconds — backoff stops growing beyond this.
44    pub max_delay_ms: u64,
45}
46
47/*
48RUST QUIRK: `impl Default` — explicitly defining the "zero value"
49
50Rust has no constructor syntax. Instead, the `Default` trait provides `::default()`.
51We implement it manually here because the defaults are non-trivial constants.
52
53If all fields were 0/false/empty-string, we could use `#[derive(Default)]` to get
54it for free. But initial_delay_ms = 1000 and backoff_multiplier = 2.0 are not
55the "zero values" of u64 and f64 (which are 0 and 0.0 respectively).
56
57Usage examples:
58  let cfg = RetryConfig::default();               // 3 retries, 1s delay, 2x backoff
59  let cfg = RetryConfig { max_retries: 5, ..Default::default() }; // override one field
60  let cfg = RetryConfig::none();                  // 0 retries (disable retries)
61*/
62impl Default for RetryConfig {
63    fn default() -> Self {
64        Self {
65            max_retries: 3,
66            initial_delay_ms: 1000,
67            backoff_multiplier: 2.0,
68            max_delay_ms: 30_000, // numeric literal underscores: _ is ignored, just a readability separator
69        }
70    }
71}
72
73impl RetryConfig {
74    /// No retries — fail immediately on any error.
75    /*
76    RUST QUIRK: `..Default::default()` — struct update syntax
77
78    `Self { max_retries: 0, ..Default::default() }` means:
79      "construct a Self where max_retries = 0,
80       and all other fields come from Default::default()"
81
82    It's Rust's equivalent of Python's dataclasses.replace():
83      dataclasses.replace(RetryConfig(), max_retries=0)
84
85    This lets RetryConfig::none() reuse the sensible defaults for all other
86    fields, and only override what matters (max_retries = 0).
87    The order matters: named fields come first, `..expr` must be last.
88    */
89    pub fn none() -> Self {
90        Self {
91            max_retries: 0,
92            ..Default::default() // fill remaining fields from the default
93        }
94    }
95
96    /// Calculate the delay for a given attempt (1-indexed).
97    /// Uses exponential backoff with ±20% jitter.
98    /*
99    RUST QUIRK: Mixed numeric types require explicit casting (`as`)
100
101    `self.initial_delay_ms` is u64 (unsigned integer).
102    `self.backoff_multiplier` is f64 (floating point).
103    Rust won't mix them — you must explicitly cast.
104
105    `self.initial_delay_ms as f64` — widening cast: u64 → f64 (safe, no data loss)
106    `(attempt - 1) as i32` — narrowing cast: usize → i32 (safe for small values)
107    `(capped_ms * jitter) as u64` — narrowing cast: f64 → u64 (truncates fraction, no panic)
108
109    `powi(n: i32)` — integer power for f64:
110      2.0_f64.powi(3) = 8.0
111      This is more precise than powf(n as f64) for integer exponents.
112
113    `base_ms.min(max)` — clamp from above:
114      Python analogy: min(base_ms, self.max_delay_ms as f64)
115
116    `rand::random::<f64>()` — generate a random f64 in [0.0, 1.0).
117      The `::<f64>` is a "turbofish" — explicit type parameter at the call site.
118      Needed because random() is generic and Rust can't always infer the type.
119      Python analogy: random.random()
120    */
121    pub fn delay_for_attempt(&self, attempt: usize) -> Duration {
122        // base_ms: initial_delay * multiplier^(attempt-1)
123        // attempt is 1-indexed: attempt 1 → multiplier^0 = 1.0 → no extra delay
124        let base_ms =
125            self.initial_delay_ms as f64 * self.backoff_multiplier.powi((attempt - 1) as i32);
126        let capped_ms = base_ms.min(self.max_delay_ms as f64); // cap at max_delay_ms
127
128        // Jitter: multiply by a random factor in [0.8, 1.2) = ±20% noise
129        let jitter = 0.8 + rand::random::<f64>() * 0.4; // 0.4 range → [0.8, 1.2)
130        Duration::from_millis((capped_ms * jitter) as u64) // f64 → u64: truncates, never panics
131    }
132}
133
134/*
135RUST QUIRK: Adding methods to a type defined in another module — `impl OtherType`
136
137`ProviderError` is defined in provider/traits.rs, but we add retry-related methods
138to it HERE in retry.rs. Rust allows this as long as either:
139  a) The type is defined in THIS crate (ProviderError is — it's in our crate)
140  b) You own the trait being implemented
141
142This is different from Python where methods must live in the class definition.
143In Rust, you can add methods to your own types from any module in the crate.
144The split is intentional: ProviderError is a pure type in provider/traits.rs;
145retry logic lives here in retry.rs (separation of concerns).
146*/
147impl ProviderError {
148    /// Whether this error is safe to retry.
149    ///
150    /// Retryable: rate limits (429) and network/transient errors.
151    /// Not retryable: auth errors, API errors (bad request), cancellation.
152    /*
153    RUST QUIRK: `matches!` macro — compact pattern matching returning bool
154
155    `matches!(self, Pattern1 | Pattern2)` is shorthand for:
156      match self {
157          Pattern1 | Pattern2 => true,
158          _ => false,
159      }
160
161    The `..` inside `RateLimited { .. }` means "I don't care about the fields,
162    just check that it's this variant." It matches any RateLimited value.
163
164    Python analogy: isinstance(self, (RateLimited, Network))
165    */
166    pub fn is_retryable(&self) -> bool {
167        matches!(self, Self::RateLimited { .. } | Self::Network(_))
168    }
169
170    /// If this is a rate limit with a server-specified retry delay, return it.
171    /*
172    ARCHITECTURE: Respecting server-specified retry delays (Retry-After header)
173
174    When an API returns HTTP 429 with a `Retry-After: 60` header, we should
175    wait exactly that long — not our computed backoff. The server knows its
176    own rate limit windows better than we do.
177
178    `retry_after_ms: Some(ms)` — only matches if the field is Some (not None).
179    `*ms` — dereferences the &u64 to get the u64 value.
180    `Duration::from_millis(*ms)` — wraps it in a Duration for the caller.
181
182    The caller in agent_loop.rs uses:
183      e.retry_after().unwrap_or_else(|| retry.delay_for_attempt(attempt))
184    meaning: "use server's delay if available, else compute our own."
185    */
186    pub fn retry_after(&self) -> Option<Duration> {
187        match self {
188            Self::RateLimited {
189                retry_after_ms: Some(ms), // match guard: only if retry_after_ms is Some
190            } => Some(Duration::from_millis(*ms)),
191            _ => None, // all other cases (including RateLimited { retry_after_ms: None })
192        }
193    }
194}
195
196/// Log a retry attempt.
197/*
198RUST QUIRK: `pub(crate)` visibility — "public within this crate only"
199
200`pub(crate)` is between `pub` (visible everywhere) and private (visible only in this module).
201It means: "any module in this crate can use this function, but external consumers cannot."
202
203Here, log_retry is called from agent_loop.rs (same crate) but shouldn't be part of
204the public API that library users see.
205
206`warn!()` is a logging macro from the `tracing` crate (similar to Python's logging.warning()).
207It uses the same syntax as println! but routes to the tracing subscriber configured by the user.
208`{:.1}` format: float with 1 decimal place → "2.5" not "2.500000"
209*/
210pub(crate) fn log_retry(
211    attempt: usize, // CURRENT — which attempt just failed (1-indexed; printed as "attempt X/Y")
212    max: usize,     // TOTAL   — RetryConfig::max_retries (the denominator in "attempt X/Y")
213    delay: &Duration, // WAIT    — computed backoff delay before the next attempt (shown in seconds)
214    error: &ProviderError, // CAUSE   — the error that triggered this retry (shown in the log message)
215) {
216    warn!(
217        "Provider error (attempt {}/{}), retrying in {:.1}s: {}",
218        attempt,
219        max,
220        delay.as_secs_f64(), // Duration → f64 seconds (e.g., 1500ms → 1.5)
221        error                // uses ProviderError's Display impl
222    );
223}