Skip to main content

reddb_server/runtime/ai/
provider_failover.rs

1//! `ProviderFailover` — pure ordered-list failover kernel for ASK.
2//!
3//! Issue #404 (PRD #391): when a user asks `ASK '...'` the runtime picks
4//! a provider from `ask.providers.fallback = ['groq', 'openai',
5//! 'anthropic']` (or per-query `USING 'a,b,c'`) and walks the list in
6//! order until one succeeds. Failover triggers on **retryable** outcomes
7//! — transport errors, 5xx, and timeouts. Authoritative errors like
8//! 4xx auth failures or content-policy refusals short-circuit: we do
9//! not paper over a bad key by silently switching vendors.
10//!
11//! Deep module: no I/O, no async, no clock. The caller supplies an
12//! attempt function `FnMut(&str) -> Result<R, AttemptError>` and we
13//! drive the loop. This keeps the kernel trivially testable with
14//! synchronous stubs and lets the eventual wiring slice plug in real
15//! HTTP transports without changing the policy logic.
16//!
17//! ## Why "retryable" is a closed set
18//!
19//! Failover is risky: if the second provider produces a different
20//! answer than the first, the user sees nondeterminism for what was
21//! supposed to be a deterministic ASK (#400). We only fail over when
22//! the first provider could not have produced *any* answer:
23//!
24//! - **Transport** — DNS, TCP, TLS, dropped connection. No response
25//!   bytes received, so no answer was committed.
26//! - **5xx** — provider acknowledged the request but admitted failure.
27//!   By HTTP convention, the resource is in an unknown/transient bad
28//!   state; safe to retry on a sibling.
29//! - **Timeout** — request exceeded the deadline. From our side the
30//!   call is over; whether the provider eventually completed is moot.
31//!
32//! Everything else — 4xx, malformed response, content-filter refusal,
33//! non-retryable provider-specific codes — is reported as-is. The
34//! caller turns those into the user-visible error.
35//!
36//! ## Preservation of determinism inputs
37//!
38//! `seed`, `temperature`, and `strict` are part of the request the
39//! caller passes to the attempt fn. The kernel is generic over the
40//! request payload, so by construction every attempt sees the same
41//! parameters. We do not "fix up" requests between attempts.
42//!
43//! ## Outcome shape
44//!
45//! On success: `(provider, response, prior_errors)`. We surface
46//! prior_errors so the audit log can record that, e.g., groq 502'd
47//! before openai answered — that's signal for capacity planning even
48//! when the user got a good answer.
49//!
50//! On total failure: `AllProvidersFailed { attempts }` where each entry
51//! is `(provider, AttemptError)`. The HTTP layer maps this to 503 per
52//! the acceptance criteria.
53
54use std::fmt;
55use std::time::Duration;
56
57/// A classification of one attempt's failure.
58///
59/// `Transport`, `Status5xx`, and `Timeout` are retryable — the failover
60/// loop moves to the next provider. `NonRetryable` aborts the loop
61/// and is returned to the caller wrapped in the outcome.
62#[derive(Debug, Clone, PartialEq)]
63pub enum AttemptError {
64    /// Network-level failure before/while receiving a response.
65    /// Examples: DNS resolution failure, connection refused, TLS error,
66    /// socket reset mid-stream. `String` is a short human description
67    /// suitable for audit.
68    Transport(String),
69    /// Provider returned a 5xx response. Carries the actual status code
70    /// (e.g. 502, 503, 504) and a short body excerpt.
71    Status5xx { code: u16, body: String },
72    /// Per-request deadline elapsed before completion.
73    Timeout(Duration),
74    /// Authoritative error that must NOT trigger failover. Examples:
75    /// 4xx auth failure (wrong API key), 4xx quota exhausted on the
76    /// account level, content-policy refusal, malformed response we
77    /// cannot recover from. The kernel returns immediately when it
78    /// sees this — there is no value in asking another provider when
79    /// the request itself is bad.
80    NonRetryable(String),
81}
82
83impl AttemptError {
84    /// Whether this error should trigger advancement to the next provider.
85    pub fn is_retryable(&self) -> bool {
86        matches!(
87            self,
88            AttemptError::Transport(_) | AttemptError::Status5xx { .. } | AttemptError::Timeout(_)
89        )
90    }
91}
92
93impl fmt::Display for AttemptError {
94    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
95        match self {
96            AttemptError::Transport(msg) => write!(f, "transport: {msg}"),
97            AttemptError::Status5xx { code, body } => write!(f, "http {code}: {body}"),
98            AttemptError::Timeout(d) => write!(f, "timeout after {}ms", d.as_millis()),
99            AttemptError::NonRetryable(msg) => write!(f, "non-retryable: {msg}"),
100        }
101    }
102}
103
104/// Successful failover result. `prior_errors` lists every retryable
105/// failure we walked through to get here — useful for audit but not
106/// for user output.
107#[derive(Debug, Clone, PartialEq)]
108pub struct FailoverSuccess<R> {
109    pub provider: String,
110    pub response: R,
111    pub prior_errors: Vec<(String, AttemptError)>,
112}
113
114/// All-providers-exhausted result. The HTTP layer maps this to 503 per
115/// the acceptance criteria; `attempts` becomes the visible list of
116/// providers that were tried and how each one failed.
117#[derive(Debug, Clone, PartialEq)]
118pub struct FailoverExhausted {
119    pub attempts: Vec<(String, AttemptError)>,
120}
121
122impl fmt::Display for FailoverExhausted {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        write!(f, "all providers failed:")?;
125        for (provider, err) in &self.attempts {
126            write!(f, " [{provider}: {err}]")?;
127        }
128        Ok(())
129    }
130}
131
132/// Walk `providers` in order. For each, invoke `attempt`. The first
133/// `Ok` short-circuits and is returned with the trail of prior
134/// retryable errors. A `NonRetryable` short-circuits to
135/// `Err(FailoverExhausted)` containing the attempts up to and including
136/// the non-retryable one — we do not pretend more providers were tried.
137/// Retryable failures advance to the next provider.
138///
139/// Empty `providers` returns `Err(FailoverExhausted { attempts: [] })`.
140/// The HTTP layer should treat that as a config error, not a 503; the
141/// kernel does not encode that distinction.
142pub fn run<R, F>(
143    providers: &[&str],
144    mut attempt: F,
145) -> Result<FailoverSuccess<R>, FailoverExhausted>
146where
147    F: FnMut(&str) -> Result<R, AttemptError>,
148{
149    let mut prior: Vec<(String, AttemptError)> = Vec::new();
150
151    for provider in providers {
152        match attempt(provider) {
153            Ok(response) => {
154                return Ok(FailoverSuccess {
155                    provider: (*provider).to_string(),
156                    response,
157                    prior_errors: prior,
158                });
159            }
160            Err(err) => {
161                let retryable = err.is_retryable();
162                prior.push(((*provider).to_string(), err));
163                if !retryable {
164                    return Err(FailoverExhausted { attempts: prior });
165                }
166            }
167        }
168    }
169
170    Err(FailoverExhausted { attempts: prior })
171}
172
173/// Parse a `USING 'a,b,c'` override into an ordered, deduped list of
174/// non-empty provider names. Surrounding whitespace is trimmed. Empty
175/// segments are dropped. Order of first occurrence wins on dedup —
176/// the user's intent is honored, not silently reordered.
177///
178/// Returns `None` if the parse yields zero providers; the caller falls
179/// back to the global `ask.providers.fallback` setting.
180pub fn parse_using_clause(raw: &str) -> Option<Vec<String>> {
181    let mut out: Vec<String> = Vec::new();
182    for segment in raw.split(',') {
183        let name = segment.trim();
184        if name.is_empty() {
185            continue;
186        }
187        if !out.iter().any(|existing| existing == name) {
188            out.push(name.to_string());
189        }
190    }
191    if out.is_empty() {
192        None
193    } else {
194        Some(out)
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201    use std::cell::RefCell;
202
203    // --- AttemptError classification ------------------------------------
204
205    #[test]
206    fn transport_is_retryable() {
207        assert!(AttemptError::Transport("dns".into()).is_retryable());
208    }
209
210    #[test]
211    fn status_5xx_is_retryable() {
212        assert!(AttemptError::Status5xx {
213            code: 502,
214            body: "bad gateway".into()
215        }
216        .is_retryable());
217    }
218
219    #[test]
220    fn timeout_is_retryable() {
221        assert!(AttemptError::Timeout(Duration::from_secs(30)).is_retryable());
222    }
223
224    #[test]
225    fn non_retryable_is_not_retryable() {
226        assert!(!AttemptError::NonRetryable("401 unauthorized".into()).is_retryable());
227    }
228
229    // --- run() success paths --------------------------------------------
230
231    #[test]
232    fn first_provider_succeeds_no_prior_errors() {
233        let providers = ["groq", "openai", "anthropic"];
234        let result = run(&providers, |p| {
235            Ok::<_, AttemptError>(format!("answer from {p}"))
236        });
237        let ok = result.expect("should succeed");
238        assert_eq!(ok.provider, "groq");
239        assert_eq!(ok.response, "answer from groq");
240        assert!(ok.prior_errors.is_empty());
241    }
242
243    #[test]
244    fn second_provider_succeeds_after_5xx() {
245        // Acceptance: integration test with two stub providers where
246        // the first errors and the second succeeds.
247        let providers = ["groq", "openai"];
248        let calls = RefCell::new(0u32);
249        let result = run(&providers, |p| {
250            *calls.borrow_mut() += 1;
251            if p == "groq" {
252                Err(AttemptError::Status5xx {
253                    code: 502,
254                    body: "bad gateway".into(),
255                })
256            } else {
257                Ok(format!("answer from {p}"))
258            }
259        });
260        let ok = result.expect("should succeed");
261        assert_eq!(ok.provider, "openai");
262        assert_eq!(ok.response, "answer from openai");
263        assert_eq!(*calls.borrow(), 2);
264        assert_eq!(ok.prior_errors.len(), 1);
265        assert_eq!(ok.prior_errors[0].0, "groq");
266    }
267
268    #[test]
269    fn third_provider_succeeds_after_transport_and_timeout() {
270        let providers = ["groq", "openai", "anthropic"];
271        let result = run(&providers, |p| match p {
272            "groq" => Err(AttemptError::Transport("connection reset".into())),
273            "openai" => Err(AttemptError::Timeout(Duration::from_secs(30))),
274            _ => Ok(format!("answer from {p}")),
275        });
276        let ok = result.expect("should succeed");
277        assert_eq!(ok.provider, "anthropic");
278        assert_eq!(ok.prior_errors.len(), 2);
279        assert!(matches!(ok.prior_errors[0].1, AttemptError::Transport(_)));
280        assert!(matches!(ok.prior_errors[1].1, AttemptError::Timeout(_)));
281    }
282
283    // --- run() failure paths --------------------------------------------
284
285    #[test]
286    fn all_retryable_failures_exhausts_with_full_attempt_list() {
287        let providers = ["groq", "openai", "anthropic"];
288        let result = run::<String, _>(&providers, |p| {
289            Err(AttemptError::Status5xx {
290                code: 503,
291                body: format!("{p} unavailable"),
292            })
293        });
294        let exhausted = result.expect_err("should exhaust");
295        assert_eq!(exhausted.attempts.len(), 3);
296        assert_eq!(exhausted.attempts[0].0, "groq");
297        assert_eq!(exhausted.attempts[1].0, "openai");
298        assert_eq!(exhausted.attempts[2].0, "anthropic");
299    }
300
301    #[test]
302    fn non_retryable_short_circuits_without_trying_remaining() {
303        // 401 from the first provider must NOT be papered over by
304        // silently switching to the next vendor. The user sees the
305        // auth error directly.
306        let providers = ["groq", "openai", "anthropic"];
307        let calls = RefCell::new(0u32);
308        let result = run::<String, _>(&providers, |p| {
309            *calls.borrow_mut() += 1;
310            if p == "groq" {
311                Err(AttemptError::NonRetryable("401 unauthorized".into()))
312            } else {
313                panic!("must not call sibling providers after non-retryable")
314            }
315        });
316        let exhausted = result.expect_err("should short-circuit");
317        assert_eq!(*calls.borrow(), 1);
318        assert_eq!(exhausted.attempts.len(), 1);
319        assert_eq!(exhausted.attempts[0].0, "groq");
320        assert!(matches!(
321            exhausted.attempts[0].1,
322            AttemptError::NonRetryable(_)
323        ));
324    }
325
326    #[test]
327    fn non_retryable_after_retryable_preserves_full_trail() {
328        // 502 from groq, then 401 from openai — the audit log should
329        // see both, and anthropic must not be called.
330        let providers = ["groq", "openai", "anthropic"];
331        let calls = RefCell::new(Vec::<String>::new());
332        let result = run::<String, _>(&providers, |p| {
333            calls.borrow_mut().push(p.to_string());
334            match p {
335                "groq" => Err(AttemptError::Status5xx {
336                    code: 502,
337                    body: "bad".into(),
338                }),
339                "openai" => Err(AttemptError::NonRetryable("401".into())),
340                _ => panic!("anthropic must not be called"),
341            }
342        });
343        let exhausted = result.expect_err("should fail");
344        assert_eq!(*calls.borrow(), vec!["groq", "openai"]);
345        assert_eq!(exhausted.attempts.len(), 2);
346    }
347
348    #[test]
349    fn empty_provider_list_returns_empty_exhausted() {
350        let providers: [&str; 0] = [];
351        let result = run::<String, _>(&providers, |_| panic!("must not be called"));
352        let exhausted = result.expect_err("empty list yields exhausted");
353        assert!(exhausted.attempts.is_empty());
354    }
355
356    // --- determinism preservation ---------------------------------------
357
358    #[test]
359    fn attempt_fn_is_invoked_with_identical_inputs() {
360        // The kernel does not modify any per-request state between
361        // attempts. We verify by capturing a request payload struct
362        // and asserting equality across calls.
363        #[derive(Clone, PartialEq, Debug)]
364        struct Req {
365            seed: u64,
366            temperature: f32,
367            strict: bool,
368        }
369        let req = Req {
370            seed: 42,
371            temperature: 0.0,
372            strict: true,
373        };
374        let providers = ["groq", "openai"];
375        let seen = RefCell::new(Vec::<Req>::new());
376        let _ = run::<(), _>(&providers, |_| {
377            seen.borrow_mut().push(req.clone());
378            Err(AttemptError::Transport("retry".into()))
379        });
380        let seen = seen.borrow();
381        assert_eq!(seen.len(), 2);
382        assert_eq!(seen[0], seen[1]);
383    }
384
385    // --- USING clause parsing -------------------------------------------
386
387    #[test]
388    fn parse_using_simple() {
389        assert_eq!(
390            parse_using_clause("groq,openai"),
391            Some(vec!["groq".into(), "openai".into()])
392        );
393    }
394
395    #[test]
396    fn parse_using_trims_whitespace() {
397        assert_eq!(
398            parse_using_clause("  groq , openai , anthropic  "),
399            Some(vec!["groq".into(), "openai".into(), "anthropic".into()])
400        );
401    }
402
403    #[test]
404    fn parse_using_drops_empty_segments() {
405        assert_eq!(
406            parse_using_clause("groq,,openai,"),
407            Some(vec!["groq".into(), "openai".into()])
408        );
409    }
410
411    #[test]
412    fn parse_using_dedupes_preserving_first_occurrence() {
413        assert_eq!(
414            parse_using_clause("groq,openai,groq"),
415            Some(vec!["groq".into(), "openai".into()])
416        );
417    }
418
419    #[test]
420    fn parse_using_empty_returns_none() {
421        assert_eq!(parse_using_clause(""), None);
422        assert_eq!(parse_using_clause(" , , "), None);
423    }
424
425    #[test]
426    fn parse_using_single_provider() {
427        assert_eq!(parse_using_clause("groq"), Some(vec!["groq".into()]));
428    }
429
430    // --- Display impls (audit-facing) -----------------------------------
431
432    #[test]
433    fn exhausted_display_lists_each_attempt() {
434        let exhausted = FailoverExhausted {
435            attempts: vec![
436                ("groq".into(), AttemptError::Transport("dns".into())),
437                (
438                    "openai".into(),
439                    AttemptError::Status5xx {
440                        code: 502,
441                        body: "bad".into(),
442                    },
443                ),
444            ],
445        };
446        let s = format!("{exhausted}");
447        assert!(s.contains("groq"));
448        assert!(s.contains("openai"));
449        assert!(s.contains("502"));
450    }
451}