revka 2026.6.22

use super::Provider;
use super::traits::{
    ChatMessage, ChatRequest, ChatResponse, StreamChunk, StreamEvent, StreamOptions, StreamResult,
};
use async_trait::async_trait;
use futures_util::{StreamExt, stream};
use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::Arc;
// Atomics are only used by the test mocks now that key rotation was removed (#426).
#[cfg(test)]
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Duration;

// ── Provider Fallback Notification ──────────────────────────────────────
// When ReliableProvider uses a fallback (different provider or model than
// requested), it records the details here so channel code can notify the user.
// Uses tokio::task_local to avoid cross-request leakage between concurrent
// users (the old global static had a race window).

/// Info about a provider fallback that occurred during a request.
#[derive(Debug, Clone)]
pub struct ProviderFallbackInfo {
    /// Provider that was originally requested.
    pub requested_provider: String,
    /// Model that was originally requested.
    pub requested_model: String,
    /// Provider that actually served the request.
    pub actual_provider: String,
    /// Model that actually served the request.
    pub actual_model: String,
}

tokio::task_local! {
    static PROVIDER_FALLBACK: RefCell<Option<ProviderFallbackInfo>>;
}

/// Take (consume) the last provider fallback info, if any.
/// Must be called within a `scope_provider_fallback` scope.
pub fn take_last_provider_fallback() -> Option<ProviderFallbackInfo> {
    PROVIDER_FALLBACK
        .try_with(|cell| cell.borrow_mut().take())
        .ok()
        .flatten()
}

/// Run the given future within a provider-fallback scope.
/// Both `record_provider_fallback` (inside ReliableProvider) and
/// `take_last_provider_fallback` (post-loop channel code) must execute
/// within this scope for the data to be visible.
pub async fn scope_provider_fallback<F: std::future::Future>(future: F) -> F::Output {
    PROVIDER_FALLBACK.scope(RefCell::new(None), future).await
}

/// Record a provider fallback event.
fn record_provider_fallback(
    requested_provider: &str,
    requested_model: &str,
    actual_provider: &str,
    actual_model: &str,
) {
    let _ = PROVIDER_FALLBACK.try_with(|cell| {
        *cell.borrow_mut() = Some(ProviderFallbackInfo {
            requested_provider: requested_provider.to_string(),
            requested_model: requested_model.to_string(),
            actual_provider: actual_provider.to_string(),
            actual_model: actual_model.to_string(),
        });
    });
}

// ── Error Classification ─────────────────────────────────────────────────
// Errors are split into retryable (transient server/network failures) and
// non-retryable (permanent client errors). This distinction drives whether
// the retry loop continues, falls back to the next provider, or aborts
// immediately — avoiding wasted latency on errors that cannot self-heal.

/// Check if an error is non-retryable (client errors that won't resolve with retries).
pub fn is_non_retryable(err: &anyhow::Error) -> bool {
    // Context window errors are NOT non-retryable — they can be recovered
    // by truncating conversation history, so let the retry loop handle them.
    if is_context_window_exceeded(err) {
        return false;
    }

    // Tool schema validation errors are NOT non-retryable — the provider's
    // built-in fallback in compatible.rs can recover by switching to
    // prompt-guided tool instructions.
    if is_tool_schema_error(err) {
        return false;
    }

    // 4xx errors are generally non-retryable (bad request, auth failure, etc.),
    // except 429 (rate-limit — transient) and 408 (timeout — worth retrying).
    if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
        if let Some(status) = reqwest_err.status() {
            let code = status.as_u16();
            return status.is_client_error() && code != 429 && code != 408;
        }
    }
    // Fallback: parse status codes from stringified errors (some providers
    // embed codes in error messages rather than returning typed HTTP errors).
    // Classify on the *primary* status token only — not the first in-range
    // digit run anywhere in the string — so a transient 5xx whose body embeds
    // a 4xx-looking token or a trace/request-id number in 400–499 is not
    // misclassified as non-retryable.
    let msg = err.to_string();
    if let Some(code) = primary_http_status(&msg) {
        if (400..500).contains(&code) {
            return code != 429 && code != 408;
        }
        // Any other status (1xx/2xx/3xx/5xx) is treated as retryable here and
        // falls through to the keyword heuristics below.
    }

    // Heuristic: detect auth/model failures by keyword when no HTTP status
    // is available (e.g. gRPC or custom transport errors).
    let msg_lower = msg.to_lowercase();
    let auth_failure_hints = [
        "invalid api key",
        "incorrect api key",
        "missing api key",
        "api key not set",
        "authentication failed",
        "auth failed",
        "unauthorized",
        "forbidden",
        "permission denied",
        "access denied",
        "invalid token",
    ];

    if auth_failure_hints
        .iter()
        .any(|hint| msg_lower.contains(hint))
    {
        return true;
    }

    msg_lower.contains("model")
        && (msg_lower.contains("not found")
            || msg_lower.contains("unknown")
            || msg_lower.contains("unsupported")
            || msg_lower.contains("does not exist")
            || msg_lower.contains("invalid"))
}

/// Extract the *primary* HTTP status code from a stringified provider error.
///
/// Providers emit errors in a canonical `"… API error (NNN …): …"` /
/// `"HTTP NNN"` shape where the status code is the first token after the `(`
/// or the `HTTP ` keyword. Anchoring to that shape (and otherwise to the first
/// status-looking 3-digit token) ensures a leading 5xx is honored as the
/// primary status instead of being skipped in favour of a later 4xx-looking
/// token embedded in the body or a trace/request-id number.
fn primary_http_status(msg: &str) -> Option<u16> {
    // Prefer a status-context anchor: the code immediately after a `(` or the
    // `HTTP ` keyword (both ASCII, so byte offsets are char boundaries).
    let anchors = msg
        .match_indices('(')
        .map(|(i, _)| i + 1)
        .chain(msg.match_indices("HTTP ").map(|(i, _)| i + "HTTP ".len()));
    for start in anchors {
        if let Some(code) = parse_status_at(&msg[start..]) {
            return Some(code);
        }
    }
    // No status-context anchor found: fall back to the first *standalone* 3-digit
    // status token (100–599) so the leading status wins over later digits. A
    // digit run glued to a URL on its left — a non-default port (`:456`), or a
    // host/path segment — is NOT a status and must be skipped, or a transient
    // transport error like `error sending request for url
    // (https://host:456/v1): connection refused` (whose only 4xx-band digits are
    // the port) would be misread as a 4xx and wrongly classified non-retryable.
    // Real status tokens in these messages are delimited by whitespace/`(`,
    // never glued to `:` / `/` / `.` / `-` / an identifier character. (Default
    // ports 80/443 are stripped by the `url` crate, so only non-default ports in
    // the 4xx band ever triggered the false positive.)
    let bytes = msg.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if !bytes[i].is_ascii_digit() {
            i += 1;
            continue;
        }
        let start = i;
        while i < bytes.len() && bytes[i].is_ascii_digit() {
            i += 1;
        }
        // Only a 3-digit run can be an HTTP status (a longer run is a port,
        // id, or count).
        if i - start != 3 {
            continue;
        }
        // Skip runs glued to a URL/identifier character on the left.
        let glued_left = msg[..start]
            .chars()
            .next_back()
            .is_some_and(|c| c.is_ascii_alphanumeric() || matches!(c, ':' | '/' | '.' | '-' | '_'));
        if glued_left {
            continue;
        }
        if let Ok(code) = msg[start..i].parse::<u16>() {
            if (100..600).contains(&code) {
                return Some(code);
            }
        }
    }
    None
}

/// Parse a leading 3-digit HTTP status (100–599) at the start of `s`, requiring
/// the digit run to be exactly three digits (not part of a longer number).
fn parse_status_at(s: &str) -> Option<u16> {
    let digits: String = s.chars().take_while(|c| c.is_ascii_digit()).collect();
    if digits.len() == 3 {
        if let Ok(code) = digits.parse::<u16>() {
            if (100..600).contains(&code) {
                return Some(code);
            }
        }
    }
    None
}

/// Check if an error is a tool schema validation failure (e.g. Groq returning
/// "tool call validation failed: attempted to call tool '...' which was not in request").
/// These errors should NOT be classified as non-retryable because the provider's
/// built-in fallback logic (`compatible.rs::is_native_tool_schema_unsupported`)
/// can recover by switching to prompt-guided tool instructions.
pub fn is_tool_schema_error(err: &anyhow::Error) -> bool {
    let lower = err.to_string().to_lowercase();
    let hints = [
        "tool call validation failed",
        "was not in request",
        "not found in tool list",
        "invalid_tool_call",
    ];
    hints.iter().any(|hint| lower.contains(hint))
}

pub(crate) fn is_context_window_exceeded(err: &anyhow::Error) -> bool {
    let lower = err.to_string().to_lowercase();
    let hints = [
        "exceeds the context window",
        "exceeds the available context size",
        "context window of this model",
        "maximum context length",
        "context length exceeded",
        "too many tokens",
        "token limit exceeded",
        "prompt is too long",
        "input is too long",
        "prompt exceeds max length",
    ];

    hints.iter().any(|hint| lower.contains(hint))
}

/// Check if an error is a rate-limit (429) error.
fn is_rate_limited(err: &anyhow::Error) -> bool {
    if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
        if let Some(status) = reqwest_err.status() {
            return status.as_u16() == 429;
        }
    }
    let msg = err.to_string();
    msg.contains("429")
        && (msg.contains("Too Many") || msg.contains("rate") || msg.contains("limit"))
}

/// Check if a 429 is a business/quota-plan error that retries cannot fix.
///
/// Examples:
/// - plan does not include requested model
/// - insufficient balance / package not active
/// - known provider business codes (e.g. Z.AI: 1311, 1113)
fn is_non_retryable_rate_limit(err: &anyhow::Error) -> bool {
    if !is_rate_limited(err) {
        return false;
    }

    let msg = err.to_string();
    let lower = msg.to_lowercase();

    let business_hints = [
        "plan does not include",
        "doesn't include",
        "not include",
        "insufficient balance",
        "insufficient_balance",
        "insufficient quota",
        "insufficient_quota",
        "quota exhausted",
        "out of credits",
        "no available package",
        "package not active",
        "purchase package",
        "model not available for your plan",
    ];

    if business_hints.iter().any(|hint| lower.contains(hint)) {
        return true;
    }

    // Known provider business codes observed for 429 where retry is futile.
    // Anchor to the provider's JSON `"code"` field (e.g. Z.AI emits
    // `{"code":1311,...}`) rather than scanning every digit token in the whole
    // message — a trace/request-id, count, or model fragment that happens to be
    // 1113/1311 must not flag an otherwise-transient 429 as non-retryable.
    if let Some(code) = json_code_field(&lower) {
        if matches!(code, 1113 | 1311) {
            return true;
        }
    }

    false
}

/// Extract the numeric value of a JSON `"code"` field from a stringified
/// provider error body, e.g. `{"code":1311,...}` → `Some(1311)`. Tolerates
/// optional whitespace around the colon. Returns the first such field only;
/// numbers elsewhere in the message (ids, counts, model fragments) are ignored.
fn json_code_field(msg: &str) -> Option<u32> {
    let mut search = msg;
    while let Some(rel) = search.find("\"code\"") {
        let after = &search[rel + "\"code\"".len()..];
        let after = after.trim_start();
        if let Some(rest) = after.strip_prefix(':') {
            let digits: String = rest
                .trim_start()
                .chars()
                .take_while(|c| c.is_ascii_digit())
                .collect();
            if let Ok(code) = digits.parse::<u32>() {
                return Some(code);
            }
        }
        search = after;
    }
    None
}

/// Try to extract a Retry-After value (in milliseconds) from an error message.
/// Looks for patterns like `Retry-After: 5` or `retry_after: 2.5` in the error string.
fn parse_retry_after_ms(err: &anyhow::Error) -> Option<u64> {
    let msg = err.to_string();
    let lower = msg.to_lowercase();

    // Look for "retry-after: <number>" or "retry_after: <number>", plus the
    // phrasings Gemini uses in 429 bodies: cloudcode-pa says "Your quota
    // will reset after 32s." and generativelanguage says "Please retry in
    // 26.3s." (the digit parse below stops at the trailing "s").
    for prefix in &[
        "retry-after:",
        "retry_after:",
        "retry-after ",
        "retry_after ",
        "reset after ",
        "retry in ",
    ] {
        if let Some(pos) = lower.find(prefix) {
            let after = &msg[pos + prefix.len()..];
            let num_str: String = after
                .trim()
                .chars()
                .take_while(|c| c.is_ascii_digit() || *c == '.')
                .collect();
            if let Ok(secs) = num_str.parse::<f64>() {
                if secs.is_finite() && secs >= 0.0 {
                    let millis = Duration::from_secs_f64(secs).as_millis();
                    if let Ok(value) = u64::try_from(millis) {
                        return Some(value);
                    }
                }
            }
        }
    }
    None
}

fn failure_reason(rate_limited: bool, non_retryable: bool) -> &'static str {
    if rate_limited && non_retryable {
        "rate_limited_non_retryable"
    } else if rate_limited {
        "rate_limited"
    } else if non_retryable {
        "non_retryable"
    } else {
        "retryable"
    }
}

fn compact_error_detail(err: &anyhow::Error) -> String {
    // Use {:#} to include the full error chain (root cause), not just the top-level message.
    super::sanitize_api_error(&format!("{:#}", err))
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

/// Truncate conversation history by dropping the oldest non-system messages.
/// Returns the number of messages dropped. Keeps at least the system message
/// (if any) and the most recent user message.
fn truncate_for_context(messages: &mut Vec<ChatMessage>) -> usize {
    // Find all non-system message indices
    let non_system: Vec<usize> = messages
        .iter()
        .enumerate()
        .filter(|(_, m)| m.role != "system")
        .map(|(i, _)| i)
        .collect();

    // Keep at least the last non-system message (most recent user turn)
    if non_system.len() <= 1 {
        return 0;
    }

    // Drop the oldest half of non-system messages
    let drop_count = non_system.len() / 2;
    let indices_to_remove: Vec<usize> = non_system[..drop_count].to_vec();

    // Remove in reverse order to preserve indices
    for &idx in indices_to_remove.iter().rev() {
        messages.remove(idx);
    }

    drop_count
}

fn push_failure(
    failures: &mut Vec<String>,
    provider_name: &str,
    model: &str,
    attempt: u32,
    max_attempts: u32,
    reason: &str,
    error_detail: &str,
) {
    failures.push(format!(
        "provider={provider_name} model={model} attempt {attempt}/{max_attempts}: {reason}; error={error_detail}"
    ));
}

/// Apply bounded "equal jitter" to a backoff interval so concurrent retriers
/// desynchronize instead of re-firing in lockstep (thundering-herd avoidance).
///
/// Returns a value in `[base/2, base]`: half the interval is fixed (preserving a
/// lower bound so retries don't collapse toward zero) and half is randomized.
fn jitter_backoff(base: u64) -> u64 {
    let half = base / 2;
    let span = base - half; // ceil(base/2); avoids a zero-modulus when base is odd
    if span == 0 {
        return base;
    }
    half + (rand::random::<u64>() % (span + 1))
}

/// Compute the next backoff interval for a failed attempt: honor a server
/// `Retry-After` (capped at 30s, no jitter) when present, otherwise apply
/// bounded equal jitter. Single source of truth shared by the non-streaming
/// `compute_backoff` and the streaming failover task (which cannot call the
/// `&self` method from its detached `'static` body).
fn next_backoff(base: u64, err: &anyhow::Error) -> u64 {
    if let Some(retry_after) = parse_retry_after_ms(err) {
        // The server told us exactly when to retry, so honor it without jitter.
        retry_after.min(30_000).max(base)
    } else {
        // No Retry-After: spread concurrent retriers out with bounded jitter so
        // they don't re-fire in lockstep against an already-struggling provider
        // (thundering-herd / retry-storm). Equal-jitter keeps a lower bound:
        // wait ∈ [base/2, base].
        jitter_backoff(base)
    }
}

// ── Resilient Provider Wrapper ────────────────────────────────────────────
// Three-level failover strategy: model chain → provider chain → retry loop.
//   Outer loop:  iterate model fallback chain (original model first, then
//                configured alternatives).
//   Middle loop: iterate registered providers in priority order.
//   Inner loop:  retry the same (provider, model) pair with exponential
//                backoff on rate-limit / transient errors.
// Loop invariant: `failures` accumulates every failed attempt so the final
// error message gives operators a complete diagnostic trail.

/// Provider wrapper with retry, fallback, and model failover.
pub struct ReliableProvider {
    providers: Vec<(String, Arc<dyn Provider>)>,
    max_retries: u32,
    base_backoff_ms: u64,
    /// Per-model fallback chains: model_name → [fallback_model_1, fallback_model_2, ...]
    model_fallbacks: HashMap<String, Vec<String>>,
}

impl ReliableProvider {
    pub fn new(
        providers: Vec<(String, Box<dyn Provider>)>,
        max_retries: u32,
        base_backoff_ms: u64,
    ) -> Self {
        Self {
            // Store providers behind `Arc` so connect-time streaming failover can
            // clone them into the spawned bridge task and re-invoke their stream
            // methods across the model/provider chain (mirrors non-streaming chat).
            providers: providers
                .into_iter()
                .map(|(name, p)| (name, Arc::<dyn Provider>::from(p)))
                .collect(),
            max_retries,
            base_backoff_ms: base_backoff_ms.max(50),
            model_fallbacks: HashMap::new(),
        }
    }

    /// Set per-model fallback chains.
    pub fn with_model_fallbacks(mut self, fallbacks: HashMap<String, Vec<String>>) -> Self {
        self.model_fallbacks = fallbacks;
        self
    }

    /// Build the list of models to try: [original, fallback1, fallback2, ...]
    fn model_chain<'a>(&'a self, model: &'a str) -> Vec<&'a str> {
        let mut chain = vec![model];
        if let Some(fallbacks) = self.model_fallbacks.get(model) {
            chain.extend(fallbacks.iter().map(|s| s.as_str()));
        }
        chain
    }

    /// Compute backoff duration, respecting Retry-After if present.
    fn compute_backoff(&self, base: u64, err: &anyhow::Error) -> u64 {
        next_backoff(base, err)
    }

    /// Build the ordered list of streaming-capable providers to try, cloning the
    /// `Arc` handles so the bridge task can re-invoke them across the failover
    /// chain. Returns empty when streaming is disabled or no provider qualifies
    /// (the caller surfaces a "no provider supports streaming" error in that
    /// case). When `needs_tool_events` is set, only providers that can emit
    /// structured tool-call events are eligible.
    fn streaming_candidates(
        &self,
        enabled: bool,
        needs_tool_events: bool,
    ) -> Vec<(String, Arc<dyn Provider>)> {
        if !enabled {
            return Vec::new();
        }
        self.providers
            .iter()
            .filter(|(_, p)| {
                p.supports_streaming() && (!needs_tool_events || p.supports_streaming_tool_events())
            })
            .map(|(name, p)| (name.clone(), Arc::clone(p)))
            .collect()
    }

    /// Spawn a background bridge task that performs connect-time streaming
    /// failover, then return a stream fed by that task.
    ///
    /// For each `(model, provider)` candidate — the model fallback chain crossed
    /// with the streaming-capable providers — the task opens a fresh stream and
    /// waits for the first item:
    ///
    /// * **First item `Ok`** → commit to this stream and forward the remainder.
    ///   A mid-stream error after the first event is *not* recoverable (bytes are
    ///   already in flight), so it is logged and forwarded as-is.
    /// * **First item `Err` (pre-first-chunk)** → classify it. Non-retryable
    ///   errors advance to the next provider; retryable ones retry the same
    ///   provider with backoff (honoring `Retry-After`) up to `max_retries`.
    /// * **Empty stream** → treat as a failed attempt and advance. If *every*
    ///   candidate is empty and none produced a real error, the bridge closes
    ///   cleanly (no synthetic error) so an empty completion still reads as
    ///   success, matching the pre-failover behavior.
    ///
    /// This mirrors the non-streaming `chat` loop as closely as the streaming
    /// shape allows. `make_stream` is invoked once per attempt to obtain a fresh
    /// stream for the given provider/model.
    ///
    /// The task is cancellation-aware: every suspend point (upstream connect,
    /// backoff sleep, mid-stream forward) is raced against `tx.closed()`, so
    /// dropping the returned stream stops the sweep immediately instead of
    /// cascading live connections/backoffs against every fallback.
    ///
    /// Unlike the non-streaming loop, a successful streaming failover does **not**
    /// record `ProviderFallbackInfo` for user notification: the failover runs in a
    /// detached `tokio::spawn` and the `PROVIDER_FALLBACK` task-local does not
    /// propagate across the spawn boundary, so the parent's
    /// `take_last_provider_fallback` could never observe it. This matches the
    /// pre-existing streaming behavior (the old path recorded nothing either).
    fn spawn_failover_stream<T, F>(
        &self,
        candidates: Vec<(String, Arc<dyn Provider>)>,
        model: &str,
        no_provider_msg: String,
        make_stream: F,
    ) -> stream::BoxStream<'static, StreamResult<T>>
    where
        T: Send + 'static,
        F: Fn(&Arc<dyn Provider>, &str) -> stream::BoxStream<'static, StreamResult<T>>
            + Send
            + 'static,
    {
        if candidates.is_empty() {
            return stream::once(async move {
                Err(super::traits::StreamError::Provider(no_provider_msg))
            })
            .boxed();
        }

        let models: Vec<String> = self
            .model_chain(model)
            .into_iter()
            .map(str::to_string)
            .collect();
        let max_retries = self.max_retries;
        let base_backoff_ms = self.base_backoff_ms;
        let (tx, rx) = tokio::sync::mpsc::channel::<StreamResult<T>>(100);

        tokio::spawn(async move {
            // Accumulate every failed attempt for an aggregated diagnostic,
            // mirroring the non-streaming `chat` loop's `failures` trail.
            let mut failures: Vec<String> = Vec::new();
            // Distinguish "all candidates errored" (surface an error) from "all
            // candidates were empty" (close cleanly, preserving the old behavior
            // where an empty stream was a content-free success, not an error).
            let mut saw_connect_error = false;

            for current_model in &models {
                for (provider_name, provider) in &candidates {
                    let mut backoff_ms = base_backoff_ms;

                    for attempt in 0..=max_retries {
                        // Stop before opening a new upstream connection if the
                        // consumer has already dropped the stream (cancellation).
                        if tx.is_closed() {
                            return;
                        }

                        let mut stream = make_stream(provider, current_model.as_str());

                        // Race the connect against receiver-drop so a cancelled
                        // request aborts the in-flight connect immediately.
                        let first = tokio::select! {
                            biased;
                            () = tx.closed() => return,
                            first = stream.next() => first,
                        };

                        match first {
                            // First event arrived: commit to this stream and
                            // forward the remainder. A mid-stream error after the
                            // first event is non-recoverable.
                            Some(Ok(first_event)) => {
                                if tx.send(Ok(first_event)).await.is_err() {
                                    return;
                                }
                                loop {
                                    let item = tokio::select! {
                                        biased;
                                        () = tx.closed() => return,
                                        item = stream.next() => item,
                                    };
                                    match item {
                                        Some(item) => {
                                            if let Err(ref e) = item {
                                                tracing::warn!(
                                                    provider = provider_name.as_str(),
                                                    model = current_model.as_str(),
                                                    "Streaming error after first event (non-recoverable): {e}"
                                                );
                                            }
                                            if tx.send(item).await.is_err() {
                                                return;
                                            }
                                        }
                                        None => return,
                                    }
                                }
                            }
                            // Pre-first-chunk error: classify and either retry the
                            // same provider with backoff or fail over to the next.
                            Some(Err(e)) => {
                                let as_any = anyhow::anyhow!("{e}");
                                let non_retryable = is_non_retryable(&as_any)
                                    || is_non_retryable_rate_limit(&as_any);
                                let rate_limited = is_rate_limited(&as_any);
                                saw_connect_error = true;
                                failures.push(format!("{provider_name}/{current_model}: {e}"));

                                if non_retryable {
                                    tracing::warn!(
                                        provider = provider_name.as_str(),
                                        model = current_model.as_str(),
                                        "Non-retryable streaming connect error, moving on: {e}"
                                    );
                                    break;
                                }

                                if attempt < max_retries {
                                    let wait = next_backoff(backoff_ms, &as_any);
                                    tracing::warn!(
                                        provider = provider_name.as_str(),
                                        model = current_model.as_str(),
                                        attempt = attempt + 1,
                                        backoff_ms = wait,
                                        rate_limited,
                                        "Streaming connect failed, retrying: {e}"
                                    );
                                    // Race the backoff against receiver-drop so a
                                    // cancelled request doesn't sleep then re-connect.
                                    tokio::select! {
                                        biased;
                                        () = tx.closed() => return,
                                        () = tokio::time::sleep(Duration::from_millis(wait)) => {}
                                    }
                                    backoff_ms = (backoff_ms.saturating_mul(2)).min(10_000);
                                }
                            }
                            // Stream ended before producing any event: record the
                            // attempt and fail over to the next candidate.
                            None => {
                                failures.push(format!(
                                    "{provider_name}/{current_model}: stream produced no events"
                                ));
                                tracing::warn!(
                                    provider = provider_name.as_str(),
                                    model = current_model.as_str(),
                                    "Streaming provider produced no events, moving on"
                                );
                                break;
                            }
                        }
                    }

                    if tx.is_closed() {
                        return;
                    }
                }
            }

            // Reached only if no candidate ever produced a first event. Surface an
            // aggregated error if any attempt actually failed; if every attempt was
            // an empty stream (no real error), drop `tx` so the caller observes a
            // clean, content-free completion as it did before failover existed.
            if saw_connect_error {
                let message = format!(
                    "All streaming providers/models failed. Attempts:\n{}",
                    failures.join("\n")
                );
                let _ = tx
                    .send(Err(super::traits::StreamError::Provider(message)))
                    .await;
            }
        });

        stream::unfold(rx, |mut rx| async move {
            rx.recv().await.map(|item| (item, rx))
        })
        .boxed()
    }
}

#[async_trait]
impl Provider for ReliableProvider {
    async fn warmup(&self) -> anyhow::Result<()> {
        for (name, provider) in &self.providers {
            tracing::info!(provider = name, "Warming up provider connection pool");
            if provider.warmup().await.is_err() {
                tracing::warn!(provider = name, "Warmup failed (non-fatal)");
            }
        }
        Ok(())
    }

    async fn chat_with_system(
        &self,
        system_prompt: Option<&str>,
        message: &str,
        model: &str,
        temperature: f64,
    ) -> anyhow::Result<String> {
        let models = self.model_chain(model);
        let mut failures = Vec::new();

        // Outer: model fallback chain. Middle: provider priority. Inner: retries.
        // Each iteration: attempt one (provider, model) call. On success, return
        // immediately. On non-retryable error, break to next provider. On
        // retryable error, sleep with exponential backoff and retry.
        for current_model in &models {
            for (provider_name, provider) in &self.providers {
                let mut backoff_ms = self.base_backoff_ms;

                for attempt in 0..=self.max_retries {
                    match provider
                        .chat_with_system(system_prompt, message, current_model, temperature)
                        .await
                    {
                        Ok(resp) => {
                            if attempt > 0
                                || *current_model != model
                                || self.providers.first().map(|(n, _)| n.as_str())
                                    != Some(provider_name)
                            {
                                tracing::info!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt,
                                    original_model = model,
                                    "Provider recovered (failover/retry)"
                                );
                                let primary = self
                                    .providers
                                    .first()
                                    .map(|(n, _)| n.as_str())
                                    .unwrap_or("");
                                record_provider_fallback(
                                    primary,
                                    model,
                                    provider_name,
                                    current_model,
                                );
                            }
                            return Ok(resp);
                        }
                        Err(e) => {
                            // Context window exceeded: no history to truncate
                            // in chat_with_system, bail immediately.
                            if is_context_window_exceeded(&e) {
                                let error_detail = compact_error_detail(&e);
                                push_failure(
                                    &mut failures,
                                    provider_name,
                                    current_model,
                                    attempt + 1,
                                    self.max_retries + 1,
                                    "non_retryable",
                                    &error_detail,
                                );
                                anyhow::bail!(
                                    "Request exceeds model context window. Attempts:\n{}",
                                    failures.join("\n")
                                );
                            }

                            let non_retryable_rate_limit = is_non_retryable_rate_limit(&e);
                            let non_retryable = is_non_retryable(&e) || non_retryable_rate_limit;
                            let rate_limited = is_rate_limited(&e);
                            let failure_reason = failure_reason(rate_limited, non_retryable);
                            let error_detail = compact_error_detail(&e);

                            push_failure(
                                &mut failures,
                                provider_name,
                                current_model,
                                attempt + 1,
                                self.max_retries + 1,
                                failure_reason,
                                &error_detail,
                            );

                            if non_retryable {
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    error = %error_detail,
                                    "Non-retryable error, moving on"
                                );
                                break;
                            }

                            if attempt < self.max_retries {
                                let wait = self.compute_backoff(backoff_ms, &e);
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt = attempt + 1,
                                    backoff_ms = wait,
                                    reason = failure_reason,
                                    error = %error_detail,
                                    "Provider call failed, retrying"
                                );
                                tokio::time::sleep(Duration::from_millis(wait)).await;
                                backoff_ms = (backoff_ms.saturating_mul(2)).min(10_000);
                            }
                        }
                    }
                }

                tracing::warn!(
                    provider = provider_name,
                    model = *current_model,
                    "Exhausted retries, trying next provider/model"
                );
            }

            if *current_model != model {
                tracing::warn!(
                    original_model = model,
                    fallback_model = *current_model,
                    "Model fallback exhausted all providers, trying next fallback model"
                );
            }
        }

        anyhow::bail!(
            "All providers/models failed. Attempts:\n{}",
            failures.join("\n")
        )
    }

    async fn chat_with_history(
        &self,
        messages: &[ChatMessage],
        model: &str,
        temperature: f64,
    ) -> anyhow::Result<String> {
        let models = self.model_chain(model);
        let mut failures = Vec::new();
        let mut effective_messages = messages.to_vec();
        let mut context_truncated = false;

        for current_model in &models {
            for (provider_name, provider) in &self.providers {
                let mut backoff_ms = self.base_backoff_ms;

                for attempt in 0..=self.max_retries {
                    match provider
                        .chat_with_history(&effective_messages, current_model, temperature)
                        .await
                    {
                        Ok(resp) => {
                            if attempt > 0
                                || *current_model != model
                                || context_truncated
                                || self.providers.first().map(|(n, _)| n.as_str())
                                    != Some(provider_name)
                            {
                                tracing::info!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt,
                                    original_model = model,
                                    context_truncated,
                                    "Provider recovered (failover/retry)"
                                );
                                let primary = self
                                    .providers
                                    .first()
                                    .map(|(n, _)| n.as_str())
                                    .unwrap_or("");
                                record_provider_fallback(
                                    primary,
                                    model,
                                    provider_name,
                                    current_model,
                                );
                            }
                            return Ok(resp);
                        }
                        Err(e) => {
                            // Context window exceeded: truncate history and retry
                            if is_context_window_exceeded(&e) && !context_truncated {
                                let dropped = truncate_for_context(&mut effective_messages);
                                if dropped > 0 {
                                    context_truncated = true;
                                    tracing::warn!(
                                        provider = provider_name,
                                        model = *current_model,
                                        dropped,
                                        remaining = effective_messages.len(),
                                        "Context window exceeded; truncated history and retrying"
                                    );
                                    continue; // Retry with truncated messages (counts as an attempt)
                                }
                                // Nothing to truncate (system prompt alone exceeds
                                // the model's context window) — bail immediately
                                // instead of wasting retry attempts.
                                let error_detail = compact_error_detail(&e);
                                push_failure(
                                    &mut failures,
                                    provider_name,
                                    current_model,
                                    attempt + 1,
                                    self.max_retries + 1,
                                    "non_retryable",
                                    &error_detail,
                                );
                                anyhow::bail!(
                                    "Request exceeds model context window and cannot be reduced further. \
                                     Try using a model with a larger context window, reducing the number \
                                     of tools/skills, or enabling compact_context in config. Attempts:\n{}",
                                    failures.join("\n")
                                );
                            }

                            let non_retryable_rate_limit = is_non_retryable_rate_limit(&e);
                            let non_retryable = is_non_retryable(&e) || non_retryable_rate_limit;
                            let rate_limited = is_rate_limited(&e);
                            let failure_reason = failure_reason(rate_limited, non_retryable);
                            let error_detail = compact_error_detail(&e);

                            push_failure(
                                &mut failures,
                                provider_name,
                                current_model,
                                attempt + 1,
                                self.max_retries + 1,
                                failure_reason,
                                &error_detail,
                            );

                            if non_retryable {
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    error = %error_detail,
                                    "Non-retryable error, moving on"
                                );
                                break;
                            }

                            if attempt < self.max_retries {
                                let wait = self.compute_backoff(backoff_ms, &e);
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt = attempt + 1,
                                    backoff_ms = wait,
                                    reason = failure_reason,
                                    error = %error_detail,
                                    "Provider call failed, retrying"
                                );
                                tokio::time::sleep(Duration::from_millis(wait)).await;
                                backoff_ms = (backoff_ms.saturating_mul(2)).min(10_000);
                            }
                        }
                    }
                }

                tracing::warn!(
                    provider = provider_name,
                    model = *current_model,
                    "Exhausted retries, trying next provider/model"
                );
            }
        }

        anyhow::bail!(
            "All providers/models failed. Attempts:\n{}",
            failures.join("\n")
        )
    }

    fn supports_native_tools(&self) -> bool {
        self.providers
            .first()
            .map(|(_, p)| p.supports_native_tools())
            .unwrap_or(false)
    }

    fn supports_vision(&self) -> bool {
        self.providers
            .iter()
            .any(|(_, provider)| provider.supports_vision())
    }

    async fn chat_with_tools(
        &self,
        messages: &[ChatMessage],
        tools: &[serde_json::Value],
        model: &str,
        temperature: f64,
    ) -> anyhow::Result<ChatResponse> {
        let models = self.model_chain(model);
        let mut failures = Vec::new();
        let mut effective_messages = messages.to_vec();
        let mut context_truncated = false;

        for current_model in &models {
            for (provider_name, provider) in &self.providers {
                let mut backoff_ms = self.base_backoff_ms;

                for attempt in 0..=self.max_retries {
                    match provider
                        .chat_with_tools(&effective_messages, tools, current_model, temperature)
                        .await
                    {
                        Ok(resp) => {
                            if attempt > 0
                                || *current_model != model
                                || context_truncated
                                || self.providers.first().map(|(n, _)| n.as_str())
                                    != Some(provider_name)
                            {
                                tracing::info!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt,
                                    original_model = model,
                                    context_truncated,
                                    "Provider recovered (failover/retry)"
                                );
                                let primary = self
                                    .providers
                                    .first()
                                    .map(|(n, _)| n.as_str())
                                    .unwrap_or("");
                                record_provider_fallback(
                                    primary,
                                    model,
                                    provider_name,
                                    current_model,
                                );
                            }
                            return Ok(resp);
                        }
                        Err(e) => {
                            // Context window exceeded: truncate history and retry
                            if is_context_window_exceeded(&e) && !context_truncated {
                                let dropped = truncate_for_context(&mut effective_messages);
                                if dropped > 0 {
                                    context_truncated = true;
                                    tracing::warn!(
                                        provider = provider_name,
                                        model = *current_model,
                                        dropped,
                                        remaining = effective_messages.len(),
                                        "Context window exceeded; truncated history and retrying"
                                    );
                                    continue; // Retry with truncated messages (counts as an attempt)
                                }
                                // Nothing to truncate (system prompt alone exceeds
                                // the model's context window) — bail immediately
                                // instead of wasting retry attempts.
                                let error_detail = compact_error_detail(&e);
                                push_failure(
                                    &mut failures,
                                    provider_name,
                                    current_model,
                                    attempt + 1,
                                    self.max_retries + 1,
                                    "non_retryable",
                                    &error_detail,
                                );
                                anyhow::bail!(
                                    "Request exceeds model context window and cannot be reduced further. \
                                     Try using a model with a larger context window, reducing the number \
                                     of tools/skills, or enabling compact_context in config. Attempts:\n{}",
                                    failures.join("\n")
                                );
                            }

                            let non_retryable_rate_limit = is_non_retryable_rate_limit(&e);
                            let non_retryable = is_non_retryable(&e) || non_retryable_rate_limit;
                            let rate_limited = is_rate_limited(&e);
                            let failure_reason = failure_reason(rate_limited, non_retryable);
                            let error_detail = compact_error_detail(&e);

                            push_failure(
                                &mut failures,
                                provider_name,
                                current_model,
                                attempt + 1,
                                self.max_retries + 1,
                                failure_reason,
                                &error_detail,
                            );

                            if non_retryable {
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    error = %error_detail,
                                    "Non-retryable error, moving on"
                                );
                                break;
                            }

                            if attempt < self.max_retries {
                                let wait = self.compute_backoff(backoff_ms, &e);
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt = attempt + 1,
                                    backoff_ms = wait,
                                    reason = failure_reason,
                                    error = %error_detail,
                                    "Provider call failed, retrying"
                                );
                                tokio::time::sleep(Duration::from_millis(wait)).await;
                                backoff_ms = (backoff_ms.saturating_mul(2)).min(10_000);
                            }
                        }
                    }
                }

                tracing::warn!(
                    provider = provider_name,
                    model = *current_model,
                    "Exhausted retries, trying next provider/model"
                );
            }
        }

        anyhow::bail!(
            "All providers/models failed. Attempts:\n{}",
            failures.join("\n")
        )
    }

    async fn chat(
        &self,
        request: ChatRequest<'_>,
        model: &str,
        temperature: f64,
    ) -> anyhow::Result<ChatResponse> {
        let models = self.model_chain(model);
        let mut failures = Vec::new();
        let mut effective_messages = request.messages.to_vec();
        let mut context_truncated = false;

        for current_model in &models {
            for (provider_name, provider) in &self.providers {
                let mut backoff_ms = self.base_backoff_ms;

                for attempt in 0..=self.max_retries {
                    let req = ChatRequest {
                        messages: &effective_messages,
                        tools: request.tools,
                    };
                    match provider.chat(req, current_model, temperature).await {
                        Ok(resp) => {
                            if attempt > 0
                                || *current_model != model
                                || context_truncated
                                || self.providers.first().map(|(n, _)| n.as_str())
                                    != Some(provider_name)
                            {
                                tracing::info!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt,
                                    original_model = model,
                                    context_truncated,
                                    "Provider recovered (failover/retry)"
                                );
                                let primary = self
                                    .providers
                                    .first()
                                    .map(|(n, _)| n.as_str())
                                    .unwrap_or("");
                                record_provider_fallback(
                                    primary,
                                    model,
                                    provider_name,
                                    current_model,
                                );
                            }
                            return Ok(resp);
                        }
                        Err(e) => {
                            // Context window exceeded: truncate history and retry
                            if is_context_window_exceeded(&e) && !context_truncated {
                                let dropped = truncate_for_context(&mut effective_messages);
                                if dropped > 0 {
                                    context_truncated = true;
                                    tracing::warn!(
                                        provider = provider_name,
                                        model = *current_model,
                                        dropped,
                                        remaining = effective_messages.len(),
                                        "Context window exceeded; truncated history and retrying"
                                    );
                                    continue; // Retry with truncated messages (counts as an attempt)
                                }
                                // Nothing to truncate (system prompt alone exceeds
                                // the model's context window) — bail immediately
                                // instead of wasting retry attempts.
                                let error_detail = compact_error_detail(&e);
                                push_failure(
                                    &mut failures,
                                    provider_name,
                                    current_model,
                                    attempt + 1,
                                    self.max_retries + 1,
                                    "non_retryable",
                                    &error_detail,
                                );
                                anyhow::bail!(
                                    "Request exceeds model context window and cannot be reduced further. \
                                     Try using a model with a larger context window, reducing the number \
                                     of tools/skills, or enabling compact_context in config. Attempts:\n{}",
                                    failures.join("\n")
                                );
                            }

                            let non_retryable_rate_limit = is_non_retryable_rate_limit(&e);
                            let non_retryable = is_non_retryable(&e) || non_retryable_rate_limit;
                            let rate_limited = is_rate_limited(&e);
                            let failure_reason = failure_reason(rate_limited, non_retryable);
                            let error_detail = compact_error_detail(&e);

                            push_failure(
                                &mut failures,
                                provider_name,
                                current_model,
                                attempt + 1,
                                self.max_retries + 1,
                                failure_reason,
                                &error_detail,
                            );

                            if non_retryable {
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    error = %error_detail,
                                    "Non-retryable error, moving on"
                                );
                                break;
                            }

                            if attempt < self.max_retries {
                                let wait = self.compute_backoff(backoff_ms, &e);
                                tracing::warn!(
                                    provider = provider_name,
                                    model = *current_model,
                                    attempt = attempt + 1,
                                    backoff_ms = wait,
                                    reason = failure_reason,
                                    error = %error_detail,
                                    "Provider call failed, retrying"
                                );
                                tokio::time::sleep(Duration::from_millis(wait)).await;
                                backoff_ms = (backoff_ms.saturating_mul(2)).min(10_000);
                            }
                        }
                    }
                }

                tracing::warn!(
                    provider = provider_name,
                    model = *current_model,
                    "Exhausted retries, trying next provider/model"
                );
            }

            if *current_model != model {
                tracing::warn!(
                    original_model = model,
                    fallback_model = *current_model,
                    "Model fallback exhausted all providers, trying next fallback model"
                );
            }
        }

        anyhow::bail!(
            "All providers/models failed. Attempts:\n{}",
            failures.join("\n")
        )
    }

    fn supports_streaming(&self) -> bool {
        self.providers.iter().any(|(_, p)| p.supports_streaming())
    }

    fn supports_streaming_tool_events(&self) -> bool {
        self.providers
            .iter()
            .any(|(_, p)| p.supports_streaming_tool_events())
    }

    fn stream_chat(
        &self,
        request: ChatRequest<'_>,
        model: &str,
        temperature: f64,
        options: StreamOptions,
    ) -> stream::BoxStream<'static, StreamResult<StreamEvent>> {
        let needs_tool_events = request.tools.is_some_and(|tools| !tools.is_empty());

        let candidates = self.streaming_candidates(options.enabled, needs_tool_events);
        let no_provider_msg = if needs_tool_events {
            "No provider supports streaming tool events".to_string()
        } else {
            "No provider supports streaming".to_string()
        };

        // Own the request payload so the bridge task can re-issue the stream
        // against fallback providers/models across the failover chain.
        let messages = request.messages.to_vec();
        let tools = request.tools.map(|tools| tools.to_vec());

        self.spawn_failover_stream(
            candidates,
            model,
            no_provider_msg,
            move |provider, model| {
                let req = ChatRequest {
                    messages: &messages,
                    tools: tools.as_deref(),
                };
                provider.stream_chat(req, model, temperature, options)
            },
        )
    }

    fn stream_chat_with_system(
        &self,
        system_prompt: Option<&str>,
        message: &str,
        model: &str,
        temperature: f64,
        options: StreamOptions,
    ) -> stream::BoxStream<'static, StreamResult<StreamChunk>> {
        let candidates = self.streaming_candidates(options.enabled, false);

        // Own the request payload so the bridge task can re-issue the stream
        // against fallback providers/models across the failover chain.
        let system_prompt = system_prompt.map(str::to_string);
        let message = message.to_string();

        self.spawn_failover_stream(
            candidates,
            model,
            "No provider supports streaming".to_string(),
            move |provider, model| {
                provider.stream_chat_with_system(
                    system_prompt.as_deref(),
                    &message,
                    model,
                    temperature,
                    options,
                )
            },
        )
    }

    fn stream_chat_with_history(
        &self,
        messages: &[ChatMessage],
        model: &str,
        temperature: f64,
        options: StreamOptions,
    ) -> stream::BoxStream<'static, StreamResult<StreamChunk>> {
        let candidates = self.streaming_candidates(options.enabled, false);

        // Own the conversation so the bridge task can re-issue the stream against
        // fallback providers/models across the failover chain.
        let messages = messages.to_vec();

        self.spawn_failover_stream(
            candidates,
            model,
            "No provider supports streaming".to_string(),
            move |provider, model| {
                provider.stream_chat_with_history(&messages, model, temperature, options)
            },
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::providers::traits::StreamError;
    use crate::tools::ToolSpec;
    use futures_util::StreamExt;
    use std::sync::Arc;

    struct MockProvider {
        calls: Arc<AtomicUsize>,
        fail_until_attempt: usize,
        response: &'static str,
        error: &'static str,
    }

    #[async_trait]
    impl Provider for MockProvider {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            let attempt = self.calls.fetch_add(1, Ordering::SeqCst) + 1;
            if attempt <= self.fail_until_attempt {
                anyhow::bail!(self.error);
            }
            Ok(self.response.to_string())
        }

        async fn chat_with_history(
            &self,
            _messages: &[ChatMessage],
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            let attempt = self.calls.fetch_add(1, Ordering::SeqCst) + 1;
            if attempt <= self.fail_until_attempt {
                anyhow::bail!(self.error);
            }
            Ok(self.response.to_string())
        }
    }

    /// Mock that records which model was used for each call.
    struct ModelAwareMock {
        calls: Arc<AtomicUsize>,
        models_seen: parking_lot::Mutex<Vec<String>>,
        fail_models: Vec<&'static str>,
        response: &'static str,
    }

    #[async_trait]
    impl Provider for ModelAwareMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            self.calls.fetch_add(1, Ordering::SeqCst);
            self.models_seen.lock().push(model.to_string());
            if self.fail_models.contains(&model) {
                anyhow::bail!("500 model {} unavailable", model);
            }
            Ok(self.response.to_string())
        }
    }

    // ── Existing tests (preserved) ──

    #[tokio::test]
    async fn succeeds_without_retry() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 0,
                    response: "ok",
                    error: "boom",
                }),
            )],
            2,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await.unwrap();
        assert_eq!(result, "ok");
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn retries_then_recovers() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 1,
                    response: "recovered",
                    error: "temporary",
                }),
            )],
            2,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await.unwrap();
        assert_eq!(result, "recovered");
        assert_eq!(calls.load(Ordering::SeqCst), 2);
    }

    #[tokio::test]
    async fn falls_back_after_retries_exhausted() {
        let primary_calls = Arc::new(AtomicUsize::new(0));
        let fallback_calls = Arc::new(AtomicUsize::new(0));

        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&primary_calls),
                        fail_until_attempt: usize::MAX,
                        response: "never",
                        error: "primary down",
                    }),
                ),
                (
                    "fallback".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&fallback_calls),
                        fail_until_attempt: 0,
                        response: "from fallback",
                        error: "fallback down",
                    }),
                ),
            ],
            1,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await.unwrap();
        assert_eq!(result, "from fallback");
        assert_eq!(primary_calls.load(Ordering::SeqCst), 2);
        assert_eq!(fallback_calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn returns_aggregated_error_when_all_providers_fail() {
        let provider = ReliableProvider::new(
            vec![
                (
                    "p1".into(),
                    Box::new(MockProvider {
                        calls: Arc::new(AtomicUsize::new(0)),
                        fail_until_attempt: usize::MAX,
                        response: "never",
                        error: "p1 error",
                    }),
                ),
                (
                    "p2".into(),
                    Box::new(MockProvider {
                        calls: Arc::new(AtomicUsize::new(0)),
                        fail_until_attempt: usize::MAX,
                        response: "never",
                        error: "p2 error",
                    }),
                ),
            ],
            0,
            1,
        );

        let err = provider
            .simple_chat("hello", "test", 0.0)
            .await
            .expect_err("all providers should fail");
        let msg = err.to_string();
        assert!(msg.contains("All providers/models failed"));
        assert!(msg.contains("provider=p1 model=test"));
        assert!(msg.contains("provider=p2 model=test"));
        assert!(msg.contains("error=p1 error"));
        assert!(msg.contains("error=p2 error"));
        assert!(msg.contains("retryable"));
    }

    #[test]
    fn non_retryable_detects_common_patterns() {
        assert!(is_non_retryable(&anyhow::anyhow!("400 Bad Request")));
        assert!(is_non_retryable(&anyhow::anyhow!("401 Unauthorized")));
        assert!(is_non_retryable(&anyhow::anyhow!("403 Forbidden")));
        assert!(is_non_retryable(&anyhow::anyhow!("404 Not Found")));
        assert!(is_non_retryable(&anyhow::anyhow!(
            "invalid api key provided"
        )));
        assert!(is_non_retryable(&anyhow::anyhow!("authentication failed")));
        assert!(is_non_retryable(&anyhow::anyhow!(
            "model glm-4.7 not found"
        )));
        assert!(is_non_retryable(&anyhow::anyhow!(
            "unsupported model: glm-4.7"
        )));
        assert!(!is_non_retryable(&anyhow::anyhow!("429 Too Many Requests")));
        assert!(!is_non_retryable(&anyhow::anyhow!("408 Request Timeout")));
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "500 Internal Server Error"
        )));
        assert!(!is_non_retryable(&anyhow::anyhow!("502 Bad Gateway")));
        assert!(!is_non_retryable(&anyhow::anyhow!("timeout")));
        assert!(!is_non_retryable(&anyhow::anyhow!("connection reset")));
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "model overloaded, try again later"
        )));
        // Context window errors are now recoverable (not non-retryable)
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "OpenAI Codex stream error: Your input exceeds the context window of this model."
        )));
    }

    #[tokio::test]
    async fn context_window_error_aborts_retries_and_model_fallbacks() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mut model_fallbacks = std::collections::HashMap::new();
        model_fallbacks.insert(
            "gpt-5.3-codex".to_string(),
            vec!["gpt-5.2-codex".to_string()],
        );

        let provider = ReliableProvider::new(
            vec![(
                "openai-codex".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: usize::MAX,
                    response: "never",
                    error: "OpenAI Codex stream error: Your input exceeds the context window of this model. Please adjust your input and try again.",
                }),
            )],
            4,
            1,
        )
        .with_model_fallbacks(model_fallbacks);

        let err = provider
            .simple_chat("hello", "gpt-5.3-codex", 0.0)
            .await
            .expect_err("context window overflow should fail fast");
        let msg = err.to_string();

        assert!(msg.contains("context window"));
        // chat_with_system has no history to truncate, so it bails immediately
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn aggregated_error_marks_non_retryable_model_mismatch_with_details() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "custom".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: usize::MAX,
                    response: "never",
                    error: "unsupported model: glm-4.7",
                }),
            )],
            3,
            1,
        );

        let err = provider
            .simple_chat("hello", "glm-4.7", 0.0)
            .await
            .expect_err("provider should fail");
        let msg = err.to_string();

        assert!(msg.contains("non_retryable"));
        assert!(msg.contains("error=unsupported model: glm-4.7"));
        // Non-retryable errors should not consume retry budget.
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn skips_retries_on_non_retryable_error() {
        let primary_calls = Arc::new(AtomicUsize::new(0));
        let fallback_calls = Arc::new(AtomicUsize::new(0));

        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&primary_calls),
                        fail_until_attempt: usize::MAX,
                        response: "never",
                        error: "401 Unauthorized",
                    }),
                ),
                (
                    "fallback".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&fallback_calls),
                        fail_until_attempt: 0,
                        response: "from fallback",
                        error: "fallback err",
                    }),
                ),
            ],
            3,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await.unwrap();
        assert_eq!(result, "from fallback");
        // Primary should have been called only once (no retries)
        assert_eq!(primary_calls.load(Ordering::SeqCst), 1);
        assert_eq!(fallback_calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn chat_with_history_retries_then_recovers() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 1,
                    response: "history ok",
                    error: "temporary",
                }),
            )],
            2,
            1,
        );

        let messages = vec![ChatMessage::system("system"), ChatMessage::user("hello")];
        let result = provider
            .chat_with_history(&messages, "test", 0.0)
            .await
            .unwrap();
        assert_eq!(result, "history ok");
        assert_eq!(calls.load(Ordering::SeqCst), 2);
    }

    #[tokio::test]
    async fn chat_with_history_falls_back() {
        let primary_calls = Arc::new(AtomicUsize::new(0));
        let fallback_calls = Arc::new(AtomicUsize::new(0));

        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&primary_calls),
                        fail_until_attempt: usize::MAX,
                        response: "never",
                        error: "primary down",
                    }),
                ),
                (
                    "fallback".into(),
                    Box::new(MockProvider {
                        calls: Arc::clone(&fallback_calls),
                        fail_until_attempt: 0,
                        response: "fallback ok",
                        error: "fallback err",
                    }),
                ),
            ],
            1,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let result = provider
            .chat_with_history(&messages, "test", 0.0)
            .await
            .unwrap();
        assert_eq!(result, "fallback ok");
        assert_eq!(primary_calls.load(Ordering::SeqCst), 2);
        assert_eq!(fallback_calls.load(Ordering::SeqCst), 1);
    }

    // ── New tests: model failover ──

    #[tokio::test]
    async fn model_failover_tries_fallback_model() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mock = Arc::new(ModelAwareMock {
            calls: Arc::clone(&calls),
            models_seen: parking_lot::Mutex::new(Vec::new()),
            fail_models: vec!["claude-opus"],
            response: "ok from sonnet",
        });

        let mut fallbacks = HashMap::new();
        fallbacks.insert("claude-opus".to_string(), vec!["claude-sonnet".to_string()]);

        let provider = ReliableProvider::new(
            vec![(
                "anthropic".into(),
                Box::new(mock.clone()) as Box<dyn Provider>,
            )],
            0, // no retries — force immediate model failover
            1,
        )
        .with_model_fallbacks(fallbacks);

        let result = provider
            .simple_chat("hello", "claude-opus", 0.0)
            .await
            .unwrap();
        assert_eq!(result, "ok from sonnet");

        let seen = mock.models_seen.lock();
        assert_eq!(seen.len(), 2);
        assert_eq!(seen[0], "claude-opus");
        assert_eq!(seen[1], "claude-sonnet");
    }

    #[tokio::test]
    async fn model_failover_all_models_fail() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mock = Arc::new(ModelAwareMock {
            calls: Arc::clone(&calls),
            models_seen: parking_lot::Mutex::new(Vec::new()),
            fail_models: vec!["model-a", "model-b", "model-c"],
            response: "never",
        });

        let mut fallbacks = HashMap::new();
        fallbacks.insert(
            "model-a".to_string(),
            vec!["model-b".to_string(), "model-c".to_string()],
        );

        let provider = ReliableProvider::new(
            vec![("p1".into(), Box::new(mock.clone()) as Box<dyn Provider>)],
            0,
            1,
        )
        .with_model_fallbacks(fallbacks);

        let err = provider
            .simple_chat("hello", "model-a", 0.0)
            .await
            .expect_err("all models should fail");
        assert!(err.to_string().contains("All providers/models failed"));

        let seen = mock.models_seen.lock();
        assert_eq!(seen.len(), 3);
    }

    #[tokio::test]
    async fn no_model_fallbacks_behaves_like_before() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 0,
                    response: "ok",
                    error: "boom",
                }),
            )],
            2,
            1,
        );
        // No model_fallbacks set — should work exactly as before
        let result = provider.simple_chat("hello", "test", 0.0).await.unwrap();
        assert_eq!(result, "ok");
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    // ── New tests: Retry-After parsing ──

    #[test]
    fn parse_retry_after_integer() {
        let err = anyhow::anyhow!("429 Too Many Requests, Retry-After: 5");
        assert_eq!(parse_retry_after_ms(&err), Some(5000));
    }

    #[test]
    fn parse_retry_after_float() {
        let err = anyhow::anyhow!("Rate limited. retry_after: 2.5 seconds");
        assert_eq!(parse_retry_after_ms(&err), Some(2500));
    }

    #[test]
    fn parse_retry_after_missing() {
        let err = anyhow::anyhow!("500 Internal Server Error");
        assert_eq!(parse_retry_after_ms(&err), None);
    }

    #[test]
    fn rate_limited_detection() {
        assert!(is_rate_limited(&anyhow::anyhow!("429 Too Many Requests")));
        assert!(is_rate_limited(&anyhow::anyhow!(
            "HTTP 429 rate limit exceeded"
        )));
        assert!(!is_rate_limited(&anyhow::anyhow!("401 Unauthorized")));
        assert!(!is_rate_limited(&anyhow::anyhow!(
            "500 Internal Server Error"
        )));
    }

    #[test]
    fn non_retryable_rate_limit_detects_plan_restricted_model() {
        let err = anyhow::anyhow!(
            "{}",
            "API error (429 Too Many Requests): {\"code\":1311,\"message\":\"the current account plan does not include glm-5\"}"
        );
        assert!(
            is_non_retryable_rate_limit(&err),
            "plan-restricted 429 should skip retries"
        );
    }

    #[test]
    fn non_retryable_rate_limit_detects_insufficient_balance() {
        let err = anyhow::anyhow!(
            "{}",
            "API error (429 Too Many Requests): {\"code\":1113,\"message\":\"insufficient balance\"}"
        );
        assert!(
            is_non_retryable_rate_limit(&err),
            "insufficient-balance 429 should skip retries"
        );
    }

    #[test]
    fn non_retryable_rate_limit_does_not_flag_generic_429() {
        let err = anyhow::anyhow!("429 Too Many Requests: rate limit exceeded");
        assert!(
            !is_non_retryable_rate_limit(&err),
            "generic rate-limit 429 should remain retryable"
        );
    }

    #[test]
    fn non_retryable_rate_limit_ignores_business_code_outside_code_field() {
        // A transient 429 whose body merely *contains* the digits 1311/1113 in
        // an unrelated field (request id, count, model fragment) must NOT be
        // misclassified as non-retryable — only the JSON `"code"` field counts.
        let err = anyhow::anyhow!(
            "{}",
            "API error (429 Too Many Requests): {\"request_id\":\"req-1311-abc\",\"retries\":1113,\"message\":\"rate limit exceeded, slow down\"}"
        );
        assert!(
            !is_non_retryable_rate_limit(&err),
            "an unrelated 1311/1113 number must not flag a transient 429 as non-retryable"
        );
    }

    #[test]
    fn non_retryable_rate_limit_detects_business_code_with_spacing() {
        // The `"code"` anchor tolerates whitespace around the colon.
        let err = anyhow::anyhow!(
            "{}",
            "API error (429 Too Many Requests): { \"code\" : 1113, \"message\":\"insufficient balance\" }"
        );
        assert!(
            is_non_retryable_rate_limit(&err),
            "spaced `\"code\": 1113` should still skip retries"
        );
    }

    #[test]
    fn compute_backoff_uses_retry_after() {
        let provider = ReliableProvider::new(vec![], 0, 500);
        let err = anyhow::anyhow!("429 Retry-After: 3");
        assert_eq!(provider.compute_backoff(500, &err), 3_000);
    }

    #[test]
    fn compute_backoff_caps_at_30s() {
        let provider = ReliableProvider::new(vec![], 0, 500);
        let err = anyhow::anyhow!("429 Retry-After: 120");
        assert_eq!(provider.compute_backoff(500, &err), 30_000);
    }

    #[test]
    fn compute_backoff_jitters_when_no_retry_after() {
        // Without a Retry-After, the wait is jittered into [base/2, base] so
        // concurrent retriers desynchronize (thundering-herd avoidance, #428).
        let provider = ReliableProvider::new(vec![], 0, 500);
        let err = anyhow::anyhow!("500 Server Error");
        for _ in 0..1_000 {
            let wait = provider.compute_backoff(500, &err);
            assert!(
                (250..=500).contains(&wait),
                "jittered backoff {wait} out of [base/2, base]"
            );
        }
    }

    #[test]
    fn jitter_backoff_stays_within_equal_jitter_bounds() {
        for base in [0u64, 1, 2, 3, 50, 999, 10_000] {
            let lo = base / 2;
            for _ in 0..1_000 {
                let v = jitter_backoff(base);
                assert!(
                    (lo..=base).contains(&v),
                    "jitter_backoff({base}) = {v} out of [{lo}, {base}]"
                );
            }
        }
    }

    // ── §2.1 API auth error (401/403) tests ──────────────────

    #[test]
    fn non_retryable_detects_401() {
        let err = anyhow::anyhow!("API error (401 Unauthorized): invalid api key");
        assert!(
            is_non_retryable(&err),
            "401 errors must be detected as non-retryable"
        );
    }

    #[test]
    fn non_retryable_detects_403() {
        let err = anyhow::anyhow!("API error (403 Forbidden): access denied");
        assert!(
            is_non_retryable(&err),
            "403 errors must be detected as non-retryable"
        );
    }

    #[test]
    fn non_retryable_detects_404() {
        let err = anyhow::anyhow!("API error (404 Not Found): model not found");
        assert!(
            is_non_retryable(&err),
            "404 errors must be detected as non-retryable"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_429() {
        let err = anyhow::anyhow!("429 Too Many Requests");
        assert!(
            !is_non_retryable(&err),
            "429 must NOT be treated as non-retryable (it is retryable with backoff)"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_408() {
        let err = anyhow::anyhow!("408 Request Timeout");
        assert!(
            !is_non_retryable(&err),
            "408 must NOT be treated as non-retryable (it is retryable)"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_500() {
        let err = anyhow::anyhow!("500 Internal Server Error");
        assert!(
            !is_non_retryable(&err),
            "500 must NOT be treated as non-retryable (server errors are retryable)"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_502() {
        let err = anyhow::anyhow!("502 Bad Gateway");
        assert!(
            !is_non_retryable(&err),
            "502 must NOT be treated as non-retryable"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_5xx_with_embedded_4xx_token() {
        // A transient 5xx whose body mentions a 4xx code must stay retryable:
        // classification must anchor to the primary (leading) status, not the
        // first in-range digit run anywhere in the string.
        let err = anyhow::anyhow!("500 Internal Server Error: upstream returned 404 for model X");
        assert!(
            !is_non_retryable(&err),
            "5xx with an embedded 4xx token must NOT be treated as non-retryable"
        );

        let anchored =
            anyhow::anyhow!("OpenAI API error (500 Internal Server Error): upstream 404");
        assert!(
            !is_non_retryable(&anchored),
            "canonical 5xx error with an embedded 4xx token must NOT be non-retryable"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_5xx_with_trace_number_in_4xx_range() {
        // A 5xx whose request-id/trace number happens to fall in 400–499 must
        // not be misclassified as non-retryable.
        let err = anyhow::anyhow!("503 Service Unavailable (request id req-412abc)");
        assert!(
            !is_non_retryable(&err),
            "5xx with a 400–499 trace number must NOT be treated as non-retryable"
        );
    }

    #[test]
    fn non_retryable_flags_canonical_4xx_error_shape() {
        // The canonical "… API error (NNN …)" shape must still flag genuine 4xx.
        let err = anyhow::anyhow!("Anthropic API error (404 Not Found): no such model");
        assert!(
            is_non_retryable(&err),
            "canonical 4xx error shape must be treated as non-retryable"
        );
    }

    #[test]
    fn non_retryable_does_not_flag_transport_error_with_4xx_port() {
        // A connect-time transport failure carries no real HTTP status; its
        // Display is `error sending request for url (https://host:PORT/path):
        // <cause>`. A non-default port in the 4xx band must NOT be mistaken for a
        // status, or the transient error (connection refused / DNS / timeout /
        // TLS) would be classified non-retryable and skip retry/backoff. Default
        // ports 80/443 are stripped by the `url` crate, so only non-default ports
        // ever reproduced this.
        for url in [
            "error sending request for url (https://host:456/v1): connection refused",
            "error sending request for url (https://api.example.com:404/v1/chat): dns error",
            "error sending request for url (http://10.0.0.5:480/x): operation timed out",
        ] {
            let err = anyhow::anyhow!("{url}");
            assert!(
                !is_non_retryable(&err),
                "transport error with a 4xx-band URL port must be retryable: {url}"
            );
        }
    }

    #[test]
    fn primary_http_status_skips_url_ports_but_keeps_real_status() {
        // A URL-glued digit run (port / host / path segment) is not a status.
        assert_eq!(
            primary_http_status("error sending request for url (https://host:456/v1): refused"),
            None
        );
        // …but a genuine standalone status token is still found even when a URL
        // with a 4xx-band port precedes it.
        assert_eq!(
            primary_http_status("url (https://host:456/v1) returned 503 Service Unavailable"),
            Some(503)
        );
        // Canonical anchored / keyword / leading shapes still resolve.
        assert_eq!(
            primary_http_status("API error (404 Not Found): x"),
            Some(404)
        );
        assert_eq!(primary_http_status("HTTP 500 upstream"), Some(500));
        assert_eq!(primary_http_status("429 Too Many Requests"), Some(429));
    }

    // ── §2.2 Rate limit Retry-After edge cases ───────────────

    #[test]
    fn parse_retry_after_zero() {
        let err = anyhow::anyhow!("429 Too Many Requests, Retry-After: 0");
        assert_eq!(
            parse_retry_after_ms(&err),
            Some(0),
            "Retry-After: 0 should parse as 0ms"
        );
    }

    #[test]
    fn parse_retry_after_with_underscore_separator() {
        let err = anyhow::anyhow!("rate limited, retry_after: 10");
        assert_eq!(
            parse_retry_after_ms(&err),
            Some(10_000),
            "retry_after with underscore must be parsed"
        );
    }

    #[test]
    fn parse_retry_after_space_separator() {
        let err = anyhow::anyhow!("Retry-After 7");
        assert_eq!(
            parse_retry_after_ms(&err),
            Some(7000),
            "Retry-After with space separator must be parsed"
        );
    }

    #[test]
    fn parse_retry_after_gemini_quota_reset_phrasing() {
        // cloudcode-pa (OAuth path) 429 body
        let err = anyhow::anyhow!(
            "Gemini API error (429 Too Many Requests): You have exhausted your capacity \
             on this model. Your quota will reset after 32s."
        );
        assert_eq!(parse_retry_after_ms(&err), Some(32_000));
    }

    #[test]
    fn parse_retry_after_gemini_retry_in_phrasing() {
        // generativelanguage (API-key path) 429 body
        let err = anyhow::anyhow!("Resource has been exhausted. Please retry in 26.3s.");
        assert_eq!(parse_retry_after_ms(&err), Some(26_300));
    }

    #[test]
    fn rate_limited_false_for_generic_error() {
        let err = anyhow::anyhow!("Connection refused");
        assert!(
            !is_rate_limited(&err),
            "generic errors must not be flagged as rate-limited"
        );
    }

    // ── §2.3 Malformed API response error classification ─────

    #[tokio::test]
    async fn non_retryable_skips_retries_for_401() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: usize::MAX,
                    response: "never",
                    error: "API error (401 Unauthorized): invalid key",
                }),
            )],
            5,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await;
        assert!(result.is_err(), "401 should fail without retries");
        assert_eq!(
            calls.load(Ordering::SeqCst),
            1,
            "must not retry on 401 — should be exactly 1 call"
        );
    }

    #[tokio::test]
    async fn non_retryable_rate_limit_skips_retries_for_plan_errors() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(MockProvider {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: usize::MAX,
                    response: "never",
                    error: "API error (429 Too Many Requests): {\"code\":1311,\"message\":\"plan does not include glm-5\"}",
                }),
            )],
            5,
            1,
        );

        let result = provider.simple_chat("hello", "test", 0.0).await;
        assert!(
            result.is_err(),
            "plan-restricted 429 should fail quickly without retrying"
        );
        assert_eq!(
            calls.load(Ordering::SeqCst),
            1,
            "must not retry non-retryable 429 business errors"
        );
    }

    // ── Arc<ModelAwareMock> Provider impl for test ──

    #[async_trait]
    impl Provider for Arc<ModelAwareMock> {
        async fn chat_with_system(
            &self,
            system_prompt: Option<&str>,
            message: &str,
            model: &str,
            temperature: f64,
        ) -> anyhow::Result<String> {
            self.as_ref()
                .chat_with_system(system_prompt, message, model, temperature)
                .await
        }
    }

    /// Mock provider that implements `chat()` with native tool support.
    struct NativeToolMock {
        calls: Arc<AtomicUsize>,
        fail_until_attempt: usize,
        response_text: &'static str,
        tool_calls: Vec<super::super::traits::ToolCall>,
        error: &'static str,
    }

    #[async_trait]
    impl Provider for NativeToolMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok(self.response_text.to_string())
        }

        fn supports_native_tools(&self) -> bool {
            true
        }

        async fn chat(
            &self,
            _request: ChatRequest<'_>,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<ChatResponse> {
            let attempt = self.calls.fetch_add(1, Ordering::SeqCst) + 1;
            if attempt <= self.fail_until_attempt {
                anyhow::bail!(self.error);
            }
            Ok(ChatResponse {
                text: Some(self.response_text.to_string()),
                tool_calls: self.tool_calls.clone(),
                usage: None,
                reasoning_content: None,
            })
        }
    }

    #[tokio::test]
    async fn chat_delegates_to_inner_provider() {
        let calls = Arc::new(AtomicUsize::new(0));
        let tool_call = super::super::traits::ToolCall {
            id: "call_1".to_string(),
            name: "shell".to_string(),
            arguments: r#"{"command":"date"}"#.to_string(),
        };
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(NativeToolMock {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 0,
                    response_text: "ok",
                    tool_calls: vec![tool_call.clone()],
                    error: "boom",
                }) as Box<dyn Provider>,
            )],
            2,
            1,
        );

        let messages = vec![ChatMessage::user("what time is it?")];
        let request = ChatRequest {
            messages: &messages,
            tools: None,
        };
        let result = provider.chat(request, "test-model", 0.0).await.unwrap();

        assert_eq!(result.text.as_deref(), Some("ok"));
        assert_eq!(result.tool_calls.len(), 1);
        assert_eq!(result.tool_calls[0].name, "shell");
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn chat_retries_and_recovers() {
        let calls = Arc::new(AtomicUsize::new(0));
        let tool_call = super::super::traits::ToolCall {
            id: "call_1".to_string(),
            name: "shell".to_string(),
            arguments: r#"{"command":"date"}"#.to_string(),
        };
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(NativeToolMock {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 2,
                    response_text: "recovered",
                    tool_calls: vec![tool_call],
                    error: "temporary failure",
                }) as Box<dyn Provider>,
            )],
            3,
            1,
        );

        let messages = vec![ChatMessage::user("test")];
        let request = ChatRequest {
            messages: &messages,
            tools: None,
        };
        let result = provider.chat(request, "test-model", 0.0).await.unwrap();

        assert_eq!(result.text.as_deref(), Some("recovered"));
        assert!(
            calls.load(Ordering::SeqCst) > 1,
            "should have retried at least once"
        );
    }

    #[tokio::test]
    async fn chat_preserves_native_tools_support() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(NativeToolMock {
                    calls: Arc::clone(&calls),
                    fail_until_attempt: 0,
                    response_text: "ok",
                    tool_calls: vec![],
                    error: "boom",
                }) as Box<dyn Provider>,
            )],
            2,
            1,
        );

        assert!(
            provider.supports_native_tools(),
            "ReliableProvider must propagate supports_native_tools from inner provider"
        );
    }

    // ── Gap 2-4: Parity tests for chat() ────────────────────────

    /// Gap 2: `chat()` returns an aggregated error when all providers fail,
    /// matching behavior of `returns_aggregated_error_when_all_providers_fail`.
    #[tokio::test]
    async fn chat_returns_aggregated_error_when_all_providers_fail() {
        let provider = ReliableProvider::new(
            vec![
                (
                    "p1".into(),
                    Box::new(NativeToolMock {
                        calls: Arc::new(AtomicUsize::new(0)),
                        fail_until_attempt: usize::MAX,
                        response_text: "never",
                        tool_calls: vec![],
                        error: "p1 chat error",
                    }) as Box<dyn Provider>,
                ),
                (
                    "p2".into(),
                    Box::new(NativeToolMock {
                        calls: Arc::new(AtomicUsize::new(0)),
                        fail_until_attempt: usize::MAX,
                        response_text: "never",
                        tool_calls: vec![],
                        error: "p2 chat error",
                    }) as Box<dyn Provider>,
                ),
            ],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let request = ChatRequest {
            messages: &messages,
            tools: None,
        };
        let err = provider
            .chat(request, "test", 0.0)
            .await
            .expect_err("all providers should fail");
        let msg = err.to_string();
        assert!(msg.contains("All providers/models failed"));
        assert!(msg.contains("provider=p1 model=test"));
        assert!(msg.contains("provider=p2 model=test"));
        assert!(msg.contains("error=p1 chat error"));
        assert!(msg.contains("error=p2 chat error"));
        assert!(msg.contains("retryable"));
    }

    /// Mock that records model names and can fail specific models,
    /// implementing `chat()` for native tool calling parity tests.
    struct NativeModelAwareMock {
        calls: Arc<AtomicUsize>,
        models_seen: parking_lot::Mutex<Vec<String>>,
        fail_models: Vec<&'static str>,
        response_text: &'static str,
    }

    #[async_trait]
    impl Provider for NativeModelAwareMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok(self.response_text.to_string())
        }

        fn supports_native_tools(&self) -> bool {
            true
        }

        async fn chat(
            &self,
            _request: ChatRequest<'_>,
            model: &str,
            _temperature: f64,
        ) -> anyhow::Result<ChatResponse> {
            self.calls.fetch_add(1, Ordering::SeqCst);
            self.models_seen.lock().push(model.to_string());
            if self.fail_models.contains(&model) {
                anyhow::bail!("500 model {} unavailable", model);
            }
            Ok(ChatResponse {
                text: Some(self.response_text.to_string()),
                tool_calls: vec![],
                usage: None,
                reasoning_content: None,
            })
        }
    }

    #[async_trait]
    impl Provider for Arc<NativeModelAwareMock> {
        async fn chat_with_system(
            &self,
            system_prompt: Option<&str>,
            message: &str,
            model: &str,
            temperature: f64,
        ) -> anyhow::Result<String> {
            self.as_ref()
                .chat_with_system(system_prompt, message, model, temperature)
                .await
        }

        fn supports_native_tools(&self) -> bool {
            true
        }

        async fn chat(
            &self,
            request: ChatRequest<'_>,
            model: &str,
            temperature: f64,
        ) -> anyhow::Result<ChatResponse> {
            self.as_ref().chat(request, model, temperature).await
        }
    }

    /// Gap 3: `chat()` tries fallback models on failure,
    /// matching behavior of `model_failover_tries_fallback_model`.
    #[tokio::test]
    async fn chat_tries_model_failover_on_failure() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mock = Arc::new(NativeModelAwareMock {
            calls: Arc::clone(&calls),
            models_seen: parking_lot::Mutex::new(Vec::new()),
            fail_models: vec!["claude-opus"],
            response_text: "ok from sonnet",
        });

        let mut fallbacks = HashMap::new();
        fallbacks.insert("claude-opus".to_string(), vec!["claude-sonnet".to_string()]);

        let provider = ReliableProvider::new(
            vec![(
                "anthropic".into(),
                Box::new(mock.clone()) as Box<dyn Provider>,
            )],
            0, // no retries — force immediate model failover
            1,
        )
        .with_model_fallbacks(fallbacks);

        let messages = vec![ChatMessage::user("hello")];
        let request = ChatRequest {
            messages: &messages,
            tools: None,
        };
        let result = provider.chat(request, "claude-opus", 0.0).await.unwrap();
        assert_eq!(result.text.as_deref(), Some("ok from sonnet"));

        let seen = mock.models_seen.lock();
        assert_eq!(seen.len(), 2);
        assert_eq!(seen[0], "claude-opus");
        assert_eq!(seen[1], "claude-sonnet");
    }

    /// Gap 4: `chat()` skips retries on non-retryable errors (401, 403, etc.),
    /// matching behavior of `skips_retries_on_non_retryable_error`.
    #[tokio::test]
    async fn chat_skips_non_retryable_errors() {
        let primary_calls = Arc::new(AtomicUsize::new(0));
        let fallback_calls = Arc::new(AtomicUsize::new(0));

        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    Box::new(NativeToolMock {
                        calls: Arc::clone(&primary_calls),
                        fail_until_attempt: usize::MAX,
                        response_text: "never",
                        tool_calls: vec![],
                        error: "401 Unauthorized",
                    }) as Box<dyn Provider>,
                ),
                (
                    "fallback".into(),
                    Box::new(NativeToolMock {
                        calls: Arc::clone(&fallback_calls),
                        fail_until_attempt: 0,
                        response_text: "from fallback",
                        tool_calls: vec![],
                        error: "fallback err",
                    }) as Box<dyn Provider>,
                ),
            ],
            3,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let request = ChatRequest {
            messages: &messages,
            tools: None,
        };
        let result = provider.chat(request, "test", 0.0).await.unwrap();
        assert_eq!(result.text.as_deref(), Some("from fallback"));
        // Primary should have been called only once (no retries)
        assert_eq!(primary_calls.load(Ordering::SeqCst), 1);
        assert_eq!(fallback_calls.load(Ordering::SeqCst), 1);
    }

    // ── Context window truncation tests ─────────────────────────

    #[test]
    fn context_window_error_is_not_non_retryable() {
        // Context window errors should be recoverable via truncation
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "exceeds the context window"
        )));
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "maximum context length exceeded"
        )));
        assert!(!is_non_retryable(&anyhow::anyhow!(
            "too many tokens in the request"
        )));
        assert!(!is_non_retryable(&anyhow::anyhow!("token limit exceeded")));
    }

    #[test]
    fn is_context_window_exceeded_detects_llamacpp() {
        assert!(is_context_window_exceeded(&anyhow::anyhow!(
            "request (8968 tokens) exceeds the available context size (8448 tokens), try increasing it"
        )));
    }

    #[test]
    fn truncate_for_context_drops_oldest_non_system() {
        let mut messages = vec![
            ChatMessage::system("sys"),
            ChatMessage::user("msg1"),
            ChatMessage::assistant("resp1"),
            ChatMessage::user("msg2"),
            ChatMessage::assistant("resp2"),
            ChatMessage::user("msg3"),
        ];

        let dropped = truncate_for_context(&mut messages);

        // 5 non-system messages, drop oldest half = 2
        assert_eq!(dropped, 2);
        // System message preserved
        assert_eq!(messages[0].role, "system");
        // Remaining messages should be the newer ones
        assert_eq!(messages.len(), 4); // system + 3 remaining non-system
        // The last message should still be the most recent user message
        assert_eq!(messages.last().unwrap().content, "msg3");
    }

    #[test]
    fn truncate_for_context_preserves_system_and_last_message() {
        // Only one non-system message: nothing to drop
        let mut messages = vec![ChatMessage::system("sys"), ChatMessage::user("only")];
        let dropped = truncate_for_context(&mut messages);
        assert_eq!(dropped, 0);
        assert_eq!(messages.len(), 2);

        // No system message, only one user message
        let mut messages = vec![ChatMessage::user("only")];
        let dropped = truncate_for_context(&mut messages);
        assert_eq!(dropped, 0);
        assert_eq!(messages.len(), 1);
    }

    /// Mock that fails with context error on first N calls, then succeeds.
    /// Tracks the number of messages received on each call.
    struct ContextOverflowMock {
        calls: Arc<AtomicUsize>,
        fail_until_attempt: usize,
        message_counts: parking_lot::Mutex<Vec<usize>>,
    }

    #[async_trait]
    impl Provider for ContextOverflowMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok("ok".to_string())
        }

        async fn chat_with_history(
            &self,
            messages: &[ChatMessage],
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            let attempt = self.calls.fetch_add(1, Ordering::SeqCst) + 1;
            self.message_counts.lock().push(messages.len());
            if attempt <= self.fail_until_attempt {
                anyhow::bail!(
                    "request (8968 tokens) exceeds the available context size (8448 tokens), try increasing it"
                );
            }
            Ok("recovered after truncation".to_string())
        }
    }

    #[tokio::test]
    async fn chat_with_history_truncates_on_context_overflow() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mock = ContextOverflowMock {
            calls: Arc::clone(&calls),
            fail_until_attempt: 1, // fail first call, succeed after truncation
            message_counts: parking_lot::Mutex::new(Vec::new()),
        };

        let provider = ReliableProvider::new(
            vec![("local".into(), Box::new(mock) as Box<dyn Provider>)],
            3,
            1,
        );

        let messages = vec![
            ChatMessage::system("system prompt"),
            ChatMessage::user("old message 1"),
            ChatMessage::assistant("old response 1"),
            ChatMessage::user("old message 2"),
            ChatMessage::assistant("old response 2"),
            ChatMessage::user("current question"),
        ];

        let result = provider
            .chat_with_history(&messages, "local-model", 0.0)
            .await
            .unwrap();
        assert_eq!(result, "recovered after truncation");
        // Should have been called twice: once with full messages, once with truncated
        assert_eq!(calls.load(Ordering::SeqCst), 2);
    }

    #[tokio::test]
    async fn context_overflow_with_no_history_to_truncate_bails_immediately() {
        let calls = Arc::new(AtomicUsize::new(0));
        let mock = ContextOverflowMock {
            calls: Arc::clone(&calls),
            fail_until_attempt: 999, // always fail
            message_counts: parking_lot::Mutex::new(Vec::new()),
        };

        let provider = ReliableProvider::new(
            vec![("local".into(), Box::new(mock) as Box<dyn Provider>)],
            3,
            1,
        );

        // Only system + one user message — nothing to truncate
        let messages = vec![
            ChatMessage::system("huge system prompt that exceeds context window"),
            ChatMessage::user("hello"),
        ];

        let result = provider
            .chat_with_history(&messages, "local-model", 0.0)
            .await;
        assert!(result.is_err());
        let err_msg = result.unwrap_err().to_string();
        assert!(
            err_msg.contains("cannot be reduced further"),
            "Should bail with actionable message, got: {err_msg}"
        );
        // Should only be called once — no useless retries
        assert_eq!(
            calls.load(Ordering::SeqCst),
            1,
            "Should not retry when truncation is impossible"
        );
    }

    // ── Tool schema error detection tests ───────────────────────────────

    #[test]
    fn tool_schema_error_detects_groq_validation_failure() {
        let msg = r#"Groq API error (400 Bad Request): {"error":{"message":"tool call validation failed: attempted to call tool 'memory_recall' which was not in request"}}"#;
        let err = anyhow::anyhow!("{}", msg);
        assert!(is_tool_schema_error(&err));
    }

    #[test]
    fn tool_schema_error_detects_not_in_request() {
        let err = anyhow::anyhow!("tool 'search' was not in request");
        assert!(is_tool_schema_error(&err));
    }

    #[test]
    fn tool_schema_error_detects_not_found_in_tool_list() {
        let err = anyhow::anyhow!("function 'foo' not found in tool list");
        assert!(is_tool_schema_error(&err));
    }

    #[test]
    fn tool_schema_error_detects_invalid_tool_call() {
        let err = anyhow::anyhow!("invalid_tool_call: no matching function");
        assert!(is_tool_schema_error(&err));
    }

    #[test]
    fn tool_schema_error_ignores_unrelated_errors() {
        let err = anyhow::anyhow!("invalid api key");
        assert!(!is_tool_schema_error(&err));

        let err = anyhow::anyhow!("model not found");
        assert!(!is_tool_schema_error(&err));
    }

    #[test]
    fn non_retryable_returns_false_for_tool_schema_400() {
        // A 400 error with tool schema validation text should NOT be non-retryable.
        let msg = "400 Bad Request: tool call validation failed: attempted to call tool 'x' which was not in request";
        let err = anyhow::anyhow!("{}", msg);
        assert!(!is_non_retryable(&err));
    }

    #[test]
    fn non_retryable_returns_true_for_other_400_errors() {
        // A regular 400 error (e.g. invalid API key) should still be non-retryable.
        let err = anyhow::anyhow!("400 Bad Request: invalid api key provided");
        assert!(is_non_retryable(&err));
    }

    struct StreamingToolEventMock {
        stream_calls: Arc<AtomicUsize>,
        supports_tool_events: bool,
    }

    impl StreamingToolEventMock {
        fn new(supports_tool_events: bool) -> Self {
            Self {
                stream_calls: Arc::new(AtomicUsize::new(0)),
                supports_tool_events,
            }
        }
    }

    #[async_trait]
    impl Provider for StreamingToolEventMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok("ok".to_string())
        }

        fn supports_streaming(&self) -> bool {
            true
        }

        fn supports_streaming_tool_events(&self) -> bool {
            self.supports_tool_events
        }

        fn stream_chat(
            &self,
            _request: ChatRequest<'_>,
            _model: &str,
            _temperature: f64,
            _options: StreamOptions,
        ) -> stream::BoxStream<'static, StreamResult<StreamEvent>> {
            self.stream_calls.fetch_add(1, Ordering::SeqCst);
            stream::iter(vec![
                Ok(StreamEvent::ToolCall(super::super::traits::ToolCall {
                    id: "call_1".to_string(),
                    name: "shell".to_string(),
                    arguments: r#"{"command":"date"}"#.to_string(),
                })),
                Ok(StreamEvent::Final),
            ])
            .boxed()
        }
    }

    #[async_trait]
    impl Provider for Arc<StreamingToolEventMock> {
        async fn chat_with_system(
            &self,
            system_prompt: Option<&str>,
            message: &str,
            model: &str,
            temperature: f64,
        ) -> anyhow::Result<String> {
            self.as_ref()
                .chat_with_system(system_prompt, message, model, temperature)
                .await
        }

        fn supports_streaming(&self) -> bool {
            self.as_ref().supports_streaming()
        }

        fn supports_streaming_tool_events(&self) -> bool {
            self.as_ref().supports_streaming_tool_events()
        }

        fn stream_chat(
            &self,
            request: ChatRequest<'_>,
            model: &str,
            temperature: f64,
            options: StreamOptions,
        ) -> stream::BoxStream<'static, StreamResult<StreamEvent>> {
            self.as_ref()
                .stream_chat(request, model, temperature, options)
        }
    }

    #[tokio::test]
    async fn stream_chat_prefers_provider_with_tool_event_support() {
        let primary = Arc::new(StreamingToolEventMock::new(false));
        let fallback = Arc::new(StreamingToolEventMock::new(true));
        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    Box::new(Arc::clone(&primary)) as Box<dyn Provider>,
                ),
                (
                    "fallback".into(),
                    Box::new(Arc::clone(&fallback)) as Box<dyn Provider>,
                ),
            ],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let tools = vec![ToolSpec {
            name: "shell".to_string(),
            description: "run shell".to_string(),
            parameters: serde_json::json!({
                "type": "object",
                "properties": {
                    "command": { "type": "string" }
                }
            }),
        }];
        let mut stream = provider.stream_chat(
            ChatRequest {
                messages: &messages,
                tools: Some(&tools),
            },
            "model",
            0.0,
            StreamOptions::new(true),
        );

        let first = stream.next().await.unwrap().unwrap();
        let second = stream.next().await.unwrap().unwrap();
        assert!(stream.next().await.is_none());

        match first {
            StreamEvent::ToolCall(call) => assert_eq!(call.name, "shell"),
            other => panic!("expected tool-call event, got {other:?}"),
        }
        assert!(matches!(second, StreamEvent::Final));
        assert_eq!(primary.stream_calls.load(Ordering::SeqCst), 0);
        assert_eq!(fallback.stream_calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn stream_chat_errors_when_no_provider_supports_tool_events() {
        let primary = Arc::new(StreamingToolEventMock::new(false));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(Arc::clone(&primary)) as Box<dyn Provider>,
            )],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let tools = vec![ToolSpec {
            name: "shell".to_string(),
            description: "run shell".to_string(),
            parameters: serde_json::json!({"type": "object"}),
        }];
        let mut stream = provider.stream_chat(
            ChatRequest {
                messages: &messages,
                tools: Some(&tools),
            },
            "model",
            0.0,
            StreamOptions::new(true),
        );

        let first = stream.next().await.unwrap();
        let err = first.expect_err("stream should fail without tool-event support");
        assert!(
            err.to_string()
                .contains("No provider supports streaming tool events"),
            "unexpected stream error: {err}"
        );
        assert!(stream.next().await.is_none());
        assert_eq!(primary.stream_calls.load(Ordering::SeqCst), 0);
    }

    // ── stream_chat_with_history failover tests ──────────────────────

    /// Mock provider that supports streaming via stream_chat_with_history.
    struct StreamingHistoryMock {
        stream_calls: Arc<AtomicUsize>,
        supports: bool,
    }

    #[async_trait]
    impl Provider for StreamingHistoryMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok("ok".to_string())
        }

        fn supports_streaming(&self) -> bool {
            self.supports
        }

        fn stream_chat_with_history(
            &self,
            messages: &[ChatMessage],
            _model: &str,
            _temperature: f64,
            _options: StreamOptions,
        ) -> stream::BoxStream<'static, StreamResult<StreamChunk>> {
            self.stream_calls.fetch_add(1, Ordering::SeqCst);
            // Echo the number of messages as the delta to verify history was passed through
            let msg_count = messages.len().to_string();
            stream::iter(vec![
                Ok(StreamChunk::delta(msg_count)),
                Ok(StreamChunk::final_chunk()),
            ])
            .boxed()
        }
    }

    #[tokio::test]
    async fn stream_chat_with_history_delegates_to_streaming_provider() {
        let calls = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "primary".into(),
                Box::new(StreamingHistoryMock {
                    stream_calls: Arc::clone(&calls),
                    supports: true,
                }) as Box<dyn Provider>,
            )],
            0,
            1,
        );

        let messages = vec![
            ChatMessage::system("system"),
            ChatMessage::user("msg1"),
            ChatMessage::assistant("resp1"),
            ChatMessage::user("msg2"),
        ];
        let mut stream =
            provider.stream_chat_with_history(&messages, "model", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(first.delta, "4", "should pass all 4 messages to provider");
        let second = stream.next().await.unwrap().unwrap();
        assert!(second.is_final);
        assert!(stream.next().await.is_none());
        assert_eq!(calls.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn stream_chat_with_history_skips_non_streaming_providers() {
        let non_streaming_calls = Arc::new(AtomicUsize::new(0));
        let streaming_calls = Arc::new(AtomicUsize::new(0));

        let provider = ReliableProvider::new(
            vec![
                (
                    "non-streaming".into(),
                    Box::new(StreamingHistoryMock {
                        stream_calls: Arc::clone(&non_streaming_calls),
                        supports: false,
                    }) as Box<dyn Provider>,
                ),
                (
                    "streaming".into(),
                    Box::new(StreamingHistoryMock {
                        stream_calls: Arc::clone(&streaming_calls),
                        supports: true,
                    }) as Box<dyn Provider>,
                ),
            ],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "model", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(first.delta, "1");
        assert_eq!(
            non_streaming_calls.load(Ordering::SeqCst),
            0,
            "non-streaming provider should be skipped"
        );
        assert_eq!(
            streaming_calls.load(Ordering::SeqCst),
            1,
            "streaming provider should be used"
        );
    }

    #[tokio::test]
    async fn stream_chat_with_history_errors_when_no_provider_supports_streaming() {
        let provider = ReliableProvider::new(
            vec![(
                "non-streaming".into(),
                Box::new(StreamingHistoryMock {
                    stream_calls: Arc::new(AtomicUsize::new(0)),
                    supports: false,
                }) as Box<dyn Provider>,
            )],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hello")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "model", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap();
        let err = first.expect_err("should fail when no provider supports streaming");
        assert!(
            err.to_string().contains("No provider supports streaming"),
            "unexpected error: {err}"
        );
        assert!(stream.next().await.is_none());
    }

    #[tokio::test]
    async fn fallback_records_provider_fallback_info() {
        scope_provider_fallback(async {
            let provider = ReliableProvider::new(
                vec![
                    (
                        "broken".into(),
                        Box::new(MockProvider {
                            calls: Arc::new(AtomicUsize::new(0)),
                            fail_until_attempt: 99, // always fail
                            response: "unused",
                            error: "401 Unauthorized",
                        }),
                    ),
                    (
                        "working".into(),
                        Box::new(MockProvider {
                            calls: Arc::new(AtomicUsize::new(0)),
                            fail_until_attempt: 0,
                            response: "hello from working",
                            error: "unused",
                        }),
                    ),
                ],
                2,
                1,
            );

            let resp = provider.simple_chat("hi", "test-model", 0.0).await.unwrap();
            assert_eq!(resp, "hello from working");

            let fb = take_last_provider_fallback();
            assert!(fb.is_some(), "fallback info should be recorded");
            let fb = fb.unwrap();
            assert_eq!(fb.requested_provider, "broken");
            assert_eq!(fb.actual_provider, "working");
            assert_eq!(fb.actual_model, "test-model");

            // Second take should be None.
            assert!(take_last_provider_fallback().is_none());
        })
        .await;
    }

    // ── connect-time streaming failover tests (#410) ─────────────────
    //
    // These drive the new spawn_failover_stream failover/retry/classification
    // branches through stream_chat_with_history (which shares the generic helper
    // with stream_chat / stream_chat_with_system). All fixtures are in-process
    // mocks — no network — so they exercise the async logic deterministically.

    /// Streaming mock whose first-chunk behavior is configurable per attempt so
    /// tests can drive failover, retry, classification, empty-stream, mid-stream
    /// error, and cancellation paths. Build with struct-update from `Default`.
    #[derive(Default)]
    struct FailoverMock {
        calls: Arc<AtomicUsize>,
        /// First `fail_until` calls return a first-chunk error, then succeed.
        fail_until: usize,
        /// Error message for failing attempts (drives classification).
        error: String,
        /// Return an empty (content-free, error-free) stream.
        empty: bool,
        /// If set, only this `model` succeeds; every other model errors.
        succeed_model: Option<String>,
        /// Success stream is `[Ok(delta), Err]` to exercise mid-stream errors.
        mid_stream_error: bool,
        /// Sleep this long before the first chunk (errors after) — for cancellation.
        connect_delay_ms: u64,
        /// Delta text on success, used to identify which candidate served.
        label: String,
    }

    #[async_trait]
    impl Provider for FailoverMock {
        async fn chat_with_system(
            &self,
            _system_prompt: Option<&str>,
            _message: &str,
            _model: &str,
            _temperature: f64,
        ) -> anyhow::Result<String> {
            Ok("ok".to_string())
        }

        fn supports_streaming(&self) -> bool {
            true
        }

        fn stream_chat_with_history(
            &self,
            _messages: &[ChatMessage],
            model: &str,
            _temperature: f64,
            _options: StreamOptions,
        ) -> stream::BoxStream<'static, StreamResult<StreamChunk>> {
            let n = self.calls.fetch_add(1, Ordering::SeqCst);

            if self.empty {
                return stream::iter(Vec::<StreamResult<StreamChunk>>::new()).boxed();
            }

            if self.connect_delay_ms > 0 {
                let delay = self.connect_delay_ms;
                let err = self.error.clone();
                return stream::once(async move {
                    tokio::time::sleep(Duration::from_millis(delay)).await;
                    Err(StreamError::Provider(err))
                })
                .boxed();
            }

            let should_fail = match &self.succeed_model {
                Some(target) => model != target.as_str(),
                None => n < self.fail_until,
            };
            if should_fail {
                return stream::iter(vec![Err(StreamError::Provider(self.error.clone()))]).boxed();
            }

            if self.mid_stream_error {
                return stream::iter(vec![
                    Ok(StreamChunk::delta(self.label.clone())),
                    Err(StreamError::Provider("mid-stream boom".to_string())),
                ])
                .boxed();
            }
            stream::iter(vec![
                Ok(StreamChunk::delta(self.label.clone())),
                Ok(StreamChunk::final_chunk()),
            ])
            .boxed()
        }
    }

    fn boxed_failover(mock: FailoverMock) -> Box<dyn Provider> {
        Box::new(mock) as Box<dyn Provider>
    }

    #[tokio::test]
    async fn stream_failover_advances_to_next_provider_on_connect_error() {
        let c1 = Arc::new(AtomicUsize::new(0));
        let c2 = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![
                (
                    "broken".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c1),
                        fail_until: usize::MAX,
                        error: "503 Service Unavailable".into(),
                        label: "broken".into(),
                        ..Default::default()
                    }),
                ),
                (
                    "healthy".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c2),
                        label: "healthy".into(),
                        ..Default::default()
                    }),
                ),
            ],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(
            first.delta, "healthy",
            "should fail over to the healthy provider"
        );
        assert!(stream.next().await.unwrap().unwrap().is_final);
        assert!(stream.next().await.is_none());
        assert_eq!(
            c1.load(Ordering::SeqCst),
            1,
            "broken provider tried exactly once"
        );
        assert_eq!(
            c2.load(Ordering::SeqCst),
            1,
            "healthy provider served the request"
        );
    }

    #[tokio::test]
    async fn stream_failover_retries_same_provider_then_succeeds() {
        let c = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "p".into(),
                boxed_failover(FailoverMock {
                    calls: Arc::clone(&c),
                    fail_until: 1,
                    error: "503 Service Unavailable".into(),
                    label: "recovered".into(),
                    ..Default::default()
                }),
            )],
            1,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(first.delta, "recovered");
        assert_eq!(
            c.load(Ordering::SeqCst),
            2,
            "retryable connect error retries the same provider then succeeds"
        );
    }

    #[tokio::test]
    async fn stream_failover_skips_retries_on_non_retryable_error() {
        let c1 = Arc::new(AtomicUsize::new(0));
        let c2 = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![
                (
                    "broken".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c1),
                        fail_until: usize::MAX,
                        error: "401 Unauthorized".into(),
                        label: "broken".into(),
                        ..Default::default()
                    }),
                ),
                (
                    "healthy".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c2),
                        label: "healthy".into(),
                        ..Default::default()
                    }),
                ),
            ],
            3,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(first.delta, "healthy");
        assert_eq!(
            c1.load(Ordering::SeqCst),
            1,
            "non-retryable error must not consume retries before failing over"
        );
        assert_eq!(c2.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn stream_failover_walks_model_fallback_chain() {
        let c = Arc::new(AtomicUsize::new(0));
        let mut fallbacks = HashMap::new();
        fallbacks.insert(
            "primary-model".to_string(),
            vec!["fallback-model".to_string()],
        );
        let provider = ReliableProvider::new(
            vec![(
                "p".into(),
                boxed_failover(FailoverMock {
                    calls: Arc::clone(&c),
                    succeed_model: Some("fallback-model".into()),
                    error: "503 Service Unavailable".into(),
                    label: "fallback-served".into(),
                    ..Default::default()
                }),
            )],
            0,
            1,
        )
        .with_model_fallbacks(fallbacks);

        let messages = vec![ChatMessage::user("hi")];
        let mut stream = provider.stream_chat_with_history(
            &messages,
            "primary-model",
            0.0,
            StreamOptions::new(true),
        );

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(
            first.delta, "fallback-served",
            "streaming must walk the model fallback chain, not just the primary model"
        );
        assert_eq!(
            c.load(Ordering::SeqCst),
            2,
            "tried primary-model then the fallback-model"
        );
    }

    #[tokio::test]
    async fn stream_failover_forwards_midstream_error_without_failover() {
        let c1 = Arc::new(AtomicUsize::new(0));
        let c2 = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![
                (
                    "primary".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c1),
                        mid_stream_error: true,
                        label: "primary".into(),
                        ..Default::default()
                    }),
                ),
                (
                    "secondary".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c2),
                        label: "secondary".into(),
                        ..Default::default()
                    }),
                ),
            ],
            2,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));

        let first = stream.next().await.unwrap().unwrap();
        assert_eq!(first.delta, "primary", "committed to the primary stream");
        assert!(
            stream.next().await.unwrap().is_err(),
            "mid-stream error is forwarded, not recovered"
        );
        assert!(stream.next().await.is_none());
        assert_eq!(c1.load(Ordering::SeqCst), 1);
        assert_eq!(
            c2.load(Ordering::SeqCst),
            0,
            "must NOT fail over after committing to a stream"
        );
    }

    #[tokio::test]
    async fn stream_failover_empty_stream_closes_cleanly_without_error() {
        // An all-empty sweep closes the bridge with no synthetic error, preserving
        // the pre-failover semantics where an empty stream is a clean completion.
        let c = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![(
                "p".into(),
                boxed_failover(FailoverMock {
                    calls: Arc::clone(&c),
                    empty: true,
                    ..Default::default()
                }),
            )],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let mut stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));

        assert!(
            stream.next().await.is_none(),
            "empty upstream yields a clean empty stream, not an error"
        );
        assert_eq!(c.load(Ordering::SeqCst), 1);
    }

    #[tokio::test]
    async fn stream_failover_stops_after_receiver_dropped() {
        // Dropping the returned stream (cancellation) must stop the failover task
        // from opening further upstream connections — no cascade to the fallback.
        let c1 = Arc::new(AtomicUsize::new(0));
        let c2 = Arc::new(AtomicUsize::new(0));
        let provider = ReliableProvider::new(
            vec![
                (
                    "slow".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c1),
                        connect_delay_ms: 30,
                        error: "503 Service Unavailable".into(),
                        ..Default::default()
                    }),
                ),
                (
                    "fallback".into(),
                    boxed_failover(FailoverMock {
                        calls: Arc::clone(&c2),
                        label: "fallback".into(),
                        ..Default::default()
                    }),
                ),
            ],
            0,
            1,
        );

        let messages = vec![ChatMessage::user("hi")];
        let stream =
            provider.stream_chat_with_history(&messages, "m", 0.0, StreamOptions::new(true));
        // Let the bridge task start the slow connect to the first provider.
        tokio::task::yield_now().await;
        // Cancel by dropping the stream while the connect is still in flight.
        drop(stream);
        // Wait well past the 30ms connect: an unfixed task would error then cascade
        // to the fallback provider by now.
        tokio::time::sleep(Duration::from_millis(200)).await;
        assert_eq!(
            c2.load(Ordering::SeqCst),
            0,
            "cancelled request must not cascade to the fallback provider"
        );
    }
}