dirge-agent 0.7.3

use std::sync::LazyLock;
use std::time::Duration;

use regex::Regex;

/// B3-2: match an HTTP 5xx status anchored by a structural
/// HTTP-context marker. Avoids false-positives on bare 5xx-shaped
/// numbers in non-HTTP text (e.g. "processed 500 items"). Patterns
/// observed from real rig/reqwest errors:
///   "503 Service Unavailable"        — leading status + reason
///   "Http status: 500"               — status: prefix
///   "status=502"                     — status= prefix
///   "error 504: ..."                 — error prefix
///   "(status_code=500)"              — status_code= prefix
///   "code: 500"                      — bare code: prefix
///   "received http 500"              — http prefix
///   "5xx server error response"      — already lowercase
static STATUS_5XX_RE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"(?x)
        (?:
            # prefix-anchored: status / code / error / http /
            # response / request / returned, with optional
            # `:`/`=`/`-`/whitespace between marker and number.
            (?:status(?:_code)?|code|error|http|response|request|returned|returns)
            \s*[:=\-]?\s*
            5\d{2}
            (?:\D|$)
        )
        |
        (?:
            # leading status + HTTP reason phrase (5xx Service / 5xx
            # Gateway / 5xx Internal / 5xx Bad / 5xx Server).
            (?:^|\D)
            5\d{2}
            \s+
            (?:service|gateway|internal|bad|server)
        )
        ",
    )
    .expect("static regex compiles")
});

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ErrorKind {
    ContextLength,
    RateLimit,
    Network,
    Auth,
    Other,
}

#[derive(Debug, Clone)]
pub struct RecoveryPolicy {
    max_retries: usize,
    backoff_base: Duration,
}

impl Default for RecoveryPolicy {
    fn default() -> Self {
        Self {
            // Transient provider blips ("error sending request", 5xx, rate
            // limits) are common enough that 3 retries (~7s of backoff)
            // still surfaced hard failures to the user. 5 retries with the
            // exponential schedule below waits ~1+2+4+8+16 ≈ 31s before
            // giving up, which rides out the typical short outage without
            // stalling the agent indefinitely.
            max_retries: 5,
            backoff_base: Duration::from_secs(1),
        }
    }
}

impl RecoveryPolicy {
    pub fn max_retries(&self) -> usize {
        self.max_retries
    }

    pub fn should_retry(&self, attempts: usize, kind: ErrorKind) -> bool {
        if attempts >= self.max_retries {
            return false;
        }
        matches!(kind, ErrorKind::Network | ErrorKind::RateLimit)
    }

    pub fn backoff_duration(&self, attempts: usize) -> Duration {
        let exp = 1u64 << attempts.min(6); // cap at 2^6 = 64s
        let base = self.backoff_base.as_millis() as u64;
        let ms = base.saturating_mul(exp);
        // Additive jitter up to +25% so concurrent agents don't retry in
        // lockstep against a rate-limited endpoint. Never shorter than the
        // policy minimum. Seeded from the system clock — pseudo-random is
        // sufficient here.
        let jitter = pseudo_random(attempts as u64) % (ms / 4).max(1);
        Duration::from_millis(ms.saturating_add(jitter))
    }

    /// F14: combine `backoff_duration` with the provider's
    /// requested `Retry-After`. Prefer whichever is longer (since
    /// retrying earlier than the server asked just earns another
    /// 429), but cap at 5 minutes so a misformatted header can't
    /// stall the agent forever.
    pub fn backoff_duration_for_msg(&self, attempts: usize, error_msg: &str) -> Duration {
        let computed = self.backoff_duration(attempts);
        match retry_after_from_error_msg(error_msg) {
            Some(server_wants) => {
                const CAP: Duration = Duration::from_secs(300);
                let chosen = server_wants.max(computed);
                if chosen > CAP { CAP } else { chosen }
            }
            None => computed,
        }
    }

    #[cfg(test)]
    pub(crate) fn with_backoff(max_retries: usize, backoff_base: Duration) -> Self {
        Self {
            max_retries,
            backoff_base,
        }
    }
}

/// Run an async operation under a [`RecoveryPolicy`], retrying transient
/// (network / rate-limit) failures with the policy's exponential
/// backoff. Auth / context-length / other failures bail immediately.
///
/// Single home for the attempt → classify → backoff → sleep loop that
/// `AnyModel::btw_query` and the summarizer each hand-rolled (dirge-6cvc).
/// `attempt` is invoked fresh on every try; `label` names the operation
/// in the retry log line. The error type only needs `Display` — the
/// message is what `classify_error` inspects.
///
/// NOTE: the backoff sleep here is not yet cancellation-aware; wiring an
/// abort signal through the (signal-less) call sites is tracked
/// separately. The streaming retry wrapper in `agent_loop::retry` is a
/// different shape (per-event commit tracking) and keeps its own loop.
pub async fn run_with_retry<T, E, F, Fut>(
    policy: &RecoveryPolicy,
    label: &str,
    mut attempt: F,
) -> Result<T, E>
where
    F: FnMut() -> Fut,
    Fut: std::future::Future<Output = Result<T, E>>,
    E: std::fmt::Display,
{
    let mut attempts = 0;
    loop {
        match attempt().await {
            Ok(value) => return Ok(value),
            Err(err) => {
                let msg = err.to_string();
                let kind = classify_error(&msg);
                if !policy.should_retry(attempts, kind) {
                    return Err(err);
                }
                let delay = policy.backoff_duration_for_msg(attempts, &msg);
                tracing::warn!(
                    op = label,
                    attempt = attempts + 1,
                    max = policy.max_retries(),
                    delay_ms = delay.as_millis() as u64,
                    kind = ?kind,
                    error = %msg,
                    "retrying after transient failure",
                );
                tokio::time::sleep(delay).await;
                attempts += 1;
            }
        }
    }
}

/// Parse a `Retry-After` value out of an error message. Looks for
/// (in order):
/// 1. Anthropic-style `retry-after-ms: <N>` — milliseconds.
/// 2. Standard `Retry-After: <N>` — seconds.
/// 3. JSON body `"retry_after": <N>` — seconds.
///
/// Returns `None` if no recognized form is present. Robust to the
/// `:` being absent (some providers emit `retry-after 30`).
pub(crate) fn retry_after_from_error_msg(msg: &str) -> Option<Duration> {
    fn parse_after_label(msg: &str, label: &str) -> Option<u64> {
        // Case-insensitive search WITHOUT lowercasing the whole
        // message: previously we lowercased `msg` and then indexed
        // into the ORIGINAL `msg` at the lowered string's byte
        // offset. For ASCII that's identical, but `to_lowercase`
        // can change byte length for some unicode (e.g. Turkish
        // `İ` → `i̇` is 2 → 3 bytes). The mismatched offset could
        // land mid-UTF-8 and panic on `&msg[...]`. Now we scan the
        // original bytes window-by-window with case-insensitive
        // ASCII comparison. The label itself is fixed-ASCII so this
        // is sound — we just need to be case-insensitive against
        // the message's casing.
        let label_bytes = label.as_bytes();
        let msg_bytes = msg.as_bytes();
        if msg_bytes.len() < label_bytes.len() {
            return None;
        }
        let mut idx = None;
        for i in 0..=msg_bytes.len() - label_bytes.len() {
            let window = &msg_bytes[i..i + label_bytes.len()];
            if window
                .iter()
                .zip(label_bytes.iter())
                .all(|(a, b)| a.eq_ignore_ascii_case(b))
            {
                idx = Some(i);
                break;
            }
        }
        let idx = idx?;
        // `idx` is now a byte offset into the original `msg`.
        // Land at a char boundary (the ASCII label match guarantees
        // we're on a boundary, but `idx + label.len()` could still
        // hit one — for ASCII labels it can't, but defend anyway).
        let after = idx + label.len();
        if !msg.is_char_boundary(after) {
            return None;
        }
        let tail = &msg[after..];
        let tail = tail.trim_start_matches([':', ' ', '\t', '"']).trim_start();
        // Consume contiguous digits, with a hard cap so a malformed
        // header (`Retry-After: 999999999999999999999`) doesn't
        // produce a parsed integer that overflows or is absurdly
        // large before the 5-min cap applies in the caller. Cap at
        // 10^10 — any value larger is clearly bogus, and the cap
        // saturates rather than overflowing u64.
        let n: String = tail
            .chars()
            .take_while(|c| c.is_ascii_digit())
            .take(11)
            .collect();
        if n.is_empty() {
            return None;
        }
        n.parse().ok()
    }

    if let Some(ms) = parse_after_label(msg, "retry-after-ms") {
        return Some(Duration::from_millis(ms));
    }
    if let Some(secs) = parse_after_label(msg, "retry-after") {
        return Some(Duration::from_secs(secs));
    }
    if let Some(secs) = parse_after_label(msg, "retry_after") {
        return Some(Duration::from_secs(secs));
    }
    // RFC 7231 HTTP-date form: `Retry-After: Wed, 21 Oct 2015 07:28:00 GMT`.
    // Tried last so the numeric forms above (which are far more common)
    // hit their fast path before we incur a chrono parse. Past dates
    // clamp to zero so a misconfigured server doesn't suppress retries
    // by emitting a stale or epoch-zero header.
    if let Some(d) = parse_http_date_retry_after(msg) {
        return Some(d);
    }
    None
}

/// Scan `msg` for a `Retry-After:` header whose value parses as an
/// RFC 7231 HTTP-date (IMF-fixdate, RFC 850, or asctime form). Returns
/// the time from now until that date, clamped to 0 if in the past.
/// Returns `None` if no `Retry-After:` is present or the value isn't a
/// recognized date form (the numeric forms are handled by
/// `parse_after_label` above).
fn parse_http_date_retry_after(msg: &str) -> Option<Duration> {
    // PROV-10: case-insensitive byte-window scan rather than
    // lowercasing the whole message and indexing back into the
    // original. `to_ascii_lowercase` on the message preserves byte
    // length only for ASCII inputs; a unicode-bearing message could
    // shift offsets and panic on `&msg[after..]`. Mirror the pattern
    // used in `parse_after_label`.
    let label = "retry-after";
    let label_bytes = label.as_bytes();
    let msg_bytes = msg.as_bytes();
    if msg_bytes.len() < label_bytes.len() {
        return None;
    }
    let mut found = None;
    for i in 0..=msg_bytes.len() - label_bytes.len() {
        let window = &msg_bytes[i..i + label_bytes.len()];
        if window
            .iter()
            .zip(label_bytes.iter())
            .all(|(a, b)| a.eq_ignore_ascii_case(b))
        {
            found = Some(i);
            break;
        }
    }
    let idx = found?;
    let after = idx + label.len();
    if !msg.is_char_boundary(after) {
        return None;
    }
    let tail = &msg[after..];
    let tail = tail.trim_start_matches([':', ' ', '\t', '"']);
    let value: String = tail
        .chars()
        .take_while(|&c| c != '\n' && c != '\r' && c != '"')
        .collect();
    let value = value.trim();
    if value.is_empty() {
        return None;
    }
    // chrono accepts the three RFC 7231 date forms via DateTime::parse_from_rfc2822
    // (IMF-fixdate is rfc2822-compatible) and DateTime::parse_from_str for
    // asctime. Try both; ignore Err.
    let parsed = chrono::DateTime::parse_from_rfc2822(value)
        .ok()
        .or_else(|| {
            chrono::NaiveDateTime::parse_from_str(value, "%a %b %e %H:%M:%S %Y")
                .ok()
                .map(|n| n.and_utc().fixed_offset())
        })?;
    let now = chrono::Utc::now().fixed_offset();
    let delta = parsed - now;
    Some(Duration::from_secs(delta.num_seconds().max(0) as u64))
}

fn pseudo_random(salt: u64) -> u64 {
    // Audit L16: two callers that hit `pseudo_random` in the same
    // `subsec_nanos()` slot with the same `salt` (`attempts`)
    // previously produced identical jitter, defeating the
    // anti-thundering-herd purpose. The process-local counter below
    // makes every call within a process unique even when the wall
    // clock + salt collide.
    use std::sync::atomic::{AtomicU64, Ordering};
    static SEQ: AtomicU64 = AtomicU64::new(0);
    let seq = SEQ.fetch_add(1, Ordering::Relaxed);
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.subsec_nanos() as u64)
        .unwrap_or(0);
    // splitmix64 finalizer for decent dispersion
    let mut z = nanos
        .wrapping_add(salt)
        .wrapping_add(seq.wrapping_mul(0xA240_2A1F_1CE4_E5B9))
        .wrapping_add(0x9E37_79B9_7F4A_7C15);
    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
    z ^ (z >> 31)
}

pub fn classify_error(msg: &str) -> ErrorKind {
    let lower = msg.to_lowercase();

    // Auth: HTTP status codes in error context
    if lower.contains(" 401 ")
        || lower.contains(" 403 ")
        || lower.contains("error 401")
        || lower.contains("error 403")
        || lower.starts_with("401 ")
        || lower.starts_with("403 ")
    {
        return ErrorKind::Auth;
    }

    if lower.contains("unauthorized")
        || lower.contains("invalid api key")
        || lower.contains("authentication failed")
    {
        return ErrorKind::Auth;
    }

    // PROV-8: OpenAI's `insufficient_quota` and the broader
    // billing-exhausted signal come through wrapped in a 429 but
    // are permanent failures (the user's billing account is
    // empty/suspended). Without this check we'd burn the full retry
    // budget on a request that will never succeed. Route to Auth so
    // the policy treats it as non-retryable.
    if lower.contains("insufficient_quota")
        || lower.contains("billing_not_active")
        || lower.contains("billing_hard_limit_reached")
    {
        return ErrorKind::Auth;
    }

    if lower.contains("rate limit") || lower.contains("too many requests") {
        return ErrorKind::RateLimit;
    }

    if lower.contains(" 429 ") || lower.contains("error 429") || lower.starts_with("429 ") {
        return ErrorKind::RateLimit;
    }

    // PROV-7: Gemini emits 429s with body
    // `{"error":{"status":"RESOURCE_EXHAUSTED",…}}` that often
    // arrive stringified without the literal " 429 " or "rate
    // limit" wording. Treat as transient so the backoff loop runs.
    if lower.contains("resource_exhausted") || lower.contains("resource has been exhausted") {
        return ErrorKind::RateLimit;
    }

    // Anthropic's `overloaded_error` is a transient capacity signal —
    // structurally a rate-limit response without the "rate limit" /
    // "too many" wording. Classify as RateLimit so the retry-with-
    // backoff policy applies; previously it fell through to `Other`
    // and the user saw a one-shot failure on transient backend
    // pressure.
    if lower.contains("overloaded") {
        return ErrorKind::RateLimit;
    }

    // B3-2 (audit fix): HTTP 5xx server errors. Previously only
    // 502/503/504 were caught and only when surrounded by spaces;
    // a bare 500 fell through to `Other` and the user saw a
    // one-shot failure on transient provider 5xx. Real-world rig/
    // reqwest errors come through in many shapes: "503 Service
    // Unavailable", "Http status: 500", "status=502", "error 504:
    // ...", "(status_code=500)". Match any 3-digit number starting
    // with 5 anywhere in the message, with a non-digit boundary on
    // BOTH sides so we don't false-positive on a 5xx-shaped
    // substring of a larger number (e.g. "request id 50012345").
    if STATUS_5XX_RE.is_match(&lower) {
        return ErrorKind::Network;
    }

    // Context-length indicators. Patterns collected from real
    // provider responses — each entry is a substring observed in
    // production from at least one provider (Anthropic, OpenAI,
    // Google, GLM, DeepSeek, Mistral, OpenRouter passthroughs).
    // Keep these substrings narrow enough to avoid colliding with
    // legitimate non-context-length errors that happen to mention
    // "tokens" or "long".
    if lower.contains("context_length_exceeded")
        || lower.contains("maximum context length")
        || lower.contains("reduce the length of the messages")
        || lower.contains("request too large")
        || lower.contains("prompt is too long")
        || lower.contains("input is too long")
        || lower.contains("input token count exceeds")
        || lower.contains("tokens exceed")
        || lower.contains("exceeds the model's context")
        // PROV-6: Anthropic `max_tokens is too large` (input + max_tokens > window);
        // Cohere/Mistral-via-OpenRouter `too many tokens`; DeepSeek
        // `Range of input length`; OpenRouter `messages.length too large`.
        || lower.contains("max_tokens is too large")
        || lower.contains("too many tokens")
        || lower.contains("range of input length")
        || lower.contains("messages.length too large")
    {
        return ErrorKind::ContextLength;
    }

    // HTML responses from intermediaries (Cloudflare 502/503,
    // nginx error pages, captive-portal interception). These never
    // parse as the JSON envelope rig/reqwest expect — without
    // detection they fell through to `Other` and the user saw a
    // one-shot opaque failure. Detect by leading HTML markers; the
    // status-text strings ("Bad Gateway", "Service Unavailable")
    // also appear in genuine JSON error bodies so we don't rely on
    // them alone.
    if lower.contains("<!doctype html")
        || lower.contains("<html")
        || lower.contains("bad gateway")
        || lower.contains("service unavailable")
        || lower.contains("gateway timeout")
        || lower.contains("cloudflare")
    {
        return ErrorKind::Network;
    }

    // Network errors — check for specific phrases (avoid "connection" false positive)
    if lower.contains("connection refused")
        || lower.contains("connection reset")
        || lower.contains("broken pipe")
        || lower.contains("dns error")
        || lower.contains("tls")
        || lower.contains("ssl")
        || lower.contains("timed out")
        || lower.contains("request timeout")
        || lower.contains("server error")
        // reqwest connect/send failures: the request never got a
        // response (connection refused/dropped, DNS, TCP connect, or
        // a mid-send drop). rig wraps these as "Http client error:
        // error sending request for url (…)". Transient — retry.
        || lower.contains("error sending request")
        || lower.contains("connect error")
        || lower.contains("tcp connect")
        // Mid-stream decode failures from reqwest/rig — the connection
        // returned bytes but they didn't deserialize into the expected
        // JSON envelope. Almost always transient (network blip,
        // truncated chunked response, provider hiccup), so it should
        // be retried like any other network error rather than surfacing
        // as a hard "Other" failure.
        || lower.contains("error decoding response body")
        || lower.contains("invalid response body")
        || lower.contains("decode error")
    {
        return ErrorKind::Network;
    }

    ErrorKind::Other
}

/// Map a raw error message to a one-line user-facing explanation
/// that names *what* failed and *what to try next*. Used by the agent
/// runner when surfacing errors to the chat — beats dumping a stack
/// of `CompletionError: ProviderError: Http client error: …` at the
/// user.
///
/// The original message is appended in parentheses as the cause so
/// the user (and any bug reports) still have the underlying details.
///
/// Transitional after phase 4.5h-6 cutover: no production caller
/// at the moment. The bridge could pretty-format Error events
/// using this when h-7 testing surfaces real provider error
/// shapes; until then keep the helper (and its tests) alive.
#[allow(dead_code)]
pub fn user_facing_error(msg: &str, attempts: usize) -> String {
    let kind = classify_error(msg);
    let lower = msg.to_lowercase();

    let (headline, hint) = match kind {
        ErrorKind::Auth => (
            "authentication failed talking to the LLM provider",
            "check your API key env var (e.g. OPENROUTER_API_KEY) and provider config",
        ),
        ErrorKind::RateLimit => (
            "provider rate-limited the request",
            "wait a moment and retry, or switch to a different model via /model",
        ),
        ErrorKind::ContextLength => (
            "conversation exceeds the model's context window",
            "run /compress to summarize older turns and try again",
        ),
        ErrorKind::Network if lower.contains("error decoding response body") => (
            "lost the response stream from the provider (truncated or malformed body)",
            "usually transient — retry. If it persists the provider may be having issues or returning non-JSON (HTML error pages, plaintext)",
        ),
        ErrorKind::Network => (
            "network error reaching the LLM provider",
            "check connectivity / firewall / proxy; the request will retry automatically",
        ),
        ErrorKind::Other => (
            "the LLM provider returned an error we didn't recognize",
            "see the cause below; consider /model to try a different provider",
        ),
    };

    let attempts_note = if attempts > 1 {
        format!(" (after {} attempt(s))", attempts)
    } else {
        String::new()
    };

    format!(
        "{}{}\n  ↳ hint: {}\n  ↳ cause: {}",
        headline, attempts_note, hint, msg
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[test]
    fn default_budget_retries_transient_failures_up_to_five_times() {
        let p = RecoveryPolicy::default();
        assert_eq!(p.max_retries(), 5);
        // A transient (network) error is retryable up to, but not past,
        // the budget.
        assert!(p.should_retry(0, ErrorKind::Network));
        assert!(p.should_retry(4, ErrorKind::Network));
        assert!(!p.should_retry(5, ErrorKind::Network));
        // Non-retryable kinds never retry, regardless of budget.
        assert!(!p.should_retry(0, ErrorKind::Auth));
    }

    // dirge-6cvc: the shared retry helper — success, immediate bail on a
    // non-retryable error, and retry-then-succeed on a transient one.
    #[tokio::test]
    async fn run_with_retry_returns_first_success() {
        let policy = RecoveryPolicy::default();
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Ok(7) }
        })
        .await;
        assert_eq!(r.unwrap(), 7);
        assert_eq!(calls.load(Ordering::SeqCst), 1, "no retry on success");
    }

    #[tokio::test]
    async fn run_with_retry_bails_immediately_on_non_retryable() {
        let policy = RecoveryPolicy::default();
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Err("invalid api key".to_string()) }
        })
        .await;
        assert!(r.is_err());
        assert_eq!(
            calls.load(Ordering::SeqCst),
            1,
            "auth error must not be retried"
        );
    }

    #[tokio::test]
    async fn run_with_retry_retries_transient_then_succeeds() {
        // Tiny backoff so the test doesn't actually wait seconds.
        let policy = RecoveryPolicy::with_backoff(3, Duration::from_millis(1));
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            let n = calls.fetch_add(1, Ordering::SeqCst);
            async move {
                if n < 2 {
                    Err("rate limit exceeded".to_string())
                } else {
                    Ok(42)
                }
            }
        })
        .await;
        assert_eq!(r.unwrap(), 42);
        assert_eq!(calls.load(Ordering::SeqCst), 3, "two retries then success");
    }

    #[tokio::test]
    async fn run_with_retry_exhausts_then_returns_last_error() {
        let policy = RecoveryPolicy::with_backoff(2, Duration::from_millis(1));
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Err("rate limit exceeded".to_string()) }
        })
        .await;
        assert!(r.is_err());
        // initial attempt + 2 retries = 3 calls.
        assert_eq!(calls.load(Ordering::SeqCst), 3);
    }

    // dirge-5ul5: reqwest connect/send failures (the connection couldn't
    // be established or dropped before a response) surface as "error
    // sending request for url …" wrapped in rig's "Http client error".
    // These are transient and MUST be retried, not classified Other.
    #[test]
    fn classify_connect_send_failures_as_network() {
        for msg in [
            "ProviderError: Http client error: error sending request for url (https://api.deepseek.com/v1/chat/completions)",
            "error sending request for url (https://api.openai.com/v1/chat/completions)",
            "reqwest::Error { kind: Connect, ... }: tcp connect error",
            "Http client error: connect error",
        ] {
            assert_eq!(
                classify_error(msg),
                ErrorKind::Network,
                "connect/send failure must be retryable: {msg}"
            );
        }
        let policy = RecoveryPolicy::default();
        assert!(
            policy.should_retry(0, classify_error("error sending request for url (x)")),
            "the DeepSeek connect failure must be retried"
        );
    }

    #[test]
    fn test_classify_context_length() {
        assert_eq!(
            classify_error("context_length_exceeded: prompt too long"),
            ErrorKind::ContextLength
        );
        assert_eq!(
            classify_error("reduce the length of the messages"),
            ErrorKind::ContextLength
        );
        assert_eq!(
            classify_error("request too large for model"),
            ErrorKind::ContextLength
        );
    }

    /// Audit H1: the original `classify_error` recognized only 4
    /// substrings and missed common provider phrasings. Each entry
    /// below corresponds to a real error string a provider can emit.
    #[test]
    fn test_classify_context_length_provider_variants() {
        // Anthropic: hits when input + max_tokens > context window.
        assert_eq!(
            classify_error("prompt is too long: 250000 tokens > 200000 maximum"),
            ErrorKind::ContextLength
        );
        // OpenAI o-series + gpt-4o family.
        assert_eq!(
            classify_error(
                "This model's maximum context length is 128000 tokens. However, your messages resulted in 130000 tokens."
            ),
            ErrorKind::ContextLength
        );
        // Generic "input too long" wording used by several providers.
        assert_eq!(
            classify_error("input is too long for the requested model"),
            ErrorKind::ContextLength
        );
        // Google Gemini 1.x token-limit message.
        assert_eq!(
            classify_error("The input token count exceeds the maximum number of tokens allowed"),
            ErrorKind::ContextLength
        );
        // GLM / DeepSeek / Mistral all surface variants of "tokens exceed".
        assert_eq!(
            classify_error("Total tokens exceed model's context window"),
            ErrorKind::ContextLength
        );
        // OpenAI returns this when chat history exceeds context.
        assert_eq!(
            classify_error("the messages array exceeds the model's context length"),
            ErrorKind::ContextLength
        );
    }

    /// Audit H5: Cloudflare / nginx 502/503 pages and captive-portal
    /// interceptions arrive as HTML, not JSON. Without HTML-aware
    /// detection these fell through to `Other` (no retry); reclassify
    /// as `Network`.
    #[test]
    fn test_classify_html_proxy_response_as_network() {
        // Cloudflare 502 page snippet.
        assert_eq!(
            classify_error("<!DOCTYPE html><html><head><title>502 Bad Gateway</title>"),
            ErrorKind::Network
        );
        // nginx error page.
        assert_eq!(
            classify_error("<html><body><h1>503 Service Unavailable</h1></body></html>"),
            ErrorKind::Network
        );
        // Captive-portal interception (login page returned for the API URL).
        assert_eq!(
            classify_error("ProviderError: <html><head><meta http-equiv=\"refresh\""),
            ErrorKind::Network
        );
    }

    /// Audit H2: `Retry-After` may arrive as an HTTP-date per RFC 7231.
    /// Parser must accept this form and return a Duration in seconds
    /// from now (clamped to 0 if the date is in the past).
    #[test]
    fn retry_after_http_date_parses() {
        // Build a date ~30s in the future, then check we recover ~30s.
        let future = chrono::Utc::now() + chrono::Duration::seconds(30);
        // RFC 7231 IMF-fixdate format.
        let header = future.format("%a, %d %b %Y %H:%M:%S GMT").to_string();
        let msg = format!("429 Too Many Requests\nRetry-After: {}", header);
        let parsed = retry_after_from_error_msg(&msg).expect("HTTP-date should parse");
        let secs = parsed.as_secs();
        assert!(
            (25..=35).contains(&secs),
            "expected ~30s, got {}s (header={})",
            secs,
            header
        );
    }

    /// Past dates must clamp to 0 rather than wrapping. A misconfigured
    /// server occasionally returns `Retry-After: Thu, 01 Jan 1970 00:00:00 GMT`
    /// — we want to retry immediately, not panic or skip retries.
    #[test]
    fn retry_after_http_date_in_past_clamps_to_zero() {
        let msg = "Retry-After: Thu, 01 Jan 1970 00:00:00 GMT";
        let parsed = retry_after_from_error_msg(msg).expect("past HTTP-date should parse");
        assert_eq!(parsed, Duration::from_secs(0));
    }

    #[test]
    fn test_classify_network() {
        assert_eq!(classify_error("connection refused"), ErrorKind::Network);
        assert_eq!(
            classify_error("connection reset by peer"),
            ErrorKind::Network
        );
        assert_eq!(classify_error("request timed out"), ErrorKind::Network);
        assert_eq!(
            classify_error("503 service unavailable"),
            ErrorKind::Network
        );
        // Reqwest decode failure mid-stream — rig surfaces it as
        // `CompletionError: ProviderError: Http client error: error
        // decoding response body`. Should be retried like any other
        // transient network blip rather than surfacing as Other.
        assert_eq!(
            classify_error(
                "CompletionError: ProviderError: Http client error: error decoding response body"
            ),
            ErrorKind::Network
        );
        assert_eq!(classify_error("decode error: EOF"), ErrorKind::Network);

        // B3-2: 5xx variants beyond the previous strict set.
        // Plain 500 (was previously falling through to Other).
        assert_eq!(
            classify_error("500 Internal Server Error"),
            ErrorKind::Network
        );
        // Prefix-anchored forms.
        assert_eq!(classify_error("Http status: 500"), ErrorKind::Network);
        assert_eq!(classify_error("status=502"), ErrorKind::Network);
        assert_eq!(classify_error("status_code=503"), ErrorKind::Network);
        assert_eq!(classify_error("code: 504"), ErrorKind::Network);
        assert_eq!(
            classify_error("CompletionError: error 500: backend hiccup"),
            ErrorKind::Network
        );
        assert_eq!(
            classify_error("received http 502 from upstream"),
            ErrorKind::Network
        );
    }

    /// `user_facing_error` produces a multi-line message with headline,
    /// hint, and cause. The cause must contain the original raw
    /// message so debug context isn't lost.
    #[test]
    fn user_facing_error_includes_cause() {
        let raw = "CompletionError: ProviderError: Http client error: error decoding response body";
        let pretty = user_facing_error(raw, 1);
        assert!(pretty.contains("lost the response stream"));
        assert!(pretty.contains("hint:"));
        assert!(pretty.contains("cause:"));
        assert!(pretty.contains(raw));
    }

    /// Auth errors get a distinct headline pointing at the API key.
    #[test]
    fn user_facing_error_classifies_auth() {
        let pretty = user_facing_error("401 unauthorized", 1);
        assert!(pretty.contains("authentication failed"));
        assert!(pretty.contains("API key"));
    }

    /// Context-length errors point at /compress.
    #[test]
    fn user_facing_error_classifies_context_length() {
        let pretty = user_facing_error("maximum context length exceeded", 1);
        assert!(pretty.contains("/compress"));
    }

    #[test]
    fn test_classify_rate_limit() {
        assert_eq!(classify_error("rate limit exceeded"), ErrorKind::RateLimit);
        assert_eq!(
            classify_error("429 too many requests"),
            ErrorKind::RateLimit
        );
    }

    /// Anthropic returns `{"type": "overloaded_error", ...}` when its
    /// service is at capacity. The body is structurally similar to a
    /// rate-limit (transient + retryable) but doesn't contain the
    /// "rate limit" / "too many" / "429" patterns. Without explicit
    /// handling it falls into `Other` and dirge doesn't retry —
    /// users see a one-shot failure on a transient backend issue.
    #[test]
    fn classify_anthropic_overloaded_error_as_retryable() {
        assert_eq!(
            classify_error("overloaded_error: Anthropic API is overloaded"),
            ErrorKind::RateLimit,
        );
        // Just the lowercase token is enough — provider stringifies
        // the structured error differently across rig versions.
        assert_eq!(
            classify_error("Provider overloaded; please retry later"),
            ErrorKind::RateLimit,
        );
    }

    #[test]
    fn test_classify_auth() {
        assert_eq!(classify_error("401 unauthorized"), ErrorKind::Auth);
        assert_eq!(classify_error("invalid api key"), ErrorKind::Auth);
    }

    #[test]
    fn test_classify_other() {
        assert_eq!(classify_error("something else"), ErrorKind::Other);
        assert_eq!(classify_error("file not found"), ErrorKind::Other);
        // "connection" alone should not trigger network
        assert_eq!(
            classify_error("database connection closed"),
            ErrorKind::Other
        );
        // "reset" alone should not trigger
        assert_eq!(classify_error("form reset successful"), ErrorKind::Other);
        // "500" in non-HTTP context should not trigger
        assert_eq!(classify_error("processed 500 items"), ErrorKind::Other);
    }

    #[test]
    fn test_retry_policy() {
        let policy = RecoveryPolicy::default();

        // Network errors are retryable up to the budget (5).
        assert!(policy.should_retry(0, ErrorKind::Network));
        assert!(policy.should_retry(2, ErrorKind::Network));
        assert!(policy.should_retry(4, ErrorKind::Network));
        assert!(!policy.should_retry(5, ErrorKind::Network));

        // Rate limits are retryable
        assert!(policy.should_retry(0, ErrorKind::RateLimit));

        // Context length is NOT retryable (needs compaction)
        assert!(!policy.should_retry(0, ErrorKind::ContextLength));

        // Auth is not retryable
        assert!(!policy.should_retry(0, ErrorKind::Auth));

        // Other is not retryable
        assert!(!policy.should_retry(0, ErrorKind::Other));
    }

    #[test]
    fn test_backoff_duration() {
        let policy = RecoveryPolicy::default();
        let d0 = policy.backoff_duration(0);
        let d1 = policy.backoff_duration(1);
        let d2 = policy.backoff_duration(2);

        assert!(d0 >= Duration::from_secs(1));
        assert!(d1 >= Duration::from_secs(2));
        assert!(d2 >= Duration::from_secs(4));
    }

    #[test]
    fn test_backoff_overflow_guard() {
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration(20); // capped at attempts=6 via min()
        // 1s * 2^6 = 64s plus up to +25% jitter = 80s ceiling
        assert!(d >= Duration::from_secs(64));
        assert!(d < Duration::from_secs(81));
    }

    #[test]
    fn test_backoff_jitter_present() {
        let policy = RecoveryPolicy::default();
        // Repeated calls at the same attempt count should yield differing values
        // most of the time. Run a small batch and confirm we see at least two
        // distinct values — proves jitter is wired in.
        let mut seen = std::collections::HashSet::new();
        for _ in 0..8 {
            seen.insert(policy.backoff_duration(3));
            std::thread::sleep(Duration::from_millis(1));
        }
        assert!(
            seen.len() > 1,
            "expected jittered backoff to vary across calls"
        );
    }

    /// F14: Anthropic-style `retry-after-ms` parses as ms.
    #[test]
    fn retry_after_parses_anthropic_ms() {
        let msg = "rate limited: retry-after-ms: 5000";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_millis(5000)),
        );
    }

    /// Standard HTTP `Retry-After: <seconds>` parses as seconds.
    #[test]
    fn retry_after_parses_standard_seconds() {
        let msg = "HTTP 429 Too Many Requests\nRetry-After: 30";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(30)),
        );
    }

    /// JSON body form: `"retry_after": 12`.
    #[test]
    fn retry_after_parses_json_body() {
        let msg = r#"{"error":"rate_limit","retry_after":12}"#;
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(12)),
        );
    }

    /// Bare-without-colon variant (some proxies log `retry-after 30`).
    #[test]
    fn retry_after_parses_no_colon() {
        let msg = "got 429, retry-after 7 next time";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(7)),
        );
    }

    /// No retry-after present → None.
    #[test]
    fn retry_after_returns_none_when_absent() {
        let msg = "generic network error: connection reset";
        assert_eq!(retry_after_from_error_msg(msg), None);
    }

    /// Regression: messages with multi-byte UTF-8 BEFORE the label
    /// previously could panic — the original parser found the
    /// label in a lowercased copy and indexed into the original
    /// at that byte offset. `to_lowercase` can change byte length
    /// (Turkish `İ` is 2 bytes lowercase as `i̇` = 3 bytes), so
    /// the offsets disagreed and `&msg[idx + label.len()..]` could
    /// land mid-UTF-8 → panic. Now the search is on byte windows
    /// of the original string with case-insensitive ASCII compare.
    #[test]
    fn retry_after_handles_unicode_before_label() {
        // Provider error message with a Turkish capital I before
        // the label. Lowercasing produces a different byte length.
        let msg = "İoError: Retry-After: 8";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(8)),
        );
    }

    /// Case-insensitive matching against the label name itself.
    /// `RETRY-AFTER-MS` and `retry-after-ms` should both parse.
    #[test]
    fn retry_after_label_match_is_case_insensitive() {
        assert_eq!(
            retry_after_from_error_msg("rate limited: RETRY-AFTER-MS: 750"),
            Some(Duration::from_millis(750)),
        );
        assert_eq!(
            retry_after_from_error_msg("Retry-After-Ms: 750"),
            Some(Duration::from_millis(750)),
        );
    }

    /// Pathological huge digit run: cap at 11 digits before parse,
    /// so `Retry-After: 999999999999999999999...` doesn't overflow
    /// or produce a 100-year wait before the upper cap clamps.
    #[test]
    fn retry_after_caps_pathological_digit_run() {
        let msg = "Retry-After: 99999999999999999999999";
        let parsed = retry_after_from_error_msg(msg);
        // 11 digits = max ~10^11 seconds — `backoff_duration_for_msg`
        // will cap at 5 minutes, but the unsanitized parse must
        // produce SOMETHING (not None, not a panic). We don't pin
        // the exact value; just verify it's bounded by the cap
        // behavior in `backoff_duration_for_msg`.
        assert!(parsed.is_some(), "must parse, not return None");
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration_for_msg(0, msg);
        assert!(
            d <= Duration::from_secs(300),
            "backoff must cap at 5min; got {:?}",
            d,
        );
    }

    /// `backoff_duration_for_msg` picks the longer of the
    /// computed exponential backoff and the server's retry-after,
    /// capped at 5 minutes.
    #[test]
    fn backoff_duration_for_msg_prefers_longer_value() {
        let policy = RecoveryPolicy::default();
        // attempts=0 → ~1s computed. retry-after=10s → 10s wins.
        let d = policy.backoff_duration_for_msg(0, "Retry-After: 10");
        assert!(d >= Duration::from_secs(10) && d < Duration::from_secs(11));

        // Server asks for ms below computed → computed wins.
        let d = policy.backoff_duration_for_msg(3, "retry-after-ms: 50");
        // 2^3 = 8s computed.
        assert!(d >= Duration::from_secs(8));
    }

    /// Cap retry-after at 5 minutes in case the header is bogus.
    #[test]
    fn backoff_duration_for_msg_caps_at_5_minutes() {
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration_for_msg(0, "Retry-After: 9999");
        assert!(d <= Duration::from_secs(300));
    }
}