opencrabs 0.3.36

//! Error types for LLM providers

use thiserror::Error;

/// Provider error types
#[derive(Debug, Error)]
pub enum ProviderError {
    /// HTTP request failed
    #[error("HTTP request failed: {0}")]
    HttpError(#[from] reqwest::Error),

    /// API returned an error
    #[error(
        "API error ({status}){}: {message}",
        error_type
            .as_ref()
            .filter(|t| !t.is_empty())
            .map(|t| format!(" [{}]", t))
            .unwrap_or_default()
    )]
    ApiError {
        status: u16,
        message: String,
        error_type: Option<String>,
    },

    /// Invalid API key
    #[error("Invalid API key")]
    InvalidApiKey,

    /// Rate limit exceeded
    #[error("Rate limit exceeded: {0}")]
    RateLimitExceeded(String),

    /// Invalid request
    #[error("Invalid request: {0}")]
    InvalidRequest(String),

    /// Model not found
    #[error("Model not found: {0}")]
    ModelNotFound(String),

    /// Context length exceeded
    #[error("Context length exceeded: {0} tokens")]
    ContextLengthExceeded(u32),

    /// Streaming not supported
    #[error("Streaming not supported by this provider")]
    StreamingNotSupported,

    /// Tools not supported
    #[error("Tools not supported by this provider")]
    ToolsNotSupported,

    /// JSON parsing error
    #[error("JSON error: {0}")]
    JsonError(#[from] serde_json::Error),

    /// Streaming error
    #[error("Streaming error: {0}")]
    StreamError(String),

    /// Timeout
    #[error("Request timed out after {0}s")]
    Timeout(u64),

    /// Internal error
    #[error("Internal error: {0}")]
    Internal(String),
}

impl ProviderError {
    /// Check if error is retryable
    pub fn is_retryable(&self) -> bool {
        match self {
            ProviderError::HttpError(_)
            | ProviderError::RateLimitExceeded(_)
            | ProviderError::Timeout(_) => true,
            ProviderError::ApiError { status, .. } if *status >= 500 => true,
            // A 4xx whose body is an HTML page is an infrastructure / CDN /
            // load-balancer error page, NOT a real JSON API client error.
            // These are transient (the next request usually hits a healthy
            // node) and must be retried, not bounced straight to the
            // fallback chain. Canonical case: modelscope intermittently
            // returns HTTP 405 with a Chinese HTML error page for a valid
            // POST to /chat/completions; retrying succeeds, but the old code
            // treated 405 as a hard client error and fell back instantly
            // with zero retries (2026-06-07). Real client errors return
            // JSON, never HTML, so this never masks an invalid_model /
            // validation / auth problem.
            ProviderError::ApiError {
                status, message, ..
            } if (400..500).contains(status) && is_html_error_body(message) => true,
            // HTTP 400 with a generic proxy-style body (empty error_type
            // AND a message that doesn't describe an actionable client
            // problem) is almost always a transient upstream failure
            // forwarded by the proxy. opencode.ai's "Provider returned
            // error" is the canonical case — the user's payload is fine,
            // their upstream is having a moment. Retry before falling
            // back. Real client-side 400s (invalid_model, validation
            // errors, bad JSON) carry specific error_type or message
            // strings and stay non-retryable.
            ProviderError::ApiError {
                status: 400,
                message,
                error_type,
            } => is_transient_proxy_400(message, error_type.as_deref()),
            _ => false,
        }
    }

    /// Get HTTP status code if available
    pub fn status_code(&self) -> Option<u16> {
        match self {
            ProviderError::ApiError { status, .. } => Some(*status),
            _ => None,
        }
    }

    /// True when the server rejected the REQUEST's model id (not the
    /// credential). Some OpenAI-compatible proxies — notably
    /// `opencode.ai/zen` — return HTTP 401 with
    /// `{"error":{"type":"ModelError","message":"Model X not supported"}}`
    /// for "this key can't use that model", which collides with real
    /// auth failures. Downstream code uses this to keep the actual
    /// "invalid key" classification meaningful and route model-mismatch
    /// errors to a different UX path.
    pub fn is_model_unsupported(&self) -> bool {
        match self {
            ProviderError::ModelNotFound(_) => true,
            ProviderError::ApiError {
                error_type,
                message,
                ..
            } => {
                let type_hit = error_type.as_ref().is_some_and(|t| {
                    let t = t.to_ascii_lowercase();
                    t == "modelerror"
                        || t == "model_error"
                        || t == "model_not_found"
                        || t == "invalid_model"
                });
                let msg = message.to_ascii_lowercase();
                let msg_hit = msg.contains("model")
                    && (msg.contains("not supported")
                        || msg.contains("not found")
                        || msg.contains("unsupported"));
                type_hit || msg_hit
            }
            _ => false,
        }
    }
}

/// True when an error body is an HTML page rather than a JSON API error.
/// A 4xx that returns HTML came from a CDN / load balancer / reverse proxy
/// (an infrastructure error page), not the API itself — these are
/// transient and worth retrying. Real API client errors are always JSON,
/// so this never matches a genuine invalid_model / validation / auth error.
pub(crate) fn is_html_error_body(message: &str) -> bool {
    // Scan a bounded prefix so a huge HTML page isn't lowercased in full on
    // every error. `chars().take()` is char-boundary-safe — a byte slice
    // would panic mid-UTF8 (the modelscope body has Chinese characters).
    let head: String = message
        .trim_start()
        .chars()
        .take(256)
        .collect::<String>()
        .to_ascii_lowercase();
    head.contains("<!doctype")
        || head.contains("<html")
        || head.contains("<head")
        || head.contains("<body")
}

/// True when an HTTP 400 response body looks like a proxy passthrough of
/// an upstream hiccup rather than a real client-side error. Used by
/// `is_retryable` so opencode.ai-style "Provider returned error" 400s
/// go through the 3-retry backoff instead of bailing to fallback on
/// the first try.
pub(crate) fn is_transient_proxy_400(message: &str, error_type: Option<&str>) -> bool {
    // Real client errors always carry an error_type (OpenAI: "invalid_request_error",
    // "model_not_found", "validation_error", etc.). Treat any non-empty type as
    // non-transient so we don't retry bad payloads.
    if error_type.is_some_and(|t| !t.is_empty()) {
        return false;
    }
    let m = message.trim().to_ascii_lowercase();
    if m.is_empty() {
        return true;
    }
    // Known proxy-passthrough phrases. Add new strings here when a proxy
    // invents a different one.
    const TRANSIENT_HINTS: &[&str] = &[
        "provider returned error",
        "upstream error",
        "internal error",
        "temporary",
        "try again",
        "bad gateway",
    ];
    TRANSIENT_HINTS.iter().any(|h| m.contains(h))
}

/// Result type for provider operations
pub type Result<T> = std::result::Result<T, ProviderError>;

impl crate::utils::retry::RetryableError for ProviderError {
    fn is_retryable(&self) -> bool {
        // Delegate to the inherent classifier. `Self::is_retryable` would
        // be ambiguous (inherent vs this trait method), so go through a
        // free helper that names the inherent unambiguously.
        provider_error_is_retryable(self)
    }

    fn retry_after(&self) -> Option<std::time::Duration> {
        // Parse a server Retry-After hint from rate-limit errors, clamped
        // to 30s so a pathological "retry after 300s" can't stall a turn.
        // Other error kinds have no hint — the caller falls back to the
        // exponential schedule.
        let msg = match self {
            ProviderError::RateLimitExceeded(m) => m.as_str(),
            ProviderError::ApiError {
                status, message, ..
            } if *status == 429 => message.as_str(),
            _ => return None,
        };
        parse_retry_seconds(msg).map(|secs| std::time::Duration::from_secs(secs.min(30)))
    }

    fn is_hard_down(&self) -> bool {
        // A connection-phase failure means the endpoint refused the
        // connection, the DNS name didn't resolve, or the host is
        // unreachable — the host is down, not slow. These don't recover
        // within a retry window, so the retry loop caps them at one quick
        // retry and moves on (e.g. to the next provider in the fallback
        // chain) instead of burning the full patient backoff. Timeouts are
        // deliberately NOT hard-down: a slow-but-alive host is worth the
        // patient schedule.
        let ProviderError::HttpError(e) = self else {
            return false;
        };
        // reqwest's is_connect() is the clean signal, but it does NOT fire
        // for DNS-resolution failures: a domain that went NXDOMAIN surfaces
        // as a generic "error sending request for url" whose is_connect() is
        // false (dialagram.me went NXDOMAIN 2026-06-07 and got the full 15s
        // patient retry instead of failing fast). Fall back to scanning the
        // error's source chain for the telltale resolver/connection failure.
        if e.is_connect() {
            return true;
        }
        let mut source: Option<&(dyn std::error::Error + 'static)> = std::error::Error::source(e);
        while let Some(err) = source {
            if looks_like_connection_failure(&err.to_string()) {
                return true;
            }
            source = err.source();
        }
        false
    }
}

/// True when an error message looks like a DNS-resolution or
/// connection-establishment failure (host is down / unreachable), as
/// opposed to a transient timeout or a real HTTP error. Matched against the
/// reqwest error source chain because reqwest doesn't flag DNS failures via
/// `is_connect()`. Lowercased substrings cover the common libc/getaddrinfo
/// and OS socket error strings across macOS and Linux.
pub(crate) fn looks_like_connection_failure(msg: &str) -> bool {
    let m = msg.to_ascii_lowercase();
    const NEEDLES: &[&str] = &[
        "dns error",
        "failed to lookup address",
        "name or service not known", // Linux getaddrinfo
        "nodename nor servname",     // macOS getaddrinfo (NXDOMAIN)
        "no such host",
        "could not resolve",
        "name resolution",
        "connection refused",
        "network is unreachable",
        "no route to host",
        "connection reset",
    ];
    NEEDLES.iter().any(|n| m.contains(n))
}

/// Free wrapper so the `RetryableError` impl can call the inherent
/// `ProviderError::is_retryable` without method-resolution ambiguity.
fn provider_error_is_retryable(e: &ProviderError) -> bool {
    e.is_retryable()
}

/// Parse a retry-delay (seconds) out of a rate-limit error message.
/// Recognizes "60 seconds", "60s", "retry in 60", "wait 60". Moved here
/// from the former `brain::provider::retry` module when retry logic was
/// consolidated onto `utils::retry`.
fn parse_retry_seconds(msg: &str) -> Option<u64> {
    use regex::Regex;
    let patterns = [
        r"(\d+)\s*seconds?",
        r"(\d+)\s*s\b",
        r"retry in (\d+)",
        r"wait (\d+)",
    ];
    for pattern in patterns {
        if let Ok(re) = Regex::new(pattern)
            && let Some(captures) = re.captures(msg)
            && let Some(num_str) = captures.get(1)
            && let Ok(secs) = num_str.as_str().parse::<u64>()
        {
            return Some(secs);
        }
    }
    None
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_error_retryable() {
        let rate_limit = ProviderError::RateLimitExceeded("Try again later".to_string());
        assert!(rate_limit.is_retryable());

        let invalid_key = ProviderError::InvalidApiKey;
        assert!(!invalid_key.is_retryable());

        let server_error = ProviderError::ApiError {
            status: 500,
            message: "Internal Server Error".to_string(),
            error_type: None,
        };
        assert!(server_error.is_retryable());

        let client_error = ProviderError::ApiError {
            status: 400,
            message: "Bad Request".to_string(),
            error_type: None,
        };
        assert!(!client_error.is_retryable());
    }

    #[test]
    fn test_status_code() {
        let error = ProviderError::ApiError {
            status: 429,
            message: "Too many requests".to_string(),
            error_type: Some("rate_limit_error".to_string()),
        };
        assert_eq!(error.status_code(), Some(429));

        let invalid_key = ProviderError::InvalidApiKey;
        assert_eq!(invalid_key.status_code(), None);
    }
}