pawan-core 0.5.22

//! OpenAI-compatible LLM backend (NVIDIA NIM, OpenAI, DeepSeek, etc.)

use super::LlmBackend;
use crate::agent::{
    LLMResponse, Message, Role, TokenCallback, TokenUsage, ToolCallRequest, ToolDefinition,
};
use crate::{PawanError, Result};
use async_trait::async_trait;
use serde_json::{json, Value};

/// Cloud fallback endpoint — different API URL/key for hybrid routing
pub struct CloudFallback {
    pub api_url: String,
    pub api_key: Option<String>,
    pub model: String,
    pub fallback_models: Vec<String>,
}

/// Configuration for OpenAI-compatible backend
pub struct OpenAiCompatConfig {
    pub api_url: String,
    pub api_key: Option<String>,
    pub model: String,
    pub temperature: f32,
    pub top_p: f32,
    pub max_tokens: usize,
    pub system_prompt: String,
    pub use_thinking: bool,
    pub max_retries: usize,
    pub fallback_models: Vec<String>,
    /// Optional cloud fallback for hybrid local+cloud routing
    pub cloud: Option<CloudFallback>,
}
/// Backend for OpenAI-compatible APIs (NVIDIA NIM, OpenAI, DeepSeek)
pub struct OpenAiCompatBackend {
    http_client: reqwest::Client,
    cfg: OpenAiCompatConfig,
}

/// Accumulated state while parsing an SSE streaming response.
struct StreamingAccumState {
    content: String,
    tool_calls: Vec<ToolCallRequest>,
    finish_reason: String,
    stream_usage: Option<TokenUsage>,
    stream_reasoning: String,
    buffer: String,
    buf_start: usize,
}

impl StreamingAccumState {
    fn new() -> Self {
        Self {
            content: String::new(),
            tool_calls: Vec::new(),
            finish_reason: "stop".to_string(),
            stream_usage: None,
            stream_reasoning: String::new(),
            buffer: String::new(),
            buf_start: 0,
        }
    }
}

impl OpenAiCompatBackend {
    pub fn new(cfg: OpenAiCompatConfig) -> Self {
        Self {
            http_client: reqwest::Client::new(),
            cfg,
        }
    }

    /// Check if a model supports the `chat_template_kwargs` parameter.
    /// Qwen, DeepSeek, Gemma-4, and GLM models support this. Mistral, LLaMA, and others reject it with 400.
    fn supports_chat_template_kwargs(model: &str) -> bool {
        let m = model.to_lowercase();
        m.contains("qwen") || m.contains("deepseek") || m.contains("gemma") || m.contains("glm")
    }

    /// Check if a model uses the `reasoning_effort` parameter instead of chat_template_kwargs.
    /// Mistral Small 4+ uses per-request `reasoning_effort` (none/high).
    fn supports_reasoning_effort(model: &str) -> bool {
        let m = model.to_lowercase();
        m.contains("mistral-small-4")
    }

    /// Get the correct `chat_template_kwargs` value for thinking mode.
    /// GLM uses `enable_thinking` + `clear_thinking`, Gemma uses `enable_thinking`,
    /// Qwen/DeepSeek use `thinking`.
    fn thinking_kwargs(model: &str, enabled: bool) -> serde_json::Value {
        let m = model.to_lowercase();
        if m.contains("glm") {
            json!({ "enable_thinking": enabled, "clear_thinking": false })
        } else if m.contains("gemma") {
            json!({ "enable_thinking": enabled })
        } else {
            json!({ "thinking": enabled })
        }
    }

    /// Check if a model supports tool use (function calling).
    /// Models known NOT to support tools on NIM: mistral-7b, old mistral-small (pre-v4).
    /// Models known to support tools: mistral-small-4, devstral, qwen, deepseek, nemotron, llama-3.1-70b+
    fn supports_tool_use(model: &str) -> bool {
        let m = model.to_lowercase();
        // Explicit deny list — old models that reject tools on NIM
        if m.contains("mistral-7b") {
            return false;
        }
        // Everything else: assume tool use support (fail gracefully in retry loop)
        true
    }

    /// Calculate exponential backoff delay with jitter
    /// Start at 1s, double each time, with ±20% random jitter
    fn calculate_backoff_delay(attempt: usize) -> std::time::Duration {
        let base_secs = (1u64 << attempt) as f64; // 1, 2, 4, 8, ...
        let jitter = 0.8 + (rand::random::<f64>() * 0.4); // 0.8 to 1.2
        std::time::Duration::from_secs_f64(base_secs * jitter)
    }
    fn build_messages(&self, messages: &[Message]) -> Vec<Value> {
        let mut out = vec![json!({
            "role": "system",
            "content": self.cfg.system_prompt
        })];

        for msg in messages {
            match msg.role {
                Role::System => {
                    out.push(json!({ "role": "system", "content": msg.content }));
                }
                Role::User => {
                    out.push(json!({ "role": "user", "content": msg.content }));
                }
                Role::Assistant => {
                    if msg.tool_calls.is_empty() {
                        out.push(json!({ "role": "assistant", "content": msg.content }));
                    } else {
                        let tool_calls: Vec<Value> = msg
                            .tool_calls
                            .iter()
                            .map(|tc| {
                                json!({
                                    "id": tc.id,
                                    "type": "function",
                                    "function": {
                                        "name": tc.name,
                                        "arguments": serde_json::to_string(&tc.arguments).unwrap_or_default()
                                    }
                                })
                            })
                            .collect();
                        out.push(json!({
                            "role": "assistant",
                            "content": msg.content,
                            "tool_calls": tool_calls
                        }));
                    }
                }
                Role::Tool => {
                    if let Some(ref tool_result) = msg.tool_result {
                        out.push(json!({
                            "role": "tool",
                            "tool_call_id": tool_result.tool_call_id,
                            "content": serde_json::to_string(&tool_result.content)
                                .unwrap_or_else(|_| tool_result.content.to_string())
                        }));
                    }
                }
            }
        }

        out
    }

    fn build_tools(&self, tools: &[ToolDefinition]) -> Vec<Value> {
        tools
            .iter()
            .map(|t| {
                json!({
                    "type": "function",
                    "function": {
                        "name": t.name,
                        "description": t.description,
                        "parameters": t.to_mcp_input_schema()
                    }
                })
            })
            .collect()
    }

    /// Map a non-success HTTP status to a formatted LLM error.
    async fn handle_error_response(response: reqwest::Response) -> Result<reqwest::Response> {
        if !response.status().is_success() {
            let status = response.status();
            let text = response.text().await.unwrap_or_default();
            return Err(PawanError::Llm(Self::format_api_error(status, &text)));
        }
        Ok(response)
    }

    /// Send an HTTP request and fail on transport or non-2xx errors.
    async fn send_http_request(request: reqwest::RequestBuilder) -> Result<reqwest::Response> {
        let response = request
            .send()
            .await
            .map_err(|e| PawanError::Llm(format!("HTTP request failed: {}", e)))?;
        Self::handle_error_response(response).await
    }

    async fn non_streaming(&self, request: reqwest::RequestBuilder) -> Result<LLMResponse> {
        let response = Self::send_http_request(request).await?;

        let response_json: Value = response
            .json()
            .await
            .map_err(|e| PawanError::Llm(format!("Failed to parse response: {}", e)))?;

        self.parse_response(&response_json)
    }

    /// Apply one streaming `delta` object to accumulated content, reasoning, and tool calls.
    fn accumulate_stream_delta(
        state: &mut StreamingAccumState,
        delta: &Value,
        on_token: Option<&TokenCallback>,
    ) {
        if let Some(c) = delta.get("content").and_then(|v| v.as_str()) {
            if let Some(callback) = on_token {
                callback(c);
            }
            state.content.push_str(c);
        }

        // Capture reasoning/thinking content from streaming deltas
        if let Some(r) = delta
            .get("reasoning_content")
            .or_else(|| delta.get("reasoning"))
            .and_then(|v| v.as_str())
        {
            state.stream_reasoning.push_str(r);
        }

        if let Some(tc_array) = delta.get("tool_calls").and_then(|v| v.as_array()) {
            for tc in tc_array {
                let index = tc.get("index").and_then(|v| v.as_u64()).unwrap_or(0) as usize;

                while state.tool_calls.len() <= index {
                    state.tool_calls.push(ToolCallRequest {
                        id: String::new(),
                        name: String::new(),
                        arguments: json!(""),
                    });
                }

                if let Some(id) = tc.get("id").and_then(|v| v.as_str()) {
                    state.tool_calls[index].id = id.to_string();
                }
                if let Some(func) = tc.get("function") {
                    if let Some(name) = func.get("name").and_then(|v| v.as_str()) {
                        state.tool_calls[index].name = name.to_string();
                    }
                    if let Some(args) = func.get("arguments").and_then(|v| v.as_str()) {
                        let current = state.tool_calls[index].arguments.as_str().unwrap_or("");
                        state.tool_calls[index].arguments = json!(format!("{}{}", current, args));
                    }
                }
            }
        }
    }

    /// Parse one SSE `data:` payload and update streaming accumulation state.
    fn parse_stream_sse_data(
        state: &mut StreamingAccumState,
        data: &str,
        on_token: Option<&TokenCallback>,
    ) {
        if let Ok(json) = serde_json::from_str::<Value>(data) {
            // Capture usage from final chunk (OpenAI stream_options, vllm-mlx, etc.)
            if json
                .get("usage")
                .and_then(|u| u.get("total_tokens"))
                .is_some()
            {
                state.stream_usage = Self::parse_usage(&json);
            }

            if let Some(choices) = json.get("choices").and_then(|v| v.as_array()) {
                for choice in choices {
                    if let Some(delta) = choice.get("delta") {
                        Self::accumulate_stream_delta(state, delta, on_token);
                    }

                    if let Some(reason) = choice.get("finish_reason").and_then(|v| v.as_str()) {
                        state.finish_reason = reason.to_string();
                    }
                }
            }
        }
    }

    /// Drain complete SSE lines from the streaming buffer.
    fn process_stream_buffer_lines(
        state: &mut StreamingAccumState,
        on_token: Option<&TokenCallback>,
    ) {
        while let Some(rel_pos) = state.buffer[state.buf_start..].find('\n') {
            let newline_pos = state.buf_start + rel_pos;
            let line = state.buffer[state.buf_start..newline_pos]
                .trim()
                .to_string();
            state.buf_start = newline_pos + 1; // advance past newline (zero-copy)
            if line.is_empty() || line == "data: [DONE]" {
                continue;
            }

            if let Some(data) = line.strip_prefix("data: ") {
                Self::parse_stream_sse_data(state, data, on_token);
            }
        }
    }

    /// Compact buffer after consuming prefix bytes.
    fn compact_stream_buffer(state: &mut StreamingAccumState) {
        if state.buf_start > 0 {
            state.buffer = state.buffer[state.buf_start..].to_string();
            state.buf_start = 0;
        }
    }

    /// Finalize streamed tool calls (JSON parse, UUIDs, Mistral fallback).
    fn finalize_stream_tool_calls(state: &mut StreamingAccumState) {
        // Parse tool call arguments from JSON strings
        // StepFun/Qwen models may interleave <think>...</think> tokens inside arguments
        for tc in &mut state.tool_calls {
            if let Some(args_str) = tc.arguments.as_str() {
                // Strip think blocks from arguments before JSON parse
                let clean_args = Self::strip_think_from_str(args_str);
                if let Ok(parsed) = serde_json::from_str::<Value>(&clean_args) {
                    tc.arguments = parsed;
                } else if let Ok(parsed) = serde_json::from_str::<Value>(args_str) {
                    // Fallback: try original if stripping broke the JSON
                    tc.arguments = parsed;
                }
            }
            if tc.id.is_empty() {
                tc.id = uuid::Uuid::new_v4().to_string();
            }
        }

        state.tool_calls.retain(|tc| !tc.name.is_empty());

        // Fallback: devstral/Mistral models sometimes stream [TOOL_CALLS] text
        // instead of structured tool_call deltas.
        if state.tool_calls.is_empty() {
            state.tool_calls = Self::parse_mistral_tool_calls(&state.content);
        }

        if !state.tool_calls.is_empty() {
            state.finish_reason = "tool_calls".to_string();
        }
    }

    /// Build the final `LLMResponse` from completed streaming accumulation.
    fn build_streaming_response(state: StreamingAccumState) -> LLMResponse {
        // Strip think blocks from content (StepFun/Qwen interleave reasoning)
        let content = Self::strip_think_from_str(&state.content);

        // Build reasoning from streamed chunks
        let reasoning = if state.stream_reasoning.is_empty() {
            None
        } else {
            Some(state.stream_reasoning)
        };

        // Enrich stream usage with reasoning token estimate
        let usage = state.stream_usage.map(|mut u| {
            if let Some(ref r) = reasoning {
                u.reasoning_tokens = (r.len() as u64) / 4;
                u.action_tokens = u.completion_tokens.saturating_sub(u.reasoning_tokens);
            }
            u
        });

        LLMResponse {
            content,
            reasoning,
            tool_calls: state.tool_calls,
            finish_reason: state.finish_reason,
            usage,
        }
    }

    async fn streaming(
        &self,
        request: reqwest::RequestBuilder,
        on_token: Option<&TokenCallback>,
    ) -> Result<LLMResponse> {
        let response = Self::send_http_request(request).await?;

        let mut state = StreamingAccumState::new();
        let mut stream = response.bytes_stream();
        use futures::StreamExt;

        while let Some(chunk) = stream.next().await {
            let chunk = chunk.map_err(|e| PawanError::Llm(format!("Stream error: {}", e)))?;
            state.buffer.push_str(&String::from_utf8_lossy(&chunk));
            Self::process_stream_buffer_lines(&mut state, on_token);
            // Compact buffer: only reallocate when >50% consumed
            Self::compact_stream_buffer(&mut state);
        }

        Self::finalize_stream_tool_calls(&mut state);
        Ok(Self::build_streaming_response(state))
    }

    /// Strip <think>...</think> blocks from a string. Handles case-insensitive,
    /// nested blocks, and blocks interleaved with content (StepFun pattern).
    fn strip_think_from_str(s: &str) -> String {
        let mut out = s.to_string();
        loop {
            let lower = out.to_lowercase();
            let open = lower.find("<think>");
            let close = lower.find("</think>");
            match (open, close) {
                (Some(i), Some(j)) if j > i => {
                    let before = out[..i].trim_end().to_string();
                    let after = if out.len() > j + 8 {
                        out[j + 8..].trim_start().to_string()
                    } else {
                        String::new()
                    };
                    out = if before.is_empty() && after.is_empty() {
                        String::new()
                    } else if before.is_empty() {
                        after
                    } else if after.is_empty() {
                        before
                    } else {
                        format!("{} {}", before, after)
                    };
                }
                _ => break,
            }
        }
        out
    }

    fn parse_response(&self, json: &Value) -> Result<LLMResponse> {
        let choices = json
            .get("choices")
            .and_then(|v| v.as_array())
            .ok_or_else(|| PawanError::Llm("No choices in response".into()))?;

        let choice = choices
            .first()
            .ok_or_else(|| PawanError::Llm("Empty choices array".into()))?;

        let message = choice
            .get("message")
            .ok_or_else(|| PawanError::Llm("No message in choice".into()))?;

        let raw_content = message
            .get("content")
            .and_then(|v| v.as_str())
            .unwrap_or("");
        let content = Self::strip_think_from_str(raw_content);

        let mut tool_calls = Vec::new();
        let finish_reason = choice
            .get("finish_reason")
            .and_then(|v| v.as_str())
            .unwrap_or("stop")
            .to_string();

        if let Some(tc_array) = message.get("tool_calls").and_then(|v| v.as_array()) {
            for tc in tc_array {
                let id = tc
                    .get("id")
                    .and_then(|v| v.as_str())
                    .unwrap_or("")
                    .to_string();

                if let Some(func) = tc.get("function") {
                    let name = func
                        .get("name")
                        .and_then(|v| v.as_str())
                        .unwrap_or("")
                        .to_string();

                    let arguments =
                        if let Some(args_str) = func.get("arguments").and_then(|v| v.as_str()) {
                            // Strip think blocks from arguments (StepFun interleaves reasoning)
                            let clean = Self::strip_think_from_str(args_str);
                            serde_json::from_str(&clean)
                                .or_else(|_| serde_json::from_str(args_str))
                                .unwrap_or(json!({}))
                        } else {
                            func.get("arguments").cloned().unwrap_or(json!({}))
                        };

                    tool_calls.push(ToolCallRequest {
                        id: if id.is_empty() {
                            uuid::Uuid::new_v4().to_string()
                        } else {
                            id
                        },
                        name,
                        arguments,
                    });
                }
            }
        }

        // Fallback: devstral/Mistral models sometimes embed tool calls in text content
        // instead of the structured tool_calls API field. Parse [TOOL_CALLS] format.
        if tool_calls.is_empty() {
            tool_calls = Self::parse_mistral_tool_calls(&content);
        }

        // Parse reasoning/thinking content (mlx_lm.server, vllm-mlx, DeepSeek)
        let reasoning = message
            .get("reasoning_content")
            .or_else(|| message.get("reasoning"))
            .and_then(|v| v.as_str())
            .filter(|s| !s.is_empty())
            .map(|s| s.to_string());

        // Parse usage from response, enriched with reasoning token estimate
        let usage = Self::parse_usage(json).map(|mut u| {
            // Estimate reasoning tokens from the reasoning string length (1 tok ≈ 4 chars)
            if let Some(ref r) = reasoning {
                u.reasoning_tokens = (r.len() as u64) / 4;
                u.action_tokens = u.completion_tokens.saturating_sub(u.reasoning_tokens);
            } else {
                u.reasoning_tokens = 0;
                u.action_tokens = u.completion_tokens;
            }
            u
        });

        Ok(LLMResponse {
            content,
            reasoning,
            tool_calls,
            finish_reason,
            usage,
        })
    }

    /// Parse Mistral text-format tool calls embedded in content.
    ///
    /// Mistral/devstral models non-deterministically emit tool calls as text instead
    /// of structured API `tool_calls`. Two observed variants:
    ///
    /// Variant 1 (standard):  `[TOOL_CALLS] [{"name":"func","arguments":{...}}]`
    /// Variant 2 (compact):   `[TOOL_CALLS]func_name{"key":"value"}`
    fn parse_mistral_tool_calls(content: &str) -> Vec<ToolCallRequest> {
        const MARKER: &str = "[TOOL_CALLS]";
        let Some(pos) = content.find(MARKER) else {
            return vec![];
        };

        let after = content[pos + MARKER.len()..].trim_start();

        // Variant 1: JSON array — [{"name":..., "arguments":...}, ...]
        if after.starts_with('[') {
            let bracket_end = Self::find_matching_bracket(after, '[', ']');
            if bracket_end > 0 {
                if let Ok(arr) = serde_json::from_str::<Vec<Value>>(&after[..bracket_end]) {
                    let calls: Vec<ToolCallRequest> = arr
                        .iter()
                        .filter_map(|tc| {
                            let name = tc.get("name")?.as_str()?.to_string();
                            if name.is_empty() {
                                return None;
                            }
                            let arguments = tc.get("arguments").cloned().unwrap_or(json!({}));
                            Some(ToolCallRequest {
                                id: uuid::Uuid::new_v4().to_string(),
                                name,
                                arguments,
                            })
                        })
                        .collect();
                    if !calls.is_empty() {
                        return calls;
                    }
                }
            }
        }

        // Variant 2: compact — func_name{"key":"value"}
        if let Some(brace_pos) = after.find('{') {
            let name = after[..brace_pos].trim();
            let is_valid_ident =
                !name.is_empty() && name.chars().all(|c| c.is_alphanumeric() || c == '_');
            if is_valid_ident {
                let json_part = &after[brace_pos..];
                let brace_end = Self::find_matching_bracket(json_part, '{', '}');
                if brace_end > 0 {
                    if let Ok(arguments) = serde_json::from_str::<Value>(&json_part[..brace_end]) {
                        return vec![ToolCallRequest {
                            id: uuid::Uuid::new_v4().to_string(),
                            name: name.to_string(),
                            arguments,
                        }];
                    }
                }
            }
        }

        vec![]
    }

    /// Find the end index of a balanced bracket pair starting at position 0.
    /// Returns the byte index after the closing bracket, or 0 if not found.
    fn find_matching_bracket(s: &str, open: char, close: char) -> usize {
        let mut depth: i32 = 0;
        let mut in_string = false;
        let mut escape = false;
        for (i, ch) in s.char_indices() {
            if escape {
                escape = false;
                continue;
            }
            if ch == '\\' && in_string {
                escape = true;
                continue;
            }
            if ch == '"' {
                in_string = !in_string;
                continue;
            }
            if in_string {
                continue;
            }
            if ch == open {
                depth += 1;
            } else if ch == close {
                depth -= 1;
                if depth == 0 {
                    return i + ch.len_utf8();
                }
            }
        }
        0
    }

    /// Parse API error response body for a user-friendly message
    fn format_api_error(status: reqwest::StatusCode, body: &str) -> String {
        // Try to extract message from JSON error body
        let detail = serde_json::from_str::<Value>(body).ok().and_then(|json| {
            // Common patterns: { "error": { "message": "..." } } or { "detail": "..." } or { "message": "..." }
            json.get("error")
                .and_then(|e| e.get("message"))
                .and_then(|v| v.as_str())
                .map(String::from)
                .or_else(|| {
                    json.get("detail")
                        .and_then(|v| v.as_str())
                        .map(String::from)
                })
                .or_else(|| {
                    json.get("message")
                        .and_then(|v| v.as_str())
                        .map(String::from)
                })
        });

        let hint = match status.as_u16() {
            401 => " (check your API key)",
            403 => " (forbidden — check API key permissions)",
            404 => " (model not found or endpoint incorrect)",
            429 => " (rate limited — try again shortly)",
            500..=599 => " (server error — retry later)",
            _ => "",
        };

        match detail {
            Some(msg) => format!("API error {}{}: {}", status, hint, msg),
            None if body.is_empty() => format!("API error {}{}", status, hint),
            None => format!("API error {}{}: {}", status, hint, body),
        }
    }

    fn parse_usage(json: &Value) -> Option<TokenUsage> {
        let u = json.get("usage")?;
        let completion = u
            .get("completion_tokens")
            .and_then(|v| v.as_u64())
            .unwrap_or(0);
        Some(TokenUsage {
            prompt_tokens: u.get("prompt_tokens").and_then(|v| v.as_u64()).unwrap_or(0),
            completion_tokens: completion,
            total_tokens: u.get("total_tokens").and_then(|v| v.as_u64()).unwrap_or(0),
            reasoning_tokens: 0, // filled in by parse_response after reasoning extraction
            action_tokens: completion,
        })
    }

    /// Build the base OpenAI-compatible chat completion request body.
    fn build_request_body(
        &self,
        api_messages: &[Value],
        api_tools: &[Value],
        stream: bool,
    ) -> Value {
        let mut request_body = json!({
            "model": self.cfg.model,
            "messages": api_messages,
            "temperature": self.cfg.temperature,
            "top_p": self.cfg.top_p,
            "max_tokens": self.cfg.max_tokens,
            "stream": stream
        });

        // Request a final usage chunk during streaming. Without this, many
        // OpenAI-compatible providers (e.g. vLLM/SGLang serving Qwen) omit
        // `usage` entirely from the stream, leaving token/context widgets at
        // zero — while others (StepFun) send it regardless. Setting it makes
        // token accounting consistent across every model/provider.
        if stream {
            request_body["stream_options"] = json!({ "include_usage": true });
        }

        // Only include tools if non-empty AND model supports tool use
        if !api_tools.is_empty() && Self::supports_tool_use(&self.cfg.model) {
            request_body["tools"] = json!(api_tools);
        }

        // Thinking mode: use the right mechanism per model family.
        // - Mistral Small 4+: `reasoning_effort` (none/high)
        // - Qwen/Gemma/GLM: `chat_template_kwargs`
        // - Others: no thinking support
        if Self::supports_reasoning_effort(&self.cfg.model) {
            request_body["reasoning_effort"] = if self.cfg.use_thinking {
                json!("high")
            } else {
                json!("none")
            };
        } else if Self::supports_chat_template_kwargs(&self.cfg.model) {
            request_body["chat_template_kwargs"] =
                Self::thinking_kwargs(&self.cfg.model, self.cfg.use_thinking);
        }

        request_body["seed"] = json!(42);
        request_body
    }

    /// Build local and optional cloud model fallback chains.
    fn build_model_chains(&self) -> Vec<(String, Option<String>, Vec<String>)> {
        // Build model chain: primary model + fallback models (same provider)
        let mut model_chains: Vec<(String, Option<String>, Vec<String>)> =
            vec![(self.cfg.api_url.clone(), self.cfg.api_key.clone(), {
                let mut m = vec![self.cfg.model.clone()];
                m.extend(self.cfg.fallback_models.clone());
                m
            })];

        // Add cloud fallback chain if configured (different provider/URL)
        if let Some(ref cloud) = self.cfg.cloud {
            let mut cloud_models = vec![cloud.model.clone()];
            cloud_models.extend(cloud.fallback_models.clone());
            model_chains.push((cloud.api_url.clone(), cloud.api_key.clone(), cloud_models));
        }

        model_chains
    }

    /// Adjust thinking/tool fields on the request body for a specific model.
    fn apply_model_specific_request_fields(
        &self,
        request_body: &mut Value,
        model: &str,
        api_tools: &[Value],
    ) {
        // Dynamically add/remove thinking params based on model support
        if Self::supports_reasoning_effort(model) {
            request_body
                .as_object_mut()
                .map(|o| o.remove("chat_template_kwargs"));
            request_body["reasoning_effort"] = if self.cfg.use_thinking {
                json!("high")
            } else {
                json!("none")
            };
        } else if Self::supports_chat_template_kwargs(model) {
            request_body
                .as_object_mut()
                .map(|o| o.remove("reasoning_effort"));
            if request_body.get("chat_template_kwargs").is_none() {
                request_body["chat_template_kwargs"] =
                    Self::thinking_kwargs(model, self.cfg.use_thinking);
            }
        } else {
            request_body
                .as_object_mut()
                .map(|o| o.remove("chat_template_kwargs"));
            request_body
                .as_object_mut()
                .map(|o| o.remove("reasoning_effort"));
        }

        // Dynamically add/remove tools based on model support
        if Self::supports_tool_use(model) {
            if !api_tools.is_empty() && request_body.get("tools").is_none() {
                request_body["tools"] = json!(api_tools);
            }
        } else {
            request_body.as_object_mut().map(|o| o.remove("tools"));
        }
    }

    /// Build an authenticated POST request for chat completions.
    fn build_authenticated_request(
        &self,
        url: &str,
        api_key: &Option<String>,
        request_body: &Value,
    ) -> reqwest::RequestBuilder {
        let mut request = self.http_client.post(url).json(request_body);
        if let Some(ref key) = api_key {
            request = request.header("Authorization", format!("Bearer {}", key));
        }
        request
    }

    /// Whether an LLM error status should trigger exponential backoff retry.
    fn is_retriable_llm_error(err: &PawanError) -> bool {
        if let PawanError::Llm(ref msg) = err {
            msg.contains("429")
                || msg.contains("500")
                || msg.contains("501")
                || msg.contains("502")
                || msg.contains("503")
                || msg.contains("504")
        } else {
            false
        }
    }

    /// Dispatch to streaming or non-streaming completion handling.
    async fn dispatch_llm_request(
        &self,
        request: reqwest::RequestBuilder,
        on_token: Option<&TokenCallback>,
    ) -> Result<LLMResponse> {
        if on_token.is_some() {
            self.streaming(request, on_token).await
        } else {
            self.non_streaming(request).await
        }
    }
}

#[async_trait]
impl LlmBackend for OpenAiCompatBackend {
    async fn generate(
        &self,
        messages: &[Message],
        tools: &[ToolDefinition],
        on_token: Option<&TokenCallback>,
    ) -> Result<LLMResponse> {
        let api_messages = self.build_messages(messages);
        let api_tools = self.build_tools(tools);
        let mut request_body =
            self.build_request_body(&api_messages, &api_tools, on_token.is_some());
        let model_chains = self.build_model_chains();

        let mut last_error = None;
        let max_retries = self.cfg.max_retries;

        for (chain_idx, (api_url, api_key, models)) in model_chains.iter().enumerate() {
            let url = format!("{}/chat/completions", api_url);
            let is_cloud = chain_idx > 0;

            for model in models {
                request_body["model"] = json!(model);
                self.apply_model_specific_request_fields(&mut request_body, model, &api_tools);

                for attempt in 0..=max_retries {
                    let request = self.build_authenticated_request(&url, api_key, &request_body);

                    let prompt_len: usize = messages.iter().map(|m| m.content.len()).sum();
                    let tools_count = tools.len();
                    tracing::info!(
                        model = model.as_str(),
                        url = url.as_str(),
                        provider = if is_cloud { "cloud" } else { "local" },
                        prompt_len,
                        tools = tools_count,
                        attempt,
                        streaming = on_token.is_some(),
                        "llm call"
                    );

                    let t0 = std::time::Instant::now();
                    let result = self.dispatch_llm_request(request, on_token).await;
                    let latency_ms = t0.elapsed().as_millis() as u64;

                    match result {
                        Ok(response) => {
                            tracing::info!(
                                model = model.as_str(),
                                provider = if is_cloud { "cloud" } else { "local" },
                                latency_ms,
                                prompt_tokens = response
                                    .usage
                                    .as_ref()
                                    .map(|u| u.prompt_tokens)
                                    .unwrap_or(0),
                                completion_tokens = response
                                    .usage
                                    .as_ref()
                                    .map(|u| u.completion_tokens)
                                    .unwrap_or(0),
                                finish_reason = response.finish_reason.as_str(),
                                response_len = response.content.len(),
                                tool_calls = response.tool_calls.len(),
                                "llm ok"
                            );
                            return Ok(response);
                        }
                        Err(err) => {
                            let err_msg = err.to_string();
                            tracing::warn!(
                                model = model.as_str(),
                                provider = if is_cloud { "cloud" } else { "local" },
                                latency_ms,
                                attempt,
                                error = %err_msg,
                                "llm error"
                            );
                            last_error = Some(err);

                            if let Some(ref err) = last_error {
                                if Self::is_retriable_llm_error(err) && attempt < max_retries {
                                    let delay = Self::calculate_backoff_delay(attempt);
                                    tracing::warn!(
                                        attempt = attempt + 1,
                                        model = model.as_str(),
                                        delay_ms = delay.as_millis() as u64,
                                        "retrying"
                                    );
                                    tokio::time::sleep(delay).await;
                                    continue;
                                }
                            }
                            break;
                        }
                    }
                }

                tracing::warn!(
                    model = model.as_str(),
                    cloud = is_cloud,
                    "Model exhausted retries, trying next"
                );
            }

            if self.cfg.cloud.is_some() && !is_cloud {
                tracing::warn!("Local models exhausted — falling back to cloud");
            }
        }

        Err(last_error.expect("No error recorded in retry loop"))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_response_no_choices() {
        let backend = OpenAiCompatBackend::new(OpenAiCompatConfig {
            api_url: "http://localhost".into(),
            api_key: None,
            model: "test".into(),
            temperature: 1.0,
            top_p: 0.95,
            max_tokens: 100,
            system_prompt: "test".into(),
            use_thinking: false,
            max_retries: 3,
            fallback_models: vec![],
            cloud: None,
        });

        let json = json!({"choices": []});
        assert!(backend.parse_response(&json).is_err());
    }

    #[test]
    fn test_parse_mistral_tool_calls_array_format() {
        let content = r#"[TOOL_CALLS] [{"name":"edit_file","arguments":{"path":"/tmp/test.rs","content":"fn main() {}"}}]"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].name, "edit_file");
        assert_eq!(calls[0].arguments["path"], "/tmp/test.rs");
    }

    #[test]
    fn test_parse_mistral_tool_calls_compact_format() {
        let content = r#"[TOOL_CALLS]read_file{"path":"/home/user/project/src/main.rs"}"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].name, "read_file");
        assert_eq!(calls[0].arguments["path"], "/home/user/project/src/main.rs");
    }

    #[test]
    fn test_parse_mistral_tool_calls_no_marker() {
        let content = "No tool calls here, just regular text.";
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert!(calls.is_empty());
    }

    #[test]
    fn test_parse_mistral_tool_calls_multiple() {
        let content = r#"[TOOL_CALLS] [{"name":"read_file","arguments":{"path":"a.rs"}},{"name":"read_file","arguments":{"path":"b.rs"}}]"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 2);
        assert_eq!(calls[0].arguments["path"], "a.rs");
        assert_eq!(calls[1].arguments["path"], "b.rs");
    }

    #[test]
    fn test_parse_mistral_tool_calls_with_preamble() {
        let content =
            "I'll edit the file now.\n[TOOL_CALLS]shell_exec{\"command\":\"cargo check\"}";
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].name, "shell_exec");
        assert_eq!(calls[0].arguments["command"], "cargo check");
    }

    #[test]
    fn test_parse_response_falls_back_to_mistral_format() {
        let backend = OpenAiCompatBackend::new(OpenAiCompatConfig {
            api_url: "http://localhost".into(),
            api_key: None,
            model: "test".into(),
            temperature: 1.0,
            top_p: 0.95,
            max_tokens: 100,
            system_prompt: "test".into(),
            use_thinking: false,
            max_retries: 3,
            fallback_models: vec![],
            cloud: None,
        });

        // No structured tool_calls, but content has [TOOL_CALLS] marker
        let json = json!({
            "choices": [{
                "message": {
                    "content": "[TOOL_CALLS] [{\"name\":\"read_file\",\"arguments\":{\"path\":\"/tmp/x.rs\"}}]",
                    "role": "assistant"
                },
                "finish_reason": "stop"
            }]
        });

        let response = backend.parse_response(&json).unwrap();
        assert_eq!(response.tool_calls.len(), 1);
        assert_eq!(response.tool_calls[0].name, "read_file");
        assert_eq!(response.tool_calls[0].arguments["path"], "/tmp/x.rs");
    }

    #[test]
    fn test_build_messages() {
        let backend = OpenAiCompatBackend::new(OpenAiCompatConfig {
            api_url: "http://localhost".into(),
            api_key: None,
            model: "test".into(),
            temperature: 1.0,
            top_p: 0.95,
            max_tokens: 100,
            system_prompt: "You are helpful.".into(),
            use_thinking: false,
            max_retries: 3,
            fallback_models: vec![],
            cloud: None,
        });

        let messages = vec![Message {
            role: Role::User,
            content: "Hello".into(),
            tool_calls: vec![],
            tool_result: None,
        }];

        let api_messages = backend.build_messages(&messages);
        assert_eq!(api_messages.len(), 2); // system + user
        assert_eq!(api_messages[0]["role"], "system");
        assert_eq!(api_messages[1]["role"], "user");
        assert_eq!(api_messages[1]["content"], "Hello");
    }

    #[test]
    fn test_calculate_backoff_delay() {
        // Test that backoff delays follow exponential pattern with jitter
        let delay_0 = OpenAiCompatBackend::calculate_backoff_delay(0);
        let delay_1 = OpenAiCompatBackend::calculate_backoff_delay(1);
        let delay_2 = OpenAiCompatBackend::calculate_backoff_delay(2);

        // Base delays should be around 1s, 2s, 4s (with ±20% jitter)
        assert!(
            delay_0.as_millis() >= 800 && delay_0.as_millis() <= 1200,
            "Delay 0 should be ~1s with jitter: {}ms",
            delay_0.as_millis()
        );
        assert!(
            delay_1.as_millis() >= 1600 && delay_1.as_millis() <= 2400,
            "Delay 1 should be ~2s with jitter: {}ms",
            delay_1.as_millis()
        );
        assert!(
            delay_2.as_millis() >= 3200 && delay_2.as_millis() <= 4800,
            "Delay 2 should be ~4s with jitter: {}ms",
            delay_2.as_millis()
        );
    }

    #[test]
    fn test_supports_chat_template_kwargs() {
        assert!(OpenAiCompatBackend::supports_chat_template_kwargs(
            "Qwen/Qwen2.5-72B-Instruct"
        ));
        assert!(OpenAiCompatBackend::supports_chat_template_kwargs(
            "deepseek-ai/deepseek-v3"
        ));
        assert!(OpenAiCompatBackend::supports_chat_template_kwargs(
            "google/gemma-4-31b-it"
        ));
        assert!(OpenAiCompatBackend::supports_chat_template_kwargs(
            "z-ai/glm4.7"
        ));
        assert!(OpenAiCompatBackend::supports_chat_template_kwargs(
            "z-ai/glm5"
        ));

        // Mistral uses reasoning_effort, not chat_template_kwargs
        assert!(!OpenAiCompatBackend::supports_chat_template_kwargs(
            "mistralai/mistral-small-4-119b-2603"
        ));
        assert!(!OpenAiCompatBackend::supports_chat_template_kwargs(
            "meta/llama-3.1-70b-instruct"
        ));
        assert!(!OpenAiCompatBackend::supports_chat_template_kwargs(
            "stepfun-ai/step-3.5-flash"
        ));
        assert!(!OpenAiCompatBackend::supports_chat_template_kwargs(
            "minimaxai/minimax-m2.5"
        ));
    }

    #[test]
    fn test_supports_reasoning_effort() {
        assert!(OpenAiCompatBackend::supports_reasoning_effort(
            "mistralai/mistral-small-4-119b-2603"
        ));
        assert!(!OpenAiCompatBackend::supports_reasoning_effort(
            "stepfun-ai/step-3.5-flash"
        ));
        assert!(!OpenAiCompatBackend::supports_reasoning_effort(
            "minimaxai/minimax-m2.5"
        ));
        assert!(!OpenAiCompatBackend::supports_reasoning_effort(
            "qwen/qwen3.5-122b-a10b"
        ));
    }

    #[test]
    fn test_thinking_kwargs_gemma_uses_enable_thinking() {
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("google/gemma-4-31b-it", true),
            json!({ "enable_thinking": true })
        );
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("google/gemma-4-31b-it", false),
            json!({ "enable_thinking": false })
        );
    }

    #[test]
    fn test_thinking_kwargs_glm_uses_enable_thinking_and_clear_thinking() {
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("z-ai/glm4.7", true),
            json!({ "enable_thinking": true, "clear_thinking": false })
        );
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("z-ai/glm5", false),
            json!({ "enable_thinking": false, "clear_thinking": false })
        );
    }

    #[test]
    fn test_thinking_kwargs_qwen_uses_thinking() {
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("Qwen/Qwen2.5-72B-Instruct", true),
            json!({ "thinking": true })
        );
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("Qwen/Qwen2.5-72B-Instruct", false),
            json!({ "thinking": false })
        );
    }

    #[test]
    fn test_thinking_kwargs_deepseek_uses_thinking() {
        assert_eq!(
            OpenAiCompatBackend::thinking_kwargs("deepseek-ai/deepseek-r1", true),
            json!({ "thinking": true })
        );
    }
}

#[cfg(test)]
mod think_strip_tests {
    use super::OpenAiCompatBackend;

    #[test]
    fn strip_simple() {
        let s = "Hello <think>internal reasoning</think> world";
        assert_eq!(OpenAiCompatBackend::strip_think_from_str(s), "Hello world");
    }

    #[test]
    fn strip_case_insensitive() {
        let s = "A <Think>stuff</THINK> B";
        assert_eq!(OpenAiCompatBackend::strip_think_from_str(s), "A B");
    }

    #[test]
    fn strip_multiple() {
        let s = "<think>a</think>Hello<think>b</think> there";
        assert_eq!(OpenAiCompatBackend::strip_think_from_str(s), "Hello there");
    }

    #[test]
    fn strip_nested_content() {
        let s = "prefix <think>line1\nline2\nline3</think> suffix";
        assert_eq!(
            OpenAiCompatBackend::strip_think_from_str(s),
            "prefix suffix"
        );
    }

    #[test]
    fn strip_entire_message() {
        let s = "<think>only thinking</think>";
        assert_eq!(OpenAiCompatBackend::strip_think_from_str(s).trim(), "");
    }

    #[test]
    fn strip_no_blocks() {
        let s = "No think blocks here";
        assert_eq!(
            OpenAiCompatBackend::strip_think_from_str(s),
            "No think blocks here"
        );
    }

    #[test]
    fn strip_from_json_args() {
        let s =
            r#"<think>let me figure out the path</think>{"path":"src/main.rs","content":"hello"}"#;
        let clean = OpenAiCompatBackend::strip_think_from_str(s);
        let parsed: serde_json::Value = serde_json::from_str(&clean).unwrap();
        assert_eq!(parsed["path"], "src/main.rs");
    }

    #[test]
    fn strip_interleaved_json() {
        // StepFun pattern: thinking interleaved with JSON
        let s = r#"{"path":"test.rs"<think>checking the file</think>,"content":"fn main() {}"}"#;
        let clean = OpenAiCompatBackend::strip_think_from_str(s);
        // After stripping, JSON should be parseable
        let result = serde_json::from_str::<serde_json::Value>(&clean);
        // This specific case may not parse due to comma positioning,
        // but the stripping itself should not panic
        let _ = result;
    }

    #[test]
    fn strip_empty_string() {
        assert_eq!(OpenAiCompatBackend::strip_think_from_str(""), "");
    }

    #[test]
    fn strip_whitespace_only() {
        assert_eq!(OpenAiCompatBackend::strip_think_from_str("   ").trim(), "");
    }
}

#[cfg(test)]
mod mistral_tool_call_tests {
    use super::OpenAiCompatBackend;

    #[test]
    fn parse_json_array_variant() {
        let content = r#"I'll use the tool. [TOOL_CALLS] [{"name":"read_file","arguments":{"path":"src/main.rs"}}]"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].name, "read_file");
        assert_eq!(calls[0].arguments["path"], "src/main.rs");
    }

    #[test]
    fn parse_compact_variant() {
        let content = r#"[TOOL_CALLS] bash{"command":"ls -la"}"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].name, "bash");
        assert_eq!(calls[0].arguments["command"], "ls -la");
    }

    #[test]
    fn parse_multiple_tools() {
        let content = r#"[TOOL_CALLS] [{"name":"read_file","arguments":{"path":"a.rs"}},{"name":"read_file","arguments":{"path":"b.rs"}}]"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert_eq!(calls.len(), 2);
        assert_eq!(calls[0].arguments["path"], "a.rs");
        assert_eq!(calls[1].arguments["path"], "b.rs");
    }

    #[test]
    fn no_marker_returns_empty() {
        let content = "Just regular text with no tool calls";
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert!(calls.is_empty());
    }

    #[test]
    fn marker_with_invalid_json() {
        let content = "[TOOL_CALLS] {invalid json here}";
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert!(calls.is_empty());
    }

    #[test]
    fn empty_name_filtered_out() {
        let content = r#"[TOOL_CALLS] [{"name":"","arguments":{}}]"#;
        let calls = OpenAiCompatBackend::parse_mistral_tool_calls(content);
        assert!(calls.is_empty());
    }
}

#[cfg(test)]
mod streaming_tool_call_tests {
    use serde_json::json;

    /// Simulate streaming tool call argument accumulation — the exact pattern
    /// that was buggy when arguments were initialized as json!({}) instead of json!("").
    #[test]
    fn streaming_args_accumulate_across_deltas() {
        // Simulate the streaming accumulation logic from the streaming() method
        let mut arguments = json!(""); // Fixed: was json!({}) which broke .as_str()

        // Delta 1: partial arguments
        let delta1 = r#"{"pa"#;
        let current = arguments.as_str().unwrap_or("");
        arguments = json!(format!("{}{}", current, delta1));

        // Delta 2: more arguments
        let delta2 = r#"th":"src/"#;
        let current = arguments.as_str().unwrap_or("");
        arguments = json!(format!("{}{}", current, delta2));

        // Delta 3: closing
        let delta3 = r#"main.rs"}"#;
        let current = arguments.as_str().unwrap_or("");
        arguments = json!(format!("{}{}", current, delta3));

        // Verify accumulated string is valid JSON
        let args_str = arguments.as_str().unwrap();
        assert_eq!(args_str, r#"{"path":"src/main.rs"}"#);

        // Verify it parses correctly
        let parsed: serde_json::Value = serde_json::from_str(args_str).unwrap();
        assert_eq!(parsed["path"], "src/main.rs");
    }

    #[test]
    fn streaming_args_init_as_empty_string_not_object() {
        let arguments = json!("");
        // This must return Some, not None
        assert!(arguments.as_str().is_some(), "json!(\"\") must be a string");
        assert_eq!(arguments.as_str().unwrap(), "");

        // Contrast: json!({}) returns None for as_str()
        let bad_arguments = json!({});
        assert!(
            bad_arguments.as_str().is_none(),
            "json!({{}}) is not a string"
        );
    }

    #[test]
    fn streaming_args_with_think_blocks_cleaned() {
        use super::OpenAiCompatBackend;

        // Simulate StepFun/Qwen model interleaving <think> in arguments
        let accumulated =
            r#"<think>let me write the path</think>{"path":"test.rs","content":"fn main() {}"}"#;
        let clean = OpenAiCompatBackend::strip_think_from_str(accumulated);
        let parsed: serde_json::Value = serde_json::from_str(&clean).unwrap();
        assert_eq!(parsed["path"], "test.rs");
        assert_eq!(parsed["content"], "fn main() {}");
    }
}

#[cfg(test)]
mod bracket_matching_tests {
    use super::OpenAiCompatBackend;

    #[test]
    fn find_matching_bracket_simple() {
        assert_eq!(
            OpenAiCompatBackend::find_matching_bracket("{}", '{', '}'),
            2
        );
        assert_eq!(
            OpenAiCompatBackend::find_matching_bracket("[]", '[', ']'),
            2
        );
    }

    #[test]
    fn find_matching_bracket_nested() {
        // {"a":{"b":1}} = 13 bytes, outer } at index 12, returns 13
        assert_eq!(
            OpenAiCompatBackend::find_matching_bracket(r#"{"a":{"b":1}}"#, '{', '}'),
            13
        );
    }

    #[test]
    fn find_matching_bracket_with_strings() {
        // {"key":"val{ue}"} = 17 bytes, outer } at index 16, returns 17
        assert_eq!(
            OpenAiCompatBackend::find_matching_bracket(r#"{"key":"val{ue}"}"#, '{', '}'),
            17
        );
    }

    #[test]
    fn find_matching_bracket_unmatched() {
        assert_eq!(
            OpenAiCompatBackend::find_matching_bracket("{unclosed", '{', '}'),
            0
        );
    }

    #[test]
    fn find_matching_bracket_empty() {
        assert_eq!(OpenAiCompatBackend::find_matching_bracket("", '{', '}'), 0);
    }
}

#[cfg(test)]
mod api_error_tests {
    use super::OpenAiCompatBackend;
    use reqwest::StatusCode;

    #[test]
    fn format_error_json_message() {
        let body = r#"{"error":{"message":"Invalid API key"}}"#;
        let result = OpenAiCompatBackend::format_api_error(StatusCode::UNAUTHORIZED, body);
        assert!(result.contains("Invalid API key"));
        assert!(result.contains("401"));
        assert!(result.contains("check your API key"));
    }

    #[test]
    fn format_error_detail_field() {
        let body = r#"{"detail":"Model not found"}"#;
        let result = OpenAiCompatBackend::format_api_error(StatusCode::NOT_FOUND, body);
        assert!(result.contains("Model not found"));
        assert!(result.contains("404"));
    }

    #[test]
    fn format_error_message_field() {
        let body = r#"{"message":"Rate limit exceeded"}"#;
        let result = OpenAiCompatBackend::format_api_error(StatusCode::TOO_MANY_REQUESTS, body);
        assert!(result.contains("Rate limit exceeded"));
        assert!(result.contains("rate limited"));
    }

    #[test]
    fn format_error_empty_body() {
        let result = OpenAiCompatBackend::format_api_error(StatusCode::INTERNAL_SERVER_ERROR, "");
        assert!(result.contains("500"));
        assert!(result.contains("server error"));
        assert!(!result.contains(": \n")); // no trailing garbage
    }

    #[test]
    fn format_error_non_json_body() {
        let body = "Bad Gateway: upstream timeout";
        let result = OpenAiCompatBackend::format_api_error(StatusCode::BAD_GATEWAY, body);
        assert!(result.contains("502"));
        assert!(result.contains("upstream timeout"));
    }

    #[test]
    fn format_error_forbidden() {
        let body = r#"{"error":{"message":"Forbidden"}}"#;
        let result = OpenAiCompatBackend::format_api_error(StatusCode::FORBIDDEN, body);
        assert!(result.contains("403"));
        assert!(result.contains("permissions"));
    }

    #[test]
    fn format_error_unknown_status() {
        let result = OpenAiCompatBackend::format_api_error(StatusCode::IM_A_TEAPOT, "teapot");
        assert!(result.contains("418"));
        assert!(result.contains("teapot"));
        // No special hint for unknown codes
        assert!(!result.contains("check"));
    }
}

#[cfg(test)]
mod usage_parsing_tests {
    use super::OpenAiCompatBackend;
    use serde_json::json;

    #[test]
    fn parse_usage_full() {
        let resp = json!({
            "usage": {
                "prompt_tokens": 100,
                "completion_tokens": 50,
                "total_tokens": 150
            }
        });
        let usage = OpenAiCompatBackend::parse_usage(&resp).unwrap();
        assert_eq!(usage.prompt_tokens, 100);
        assert_eq!(usage.completion_tokens, 50);
        assert_eq!(usage.total_tokens, 150);
    }

    #[test]
    fn parse_usage_missing() {
        let resp = json!({"choices": []});
        assert!(OpenAiCompatBackend::parse_usage(&resp).is_none());
    }

    #[test]
    fn parse_usage_partial() {
        let resp = json!({"usage": {"prompt_tokens": 42}});
        let usage = OpenAiCompatBackend::parse_usage(&resp).unwrap();
        assert_eq!(usage.prompt_tokens, 42);
        assert_eq!(usage.completion_tokens, 0); // defaults to 0
    }
}