car-inference 0.26.0

//! Remote inference backend — HTTP client for cloud API models.
//!
//! The main dispatch uses the `ProtocolHandler` trait from `protocol.rs`.
//! Legacy per-protocol handlers are kept temporarily for the embedding path.
//!
//! Supports: OpenAI-compatible, Anthropic, Google (Gemini).

// Legacy handlers are being phased out in favor of ProtocolHandler trait.

use reqwest::Client;
use serde::Deserialize;
use tracing::{debug, instrument};

use crate::key_pool::{KeyLease, KeyPool};
use crate::protocol::ProtocolHandler;
use crate::schema::{ApiProtocol, ModelSchema, ModelSource, ProprietaryAuth};
use crate::tasks::ContentBlock;
use crate::InferenceError;
use std::collections::HashMap;
use std::sync::OnceLock;

/// car-secrets key under which `car auth login parslee` stores the
/// OAuth2 access token (default `"car"` service). MUST stay in sync
/// with the CLI's storage key — the C4 shared-auth refactor will
/// hoist this into one place; until then the string is the contract.
/// `pub(crate)` so the registry can gate Parslee model availability on
/// the same credential the inference path leases.
pub(crate) const PARSLEE_ACCESS_TOKEN_ENV: &str = "PARSLEE_ACCESS_TOKEN";

/// Bounded transient-retry budget for a single remote HTTP call inside
/// [`RemoteBackend::execute_request`]: initial attempt + 2 retries. Mirrors the
/// values the car-cli run_task loop used before retries moved into the engine.
const REMOTE_MAX_ATTEMPTS: u32 = 3;
/// Linear backoff base (seconds): waits `base * attempt` → 3s, then 6s.
const REMOTE_BACKOFF_BASE_SECS: u64 = 3;

/// Whether an HTTP status code is worth retrying. Transient server-side /
/// rate-limit failures only — 4xx validation/auth (400/401/403/404/422) are
/// NOT retried because re-sending the same request just fails again.
fn is_transient_http_status(code: u16) -> bool {
    matches!(code, 429 | 500 | 502 | 503 | 529)
}

/// Whether a transport/timeout error is worth retrying. These surface as
/// [`InferenceError::InferenceFailed`] with `"HTTP error:"`, `"request timed
/// out"`, `"response body read timed out"`, or `"read body:"` prefixes. 4xx
/// arrive via the non-2xx `Ok` arm (status-coded), never here.
fn is_transient_transport_error(e: &InferenceError) -> bool {
    let m = e.to_string().to_ascii_lowercase();
    m.contains("http error")
        || m.contains("error sending request")
        || m.contains("timed out")
        || m.contains("timeout")
        || m.contains("connection")
        || m.contains("reset")
}

/// Whether a Parslee-path error is an authentication rejection (401/403) —
/// i.e. the bearer is bad/expired. These reach us as
/// [`InferenceError::InferenceFailed`] strings of the form
/// `"...: HTTP 401: ..."` (org lookup, `/connect/session`, or the chat
/// endpoint). A match drives the reactive `force_refresh`-and-retry-once
/// path. Anchored on `"HTTP 40x"` so a 4xx body that merely mentions the
/// number doesn't false-positive.
fn is_auth_rejection(e: &InferenceError) -> bool {
    let m = e.to_string();
    m.contains("HTTP 401") || m.contains("HTTP 403")
}

/// Find the first SSE event-boundary in a byte buffer, returning
/// `(offset, separator_len)`. Servers delimit events with a blank line,
/// which may arrive as `"\n\n"` (2 bytes) or — per the SSE spec — as
/// `"\r\n\r\n"` (4 bytes, CRLF). Splitting on `"\n\n"` alone silently
/// never frames a CRLF stream. Operating on bytes (not a lossily-decoded
/// `String`) is what lets the caller keep a partial multi-byte codepoint
/// buffered across chunk boundaries instead of corrupting it into `�`.
fn find_sse_separator(buf: &[u8]) -> Option<(usize, usize)> {
    let lf = buf.windows(2).position(|w| w == b"\n\n").map(|p| (p, 2));
    let crlf = buf.windows(4).position(|w| w == b"\r\n\r\n").map(|p| (p, 4));
    match (lf, crlf) {
        (Some(a), Some(b)) => Some(if a.0 <= b.0 { a } else { b }),
        (Some(a), None) => Some(a),
        (None, Some(b)) => Some(b),
        (None, None) => None,
    }
}

/// Process-wide cache of `(org_id, user_id)` resolved from a Parslee
/// bearer, keyed by the token so a re-login re-resolves. The Parslee
/// managed-inference path (`ModelSource::Proprietary`) talks to the
/// existing hosted assistant endpoint
/// (`POST /api/v1/orgs/{org}/chat/stream`), which needs the caller's
/// org + a user identifier — neither is in the token in a form we
/// parse, so we resolve them once per token via `/organizations/me`
/// and `/connect/session` (mirrors how Parslee Hydra does it).
static PARSLEE_IDENTITY: OnceLock<tokio::sync::Mutex<HashMap<String, (String, String)>>> =
    OnceLock::new();

/// Remote inference client. Reuses a single HTTP client for connection pooling.
/// Integrates with KeyPool for multi-key load balancing.
pub struct RemoteBackend {
    pub(crate) client: Client,
    pub key_pool: KeyPool,
}

/// Estimate token count from a string. Uses the standard ~4 chars/token heuristic
/// for remote models where we don't have a tokenizer.
pub fn estimate_tokens(text: &str) -> usize {
    std::cmp::max(1, text.len() / 4)
}

/// Collect the assistant text from a buffered Parslee chat SSE body.
///
/// Parslee frames events with blank-line separators. The on-wire
/// payload (per Parslee Hydra) is a leading `{ "conversationId": … }`
/// (start, ignored), zero or more `{ "content": … }` deltas
/// (concatenated), and a final event carrying a `timestamp` (stop).
/// An OpenAI-style `[DONE]` sentinel and unparseable frames are
/// skipped. Pure + side-effect-free so the parsing contract is unit-
/// testable without a live backend.
fn collect_parslee_sse(raw: &str) -> String {
    let mut out = String::new();
    for block in raw.split("\n\n") {
        let block = block.trim();
        if block.is_empty() {
            continue;
        }
        let data: String = block
            .lines()
            .filter_map(|l| l.strip_prefix("data:").map(str::trim))
            .collect::<Vec<_>>()
            .join("\n");
        if data.is_empty() || data == "[DONE]" {
            continue;
        }
        let Ok(ev) = serde_json::from_str::<serde_json::Value>(&data) else {
            continue;
        };
        if let Some(c) = ev.get("content").and_then(|v| v.as_str()) {
            out.push_str(c);
        }
        if ev.get("timestamp").is_some() {
            break;
        }
    }
    out
}

/// Truncate a prompt to fit within a context window.
/// Keeps the most recent content (suffix) since that's most relevant.
/// Returns the truncated prompt.
/// Truncate a prompt to fit within a context window.
/// This is a last-resort fallback — callers should prefer compaction via
/// car-memgine which preserves semantic content. This function is only
/// used when compaction isn't available (e.g., raw API calls).
fn truncate_prompt_to_fit(
    prompt: &str,
    context: Option<&str>,
    tools_json: Option<&[serde_json::Value]>,
    max_tokens: usize,
    context_window: usize,
) -> String {
    let context_tokens = context.map(|c| estimate_tokens(c)).unwrap_or(0);
    let tools_tokens = tools_json
        .map(|t| estimate_tokens(&serde_json::to_string(t).unwrap_or_default()))
        .unwrap_or(0);
    // Reserve space for: context + tools + max_tokens (output) + overhead
    let overhead = 100; // message framing, special tokens
    let reserved = context_tokens + tools_tokens + max_tokens + overhead;
    let available = context_window.saturating_sub(reserved);

    let prompt_tokens = estimate_tokens(prompt);
    if prompt_tokens <= available {
        return prompt.to_string();
    }

    tracing::warn!(
        prompt_tokens = prompt_tokens,
        available_tokens = available,
        context_window = context_window,
        "truncating prompt to fit context window (prefer compaction via car-memgine)"
    );

    // Truncate from the beginning (keep the end, which is most recent/relevant)
    // Estimate chars to keep based on available tokens
    let chars_to_keep = available * 4;
    if chars_to_keep >= prompt.len() {
        return prompt.to_string();
    }

    let start = prompt.len().saturating_sub(chars_to_keep);
    let safe_start = prompt.ceil_char_boundary(start);
    let truncated = &prompt[safe_start..];
    // Find a clean break point (newline or space)
    let break_point = truncated
        .find('\n')
        .or_else(|| truncated.find(' '))
        .unwrap_or(0);

    format!(
        "[...truncated...]\n{}",
        &truncated[break_point..].trim_start()
    )
}

impl RemoteBackend {
    pub fn new() -> Self {
        let client = Client::builder()
            // A large completion (e.g. a multi-KB file written as one tool
            // argument at a 32K output cap) holds a non-streamed socket silent
            // for minutes while generating. The old 120s/90s ceilings killed
            // those legitimate requests ("error sending request"). These are
            // upper bounds, not waits — raising them never slows a fast call.
            .timeout(std::time::Duration::from_secs(300))
            .connect_timeout(std::time::Duration::from_secs(10))
            .read_timeout(std::time::Duration::from_secs(180))
            .build()
            .unwrap_or_default();
        Self {
            client,
            key_pool: KeyPool::new(),
        }
    }

    /// Register all keys from a model schema into the key pool.
    pub async fn register_model_keys(&self, schema: &ModelSchema) {
        if let ModelSource::RemoteApi { ref endpoint, .. } = schema.source {
            let env_vars = schema.all_api_key_envs();
            if !env_vars.is_empty() {
                self.key_pool.register_endpoint(endpoint, env_vars).await;
            }
        }
    }

    /// Unified request execution using the protocol handler abstraction.
    /// All public generate methods delegate to this.
    #[instrument(
        name = "inference.remote_call",
        skip_all,
        fields(
            model = %schema.name,
            provider = %schema.provider,
        ),
    )]
    async fn execute_request(
        &self,
        schema: &ModelSchema,
        req: crate::protocol::ApiRequest,
    ) -> Result<crate::protocol::ApiResponse, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        let lease = self.lease_key(schema, &endpoint).await?;
        // Parslee managed inference (`parslee/*`) is NOT an OpenAI-
        // compatible gateway — it's the existing hosted assistant
        // endpoint (`/api/v1/orgs/{org}/chat/stream`, SSE). Route it
        // through the dedicated path instead of the generic
        // ProtocolHandler pipeline. No key_pool (single bearer).
        if matches!(schema.source, ModelSource::Proprietary { .. }) {
            // #319 (won't-fix): Proprietary is single-bearer, so key_pool endpoint stats (which exist only for multi-key load-balancing) are intentionally skipped — Parslee health is tracked at the model level via the outcome tracker (outcome.rs).
            // The Parslee managed-assistant endpoint owns its own model,
            // prompt, and decoding — it accepts only `{userId, message}` and
            // has no surface for tool definitions, JSON-schema output, or a
            // token cap. Silently dropping them made a tool-expecting caller
            // see zero tool calls (looks like the model declined) instead of a
            // clear "unsupported". Reject up front, mirroring the audio/video
            // guard in `generate_with_tools_multi`.
            if req.tools.as_ref().is_some_and(|t| !t.is_empty()) {
                return Err(InferenceError::UnsupportedMode {
                    mode: "tool-use",
                    backend: "parslee-assistant",
                    reason: "the Parslee managed-assistant endpoint does not accept tool \
                             definitions — route tool-calling requests to an OpenAI/Anthropic/\
                             Google model, or a local model with ToolUse capability",
                });
            }
            if req.response_format.is_some() {
                return Err(InferenceError::UnsupportedMode {
                    mode: "structured-output",
                    backend: "parslee-assistant",
                    reason: "the Parslee managed-assistant endpoint does not accept a \
                             response_format / JSON schema — route structured-output requests \
                             to a model that supports it",
                });
            }
            match self
                .parslee_assistant_request(&endpoint, &lease.api_key, &req)
                .await
            {
                Ok(resp) => return Ok(resp),
                Err(e) if is_auth_rejection(&e) => {
                    // The bearer was rejected (401/403). Reactively mint a
                    // fresh one and retry ONCE — the proactive expiry window
                    // (car_auth::access_token_refreshing) misses tokens that
                    // are revoked early or stored without an expiry, which is
                    // exactly the 401 burst that poisoned model health (#313).
                    match car_auth::force_refresh().await {
                        Some(fresh) => {
                            return self
                                .parslee_assistant_request(&endpoint, &fresh, &req)
                                .await;
                        }
                        // No refresh token, or refresh failed: surface the
                        // original auth error unchanged.
                        None => return Err(e),
                    }
                }
                Err(e) => return Err(e),
            }
        }

        // AWS Bedrock Converse — SigV4-signed, region-scoped, its own request
        // shape. Handled here (not the generic handler path) because signing
        // needs the assembled body + URL + timestamp. `endpoint` is the region.
        if matches!(protocol, ApiProtocol::Bedrock) {
            return self.bedrock_converse_request(&endpoint, &req).await;
        }

        let handler = crate::protocol::handler_for(protocol);
        let start = std::time::Instant::now();

        // Build URL
        let api_version = match &schema.source {
            ModelSource::RemoteApi { api_version, .. } => api_version.clone(),
            _ => None,
        };
        let url = if matches!(protocol, ApiProtocol::Google) {
            crate::protocol::google_url(&endpoint, &req.model, &lease.api_key)
        } else if matches!(protocol, ApiProtocol::VertexAi) {
            crate::protocol::vertex_url(&endpoint, &req.model)
        } else if matches!(protocol, ApiProtocol::AzureOpenAi) {
            let version = api_version.as_deref().unwrap_or("2024-10-21");
            format!(
                "{}/openai/deployments/{}/chat/completions?api-version={}",
                endpoint.trim_end_matches('/'),
                req.model,
                version
            )
        } else {
            format_endpoint(&endpoint, chat_path_for(schema, handler.as_ref()))
        };

        // Build headers
        let headers = handler.auth_headers(&lease.api_key);

        // Build request body
        let body = handler.build_request_body(&req);

        debug!(url = %url, model = %req.model, "protocol handler request");

        // Execute HTTP request with a bounded transient-retry loop. A genuine
        // blip (transport reset, timeout, 5xx/429/529 overloaded) must not throw
        // away a long-horizon task that has already done real work; client errors
        // (4xx validation/auth) are NOT retried — re-sending the same bad request
        // is pointless. The per-attempt tokio timeout (#26) is the safety net for
        // an OS/kernel-level socket hang reqwest's own timeout can't cancel.
        let mut attempt = 0u32;
        let resp_text = loop {
            attempt += 1;
            let send_outcome: Result<(reqwest::StatusCode, String), InferenceError> = async {
                let mut builder = self.client.post(&url);
                for (name, value) in &headers {
                    builder = builder.header(name.as_str(), value.as_str());
                }
                let send_fut = builder.json(&body).send();
                let resp = tokio::time::timeout(std::time::Duration::from_secs(300), send_fut)
                    .await
                    .map_err(|_| {
                        InferenceError::InferenceFailed(
                            "request timed out after 300s (tokio safety timeout)".to_string(),
                        )
                    })?
                    .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;
                let status = resp.status();
                let body_fut = resp.text();
                let txt = tokio::time::timeout(std::time::Duration::from_secs(300), body_fut)
                    .await
                    .map_err(|_| {
                        InferenceError::InferenceFailed(
                            "response body read timed out after 300s".to_string(),
                        )
                    })?
                    .map_err(|e| InferenceError::InferenceFailed(format!("read body: {e}")))?;
                Ok((status, txt))
            }
            .await;

            match send_outcome {
                Ok((status, txt)) if status.is_success() => break txt,
                Ok((status, txt)) => {
                    // Non-2xx: keep the existing key_pool failure accounting on
                    // EVERY attempt so multi-key load balancing / rate-limit
                    // detection stays accurate.
                    let is_rl = txt.contains("429") || txt.contains("RESOURCE_EXHAUSTED");
                    self.key_pool
                        .report_failure(&endpoint, &lease.env_var, is_rl)
                        .await;
                    let err_msg = format!("API returned {status}: {txt}");
                    if attempt < REMOTE_MAX_ATTEMPTS && is_transient_http_status(status.as_u16()) {
                        tokio::time::sleep(std::time::Duration::from_secs(
                            REMOTE_BACKOFF_BASE_SECS * attempt as u64,
                        ))
                        .await;
                        continue;
                    }
                    return Err(InferenceError::InferenceFailed(err_msg));
                }
                Err(e) => {
                    // Transport / timeout error.
                    if attempt < REMOTE_MAX_ATTEMPTS && is_transient_transport_error(&e) {
                        tokio::time::sleep(std::time::Duration::from_secs(
                            REMOTE_BACKOFF_BASE_SECS * attempt as u64,
                        ))
                        .await;
                        continue;
                    }
                    return Err(e);
                }
            }
        };

        let latency_ms = start.elapsed().as_millis() as u64;

        let est_tokens = req
            .messages
            .iter()
            .filter_map(|m| m.get("content").and_then(|c| c.as_str()))
            .map(|s| s.len() / 4)
            .sum::<usize>() as u64;
        self.key_pool
            .report_success(&endpoint, &lease.env_var, latency_ms, est_tokens, 0)
            .await;

        let mut response = handler.parse_response(&resp_text)?;
        // Fill in context_window from the model schema
        if let Some(ref mut usage) = response.usage {
            usage.context_window = schema.context_length as u64;
        }
        Ok(response)
    }

    /// AWS Bedrock Converse request — SigV4-signed POST to
    /// `bedrock-runtime.{region}.amazonaws.com/model/{modelId}/converse`.
    /// `region` is the model's configured `endpoint`; `req.model` is the
    /// Bedrock model id. Credentials come from the standard AWS env vars.
    async fn bedrock_converse_request(
        &self,
        region: &str,
        req: &crate::protocol::ApiRequest,
    ) -> Result<crate::protocol::ApiResponse, InferenceError> {
        let creds = crate::aws_sigv4::AwsCredentials::from_env().ok_or_else(|| {
            InferenceError::InferenceFailed(
                "Bedrock requires AWS credentials: set AWS_ACCESS_KEY_ID and \
                 AWS_SECRET_ACCESS_KEY (plus AWS_SESSION_TOKEN for temporary creds)"
                    .to_string(),
            )
        })?;
        if region.trim().is_empty() {
            return Err(InferenceError::InferenceFailed(
                "Bedrock model endpoint must be the AWS region (e.g. \"us-east-1\")".to_string(),
            ));
        }

        let handler = crate::protocol::BedrockHandler;
        let body = handler.build_request_body(req);
        let body_bytes = serde_json::to_vec(&body)
            .map_err(|e| InferenceError::InferenceFailed(format!("serialize Bedrock body: {e}")))?;

        let host = format!("bedrock-runtime.{region}.amazonaws.com");
        // The model id contains `:` — encode it identically for the URL and the
        // signed canonical path (else the SigV4 signature won't match).
        let encoded_model = crate::aws_sigv4::uri_encode_segment(&req.model);
        let canonical_path = format!("/model/{encoded_model}/converse");
        let url = format!("https://{host}{canonical_path}");

        let now = chrono::Utc::now();
        let amz_date = now.format("%Y%m%dT%H%M%SZ").to_string();
        let date_stamp = now.format("%Y%m%d").to_string();
        let base_headers = vec![
            ("host".to_string(), host.clone()),
            ("content-type".to_string(), "application/json".to_string()),
        ];
        let signed = crate::aws_sigv4::signed_headers(
            &creds,
            region,
            "bedrock",
            "POST",
            &canonical_path,
            "",
            &base_headers,
            &body_bytes,
            &amz_date,
            &date_stamp,
        );

        let mut builder = self.client.post(&url);
        for (k, v) in &signed {
            // reqwest sets Host itself from the URL (matching what we signed).
            if k != "host" {
                builder = builder.header(k, v);
            }
        }
        // Send the EXACT bytes that were hashed for the signature (not `.json()`,
        // which would re-serialize and could differ).
        let send_fut = builder.body(body_bytes).send();
        let resp = tokio::time::timeout(std::time::Duration::from_secs(300), send_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed("Bedrock request timed out after 300s".to_string())
            })?
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;
        let status = resp.status();
        let txt = resp
            .text()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("read Bedrock body: {e}")))?;
        if !status.is_success() {
            return Err(InferenceError::InferenceFailed(format!(
                "Bedrock returned {status}: {txt}"
            )));
        }
        // context_window on usage is filled in by the caller from the schema.
        handler.parse_response(&txt)
    }

    /// Resolve `(org_id, user_id)` for a Parslee bearer, cached per
    /// token. Mirrors Parslee Hydra: org from `/api/v1/organizations/me`,
    /// a user identifier (account email, best-effort) from
    /// `/connect/session`. The backend authorizes off the bearer; the
    /// `userId` in the chat body is informational, so a missing email
    /// degrades to `"car"` rather than failing the request.
    async fn parslee_identity(
        &self,
        endpoint: &str,
        bearer: &str,
    ) -> Result<(String, String), InferenceError> {
        let cache = PARSLEE_IDENTITY.get_or_init(|| tokio::sync::Mutex::new(HashMap::new()));
        if let Some(v) = cache.lock().await.get(bearer) {
            return Ok(v.clone());
        }
        let base = endpoint.trim_end_matches('/');

        let org_url = format!("{base}/api/v1/organizations/me");
        let org_resp = self
            .client
            .get(&org_url)
            .bearer_auth(bearer)
            .send()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("Parslee org lookup: {e}")))?;
        if !org_resp.status().is_success() {
            let s = org_resp.status();
            let b = org_resp.text().await.unwrap_or_default();
            return Err(InferenceError::InferenceFailed(format!(
                "Parslee org lookup failed: HTTP {s}: {b}"
            )));
        }
        let org_json: serde_json::Value = org_resp
            .json()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("parse Parslee org: {e}")))?;
        let org_id = org_json
            .get("organizationId")
            .or_else(|| org_json.get("OrganizationId"))
            .and_then(|v| v.as_str())
            .ok_or_else(|| {
                InferenceError::InferenceFailed(
                    "Parslee org response has no organizationId — the signed-in \
                     account has no workspace yet (sign in via CAR Host.app or \
                     the web to finish onboarding)"
                        .to_string(),
                )
            })?
            .to_string();

        let sess_url = format!("{base}/connect/session");
        let user_id = match self.client.get(&sess_url).bearer_auth(bearer).send().await {
            Ok(r) if r.status().is_success() => r
                .json::<serde_json::Value>()
                .await
                .ok()
                .and_then(|j| {
                    j.get("account")
                        .and_then(|a| a.get("email"))
                        .and_then(|v| v.as_str())
                        .map(String::from)
                })
                .unwrap_or_else(|| "car".to_string()),
            _ => "car".to_string(),
        };

        let pair = (org_id, user_id);
        cache
            .lock()
            .await
            .insert(bearer.to_string(), pair.clone());
        Ok(pair)
    }

    /// Parslee managed inference via the existing hosted assistant
    /// endpoint (`POST /api/v1/orgs/{org}/chat/stream`, SSE). This is
    /// a *managed assistant* turn, not raw model inference: Parslee
    /// owns the model/prompt/conversation, so CAR flattens its message
    /// list into the single `message` string the `GenericChatRequest`
    /// contract takes. `infer` is non-streaming, so we buffer the SSE
    /// body and concatenate the `content` deltas. Zero backend deps —
    /// works against today's deployed Parslee.
    async fn parslee_assistant_request(
        &self,
        endpoint: &str,
        bearer: &str,
        req: &crate::protocol::ApiRequest,
    ) -> Result<crate::protocol::ApiResponse, InferenceError> {
        let (org_id, user_id) = self.parslee_identity(endpoint, bearer).await?;

        let mut parts: Vec<String> = Vec::new();
        if let Some(sys) = &req.system {
            if !sys.trim().is_empty() {
                parts.push(format!("system: {sys}"));
            }
        }
        for m in &req.messages {
            let role = m.get("role").and_then(|v| v.as_str()).unwrap_or("user");
            let content = m.get("content").and_then(|v| v.as_str()).unwrap_or("");
            if !content.is_empty() {
                parts.push(format!("{role}: {content}"));
            }
        }
        let message = parts.join("\n");
        if message.trim().is_empty() {
            return Err(InferenceError::InferenceFailed(
                "Parslee assistant: empty prompt".to_string(),
            ));
        }

        let url = format!(
            "{}/api/v1/orgs/{}/chat/stream",
            endpoint.trim_end_matches('/'),
            org_id
        );
        let body = serde_json::json!({ "userId": user_id, "message": message });
        let send_fut = self
            .client
            .post(&url)
            .bearer_auth(bearer)
            .header("content-type", "application/json")
            .header("accept", "text/event-stream")
            .json(&body)
            .send();
        let resp = tokio::time::timeout(std::time::Duration::from_secs(300), send_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed("Parslee chat request timed out (150s)".to_string())
            })?
            .map_err(|e| {
                InferenceError::InferenceFailed(format!("Parslee chat HTTP error: {e}"))
            })?;
        let status = resp.status();
        let text_fut = resp.text();
        let raw = tokio::time::timeout(std::time::Duration::from_secs(300), text_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed(
                    "Parslee chat body read timed out (120s)".to_string(),
                )
            })?
            .map_err(|e| {
                InferenceError::InferenceFailed(format!("Parslee chat read body: {e}"))
            })?;
        if !status.is_success() {
            return Err(InferenceError::InferenceFailed(format!(
                "Parslee chat failed: HTTP {status}: {raw}"
            )));
        }

        let out = collect_parslee_sse(&raw);
        if out.is_empty() {
            return Err(InferenceError::InferenceFailed(
                "Parslee assistant returned no content".to_string(),
            ));
        }
        Ok(crate::protocol::ApiResponse {
            text: out,
            tool_calls: Vec::new(),
            usage: None,
            stop_reason: None,
        })
    }

    /// Generate text via a remote API, using load-balanced key selection.
    /// Auto-truncates the prompt if it exceeds the model's context window.
    pub async fn generate(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        images: Option<&[ContentBlock]>,
    ) -> Result<String, InferenceError> {
        let resp = self
            .generate_with_tools_multi(
                schema,
                prompt,
                context,
                temperature,
                max_tokens,
                None,
                images,
                None,
                None,
                None,
                0,
                false,
                None,
            )
            .await?;
        Ok(resp.0)
    }

    /// Generate with optional tool definitions and multi-turn conversation history.
    /// Auto-truncates the prompt if it exceeds the model's context window.
    ///
    /// When `messages` is provided, builds a proper multi-turn conversation
    /// instead of a single user message. This enables tool_use → tool_result flows.
    ///
    /// Post-processes `done` tool calls: if the result argument is suspiciously
    /// short (< 50 chars) and the model also produced text output, enriches
    /// the done result with the text (fixes #10).
    pub async fn generate_with_tools(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
    ) -> Result<(String, Vec<crate::tasks::generate::ToolCall>), InferenceError> {
        let (text, calls, _usage, _stop) = self
            .generate_with_tools_multi(
                schema,
                prompt,
                context,
                temperature,
                max_tokens,
                tools,
                images,
                None,
                None,
                None,
                0,
                false,
                None,
            )
            .await?;
        Ok((text, calls))
    }

    /// Generate with multi-turn conversation support and optional extended thinking.
    /// Uses the unified protocol handler abstraction.
    ///
    /// When `cache_control` is true, system prompt and tool definitions are marked
    /// with Anthropic prompt caching breakpoints for cache reuse across calls.
    pub async fn generate_with_tools_multi(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
        messages: Option<&[crate::tasks::generate::Message]>,
        tool_choice: Option<&str>,
        parallel_tool_calls: Option<bool>,
        budget_tokens: usize,
        cache_control: bool,
        response_format: Option<&crate::tasks::generate::ResponseFormat>,
    ) -> Result<
        (
            String,
            Vec<crate::tasks::generate::ToolCall>,
            Option<crate::TokenUsage>,
            Option<String>,
        ),
        InferenceError,
    > {
        let (_, protocol) = extract_remote_endpoint(schema)?;
        let handler = crate::protocol::handler_for(protocol);

        // Pre-check: reject video / audio content blocks on protocols
        // that don't accept them natively. Silent stringification to
        // `[video: <source>]` or `[audio: <source>]` is a correctness
        // trap — the model would answer confidently about content it
        // never saw. Surface as an explicit typed error instead.
        if !handler.supports_video() {
            let has_video_in_images =
                images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_video));
            let has_video_in_messages = messages.is_some_and(|msgs| {
                msgs.iter().any(|msg| match msg {
                    crate::tasks::generate::Message::UserMultimodal { content } => {
                        content.iter().any(ContentBlock::is_video)
                    }
                    _ => false,
                })
            });
            if has_video_in_images || has_video_in_messages {
                return Err(InferenceError::UnsupportedMode {
                    mode: "video-content-block",
                    backend: handler.protocol_name(),
                    reason: "this remote protocol has no native video input path; route to \
                         a provider that implements ProtocolHandler::supports_video() (Gemini)",
                });
            }
        }
        if !handler.supports_audio() {
            let has_audio_in_images =
                images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_audio));
            let has_audio_in_messages = messages.is_some_and(|msgs| {
                msgs.iter().any(|msg| match msg {
                    crate::tasks::generate::Message::UserMultimodal { content } => {
                        content.iter().any(ContentBlock::is_audio)
                    }
                    _ => false,
                })
            });
            if has_audio_in_images || has_audio_in_messages {
                return Err(InferenceError::UnsupportedMode {
                    mode: "audio-content-block",
                    backend: handler.protocol_name(),
                    reason: "this remote protocol has no native audio input path; route to \
                         a provider that implements ProtocolHandler::supports_audio() (Gemini)",
                });
            }
        }

        // Auto-truncate if prompt exceeds context window
        let prompt = if schema.context_length > 0 {
            truncate_prompt_to_fit(prompt, context, tools, max_tokens, schema.context_length)
        } else {
            prompt.to_string()
        };

        // Build messages using protocol handler
        let (api_messages, system) =
            handler.build_messages(messages.unwrap_or(&[]), &prompt, context, images);

        // Build tools using protocol handler
        let api_tools = tools.map(|t| handler.build_tools(t));

        // Build and execute request
        let req = crate::protocol::ApiRequest {
            model: request_model_name(schema),
            messages: api_messages,
            system,
            temperature,
            max_tokens,
            tools: api_tools,
            tool_choice: tool_choice.map(str::to_string),
            parallel_tool_calls,
            stream: false,
            budget_tokens,
            cache_control,
            response_format: response_format.cloned(),
        };

        let response = self.execute_request(schema, req).await?;

        let text = response.text;
        let mut calls = response.tool_calls;
        let usage = response.usage;
        let stop_reason = response.stop_reason;

        // Fix #10: enrich empty "done" results with text output from the same response.
        // Models often call done({"result": "completed"}) while putting actual findings
        // in the text output block.
        if !text.is_empty() {
            for call in &mut calls {
                if call.name == "done" {
                    let result_val = call
                        .arguments
                        .get("result")
                        .and_then(|v| v.as_str())
                        .unwrap_or("");
                    if result_val.len() < 50 && text.len() > result_val.len() {
                        call.arguments.insert(
                            "result".to_string(),
                            serde_json::Value::String(text.clone()),
                        );
                    }
                }
            }
        }

        Ok((text, calls, usage, stop_reason))
    }

    /// Generate embeddings via a remote API (OpenAI-compatible only for now).
    pub async fn embed(
        &self,
        schema: &ModelSchema,
        texts: &[String],
    ) -> Result<Vec<Vec<f32>>, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        let lease = self.lease_key(schema, &endpoint).await?;
        let start = std::time::Instant::now();

        let result = match protocol {
            ApiProtocol::OpenAiCompat => {
                self.embed_openai(&endpoint, &lease.api_key, &schema.name, texts)
                    .await
            }
            _ => Err(InferenceError::InferenceFailed(format!(
                "embedding not supported for {:?} protocol",
                protocol
            ))),
        };

        let latency_ms = start.elapsed().as_millis() as u64;
        match &result {
            Ok(_) => {
                let est_tokens = texts
                    .iter()
                    .map(|t| t.split_whitespace().count() as u64)
                    .sum();
                self.key_pool
                    .report_success(&endpoint, &lease.env_var, latency_ms, est_tokens, 0)
                    .await;
            }
            Err(e) => {
                let is_rl =
                    e.to_string().contains("429") || e.to_string().contains("RESOURCE_EXHAUSTED");
                self.key_pool
                    .report_failure(&endpoint, &lease.env_var, is_rl)
                    .await;
            }
        }

        result
    }

    /// Lease a key from the pool, falling back to env var extraction.
    async fn lease_key(
        &self,
        schema: &ModelSchema,
        endpoint: &str,
    ) -> Result<KeyLease, InferenceError> {
        // Try to get the fallback env var name
        let fallback_env = match &schema.source {
            ModelSource::RemoteApi { api_key_env, .. } => api_key_env.as_str(),
            ModelSource::Ollama { .. } | ModelSource::VllmMlx { .. } => {
                return Ok(KeyLease {
                    api_key: String::new(),
                    env_var: String::new(),
                    index: 0,
                })
            }
            // Proprietary providers (Parslee) carry a single bearer
            // credential, not a pool of rotatable env keys, so resolve
            // it directly and return — no key_pool lease/round-robin.
            ModelSource::Proprietary {
                ref auth,
                ref provider,
                ..
            } => {
                let (token, source_label) = match auth {
                    // OAuth2 PKCE: the access token is minted by
                    // `car auth login parslee` and stored in the OS
                    // keychain (or overridden via the same-named env
                    // var). resolve_env_or_keychain does env-first,
                    // keychain-fallback under the default "car" service.
                    ProprietaryAuth::OAuth2Pkce { .. } => (
                        // Proactively refresh a lapsed token instead of letting
                        // the request 401 (which poisons 30-day model health,
                        // #313). The `PARSLEE_ACCESS_TOKEN` env override still
                        // wins and is never refreshed; without a stored expiry
                        // this degrades to the prior env-first/keychain read.
                        car_auth::access_token_refreshing().await,
                        PARSLEE_ACCESS_TOKEN_ENV,
                    ),
                    ProprietaryAuth::BearerTokenEnv { env_var }
                    | ProprietaryAuth::ApiKeyEnv { env_var } => {
                        (car_secrets::resolve_env_or_keychain(env_var), env_var.as_str())
                    }
                };
                let token = token.ok_or_else(|| {
                    InferenceError::InferenceFailed(format!(
                        "no credential for proprietary provider '{provider}': set ${source_label} \
                         or run `car auth login {provider}` (model {})",
                        schema.id
                    ))
                })?;
                return Ok(KeyLease {
                    api_key: token,
                    env_var: String::new(),
                    index: 0,
                });
            }
            _ => {
                return Err(InferenceError::InferenceFailed(format!(
                    "model {} is not remote",
                    schema.id
                )))
            }
        };

        // Register keys on first use (idempotent)
        self.register_model_keys(schema).await;

        self.key_pool
            .lease_or_env(endpoint, fallback_env)
            .await
            .ok_or_else(|| {
                InferenceError::InferenceFailed(format!(
                    "no API keys available for endpoint {} (checked env vars: {:?})",
                    endpoint,
                    schema.all_api_key_envs()
                ))
            })
    }

    // --- Embedding (not yet migrated to ProtocolHandler) ---

    async fn embed_openai(
        &self,
        endpoint: &str,
        api_key: &str,
        model: &str,
        texts: &[String],
    ) -> Result<Vec<Vec<f32>>, InferenceError> {
        let url = format_endpoint(endpoint, "/v1/embeddings");

        let body = serde_json::json!({
            "model": model,
            "input": texts,
        });

        let resp = self
            .client
            .post(&url)
            .header("Authorization", format!("Bearer {api_key}"))
            .header("Content-Type", "application/json")
            .json(&body)
            .send()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;

        let status = resp.status();
        let text = resp
            .text()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("read body: {e}")))?;

        if !status.is_success() {
            return Err(InferenceError::InferenceFailed(format!(
                "API returned {status}: {text}"
            )));
        }

        let parsed: OpenAiEmbedResponse = serde_json::from_str(&text)
            .map_err(|e| InferenceError::InferenceFailed(format!("parse response: {e}")))?;

        Ok(order_embeddings(parsed.data))
    }

    /// Stream a response from a remote API using Server-Sent Events.
    /// Returns a channel receiver that yields StreamEvents.
    /// Works with OpenAI-compatible and Anthropic APIs.
    pub async fn generate_stream(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
        tool_choice: Option<&str>,
        parallel_tool_calls: Option<bool>,
        response_format: Option<&crate::tasks::generate::ResponseFormat>,
    ) -> Result<tokio::sync::mpsc::Receiver<crate::stream::StreamEvent>, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        // Parslee managed inference is a non-streaming hosted-assistant
        // turn (see `parslee_assistant_request`). The generic streaming
        // pipeline below would hit an OpenAI-compat path the Parslee
        // backend doesn't expose; fail fast with a clear message
        // instead. Callers use the non-streaming `generate`/`infer`
        // path for `parslee/*` (streaming support is a follow-up).
        if matches!(schema.source, ModelSource::Proprietary { .. }) {
            return Err(InferenceError::InferenceFailed(
                "parslee/* models do not support streaming inference yet; \
                 use the non-streaming path (generate/infer)"
                    .to_string(),
            ));
        }
        let lease = self.lease_key(schema, &endpoint).await?;
        let api_key = lease.api_key;
        let model = request_model_name(schema);
        let handler = crate::protocol::handler_for(protocol);

        // Mirror the non-streaming audio/video guard so streaming
        // callers can't bypass the supports_* check and silently
        // stringify audio/video blocks via the handler fallback arms.
        if !handler.supports_video()
            && images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_video))
        {
            return Err(InferenceError::UnsupportedMode {
                mode: "video-content-block",
                backend: handler.protocol_name(),
                reason: "this remote protocol has no native video input path; route to \
                     a provider that implements ProtocolHandler::supports_video() (Gemini)",
            });
        }
        if !handler.supports_audio()
            && images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_audio))
        {
            return Err(InferenceError::UnsupportedMode {
                mode: "audio-content-block",
                backend: handler.protocol_name(),
                reason: "this remote protocol has no native audio input path; route to \
                     a provider that implements ProtocolHandler::supports_audio() (Gemini)",
            });
        }

        // Streaming path now flows through the same protocol abstraction
        // the non-streaming path uses (#125): build an `ApiRequest`,
        // delegate body construction to `handler.build_request_body`,
        // delegate auth/content-type to `handler.auth_headers`. Anything
        // the protocol abstraction picks up — `response_format`,
        // `tool_choice`, `parallel_tool_calls`, future fields — flows
        // through to streaming for free without re-implementing per
        // provider.
        if matches!(protocol, ApiProtocol::Google) {
            return Err(InferenceError::InferenceFailed(
                "streaming not supported for Google protocol".to_string(),
            ));
        }

        let (messages, system) = handler.build_messages(&[], prompt, context, images);
        let built_tools = tools.map(|t| handler.build_tools(t));
        let req = crate::protocol::ApiRequest {
            model: model.clone(),
            messages,
            system,
            temperature,
            max_tokens,
            tools: built_tools,
            tool_choice: tool_choice.map(str::to_string),
            parallel_tool_calls,
            stream: true,
            budget_tokens: 0,
            cache_control: false,
            response_format: response_format.cloned(),
        };
        let body = handler.build_request_body(&req);

        // Build URL — Google/Vertex stream via :streamGenerateContent?alt=sse,
        // Azure has its deployment shape, everyone else uses the handler path.
        let url = if matches!(protocol, ApiProtocol::Google) {
            crate::protocol::google_stream_url(&endpoint, &model, &api_key)
        } else if matches!(protocol, ApiProtocol::VertexAi) {
            crate::protocol::vertex_stream_url(&endpoint, &model)
        } else if matches!(protocol, ApiProtocol::AzureOpenAi) {
            let api_version = match &schema.source {
                ModelSource::RemoteApi { api_version, .. } => api_version.clone(),
                _ => None,
            };
            let version = api_version.as_deref().unwrap_or("2024-10-21");
            format!(
                "{}/openai/deployments/{}/chat/completions?api-version={}",
                endpoint.trim_end_matches('/'),
                model,
                version
            )
        } else {
            format_endpoint(&endpoint, chat_path_for(schema, handler.as_ref()))
        };

        let mut headers = reqwest::header::HeaderMap::new();
        for (name, value) in handler.auth_headers(&api_key) {
            headers.insert(
                reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
                    InferenceError::InferenceFailed(format!("auth header name: {e}"))
                })?,
                value.parse().map_err(|e| {
                    InferenceError::InferenceFailed(format!("auth header value: {e}"))
                })?,
            );
        }

        let send_fut = self.client.post(&url).headers(headers).json(&body).send();
        let resp = tokio::time::timeout(std::time::Duration::from_secs(300), send_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed(
                    "stream request timed out after 300s (tokio safety timeout)".to_string(),
                )
            })?
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;

        let status = resp.status();
        if !status.is_success() {
            let err_text = resp.text().await.unwrap_or_default();
            return Err(InferenceError::InferenceFailed(format!(
                "API returned {status}: {err_text}"
            )));
        }

        let (tx, rx) = tokio::sync::mpsc::channel::<crate::stream::StreamEvent>(64);

        // Spawn a task to read the SSE stream and forward events. Parsing is
        // delegated to the protocol handler (`parse_stream_event`) so each
        // provider's SSE shape is decoded by its own handler — OpenAI chat,
        // Anthropic, and the Responses API all differ. (Previously this
        // hardcoded an is_anthropic-vs-OpenAI branch, which silently mis-parsed
        // any third shape, e.g. the Responses API's typed `event:` stream.)
        tokio::spawn(async move {
            use futures::StreamExt;
            let mut byte_stream = resp.bytes_stream();
            // Buffer raw BYTES, not a per-chunk lossily-decoded String: a
            // multi-byte UTF-8 codepoint split across two TCP chunks would
            // otherwise become a `�`, corrupting tool-argument JSON or text.
            let mut buffer: Vec<u8> = Vec::new();

            while let Some(chunk_result) = byte_stream.next().await {
                match chunk_result {
                    Ok(bytes) => buffer.extend_from_slice(&bytes),
                    Err(e) => {
                        // A transport break mid-stream is NOT a clean EOF.
                        // Previously this `break`'d silently, so the partial
                        // text looked like a finished answer. Log it and emit
                        // an abnormal-termination signal so the receiver can
                        // tell truncation from completion.
                        tracing::warn!(error = %e, "remote stream transport error mid-response");
                        let _ = tx
                            .send(crate::stream::StreamEvent::StopReason("error".to_string()))
                            .await;
                        return;
                    }
                };

                // Process complete SSE events. Each event ends at a blank
                // line ("\n\n" or CRLF "\r\n\r\n"). The block always ends on
                // an ASCII boundary, so decoding it as UTF-8 is exact; any
                // trailing partial codepoint stays in `buffer` for next chunk.
                while let Some((pos, sep_len)) = find_sse_separator(&buffer) {
                    let event_block = String::from_utf8_lossy(&buffer[..pos]).into_owned();
                    buffer.drain(..pos + sep_len);

                    let sse_events = crate::stream::parse_sse_lines(&event_block);
                    for (event_type, data) in sse_events {
                        if data == "[DONE]" {
                            continue;
                        }

                        let stream_events = handler.parse_stream_event(&event_type, &data);

                        for evt in stream_events {
                            if tx.send(evt).await.is_err() {
                                return; // receiver dropped
                            }
                        }
                    }
                }
            }
        });

        Ok(rx)
    }
}

impl Default for RemoteBackend {
    fn default() -> Self {
        Self::new()
    }
}

// --- Helpers ---

/// Extract endpoint and protocol from a model schema (key comes from KeyPool now).
fn extract_remote_endpoint(schema: &ModelSchema) -> Result<(String, ApiProtocol), InferenceError> {
    match &schema.source {
        ModelSource::RemoteApi {
            endpoint, protocol, ..
        } => Ok((endpoint.clone(), *protocol)),
        ModelSource::Ollama { host, .. } => Ok((host.clone(), ApiProtocol::OpenAiCompat)),
        ModelSource::VllmMlx { endpoint, .. } => Ok((endpoint.clone(), ApiProtocol::OpenAiCompat)),
        // Proprietary providers (Parslee) speak an OpenAI-compatible
        // wire shape; the base URL is the provider endpoint and the
        // (non-default) chat path is applied via `chat_path_for`.
        ModelSource::Proprietary { endpoint, .. } => {
            Ok((endpoint.clone(), ApiProtocol::OpenAiCompat))
        }
        _ => Err(InferenceError::InferenceFailed(format!(
            "model {} is not remote",
            schema.id
        ))),
    }
}

/// Chat path to append to the base endpoint. Proprietary providers
/// (Parslee) declare their own `chat_path` in the schema; every other
/// source uses the protocol handler's default `endpoint_path()`.
fn chat_path_for<'a>(
    schema: &'a ModelSchema,
    handler: &'a dyn crate::protocol::ProtocolHandler,
) -> &'a str {
    match &schema.source {
        ModelSource::Proprietary { protocol, .. } => protocol.chat_path.as_str(),
        _ => handler.endpoint_path(),
    }
}

fn request_model_name(schema: &ModelSchema) -> String {
    match &schema.source {
        ModelSource::VllmMlx { model_name, .. } => model_name.clone(),
        _ => schema.name.clone(),
    }
}

/// Normalize endpoint URL for a given path.
fn format_endpoint(base: &str, path: &str) -> String {
    let base = base.trim_end_matches('/');
    // If the base already ends with the path, use it as-is
    if base.ends_with(path.trim_start_matches('/')) {
        base.to_string()
    } else {
        format!("{}{}", base, path)
    }
}

// --- Response types ---

#[derive(Debug, Deserialize)]
struct OpenAiEmbedResponse {
    data: Vec<OpenAiEmbedData>,
}

#[derive(Debug, Deserialize)]
struct OpenAiEmbedData {
    embedding: Vec<f32>,
    /// Position in the input batch. The OpenAI embeddings API does **not**
    /// guarantee `data[]` is returned in input order — `index` is authoritative.
    #[serde(default)]
    index: usize,
}

/// Collect embeddings in input order. The API may return `data[]` out of order,
/// so sort by `index` first — a caller that pairs `embeddings[i]` with input
/// text `i` (every similarity ranking does) would otherwise silently mis-pair.
fn order_embeddings(mut data: Vec<OpenAiEmbedData>) -> Vec<Vec<f32>> {
    data.sort_by_key(|d| d.index);
    data.into_iter().map(|d| d.embedding).collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::schema::ProprietaryProtocol;

    #[test]
    fn sse_separator_handles_lf_crlf_and_partials() {
        // Plain "\n\n".
        assert_eq!(find_sse_separator(b"data: a\n\nrest"), Some((7, 2)));
        // CRLF "\r\n\r\n" must frame too (was silently never matched).
        assert_eq!(find_sse_separator(b"data: a\r\n\r\nrest"), Some((7, 4)));
        // No complete boundary yet (partial event) -> None, stays buffered.
        assert_eq!(find_sse_separator(b"data: a\n"), None);
        // Earliest boundary wins when both kinds are present.
        let buf = b"a\n\nb\r\n\r\nc";
        assert_eq!(find_sse_separator(buf), Some((1, 2)));
    }

    #[test]
    fn auth_rejection_is_anchored_on_http_status() {
        use crate::InferenceError;
        assert!(is_auth_rejection(&InferenceError::InferenceFailed(
            "Parslee chat failed: HTTP 401: {\"error\":\"expired\"}".into()
        )));
        assert!(is_auth_rejection(&InferenceError::InferenceFailed(
            "Parslee org lookup failed: HTTP 403: forbidden".into()
        )));
        // A 400 body that merely mentions 401 must NOT trigger a refresh.
        assert!(!is_auth_rejection(&InferenceError::InferenceFailed(
            "API returned 400: your last request 401'd upstream".into()
        )));
        assert!(!is_auth_rejection(&InferenceError::InferenceFailed(
            "HTTP 500: server error".into()
        )));
    }

    #[test]
    fn embeddings_resorted_to_input_order() {
        // Provider returns data[] out of order; index is authoritative.
        let data = vec![
            OpenAiEmbedData { embedding: vec![2.0], index: 2 },
            OpenAiEmbedData { embedding: vec![0.0], index: 0 },
            OpenAiEmbedData { embedding: vec![1.0], index: 1 },
        ];
        assert_eq!(order_embeddings(data), vec![vec![0.0], vec![1.0], vec![2.0]]);
    }

    #[test]
    fn format_endpoint_no_dup() {
        assert_eq!(
            format_endpoint("https://api.openai.com", "/v1/chat/completions"),
            "https://api.openai.com/v1/chat/completions"
        );
        assert_eq!(
            format_endpoint(
                "https://api.openai.com/v1/chat/completions",
                "/v1/chat/completions"
            ),
            "https://api.openai.com/v1/chat/completions"
        );
        assert_eq!(
            format_endpoint("https://api.openai.com/", "/v1/chat/completions"),
            "https://api.openai.com/v1/chat/completions"
        );
    }

    #[test]
    fn extract_endpoint_from_remote() {
        let schema = ModelSchema {
            id: "test/model:v1".into(),
            name: "Test".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            max_output_tokens: None,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::RemoteApi {
                endpoint: "https://api.test.com".into(),
                api_key_env: "NONEXISTENT_TEST_KEY_12345".into(),
                api_key_envs: vec![],
                api_version: None,
                protocol: ApiProtocol::OpenAiCompat,
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: crate::schema::TrustTier::Curated,
            deprecated: false,
            available: false,
        };
        let (endpoint, protocol) = extract_remote_endpoint(&schema).unwrap();
        assert_eq!(endpoint, "https://api.test.com");
        assert_eq!(protocol, ApiProtocol::OpenAiCompat);
    }

    #[test]
    fn extract_endpoint_non_remote_fails() {
        let schema = ModelSchema {
            id: "local/model:v1".into(),
            name: "Local".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            max_output_tokens: None,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::Local {
                hf_repo: "test".into(),
                hf_filename: "test".into(),
                tokenizer_repo: "test".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: crate::schema::TrustTier::Curated,
            deprecated: false,
            available: false,
        };
        assert!(extract_remote_endpoint(&schema).is_err());
    }

    #[test]
    fn proprietary_endpoint_is_openai_compat_with_custom_chat_path() {
        let schema = ModelSchema {
            id: "parslee/advisor".into(),
            name: "Parslee Advisor".into(),
            provider: "parslee".into(),
            family: "parslee".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 128_000,
            max_output_tokens: None,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::Proprietary {
                provider: "parslee".into(),
                endpoint: "https://api.parslee.ai".into(),
                auth: ProprietaryAuth::OAuth2Pkce {
                    authority: "https://api.parslee.ai".into(),
                    client_id: "parslee-car".into(),
                    scopes: vec!["inference:invoke".into()],
                },
                protocol: ProprietaryProtocol {
                    chat_path: "/v1/inference/openai/v1/chat/completions".into(),
                    content_type: "application/json".into(),
                    streaming: false,
                    extra_headers: Default::default(),
                },
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: crate::schema::TrustTier::Curated,
            deprecated: false,
            available: true,
        };
        let (endpoint, protocol) = extract_remote_endpoint(&schema).unwrap();
        assert_eq!(endpoint, "https://api.parslee.ai");
        assert_eq!(protocol, ApiProtocol::OpenAiCompat);
        let handler = crate::protocol::handler_for(protocol);
        assert_eq!(
            chat_path_for(&schema, handler.as_ref()),
            "/v1/inference/openai/v1/chat/completions"
        );
        assert_eq!(
            format_endpoint(&endpoint, chat_path_for(&schema, handler.as_ref())),
            "https://api.parslee.ai/v1/inference/openai/v1/chat/completions"
        );
    }

    #[tokio::test]
    async fn lease_key_proprietary_oauth2_resolves_env_override() {
        // resolve_env_or_keychain is env-first; setting the env var
        // exercises the OAuth2Pkce arm deterministically without
        // touching the OS keychain.
        std::env::set_var(PARSLEE_ACCESS_TOKEN_ENV, "test-bearer-abc123");
        let schema = ModelSchema {
            id: "parslee/advisor".into(),
            name: "Parslee Advisor".into(),
            provider: "parslee".into(),
            family: "parslee".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 128_000,
            max_output_tokens: None,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::Proprietary {
                provider: "parslee".into(),
                endpoint: "https://api.parslee.ai".into(),
                auth: ProprietaryAuth::OAuth2Pkce {
                    authority: "https://api.parslee.ai".into(),
                    client_id: "parslee-car".into(),
                    scopes: vec!["inference:invoke".into()],
                },
                protocol: ProprietaryProtocol {
                    chat_path: "/v1/inference/openai/v1/chat/completions".into(),
                    content_type: "application/json".into(),
                    streaming: false,
                    extra_headers: Default::default(),
                },
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: crate::schema::TrustTier::Curated,
            deprecated: false,
            available: true,
        };
        let lease = RemoteBackend::new()
            .lease_key(&schema, "https://api.parslee.ai")
            .await
            .expect("proprietary OAuth2 lease should resolve the env-override token");
        assert_eq!(lease.api_key, "test-bearer-abc123");
        std::env::remove_var(PARSLEE_ACCESS_TOKEN_ENV);
    }

    #[test]
    fn parse_openai_embed_response() {
        let json = r#"{"data":[{"embedding":[0.1,0.2,0.3]}]}"#;
        let resp: OpenAiEmbedResponse = serde_json::from_str(json).unwrap();
        assert_eq!(resp.data[0].embedding, vec![0.1, 0.2, 0.3]);
    }

    #[test]
    fn request_model_name_uses_vllm_server_model() {
        let schema = ModelSchema {
            id: "vllm-mlx/test".into(),
            name: "Display Name".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            max_output_tokens: None,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::VllmMlx {
                endpoint: "http://localhost:8000".into(),
                model_name: "mlx-community/Actual-Model".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: crate::schema::TrustTier::Curated,
            deprecated: false,
            available: true,
        };
        assert_eq!(request_model_name(&schema), "mlx-community/Actual-Model");
    }

    #[test]
    fn truncate_prompt_fits_returns_unchanged() {
        let prompt = "short prompt";
        let result = truncate_prompt_to_fit(prompt, None, None, 16, 256);
        assert_eq!(result, prompt);
    }

    #[test]
    fn truncate_prompt_cjk_mid_codepoint_does_not_panic() {
        // 200 CJK chars = 600 bytes. With max_tokens=20 and window=209:
        // reserved = 20 + 100 overhead = 120, available = 89, chars_to_keep = 356.
        // start = 600 - 356 = 244, which is NOT a char boundary for 3-byte chars.
        let prompt: String = std::iter::repeat('\u{4E16}').take(200).collect();
        let result = truncate_prompt_to_fit(&prompt, None, None, 20, 209);
        assert!(result.starts_with("[...truncated...]"));
        let kept = result.strip_prefix("[...truncated...]\n").unwrap();
        assert!(!kept.is_empty());
    }

    #[test]
    fn truncate_prompt_accounts_for_context_and_tools() {
        let prompt = "line one\nline two\nline three\n".repeat(50);
        let tools = vec![serde_json::json!({"name": "demo_tool"})];
        let result = truncate_prompt_to_fit(&prompt, Some("ctx"), Some(&tools), 20, 240);
        assert!(result.starts_with("[...truncated...]"));
    }

    #[test]
    fn parslee_sse_concatenates_content_until_timestamp() {
        // Realistic Parslee frame sequence: start (conversationId
        // only, ignored) → two content deltas → end (timestamp, stop).
        let raw = "data: {\"conversationId\":\"c1\"}\n\n\
                   data: {\"content\":\"Hello\"}\n\n\
                   data: {\"content\":\", world\"}\n\n\
                   data: {\"conversationId\":\"c1\",\"timestamp\":\"2026-05-15T00:00:00Z\"}\n\n";
        assert_eq!(collect_parslee_sse(raw), "Hello, world");
    }

    #[test]
    fn parslee_sse_skips_done_and_unparseable_frames() {
        let raw = "data: not-json\n\n\
                   data: {\"content\":\"ok\"}\n\n\
                   data: [DONE]\n\n";
        assert_eq!(collect_parslee_sse(raw), "ok");
    }

    #[test]
    fn parslee_sse_stops_at_timestamp_ignoring_later_content() {
        let raw = "data: {\"content\":\"kept\"}\n\n\
                   data: {\"timestamp\":\"t\"}\n\n\
                   data: {\"content\":\"dropped\"}\n\n";
        assert_eq!(collect_parslee_sse(raw), "kept");
    }

    #[test]
    fn parslee_sse_empty_when_no_content() {
        let raw = "data: {\"conversationId\":\"c1\"}\n\n";
        assert!(collect_parslee_sse(raw).is_empty());
    }

    // --- Parslee identity resolution over real HTTP (wiremock). This is the
    // org-lookup endpoint that returned `HTTP 401` in the live #313 incident;
    // it had no in-CI coverage. Unique bearers per test so the process-wide
    // `PARSLEE_IDENTITY` cache can't cross-contaminate. ---

    #[tokio::test]
    async fn parslee_identity_resolves_org_and_user() {
        use wiremock::matchers::{method, path};
        use wiremock::{Mock, MockServer, ResponseTemplate};
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/api/v1/organizations/me"))
            .respond_with(
                ResponseTemplate::new(200)
                    .set_body_json(serde_json::json!({ "organizationId": "org_test" })),
            )
            .mount(&server)
            .await;
        Mock::given(method("GET"))
            .and(path("/connect/session"))
            .respond_with(ResponseTemplate::new(200).set_body_json(
                serde_json::json!({ "account": { "email": "user@example.com" } }),
            ))
            .mount(&server)
            .await;

        let backend = RemoteBackend::new();
        let (org, user) = backend
            .parslee_identity(&server.uri(), "ident-ok-bearer")
            .await
            .expect("identity should resolve");
        assert_eq!(org, "org_test");
        assert_eq!(user, "user@example.com");
    }

    #[tokio::test]
    async fn parslee_identity_surfaces_org_lookup_401() {
        use wiremock::matchers::{method, path};
        use wiremock::{Mock, MockServer, ResponseTemplate};
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/api/v1/organizations/me"))
            .respond_with(
                ResponseTemplate::new(401).set_body_string("Authentication required"),
            )
            .mount(&server)
            .await;

        let backend = RemoteBackend::new();
        let err = backend
            .parslee_identity(&server.uri(), "ident-401-bearer")
            .await
            .expect_err("a 401 org lookup must error");
        // The exact message shape the live incident surfaced — and the string
        // `is_auth_rejection` keys on to trigger the reactive refresh.
        let msg = err.to_string();
        assert!(msg.contains("org lookup failed"), "got: {msg}");
        assert!(msg.contains("HTTP 401"), "got: {msg}");
        assert!(is_auth_rejection(&err), "401 must be classified as auth rejection");
    }
}