car-inference 0.13.0

//! Remote inference backend — HTTP client for cloud API models.
//!
//! The main dispatch uses the `ProtocolHandler` trait from `protocol.rs`.
//! Legacy per-protocol handlers are kept temporarily for the embedding path.
//!
//! Supports: OpenAI-compatible, Anthropic, Google (Gemini).

// Legacy handlers are being phased out in favor of ProtocolHandler trait.

use reqwest::Client;
use serde::Deserialize;
use tracing::{debug, instrument};

use crate::key_pool::{KeyLease, KeyPool};
use crate::schema::{ApiProtocol, ModelSchema, ModelSource};
use crate::tasks::ContentBlock;
use crate::InferenceError;

/// Remote inference client. Reuses a single HTTP client for connection pooling.
/// Integrates with KeyPool for multi-key load balancing.
pub struct RemoteBackend {
    pub(crate) client: Client,
    pub key_pool: KeyPool,
}

/// Estimate token count from a string. Uses the standard ~4 chars/token heuristic
/// for remote models where we don't have a tokenizer.
pub fn estimate_tokens(text: &str) -> usize {
    std::cmp::max(1, text.len() / 4)
}

/// Truncate a prompt to fit within a context window.
/// Keeps the most recent content (suffix) since that's most relevant.
/// Returns the truncated prompt.
/// Truncate a prompt to fit within a context window.
/// This is a last-resort fallback — callers should prefer compaction via
/// car-memgine which preserves semantic content. This function is only
/// used when compaction isn't available (e.g., raw API calls).
fn truncate_prompt_to_fit(
    prompt: &str,
    context: Option<&str>,
    tools_json: Option<&[serde_json::Value]>,
    max_tokens: usize,
    context_window: usize,
) -> String {
    let context_tokens = context.map(|c| estimate_tokens(c)).unwrap_or(0);
    let tools_tokens = tools_json
        .map(|t| estimate_tokens(&serde_json::to_string(t).unwrap_or_default()))
        .unwrap_or(0);
    // Reserve space for: context + tools + max_tokens (output) + overhead
    let overhead = 100; // message framing, special tokens
    let reserved = context_tokens + tools_tokens + max_tokens + overhead;
    let available = context_window.saturating_sub(reserved);

    let prompt_tokens = estimate_tokens(prompt);
    if prompt_tokens <= available {
        return prompt.to_string();
    }

    tracing::warn!(
        prompt_tokens = prompt_tokens,
        available_tokens = available,
        context_window = context_window,
        "truncating prompt to fit context window (prefer compaction via car-memgine)"
    );

    // Truncate from the beginning (keep the end, which is most recent/relevant)
    // Estimate chars to keep based on available tokens
    let chars_to_keep = available * 4;
    if chars_to_keep >= prompt.len() {
        return prompt.to_string();
    }

    let start = prompt.len().saturating_sub(chars_to_keep);
    let safe_start = prompt.ceil_char_boundary(start);
    let truncated = &prompt[safe_start..];
    // Find a clean break point (newline or space)
    let break_point = truncated
        .find('\n')
        .or_else(|| truncated.find(' '))
        .unwrap_or(0);

    format!(
        "[...truncated...]\n{}",
        &truncated[break_point..].trim_start()
    )
}

impl RemoteBackend {
    pub fn new() -> Self {
        let client = Client::builder()
            .timeout(std::time::Duration::from_secs(120))
            .connect_timeout(std::time::Duration::from_secs(10))
            .read_timeout(std::time::Duration::from_secs(90))
            .build()
            .unwrap_or_default();
        Self {
            client,
            key_pool: KeyPool::new(),
        }
    }

    /// Register all keys from a model schema into the key pool.
    pub async fn register_model_keys(&self, schema: &ModelSchema) {
        if let ModelSource::RemoteApi { ref endpoint, .. } = schema.source {
            let env_vars = schema.all_api_key_envs();
            if !env_vars.is_empty() {
                self.key_pool.register_endpoint(endpoint, env_vars).await;
            }
        }
    }

    /// Unified request execution using the protocol handler abstraction.
    /// All public generate methods delegate to this.
    #[instrument(
        name = "inference.remote_call",
        skip_all,
        fields(
            model = %schema.name,
            provider = %schema.provider,
        ),
    )]
    async fn execute_request(
        &self,
        schema: &ModelSchema,
        req: crate::protocol::ApiRequest,
    ) -> Result<crate::protocol::ApiResponse, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        let lease = self.lease_key(schema, &endpoint).await?;
        let handler = crate::protocol::handler_for(protocol);
        let start = std::time::Instant::now();

        // Build URL
        let api_version = match &schema.source {
            ModelSource::RemoteApi { api_version, .. } => api_version.clone(),
            _ => None,
        };
        let url = if matches!(protocol, ApiProtocol::Google) {
            crate::protocol::google_url(&endpoint, &req.model, &lease.api_key)
        } else if matches!(protocol, ApiProtocol::AzureOpenAi) {
            let version = api_version.as_deref().unwrap_or("2024-10-21");
            format!(
                "{}/openai/deployments/{}/chat/completions?api-version={}",
                endpoint.trim_end_matches('/'),
                req.model,
                version
            )
        } else {
            format_endpoint(&endpoint, handler.endpoint_path())
        };

        // Build headers
        let headers = handler.auth_headers(&lease.api_key);

        // Build request body
        let body = handler.build_request_body(&req);

        debug!(url = %url, model = %req.model, "protocol handler request");

        // Execute HTTP request with tokio-level timeout as safety net (#26).
        // reqwest's built-in timeouts cover most cases, but can fail to cancel
        // when the OS/kernel blocks at the socket level. This outer timeout
        // ensures generate_tracked() never hangs indefinitely.
        let mut builder = self.client.post(&url);
        for (name, value) in &headers {
            builder = builder.header(name.as_str(), value.as_str());
        }
        let send_fut = builder.json(&body).send();
        let resp = tokio::time::timeout(std::time::Duration::from_secs(150), send_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed(
                    "request timed out after 150s (tokio safety timeout)".to_string(),
                )
            })?
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;

        let status = resp.status();
        let body_fut = resp.text();
        let resp_text = tokio::time::timeout(std::time::Duration::from_secs(120), body_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed(
                    "response body read timed out after 120s".to_string(),
                )
            })?
            .map_err(|e| InferenceError::InferenceFailed(format!("read body: {e}")))?;

        let latency_ms = start.elapsed().as_millis() as u64;

        if !status.is_success() {
            let is_rl = resp_text.contains("429") || resp_text.contains("RESOURCE_EXHAUSTED");
            self.key_pool
                .report_failure(&endpoint, &lease.env_var, is_rl)
                .await;
            return Err(InferenceError::InferenceFailed(format!(
                "API returned {status}: {resp_text}"
            )));
        }

        let est_tokens = req
            .messages
            .iter()
            .filter_map(|m| m.get("content").and_then(|c| c.as_str()))
            .map(|s| s.len() / 4)
            .sum::<usize>() as u64;
        self.key_pool
            .report_success(&endpoint, &lease.env_var, latency_ms, est_tokens, 0)
            .await;

        let mut response = handler.parse_response(&resp_text)?;
        // Fill in context_window from the model schema
        if let Some(ref mut usage) = response.usage {
            usage.context_window = schema.context_length as u64;
        }
        Ok(response)
    }

    /// Generate text via a remote API, using load-balanced key selection.
    /// Auto-truncates the prompt if it exceeds the model's context window.
    pub async fn generate(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        images: Option<&[ContentBlock]>,
    ) -> Result<String, InferenceError> {
        let resp = self
            .generate_with_tools_multi(
                schema,
                prompt,
                context,
                temperature,
                max_tokens,
                None,
                images,
                None,
                None,
                None,
                0,
                false,
                None,
            )
            .await?;
        Ok(resp.0)
    }

    /// Generate with optional tool definitions and multi-turn conversation history.
    /// Auto-truncates the prompt if it exceeds the model's context window.
    ///
    /// When `messages` is provided, builds a proper multi-turn conversation
    /// instead of a single user message. This enables tool_use → tool_result flows.
    ///
    /// Post-processes `done` tool calls: if the result argument is suspiciously
    /// short (< 50 chars) and the model also produced text output, enriches
    /// the done result with the text (fixes #10).
    pub async fn generate_with_tools(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
    ) -> Result<(String, Vec<crate::tasks::generate::ToolCall>), InferenceError> {
        let (text, calls, _usage) = self
            .generate_with_tools_multi(
                schema,
                prompt,
                context,
                temperature,
                max_tokens,
                tools,
                images,
                None,
                None,
                None,
                0,
                false,
                None,
            )
            .await?;
        Ok((text, calls))
    }

    /// Generate with multi-turn conversation support and optional extended thinking.
    /// Uses the unified protocol handler abstraction.
    ///
    /// When `cache_control` is true, system prompt and tool definitions are marked
    /// with Anthropic prompt caching breakpoints for cache reuse across calls.
    pub async fn generate_with_tools_multi(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
        messages: Option<&[crate::tasks::generate::Message]>,
        tool_choice: Option<&str>,
        parallel_tool_calls: Option<bool>,
        budget_tokens: usize,
        cache_control: bool,
        response_format: Option<&crate::tasks::generate::ResponseFormat>,
    ) -> Result<
        (
            String,
            Vec<crate::tasks::generate::ToolCall>,
            Option<crate::TokenUsage>,
        ),
        InferenceError,
    > {
        let (_, protocol) = extract_remote_endpoint(schema)?;
        let handler = crate::protocol::handler_for(protocol);

        // Pre-check: reject video / audio content blocks on protocols
        // that don't accept them natively. Silent stringification to
        // `[video: <source>]` or `[audio: <source>]` is a correctness
        // trap — the model would answer confidently about content it
        // never saw. Surface as an explicit typed error instead.
        if !handler.supports_video() {
            let has_video_in_images =
                images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_video));
            let has_video_in_messages = messages.is_some_and(|msgs| {
                msgs.iter().any(|msg| match msg {
                    crate::tasks::generate::Message::UserMultimodal { content } => {
                        content.iter().any(ContentBlock::is_video)
                    }
                    _ => false,
                })
            });
            if has_video_in_images || has_video_in_messages {
                return Err(InferenceError::UnsupportedMode {
                    mode: "video-content-block",
                    backend: handler.protocol_name(),
                    reason: "this remote protocol has no native video input path; route to \
                         a provider that implements ProtocolHandler::supports_video() (Gemini)",
                });
            }
        }
        if !handler.supports_audio() {
            let has_audio_in_images =
                images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_audio));
            let has_audio_in_messages = messages.is_some_and(|msgs| {
                msgs.iter().any(|msg| match msg {
                    crate::tasks::generate::Message::UserMultimodal { content } => {
                        content.iter().any(ContentBlock::is_audio)
                    }
                    _ => false,
                })
            });
            if has_audio_in_images || has_audio_in_messages {
                return Err(InferenceError::UnsupportedMode {
                    mode: "audio-content-block",
                    backend: handler.protocol_name(),
                    reason: "this remote protocol has no native audio input path; route to \
                         a provider that implements ProtocolHandler::supports_audio() (Gemini)",
                });
            }
        }

        // Auto-truncate if prompt exceeds context window
        let prompt = if schema.context_length > 0 {
            truncate_prompt_to_fit(prompt, context, tools, max_tokens, schema.context_length)
        } else {
            prompt.to_string()
        };

        // Build messages using protocol handler
        let (api_messages, system) =
            handler.build_messages(messages.unwrap_or(&[]), &prompt, context, images);

        // Build tools using protocol handler
        let api_tools = tools.map(|t| handler.build_tools(t));

        // Build and execute request
        let req = crate::protocol::ApiRequest {
            model: request_model_name(schema),
            messages: api_messages,
            system,
            temperature,
            max_tokens,
            tools: api_tools,
            tool_choice: tool_choice.map(str::to_string),
            parallel_tool_calls,
            stream: false,
            budget_tokens,
            cache_control,
            response_format: response_format.cloned(),
        };

        let response = self.execute_request(schema, req).await?;

        let text = response.text;
        let mut calls = response.tool_calls;
        let usage = response.usage;

        // Fix #10: enrich empty "done" results with text output from the same response.
        // Models often call done({"result": "completed"}) while putting actual findings
        // in the text output block.
        if !text.is_empty() {
            for call in &mut calls {
                if call.name == "done" {
                    let result_val = call
                        .arguments
                        .get("result")
                        .and_then(|v| v.as_str())
                        .unwrap_or("");
                    if result_val.len() < 50 && text.len() > result_val.len() {
                        call.arguments.insert(
                            "result".to_string(),
                            serde_json::Value::String(text.clone()),
                        );
                    }
                }
            }
        }

        Ok((text, calls, usage))
    }

    /// Generate embeddings via a remote API (OpenAI-compatible only for now).
    pub async fn embed(
        &self,
        schema: &ModelSchema,
        texts: &[String],
    ) -> Result<Vec<Vec<f32>>, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        let lease = self.lease_key(schema, &endpoint).await?;
        let start = std::time::Instant::now();

        let result = match protocol {
            ApiProtocol::OpenAiCompat => {
                self.embed_openai(&endpoint, &lease.api_key, &schema.name, texts)
                    .await
            }
            _ => Err(InferenceError::InferenceFailed(format!(
                "embedding not supported for {:?} protocol",
                protocol
            ))),
        };

        let latency_ms = start.elapsed().as_millis() as u64;
        match &result {
            Ok(_) => {
                let est_tokens = texts
                    .iter()
                    .map(|t| t.split_whitespace().count() as u64)
                    .sum();
                self.key_pool
                    .report_success(&endpoint, &lease.env_var, latency_ms, est_tokens, 0)
                    .await;
            }
            Err(e) => {
                let is_rl =
                    e.to_string().contains("429") || e.to_string().contains("RESOURCE_EXHAUSTED");
                self.key_pool
                    .report_failure(&endpoint, &lease.env_var, is_rl)
                    .await;
            }
        }

        result
    }

    /// Lease a key from the pool, falling back to env var extraction.
    async fn lease_key(
        &self,
        schema: &ModelSchema,
        endpoint: &str,
    ) -> Result<KeyLease, InferenceError> {
        // Try to get the fallback env var name
        let fallback_env = match &schema.source {
            ModelSource::RemoteApi { api_key_env, .. } => api_key_env.as_str(),
            ModelSource::Ollama { .. } | ModelSource::VllmMlx { .. } => {
                return Ok(KeyLease {
                    api_key: String::new(),
                    env_var: String::new(),
                    index: 0,
                })
            }
            _ => {
                return Err(InferenceError::InferenceFailed(format!(
                    "model {} is not remote",
                    schema.id
                )))
            }
        };

        // Register keys on first use (idempotent)
        self.register_model_keys(schema).await;

        self.key_pool
            .lease_or_env(endpoint, fallback_env)
            .await
            .ok_or_else(|| {
                InferenceError::InferenceFailed(format!(
                    "no API keys available for endpoint {} (checked env vars: {:?})",
                    endpoint,
                    schema.all_api_key_envs()
                ))
            })
    }

    // --- Embedding (not yet migrated to ProtocolHandler) ---

    async fn embed_openai(
        &self,
        endpoint: &str,
        api_key: &str,
        model: &str,
        texts: &[String],
    ) -> Result<Vec<Vec<f32>>, InferenceError> {
        let url = format_endpoint(endpoint, "/v1/embeddings");

        let body = serde_json::json!({
            "model": model,
            "input": texts,
        });

        let resp = self
            .client
            .post(&url)
            .header("Authorization", format!("Bearer {api_key}"))
            .header("Content-Type", "application/json")
            .json(&body)
            .send()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;

        let status = resp.status();
        let text = resp
            .text()
            .await
            .map_err(|e| InferenceError::InferenceFailed(format!("read body: {e}")))?;

        if !status.is_success() {
            return Err(InferenceError::InferenceFailed(format!(
                "API returned {status}: {text}"
            )));
        }

        let parsed: OpenAiEmbedResponse = serde_json::from_str(&text)
            .map_err(|e| InferenceError::InferenceFailed(format!("parse response: {e}")))?;

        Ok(parsed.data.into_iter().map(|d| d.embedding).collect())
    }

    /// Stream a response from a remote API using Server-Sent Events.
    /// Returns a channel receiver that yields StreamEvents.
    /// Works with OpenAI-compatible and Anthropic APIs.
    pub async fn generate_stream(
        &self,
        schema: &ModelSchema,
        prompt: &str,
        context: Option<&str>,
        temperature: f64,
        max_tokens: usize,
        tools: Option<&[serde_json::Value]>,
        images: Option<&[ContentBlock]>,
        tool_choice: Option<&str>,
        parallel_tool_calls: Option<bool>,
        response_format: Option<&crate::tasks::generate::ResponseFormat>,
    ) -> Result<tokio::sync::mpsc::Receiver<crate::stream::StreamEvent>, InferenceError> {
        let (endpoint, protocol) = extract_remote_endpoint(schema)?;
        let lease = self.lease_key(schema, &endpoint).await?;
        let api_key = lease.api_key;
        let model = request_model_name(schema);
        let handler = crate::protocol::handler_for(protocol);

        // Mirror the non-streaming audio/video guard so streaming
        // callers can't bypass the supports_* check and silently
        // stringify audio/video blocks via the handler fallback arms.
        if !handler.supports_video()
            && images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_video))
        {
            return Err(InferenceError::UnsupportedMode {
                mode: "video-content-block",
                backend: handler.protocol_name(),
                reason: "this remote protocol has no native video input path; route to \
                     a provider that implements ProtocolHandler::supports_video() (Gemini)",
            });
        }
        if !handler.supports_audio()
            && images.is_some_and(|blocks| blocks.iter().any(ContentBlock::is_audio))
        {
            return Err(InferenceError::UnsupportedMode {
                mode: "audio-content-block",
                backend: handler.protocol_name(),
                reason: "this remote protocol has no native audio input path; route to \
                     a provider that implements ProtocolHandler::supports_audio() (Gemini)",
            });
        }

        // Streaming path now flows through the same protocol abstraction
        // the non-streaming path uses (#125): build an `ApiRequest`,
        // delegate body construction to `handler.build_request_body`,
        // delegate auth/content-type to `handler.auth_headers`. Anything
        // the protocol abstraction picks up — `response_format`,
        // `tool_choice`, `parallel_tool_calls`, future fields — flows
        // through to streaming for free without re-implementing per
        // provider.
        if matches!(protocol, ApiProtocol::Google) {
            return Err(InferenceError::InferenceFailed(
                "streaming not supported for Google protocol".to_string(),
            ));
        }

        let (messages, system) = handler.build_messages(&[], prompt, context, images);
        let built_tools = tools.map(|t| handler.build_tools(t));
        let req = crate::protocol::ApiRequest {
            model: model.clone(),
            messages,
            system,
            temperature,
            max_tokens,
            tools: built_tools,
            tool_choice: tool_choice.map(str::to_string),
            parallel_tool_calls,
            stream: true,
            budget_tokens: 0,
            cache_control: false,
            response_format: response_format.cloned(),
        };
        let body = handler.build_request_body(&req);

        // Build URL — Azure has its own deployment-based shape; everyone
        // else uses the handler's declared endpoint path.
        let url = if matches!(protocol, ApiProtocol::AzureOpenAi) {
            let api_version = match &schema.source {
                ModelSource::RemoteApi { api_version, .. } => api_version.clone(),
                _ => None,
            };
            let version = api_version.as_deref().unwrap_or("2024-10-21");
            format!(
                "{}/openai/deployments/{}/chat/completions?api-version={}",
                endpoint.trim_end_matches('/'),
                model,
                version
            )
        } else {
            format_endpoint(&endpoint, handler.endpoint_path())
        };

        let mut headers = reqwest::header::HeaderMap::new();
        for (name, value) in handler.auth_headers(&api_key) {
            headers.insert(
                reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
                    InferenceError::InferenceFailed(format!("auth header name: {e}"))
                })?,
                value.parse().map_err(|e| {
                    InferenceError::InferenceFailed(format!("auth header value: {e}"))
                })?,
            );
        }

        let send_fut = self.client.post(&url).headers(headers).json(&body).send();
        let resp = tokio::time::timeout(std::time::Duration::from_secs(150), send_fut)
            .await
            .map_err(|_| {
                InferenceError::InferenceFailed(
                    "stream request timed out after 150s (tokio safety timeout)".to_string(),
                )
            })?
            .map_err(|e| InferenceError::InferenceFailed(format!("HTTP error: {e}")))?;

        let status = resp.status();
        if !status.is_success() {
            let err_text = resp.text().await.unwrap_or_default();
            return Err(InferenceError::InferenceFailed(format!(
                "API returned {status}: {err_text}"
            )));
        }

        let is_anthropic = matches!(protocol, ApiProtocol::Anthropic);
        let (tx, rx) = tokio::sync::mpsc::channel::<crate::stream::StreamEvent>(64);

        // Spawn a task to read the SSE stream and forward events
        tokio::spawn(async move {
            use futures::StreamExt;
            let mut byte_stream = resp.bytes_stream();
            let mut buffer = String::new();

            while let Some(chunk_result) = byte_stream.next().await {
                let chunk = match chunk_result {
                    Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(),
                    Err(_) => break,
                };
                buffer.push_str(&chunk);

                // Process complete SSE events (separated by double newlines)
                while let Some(pos) = buffer.find("\n\n") {
                    let event_block = buffer[..pos].to_string();
                    buffer = buffer[pos + 2..].to_string();

                    let sse_events = crate::stream::parse_sse_lines(&event_block);
                    for (event_type, data) in sse_events {
                        if data == "[DONE]" {
                            continue;
                        }

                        let stream_events = if is_anthropic {
                            crate::stream::parse_anthropic_sse_line(&event_type, &data)
                        } else {
                            crate::stream::parse_openai_sse_line(&format!("data: {data}"))
                        };

                        for evt in stream_events {
                            if tx.send(evt).await.is_err() {
                                return; // receiver dropped
                            }
                        }
                    }
                }
            }
        });

        Ok(rx)
    }
}

impl Default for RemoteBackend {
    fn default() -> Self {
        Self::new()
    }
}

// --- Helpers ---

/// Extract endpoint and protocol from a model schema (key comes from KeyPool now).
fn extract_remote_endpoint(schema: &ModelSchema) -> Result<(String, ApiProtocol), InferenceError> {
    match &schema.source {
        ModelSource::RemoteApi {
            endpoint, protocol, ..
        } => Ok((endpoint.clone(), *protocol)),
        ModelSource::Ollama { host, .. } => Ok((host.clone(), ApiProtocol::OpenAiCompat)),
        ModelSource::VllmMlx { endpoint, .. } => Ok((endpoint.clone(), ApiProtocol::OpenAiCompat)),
        _ => Err(InferenceError::InferenceFailed(format!(
            "model {} is not remote",
            schema.id
        ))),
    }
}

fn request_model_name(schema: &ModelSchema) -> String {
    match &schema.source {
        ModelSource::VllmMlx { model_name, .. } => model_name.clone(),
        _ => schema.name.clone(),
    }
}

/// Normalize endpoint URL for a given path.
fn format_endpoint(base: &str, path: &str) -> String {
    let base = base.trim_end_matches('/');
    // If the base already ends with the path, use it as-is
    if base.ends_with(path.trim_start_matches('/')) {
        base.to_string()
    } else {
        format!("{}{}", base, path)
    }
}

// --- Response types ---

#[derive(Debug, Deserialize)]
struct OpenAiEmbedResponse {
    data: Vec<OpenAiEmbedData>,
}

#[derive(Debug, Deserialize)]
struct OpenAiEmbedData {
    embedding: Vec<f32>,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn format_endpoint_no_dup() {
        assert_eq!(
            format_endpoint("https://api.openai.com", "/v1/chat/completions"),
            "https://api.openai.com/v1/chat/completions"
        );
        assert_eq!(
            format_endpoint(
                "https://api.openai.com/v1/chat/completions",
                "/v1/chat/completions"
            ),
            "https://api.openai.com/v1/chat/completions"
        );
        assert_eq!(
            format_endpoint("https://api.openai.com/", "/v1/chat/completions"),
            "https://api.openai.com/v1/chat/completions"
        );
    }

    #[test]
    fn extract_endpoint_from_remote() {
        let schema = ModelSchema {
            id: "test/model:v1".into(),
            name: "Test".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::RemoteApi {
                endpoint: "https://api.test.com".into(),
                api_key_env: "NONEXISTENT_TEST_KEY_12345".into(),
                api_key_envs: vec![],
                api_version: None,
                protocol: ApiProtocol::OpenAiCompat,
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        };
        let (endpoint, protocol) = extract_remote_endpoint(&schema).unwrap();
        assert_eq!(endpoint, "https://api.test.com");
        assert_eq!(protocol, ApiProtocol::OpenAiCompat);
    }

    #[test]
    fn extract_endpoint_non_remote_fails() {
        let schema = ModelSchema {
            id: "local/model:v1".into(),
            name: "Local".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::Local {
                hf_repo: "test".into(),
                hf_filename: "test".into(),
                tokenizer_repo: "test".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        };
        assert!(extract_remote_endpoint(&schema).is_err());
    }

    #[test]
    fn parse_openai_embed_response() {
        let json = r#"{"data":[{"embedding":[0.1,0.2,0.3]}]}"#;
        let resp: OpenAiEmbedResponse = serde_json::from_str(json).unwrap();
        assert_eq!(resp.data[0].embedding, vec![0.1, 0.2, 0.3]);
    }

    #[test]
    fn request_model_name_uses_vllm_server_model() {
        let schema = ModelSchema {
            id: "vllm-mlx/test".into(),
            name: "Display Name".into(),
            provider: "test".into(),
            family: "test".into(),
            version: "1".into(),
            capabilities: vec![],
            context_length: 4096,
            param_count: String::new(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: ModelSource::VllmMlx {
                endpoint: "http://localhost:8000".into(),
                model_name: "mlx-community/Actual-Model".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: true,
        };
        assert_eq!(request_model_name(&schema), "mlx-community/Actual-Model");
    }

    #[test]
    fn truncate_prompt_fits_returns_unchanged() {
        let prompt = "short prompt";
        let result = truncate_prompt_to_fit(prompt, None, None, 16, 256);
        assert_eq!(result, prompt);
    }

    #[test]
    fn truncate_prompt_cjk_mid_codepoint_does_not_panic() {
        // 200 CJK chars = 600 bytes. With max_tokens=20 and window=209:
        // reserved = 20 + 100 overhead = 120, available = 89, chars_to_keep = 356.
        // start = 600 - 356 = 244, which is NOT a char boundary for 3-byte chars.
        let prompt: String = std::iter::repeat('\u{4E16}').take(200).collect();
        let result = truncate_prompt_to_fit(&prompt, None, None, 20, 209);
        assert!(result.starts_with("[...truncated...]"));
        let kept = result.strip_prefix("[...truncated...]\n").unwrap();
        assert!(!kept.is_empty());
    }

    #[test]
    fn truncate_prompt_accounts_for_context_and_tools() {
        let prompt = "line one\nline two\nline three\n".repeat(50);
        let tools = vec![serde_json::json!({"name": "demo_tool"})];
        let result = truncate_prompt_to_fit(&prompt, Some("ctx"), Some(&tools), 20, 240);
        assert!(result.starts_with("[...truncated...]"));
    }
}