polyc-llm-vertex 0.1.3

//! A concrete [`LlmProvider`] backed by the GCP AI platform REST API,
//! authenticated via Workload Identity Federation / Application Default
//! Credentials (handled by `gcp_auth`: in-cluster metadata, WIF, or a local
//! `gcloud` login, transparently).
//!
//! Uses the SSE-streaming `streamGenerateContent` endpoint
//! (`?alt=sse`) and adapts each partial response into the streaming
//! [`Chunk`] vocabulary the trait expects (text → [`Chunk::TextDelta`] per
//! token batch, function calls → tool-call chunks, usage + finish reason
//! → [`Chunk::Usage`]/[`Chunk::Stop`]). Chunks are yielded as bytes arrive
//! — the harness pushes them down to the client without buffering the
//! whole response, so user-visible latency starts at first-token time.
//! The trait boundary keeps it a
//! single-file change.

use std::sync::Arc;

use async_trait::async_trait;
use futures::stream::{BoxStream, StreamExt};
use gcp_auth::TokenProvider;
use polyc_llm::{
    Chunk, CompletionRequest, Content, LlmProvider, Message, Role, StopReason, Usage,
    sse::next_event_boundary,
};
use serde::Deserialize;

const SCOPE: &str = "https://www.googleapis.com/auth/cloud-platform";

/// How long to wait for the TCP/TLS connection to establish.
const CONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
/// Per-read idle timeout — bounds a *stalled* SSE stream without capping a long
/// healthy generation (resets on each successful read; NOT `Client::timeout`).
const READ_TIMEOUT: std::time::Duration = std::time::Duration::from_mins(2);

/// Which model/project/region to call.
#[derive(Debug, Clone)]
pub struct VertexConfig {
    /// GCP project id.
    pub project: String,
    /// Region, e.g. `"us-central1"`.
    pub location: String,
    /// Publisher model id, e.g. `"<family>-<size>-<version>"`.
    pub model: String,
}

/// Errors from the provider.
#[derive(Debug, thiserror::Error)]
pub enum VertexError {
    /// Credential acquisition failed.
    #[error("auth: {0}")]
    Auth(#[from] gcp_auth::Error),
    /// Transport/HTTP failure.
    #[error("http: {0}")]
    Http(#[from] reqwest::Error),
    /// The API returned a non-2xx status.
    #[error("provider returned status {status}: {body}")]
    Provider {
        /// HTTP status code.
        status: u16,
        /// Response body.
        body: String,
    },
}

impl polyc_llm::LlmError for VertexError {
    fn kind(&self) -> polyc_llm::LlmErrorKind {
        use polyc_llm::LlmErrorKind;
        match self {
            // Credential acquisition failure → terminal auth error.
            Self::Auth(_) => LlmErrorKind::Auth,
            Self::Http(e) if e.is_timeout() => LlmErrorKind::Timeout,
            Self::Http(_) => LlmErrorKind::Unavailable,
            Self::Provider { status, .. } => polyc_llm::kind_from_http_status(*status),
        }
    }
}

/// `LlmProvider` over the GCP AI platform REST API.
pub struct VertexProvider {
    http: reqwest::Client,
    tokens: Arc<dyn TokenProvider>,
    config: VertexConfig,
}

impl VertexProvider {
    /// Build a provider, resolving ambient credentials (WIF/ADC/metadata).
    ///
    /// # Errors
    ///
    /// Returns [`VertexError::Auth`] if no credentials can be resolved.
    pub async fn new(config: VertexConfig) -> Result<Self, VertexError> {
        let tokens = gcp_auth::provider().await?;
        // Bound a stalled stream so a detached harness turn task can't hang
        // forever waiting on a read that never returns (the control plane's
        // TURN_DEADLINE can't reclaim it).
        let http = reqwest::Client::builder()
            .connect_timeout(CONNECT_TIMEOUT)
            .read_timeout(READ_TIMEOUT)
            .build()
            .unwrap_or_else(|_| reqwest::Client::new());
        Ok(Self {
            http,
            tokens,
            config,
        })
    }

    /// Build the `streamGenerateContent` URL for `model`.
    ///
    /// `model` is the per-request model id (`CompletionRequest::model`); the
    /// caller falls back to [`VertexConfig::model`] when the request leaves it
    /// empty, so the configured model is only a default and the active model can
    /// be switched per turn without rebuilding the provider.
    fn endpoint(&self, model: &str) -> String {
        let VertexConfig {
            project, location, ..
        } = &self.config;
        // The `global` location is served from the un-prefixed host
        // (`aiplatform.googleapis.com`), NOT `global-aiplatform.googleapis.com`
        // — the path still carries `locations/global`. The newest models are
        // global-only, so this branch is required to reach them; regional
        // locations keep the `{location}-` host prefix.
        let host = if location == "global" {
            "aiplatform.googleapis.com".to_owned()
        } else {
            format!("{location}-aiplatform.googleapis.com")
        };
        format!(
            "https://{host}/v1/projects/{project}/locations/{location}/publishers/google/models/{model}:streamGenerateContent?alt=sse"
        )
    }
}

#[async_trait]
impl LlmProvider for VertexProvider {
    type Error = VertexError;

    async fn complete(
        &self,
        req: CompletionRequest,
    ) -> Result<BoxStream<'static, Result<Chunk, Self::Error>>, Self::Error> {
        // Per-request model wins; the configured model is the fallback default,
        // so an empty request model (or a switch to a different one) needs no
        // provider rebuild.
        let model = if req.model.is_empty() {
            self.config.model.as_str()
        } else {
            req.model.as_str()
        };
        let body = build_request(&req);
        tracing::debug!(
            model = %model,
            messages = req.messages.len(),
            tools = req.tools.len(),
            max_tokens = ?req.max_tokens,
            temperature = ?req.temperature,
            body = %body,
            "vertex request"
        );
        let token = self.tokens.token(&[SCOPE]).await?;
        let resp = self
            .http
            .post(self.endpoint(model))
            .bearer_auth(token.as_str())
            .json(&body)
            .send()
            .await?;

        let status = resp.status();
        if !status.is_success() {
            let body = resp.text().await.unwrap_or_default();
            return Err(VertexError::Provider {
                status: status.as_u16(),
                body,
            });
        }

        // Yield Chunks as SSE events arrive. Each event is `data: <json>\n\n`
        // where `<json>` is a partial `GenerateContentResponse` (a slice of
        // the full reply); `map_response` produces one or more Chunks per
        // event. Buffering only the smallest amount needed to find the next
        // `\n\n` boundary keeps end-to-end latency at first-token time.
        let byte_stream = resp.bytes_stream();
        let chunks = async_stream::stream! {
            use futures::StreamExt as _;
            let mut byte_stream = byte_stream;
            let mut buf: Vec<u8> = Vec::new();
            // Stream-scoped counter so synthesized tool-call ids stay unique
            // across SSE events (see `map_response`).
            let mut tool_seq = 0usize;
            while let Some(item) = byte_stream.next().await {
                let bytes = match item {
                    Ok(b) => b,
                    Err(e) => { yield Err(VertexError::from(e)); return; }
                };
                buf.extend_from_slice(&bytes);
                while let Some((pos, sep_len)) = next_event_boundary(&buf) {
                    let event_bytes: Vec<u8> = buf.drain(..pos + sep_len).collect();
                    // Trim the trailing separator.
                    let event = std::str::from_utf8(&event_bytes[..event_bytes.len() - sep_len])
                        .unwrap_or("");
                    for line in event.lines() {
                        let Some(json) = line.strip_prefix("data: ").or_else(|| line.strip_prefix("data:")) else {
                            continue;
                        };
                        tracing::debug!(event = %json, "vertex sse event");
                        match serde_json::from_str::<GenerateContentResponse>(json) {
                            Ok(resp) => {
                                for chunk in map_response(resp, &mut tool_seq) {
                                    yield chunk;
                                }
                            }
                            Err(err) => {
                                // Don't kill the stream over one malformed
                                // event — surface it and continue.
                                yield Err(VertexError::Provider {
                                    status: 0,
                                    body: format!("malformed SSE JSON: {err}; line: {json}"),
                                });
                            }
                        }
                    }
                }
            }
        };
        Ok(chunks.boxed())
    }
}

/// Map a [`CompletionRequest`] to the GCP `generateContent` request body.
fn build_request(req: &CompletionRequest) -> serde_json::Value {
    let mut contents = Vec::new();
    let mut system_parts = Vec::new();

    for msg in &req.messages {
        if msg.role == Role::System {
            for c in &msg.content {
                if let Content::Text(t) = c {
                    system_parts.push(serde_json::json!({ "text": t }));
                }
            }
        } else {
            let role = if msg.role == Role::Assistant {
                "model"
            } else {
                "user"
            };
            let parts = message_parts(msg);
            if !parts.is_empty() {
                contents.push(serde_json::json!({ "role": role, "parts": parts }));
            }
        }
    }

    let mut body = serde_json::json!({ "contents": contents });
    if !system_parts.is_empty() {
        body["systemInstruction"] = serde_json::json!({ "parts": system_parts });
    }
    let mut gen_config = serde_json::Map::new();
    if let Some(max) = req.max_tokens {
        gen_config.insert("maxOutputTokens".into(), max.into());
    }
    if let Some(temp) = req.temperature {
        gen_config.insert("temperature".into(), temp.into());
    }
    if !req.stop.is_empty() {
        gen_config.insert("stopSequences".into(), serde_json::json!(req.stop));
    }
    if !gen_config.is_empty() {
        body["generationConfig"] = serde_json::Value::Object(gen_config);
    }
    // Gemini accepts a single `tools` array carrying multiple entries: one
    // `functionDeclarations` block for our MCP/native tools, plus built-in tools
    // like `googleSearch`. Gemini 3 supports combining them in one request, so
    // the model can pivot between searching the public web and calling our tools
    // within a turn. `web_search` is what `run_turn` sets for answering turns;
    // auxiliary calls (summarize/classify) leave it false and get no search.
    let mut tool_entries: Vec<serde_json::Value> = Vec::new();
    if !req.tools.is_empty() {
        let decls: Vec<_> = req
            .tools
            .iter()
            .map(|t| {
                serde_json::json!({
                    "name": t.name,
                    "description": t.description,
                    "parameters": sanitize_schema_for_gemini(&t.schema_json),
                })
            })
            .collect();
        tool_entries.push(serde_json::json!({ "functionDeclarations": decls }));
    }
    if req.web_search {
        tool_entries.push(serde_json::json!({ "googleSearch": {} }));
    }
    if !tool_entries.is_empty() {
        body["tools"] = serde_json::Value::Array(tool_entries);
    }
    body
}

/// JSON-Schema keywords Gemini's function-declaration parser rejects with a 400.
///
/// MCP tool schemas are frequently JSON-Schema draft-07 (e.g. Zod output) and
/// carry these; Vertex's function-calling `Schema` is a restricted `OpenAPI` 3.0
/// subset. We drop them recursively before sending. The kept keywords (`type`,
/// `properties`, `required`, `enum`, `items`, `description`, `minimum`,
/// `maximum`, `default`, `format`, …) cover what the model needs to call the
/// tool; the dropped constraints are re-validated by the tool server anyway.
///
/// Verified still required on `gemini-3.1-pro-preview` (2026-06-05): sending a
/// raw draft-07 `parameters` 400s with `Unknown name "$schema"` /
/// `"exclusiveMinimum"` — the function-declaration parser is OpenAPI-subset, not
/// model-version-gated, so this is NOT a 2.5-only workaround.
const GEMINI_UNSUPPORTED_SCHEMA_KEYS: &[&str] = &[
    "$schema",
    "$id",
    "$ref",
    "$defs",
    "$comment",
    "definitions",
    "additionalProperties",
    "unevaluatedProperties",
    "patternProperties",
    "exclusiveMinimum",
    "exclusiveMaximum",
];

/// Recursively strip [`GEMINI_UNSUPPORTED_SCHEMA_KEYS`] from a JSON-Schema value
/// so a rich MCP tool schema becomes a valid Vertex function-declaration
/// `parameters` object. Non-object/array values pass through unchanged.
fn sanitize_schema_for_gemini(value: &serde_json::Value) -> serde_json::Value {
    match value {
        serde_json::Value::Object(map) => serde_json::Value::Object(
            map.iter()
                .filter(|(k, _)| !GEMINI_UNSUPPORTED_SCHEMA_KEYS.contains(&k.as_str()))
                .map(|(k, v)| (k.clone(), sanitize_schema_for_gemini(v)))
                .collect(),
        ),
        serde_json::Value::Array(arr) => {
            serde_json::Value::Array(arr.iter().map(sanitize_schema_for_gemini).collect())
        }
        other => other.clone(),
    }
}

/// Build the `parts` array for one message (text, tool calls, tool results).
fn message_parts(msg: &Message) -> Vec<serde_json::Value> {
    let mut parts = Vec::new();
    for c in &msg.content {
        match c {
            Content::Text(t) => parts.push(serde_json::json!({ "text": t })),
            Content::ToolUse(tc) => {
                let args: serde_json::Value =
                    serde_json::from_str(&tc.args_json).unwrap_or(serde_json::Value::Null);
                let mut part = serde_json::json!({
                    "functionCall": { "name": tc.name, "args": args }
                });
                // Thinking models require the thought signature they emitted
                // alongside a function call to be echoed back on the part when
                // it reappears in the request history, or they reject the
                // request (400 INVALID_ARGUMENT).
                if let Some(sig) = &tc.signature {
                    part["thoughtSignature"] = serde_json::json!(sig);
                }
                parts.push(part);
            }
            Content::ToolResult(tr) => {
                let result: serde_json::Value =
                    serde_json::from_str(&tr.result_json).unwrap_or(serde_json::Value::Null);
                parts.push(serde_json::json!({
                    "functionResponse": { "name": tr.tool_call_id, "response": { "result": result } }
                }));
            }
            // Images and any future content variants are not yet mapped.
            _ => {}
        }
    }
    parts
}

/// Fold a `generateContent` response into ordered [`Chunk`]s.
///
/// `tool_seq` is a stream-scoped counter for synthesizing unique tool-call
/// ids: the provider emits no ids of its own, and a turn's function calls can
/// arrive across multiple SSE events, so a per-event index would collide
/// (every event would restart at `call-0`). Threading one counter across all
/// events keeps ids unique and stable.
fn map_response(
    resp: GenerateContentResponse,
    tool_seq: &mut usize,
) -> Vec<Result<Chunk, VertexError>> {
    let mut chunks = Vec::new();
    let candidate = resp.candidates.into_iter().next();

    let mut text = String::new();
    let mut tool_calls = Vec::new();
    let mut finish = None;
    if let Some(c) = candidate {
        finish = c.finish_reason;
        if let Some(content) = c.content {
            for part in content.parts {
                if let Some(t) = part.text {
                    text.push_str(&t);
                }
                if let Some(fc) = part.function_call {
                    // Carry the per-part thought signature with its call so it
                    // can be echoed back on the follow-up request.
                    tool_calls.push((fc, part.thought_signature));
                }
            }
        }
    }

    if !text.is_empty() {
        chunks.push(Ok(Chunk::text_delta(text)));
    }
    for (fc, signature) in &tool_calls {
        let id = format!("call-{}", *tool_seq);
        *tool_seq += 1;
        chunks.push(Ok(Chunk::tool_call_start_signed(
            id.clone(),
            fc.name.clone(),
            signature.clone(),
        )));
        chunks.push(Ok(Chunk::tool_call_args_delta(
            id.clone(),
            fc.args.to_string(),
        )));
        chunks.push(Ok(Chunk::tool_call_end(id)));
    }
    if let Some(u) = resp.usage_metadata {
        chunks.push(Ok(Chunk::Usage(Usage {
            input_tokens: u.prompt_token_count,
            output_tokens: u.candidates_token_count,
        })));
    }
    // Only terminate the stream when this event actually signals end of turn:
    // either an explicit `finish_reason` (the canonical end marker) or a
    // tool-use payload (the model handed control back). Partial SSE events
    // have neither, so they don't push a Stop — keeping the stream open for
    // subsequent token batches.
    if finish.is_some() || !tool_calls.is_empty() {
        let mapped = map_finish_reason(finish.as_deref());
        // Tool calls present → the model yielded for tool use, INCLUDING the
        // common streamed case where the call arrives with no finish reason
        // (mapped → EndTurn). But a *hard* finish (MaxTokens / Refusal /
        // StopSequence) means the output was truncated or refused, so it wins:
        // the call may be incomplete and must not be executed downstream.
        let stop = if !tool_calls.is_empty() && matches!(mapped, StopReason::EndTurn) {
            StopReason::ToolUse
        } else {
            mapped
        };
        chunks.push(Ok(Chunk::Stop(stop)));
    }
    chunks
}

fn map_finish_reason(reason: Option<&str>) -> StopReason {
    match reason {
        Some("MAX_TOKENS") => StopReason::MaxTokens,
        Some("STOP_SEQUENCE") => StopReason::StopSequence,
        Some("SAFETY" | "RECITATION" | "BLOCKLIST" | "PROHIBITED_CONTENT" | "SPII") => {
            StopReason::Refusal
        }
        _ => StopReason::EndTurn,
    }
}

// ── response wire types ─────────────────────────────────────────────────────

#[derive(Deserialize)]
struct GenerateContentResponse {
    #[serde(default)]
    candidates: Vec<Candidate>,
    #[serde(default, rename = "usageMetadata")]
    usage_metadata: Option<UsageMetadata>,
}

#[derive(Deserialize)]
struct Candidate {
    #[serde(default)]
    content: Option<RespContent>,
    #[serde(default, rename = "finishReason")]
    finish_reason: Option<String>,
}

#[derive(Deserialize)]
struct RespContent {
    #[serde(default)]
    parts: Vec<Part>,
}

#[derive(Deserialize)]
struct Part {
    #[serde(default)]
    text: Option<String>,
    #[serde(default, rename = "functionCall")]
    function_call: Option<FunctionCall>,
    #[serde(default, rename = "thoughtSignature")]
    thought_signature: Option<String>,
}

#[derive(Deserialize)]
struct FunctionCall {
    name: String,
    #[serde(default)]
    args: serde_json::Value,
}

#[derive(Deserialize)]
struct UsageMetadata {
    #[serde(default, rename = "promptTokenCount")]
    prompt_token_count: u64,
    #[serde(default, rename = "candidatesTokenCount")]
    candidates_token_count: u64,
}

#[cfg(test)]
mod tests {
    #![allow(clippy::pedantic, clippy::nursery, missing_docs)]

    use super::*;

    #[test]
    fn maps_text_and_usage_and_stop() {
        let resp: GenerateContentResponse = serde_json::from_value(serde_json::json!({
            "candidates": [{
                "content": { "role": "model", "parts": [{ "text": "parity" }] },
                "finishReason": "STOP"
            }],
            "usageMetadata": { "promptTokenCount": 5, "candidatesTokenCount": 2 }
        }))
        .unwrap();
        let chunks: Vec<_> = map_response(resp, &mut 0)
            .into_iter()
            .map(Result::unwrap)
            .collect();
        assert_eq!(chunks[0], Chunk::text_delta("parity"));
        assert!(matches!(
            chunks[chunks.len() - 1],
            Chunk::Stop(StopReason::EndTurn)
        ));
    }

    #[test]
    fn maps_function_call_to_tool_chunks() {
        let resp: GenerateContentResponse = serde_json::from_value(serde_json::json!({
            "candidates": [{
                "content": { "parts": [{ "functionCall": { "name": "search", "args": { "q": "rust" } } }] },
                "finishReason": "STOP"
            }]
        }))
        .unwrap();
        let chunks: Vec<_> = map_response(resp, &mut 0)
            .into_iter()
            .map(Result::unwrap)
            .collect();
        assert!(
            chunks
                .iter()
                .any(|c| matches!(c, Chunk::ToolCallStart { name, .. } if name == "search"))
        );
        assert!(matches!(
            chunks[chunks.len() - 1],
            Chunk::Stop(StopReason::ToolUse)
        ));
    }

    #[test]
    fn build_request_maps_roles_and_system() {
        let mut req = CompletionRequest::new("m");
        req.system = None;
        req.messages = vec![Message::system("be terse"), Message::user("hi")];
        let body = build_request(&req);
        assert_eq!(body["systemInstruction"]["parts"][0]["text"], "be terse");
        assert_eq!(body["contents"][0]["role"], "user");
        assert_eq!(body["contents"][0]["parts"][0]["text"], "hi");
    }

    #[test]
    fn build_request_strips_gemini_incompatible_tool_schema_keys() {
        use polyc_llm::ToolSpec;
        let mut req = CompletionRequest::new("m");
        // Gemini's function-declaration parser rejects draft-07 keywords with a
        // 400 (verified on gemini-3.1-pro-preview 2026-06-05), so build_request
        // must strip them recursively before sending.
        req.tools = vec![ToolSpec {
            name: "list_recent".to_owned(),
            description: "recent".to_owned(),
            schema_json: serde_json::json!({
                "$schema": "http://json-schema.org/draft-07/schema#",
                "type": "object",
                "additionalProperties": false,
                "properties": {
                    "limit": { "type": "integer", "exclusiveMinimum": 0, "maximum": 100 }
                }
            }),
            title: None,
            needs_approval: false,
        }];
        let params = &build_request(&req)["tools"][0]["functionDeclarations"][0]["parameters"];
        assert!(params.get("$schema").is_none());
        assert!(params.get("additionalProperties").is_none());
        assert!(
            params["properties"]["limit"]
                .get("exclusiveMinimum")
                .is_none()
        );
        // Valid keywords survive.
        assert_eq!(params["type"], "object");
        assert_eq!(params["properties"]["limit"]["type"], "integer");
        assert_eq!(params["properties"]["limit"]["maximum"], 100);
    }

    #[test]
    fn web_search_adds_google_search_grounding_tool() {
        use polyc_llm::ToolSpec;
        // web_search off: no tools array at all when there are no function tools.
        let off = CompletionRequest::new("m");
        assert!(build_request(&off).get("tools").is_none());

        // web_search on, no function tools: a lone googleSearch entry.
        let mut grounded = CompletionRequest::new("m");
        grounded.web_search = true;
        let tools = build_request(&grounded)["tools"].clone();
        assert_eq!(tools, serde_json::json!([{ "googleSearch": {} }]));

        // web_search on WITH a function tool: both entries coexist in one array
        // (Gemini 3 combines built-in + custom tools in a single request).
        grounded.tools = vec![ToolSpec {
            name: "list_recent".to_owned(),
            description: "recent".to_owned(),
            schema_json: serde_json::json!({ "type": "object" }),
            title: None,
            needs_approval: false,
        }];
        let tools = build_request(&grounded)["tools"].clone();
        assert_eq!(tools.as_array().map(Vec::len), Some(2));
        assert!(tools[0].get("functionDeclarations").is_some());
        assert_eq!(tools[1], serde_json::json!({ "googleSearch": {} }));
    }
}