car-inference 0.15.0

//! Model schema — declarative metadata for models, analogous to ToolSchema for tools.
//!
//! Every model (local GGUF, remote API, Ollama) is described by a `ModelSchema`
//! that declares identity, capabilities, constraints, cost, and source.
//! The router uses this schema for initial routing; observed outcomes refine it.

use serde::{Deserialize, Serialize};

/// What a model can do.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ModelCapability {
    /// Text completion / chat generation
    Generate,
    /// Vector embeddings
    Embed,
    /// Cross-encoder relevance scoring (query + document → relevance
    /// score). Qwen3-Reranker is the canonical local implementation.
    Rerank,
    /// Label assignment / classification
    Classify,
    /// Code generation, repair, refactoring
    Code,
    /// Chain-of-thought, planning, analysis
    Reasoning,
    /// Text condensation
    Summarize,
    /// Function/tool calling
    ToolUse,
    /// Multiple tool calls in a single response (parallel tool execution)
    MultiToolCall,
    /// Vision / image understanding
    Vision,
    /// Video understanding (multi-frame sampling + temporal tokens).
    /// Distinct from `Vision` so routing can prefer video-trained
    /// models when the caller attaches a video content block.
    VideoUnderstanding,
    /// Audio understanding (speech + non-speech audio as an input to
    /// a chat/reasoning model). Distinct from `SpeechToText` which is
    /// the transcription-only task. Gemma 4 E2B/E4B and Gemini do
    /// this; Qwen2.5-VL does not.
    AudioUnderstanding,
    /// Visual grounding — structured object-localization output
    /// (bounding boxes keyed to object labels) in addition to text.
    Grounding,
    /// Speech recognition / transcription
    SpeechToText,
    /// Speech synthesis / text-to-speech
    TextToSpeech,
    /// Image generation
    ImageGeneration,
    /// Video generation
    VideoGeneration,
}

/// How to access the model.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ModelSource {
    /// Local GGUF file via Candle backend.
    Local {
        hf_repo: String,
        hf_filename: String,
        tokenizer_repo: String,
    },
    /// Remote API endpoint (OpenAI-compatible, Anthropic, etc.)
    RemoteApi {
        endpoint: String,
        /// Environment variable name containing the API key (never the key itself).
        /// The env var value may contain comma-separated keys for load balancing.
        api_key_env: String,
        /// Additional environment variable names for load balancing across multiple keys.
        /// Each env var may also contain comma-separated keys.
        #[serde(default)]
        api_key_envs: Vec<String>,
        #[serde(default)]
        api_version: Option<String>,
        protocol: ApiProtocol,
    },
    /// Ollama local server.
    Ollama {
        model_tag: String,
        #[serde(default = "default_ollama_host")]
        host: String,
    },
    /// Local MLX model via mlx-rs backend (Apple Silicon, safetensors format).
    /// Models from mlx-community on HuggingFace.
    Mlx {
        /// HuggingFace repo (e.g., "mlx-community/Qwen3-4B-4bit").
        hf_repo: String,
        /// Optional specific weight filename. If None, auto-discovers safetensors files.
        #[serde(default)]
        hf_weight_file: Option<String>,
    },
    /// Local vLLM-MLX server (Apple Silicon, OpenAI-compatible API).
    /// Routes through RemoteBackend with OpenAI protocol handler.
    VllmMlx {
        /// Server endpoint (e.g., "http://localhost:8000").
        endpoint: String,
        /// The model name as known to vLLM-MLX (e.g., "mlx-community/Qwen3-4B-4bit").
        model_name: String,
    },
    /// Apple's on-device system model via the FoundationModels framework
    /// (macOS 26+, Apple Silicon). Inference happens in-process through a
    /// Swift shim — there is no HTTP, no API key, and no model file: the
    /// OS owns the weights. Availability is checked at runtime via
    /// `@available(macOS 26.0, *)`; on older macOS or non-Apple-Silicon
    /// hosts the backend reports `UnsupportedMode` and the router falls
    /// through to the next candidate.
    AppleFoundationModels {
        /// Optional Apple use-case hint passed through to
        /// `LanguageModelSession`. Apple's framework tunes its prompt and
        /// safety scaffolding per use case (e.g. "general", "summarize").
        /// `None` uses the default.
        #[serde(default)]
        use_case: Option<String>,
    },
    /// Proprietary provider with custom auth and protocol.
    ///
    /// For vendor-specific APIs that aren't generic OpenAI-compatible endpoints.
    /// Parslee is the first proprietary provider — custom auth (OAuth2),
    /// custom response format, multi-provider routing built into the API.
    Proprietary {
        /// Provider identifier (e.g., "parslee").
        provider: String,
        /// Base URL for the API.
        endpoint: String,
        /// Auth configuration.
        auth: ProprietaryAuth,
        /// Custom protocol details.
        protocol: ProprietaryProtocol,
    },
    /// Inference is delegated to a host-registered runner. CAR does
    /// not own the wire format — the runner (typically a JS / Python
    /// host) translates the `GenerateRequest` to its provider's API,
    /// streams chunks back through the runner's event callback, and
    /// returns the final aggregated result.
    ///
    /// Closes Parslee-ai/car-releases#24. Use this when the host
    /// already has an SDK relationship with a provider (Anthropic,
    /// OpenAI, GitHub Models, Vercel AI SDK) and wants CAR to sit in
    /// the lifecycle / policy / replay path without learning every
    /// provider's wire format.
    ///
    /// Routing requires that a runner has been registered via
    /// [`crate::set_inference_runner`] (or its FFI equivalent —
    /// `registerInferenceRunner` on JS, `register_inference_runner`
    /// on Python, the `InferenceRunner` foreign trait on UniFFI,
    /// `inference.register_runner` on the WebSocket protocol).
    /// Without a runner, dispatch fails with `InferenceFailed`.
    Delegated {
        /// Opaque hint passed through to the runner — typically the
        /// provider id (`"anthropic"`, `"openai"`, `"vercel-ai-sdk"`)
        /// so a multi-provider runner can dispatch internally. CAR
        /// does not interpret this string.
        #[serde(default)]
        hint: Option<String>,
    },
}

/// Authentication method for proprietary providers.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ProprietaryAuth {
    /// OAuth2 PKCE flow (e.g., Azure AD for Parslee).
    OAuth2Pkce {
        authority: String,
        client_id: String,
        scopes: Vec<String>,
    },
    /// Static API key from environment variable.
    ApiKeyEnv { env_var: String },
    /// Bearer token from environment variable.
    BearerTokenEnv { env_var: String },
}

/// Protocol configuration for proprietary providers.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProprietaryProtocol {
    /// Chat/completion endpoint path (appended to base URL).
    #[serde(default = "default_chat_path")]
    pub chat_path: String,
    /// Content type for requests.
    #[serde(default = "default_content_type")]
    pub content_type: String,
    /// Whether the API streams responses via SSE.
    #[serde(default)]
    pub streaming: bool,
    /// Custom headers to include in every request.
    #[serde(default)]
    pub extra_headers: std::collections::HashMap<String, String>,
}

impl Default for ProprietaryProtocol {
    fn default() -> Self {
        Self {
            chat_path: default_chat_path(),
            content_type: default_content_type(),
            streaming: false,
            extra_headers: std::collections::HashMap::new(),
        }
    }
}

fn default_chat_path() -> String {
    "/chat".to_string()
}

fn default_content_type() -> String {
    "application/json".to_string()
}

fn default_ollama_host() -> String {
    "http://localhost:11434".to_string()
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ApiProtocol {
    OpenAiCompat,
    /// OpenAI Responses API (/v1/responses) — works with all OpenAI models including codex.
    OpenAiResponses,
    Anthropic,
    Google,
    /// Azure OpenAI — uses api-key header and deployment-based URLs.
    /// Endpoint format: {base}/openai/deployments/{model}/chat/completions?api-version={version}
    AzureOpenAi,
}

/// Declared performance expectations. Overridden by observed data once available.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PerformanceEnvelope {
    /// Median latency in milliseconds (declared/estimated).
    #[serde(default)]
    pub latency_p50_ms: Option<u64>,
    /// 99th percentile latency in milliseconds.
    #[serde(default)]
    pub latency_p99_ms: Option<u64>,
    /// Tokens per second throughput.
    #[serde(default)]
    pub tokens_per_second: Option<f64>,
}

/// Cost model for routing optimization.
/// Generation parameters that a model may or may not support.
/// Models declare which params they accept. The inference layer
/// strips unsupported params before sending to the API.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum GenerateParam {
    Temperature,
    TopP,
    TopK,
    MaxTokens,
    StopSequences,
    FrequencyPenalty,
    PresencePenalty,
    Seed,
    ResponseFormat,
    /// Extended thinking / internal reasoning before responding.
    ExtendedThinking,
}

/// Standard parameter set for most models.
pub fn standard_params() -> Vec<GenerateParam> {
    vec![
        GenerateParam::Temperature,
        GenerateParam::TopP,
        GenerateParam::MaxTokens,
        GenerateParam::StopSequences,
        GenerateParam::FrequencyPenalty,
        GenerateParam::PresencePenalty,
        GenerateParam::Seed,
    ]
}

/// Parameter set for reasoning models (no temperature, no top_p).
pub fn reasoning_params() -> Vec<GenerateParam> {
    vec![GenerateParam::MaxTokens, GenerateParam::StopSequences]
}

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CostModel {
    /// USD per 1M input tokens (remote models).
    #[serde(default)]
    pub input_per_mtok: Option<f64>,
    /// USD per 1M output tokens (remote models).
    #[serde(default)]
    pub output_per_mtok: Option<f64>,
    /// On-disk size in MB (local models).
    #[serde(default)]
    pub size_mb: Option<u64>,
    /// RAM required during inference in MB.
    #[serde(default)]
    pub ram_mb: Option<u64>,
}

/// A score on a public benchmark from a published source (model card,
/// paper, leaderboard). The schema is deliberately permissive — no enum
/// of benchmark names — so the catalog can carry whichever benchmarks
/// the upstream provider chose to publish, and new ones can be added
/// without a code change. Scores are stored on a 0.0–1.0 scale (e.g.
/// 73.5% accuracy → 0.735) so they compare cleanly across benchmarks
/// and so `routing_ext::apply_benchmark_priors` can consume them
/// directly when wired in later.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkScore {
    /// Benchmark name as published (e.g., "MMLU-Pro", "GPQA-Diamond",
    /// "SWE-bench-Verified", "HumanEval", "MATH").
    pub name: String,
    /// Score on a 0.0–1.0 scale.
    pub score: f64,
    /// Evaluation harness or setup label (e.g., "5-shot", "0-shot CoT",
    /// "agentic", "pass@1"). Optional but strongly recommended — the
    /// same benchmark name can mean different things under different
    /// harnesses.
    #[serde(default)]
    pub harness: Option<String>,
    /// Where the score came from (model card URL, paper, leaderboard
    /// snapshot). Empty when the source is the upstream provider's
    /// announcement and a stable URL is not yet known.
    #[serde(default)]
    pub source_url: Option<String>,
    /// ISO 8601 date of the score snapshot (e.g., "2025-08-12"). Lets
    /// downstream code judge how stale a number is.
    #[serde(default)]
    pub measured_at: Option<String>,
}

/// The full declarative schema for a model.
///
/// Analogous to `ToolSchema` — describes what a model is, what it can do,
/// and how to access it. The router uses this for constraint-based filtering
/// and cold-start scoring before observed performance data is available.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelSchema {
    /// Unique identifier: "provider/model-name:variant" (e.g., "qwen/qwen3-4b:q4_k_m").
    pub id: String,
    /// Human-readable display name.
    pub name: String,
    /// Provider (qwen, openai, anthropic, google, meta, ollama, custom).
    pub provider: String,
    /// Model family for grouping (qwen3, gpt-4, claude-4, llama-3).
    pub family: String,
    /// Semantic version or checkpoint label.
    #[serde(default)]
    pub version: String,
    /// What this model can do — ordered by primary capability first.
    pub capabilities: Vec<ModelCapability>,
    /// Context window in tokens.
    pub context_length: usize,
    /// Parameter count as human-readable string (e.g., "4B", "30B (3B active)").
    #[serde(default)]
    pub param_count: String,
    /// Quantization (Q4_K_M, Q8_0, F16, none).
    #[serde(default)]
    pub quantization: Option<String>,
    /// Declared performance envelope (initial estimate, overridden by observed data).
    #[serde(default)]
    pub performance: PerformanceEnvelope,
    /// Cost structure.
    #[serde(default)]
    pub cost: CostModel,
    /// How to access this model.
    pub source: ModelSource,
    /// Free-form tags for filtering (e.g., "fast", "multilingual", "moe").
    #[serde(default)]
    pub tags: Vec<String>,
    /// Supported generation parameters. The inference layer strips any parameter
    /// not in this set before sending to the API. Empty = all supported.
    #[serde(default)]
    pub supported_params: Vec<GenerateParam>,
    /// Public benchmark scores as published by the model provider or
    /// reproduced on a public leaderboard (MMLU-Pro, GPQA-Diamond,
    /// SWE-bench, HumanEval, etc.). The built-in catalog ships this
    /// empty — population is a curation step, not a code change. See
    /// `BenchmarkScore` for the field shape and the 0.0–1.0 scoring
    /// convention.
    #[serde(default)]
    pub public_benchmarks: Vec<BenchmarkScore>,
    /// Whether this model is currently available (downloaded / reachable).
    /// Not serialized — computed at runtime.
    #[serde(skip)]
    pub available: bool,
}

impl ModelSchema {
    /// Check if this model has a given capability.
    pub fn has_capability(&self, cap: ModelCapability) -> bool {
        self.capabilities.contains(&cap)
    }

    /// Check if this model is local (runs on-device).
    pub fn is_local(&self) -> bool {
        matches!(
            self.source,
            ModelSource::Local { .. }
                | ModelSource::Mlx { .. }
                | ModelSource::VllmMlx { .. }
                | ModelSource::AppleFoundationModels { .. }
        )
    }

    /// Check if this model delegates inference to a host-registered
    /// runner (closes Parslee-ai/car-releases#24).
    pub fn is_delegated(&self) -> bool {
        matches!(self.source, ModelSource::Delegated { .. })
    }

    /// Check if this model uses the MLX backend.
    pub fn is_mlx(&self) -> bool {
        matches!(self.source, ModelSource::Mlx { .. })
    }

    /// Check if this model routes to Apple's on-device FoundationModels
    /// framework. True only for `ModelSource::AppleFoundationModels`;
    /// callers must still verify runtime availability before dispatch
    /// (the schema can describe the model on any host, but execution
    /// requires macOS 26+ on Apple Silicon).
    pub fn is_foundation_models(&self) -> bool {
        matches!(self.source, ModelSource::AppleFoundationModels { .. })
    }

    /// Check if this model uses vLLM-MLX backend.
    pub fn is_vllm_mlx(&self) -> bool {
        matches!(self.source, ModelSource::VllmMlx { .. })
    }

    /// Check if this model is remote (requires API call).
    pub fn is_remote(&self) -> bool {
        matches!(
            self.source,
            ModelSource::RemoteApi { .. } | ModelSource::Proprietary { .. }
        )
    }

    /// Collect all API key env var names for this model (primary + extras).
    /// Returns empty vec for non-remote models.
    pub fn all_api_key_envs(&self) -> Vec<String> {
        match &self.source {
            ModelSource::RemoteApi {
                api_key_env,
                api_key_envs,
                ..
            } => {
                let mut all = vec![api_key_env.clone()];
                all.extend(api_key_envs.iter().cloned());
                all
            }
            ModelSource::Proprietary {
                auth: ProprietaryAuth::ApiKeyEnv { env_var },
                ..
            }
            | ModelSource::Proprietary {
                auth: ProprietaryAuth::BearerTokenEnv { env_var },
                ..
            } => vec![env_var.clone()],
            _ => vec![],
        }
    }

    /// Get the size in MB (from cost model or 0 if unknown).
    pub fn size_mb(&self) -> u64 {
        self.cost.size_mb.unwrap_or(0)
    }

    /// Get the RAM requirement in MB (from cost model, falls back to size_mb).
    pub fn ram_mb(&self) -> u64 {
        self.cost.ram_mb.unwrap_or_else(|| self.size_mb())
    }

    /// Estimated cost per 1K output tokens in USD. Returns 0.0 for local models.
    pub fn cost_per_1k_output(&self) -> f64 {
        self.cost.output_per_mtok.map(|c| c / 1000.0).unwrap_or(0.0)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn sample_local() -> ModelSchema {
        ModelSchema {
            id: "qwen/qwen3-4b:q4_k_m".into(),
            name: "Qwen3-4B".into(),
            provider: "qwen".into(),
            family: "qwen3".into(),
            version: "1.0".into(),
            capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
            context_length: 32768,
            param_count: "4B".into(),
            quantization: Some("Q4_K_M".into()),
            performance: PerformanceEnvelope {
                tokens_per_second: Some(45.0),
                ..Default::default()
            },
            cost: CostModel {
                size_mb: Some(2500),
                ram_mb: Some(2500),
                ..Default::default()
            },
            source: ModelSource::Local {
                hf_repo: "Qwen/Qwen3-4B-GGUF".into(),
                hf_filename: "Qwen3-4B-Q4_K_M.gguf".into(),
                tokenizer_repo: "Qwen/Qwen3-4B".into(),
            },
            tags: vec!["code".into(), "fast".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        }
    }

    fn sample_remote() -> ModelSchema {
        ModelSchema {
            id: "anthropic/claude-sonnet-4-6:latest".into(),
            name: "Claude Sonnet 4.6".into(),
            provider: "anthropic".into(),
            family: "claude-4".into(),
            version: "latest".into(),
            capabilities: vec![
                ModelCapability::Generate,
                ModelCapability::Code,
                ModelCapability::Reasoning,
                ModelCapability::ToolUse,
                ModelCapability::Vision,
            ],
            context_length: 200000,
            param_count: String::new(),
            quantization: None,
            performance: PerformanceEnvelope {
                latency_p50_ms: Some(2000),
                latency_p99_ms: Some(8000),
                tokens_per_second: Some(80.0),
            },
            cost: CostModel {
                input_per_mtok: Some(3.0),
                output_per_mtok: Some(15.0),
                ..Default::default()
            },
            source: ModelSource::RemoteApi {
                endpoint: "https://api.anthropic.com/v1/messages".into(),
                api_key_env: "ANTHROPIC_API_KEY".into(),
                api_key_envs: vec![],
                api_version: Some("2023-06-01".into()),
                protocol: ApiProtocol::Anthropic,
            },
            tags: vec!["reasoning".into(), "tool_use".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        }
    }

    #[test]
    fn capabilities() {
        let m = sample_local();
        assert!(m.has_capability(ModelCapability::Code));
        assert!(!m.has_capability(ModelCapability::Vision));
    }

    #[test]
    fn local_vs_remote() {
        assert!(sample_local().is_local());
        assert!(!sample_local().is_remote());
        assert!(sample_remote().is_remote());
        assert!(!sample_remote().is_local());
    }

    #[test]
    fn cost() {
        let local = sample_local();
        assert_eq!(local.cost_per_1k_output(), 0.0);

        let remote = sample_remote();
        assert!(remote.cost_per_1k_output() > 0.0);
    }

    #[test]
    fn serde_roundtrip() {
        let local = sample_local();
        let json = serde_json::to_string(&local).unwrap();
        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.id, local.id);
        assert_eq!(parsed.capabilities, local.capabilities);

        let remote = sample_remote();
        let json = serde_json::to_string(&remote).unwrap();
        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.id, remote.id);
        // available is skip-serialized, defaults to false
        assert!(!parsed.available);
    }
}