car_inference/
schema.rs

1//! Model schema — declarative metadata for models, analogous to ToolSchema for tools.
2//!
3//! Every model (local GGUF, remote API, Ollama) is described by a `ModelSchema`
4//! that declares identity, capabilities, constraints, cost, and source.
5//! The router uses this schema for initial routing; observed outcomes refine it.
6
7use serde::{Deserialize, Serialize};
8
9/// What a model can do.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
11#[serde(rename_all = "snake_case")]
12pub enum ModelCapability {
13    /// Text completion / chat generation
14    Generate,
15    /// Vector embeddings
16    Embed,
17    /// Cross-encoder relevance scoring (query + document → relevance
18    /// score). Qwen3-Reranker is the canonical local implementation.
19    Rerank,
20    /// Label assignment / classification
21    Classify,
22    /// Code generation, repair, refactoring
23    Code,
24    /// Chain-of-thought, planning, analysis
25    Reasoning,
26    /// Text condensation
27    Summarize,
28    /// Function/tool calling
29    ToolUse,
30    /// Multiple tool calls in a single response (parallel tool execution)
31    MultiToolCall,
32    /// Vision / image understanding
33    Vision,
34    /// Video understanding (multi-frame sampling + temporal tokens).
35    /// Distinct from `Vision` so routing can prefer video-trained
36    /// models when the caller attaches a video content block.
37    VideoUnderstanding,
38    /// Audio understanding (speech + non-speech audio as an input to
39    /// a chat/reasoning model). Distinct from `SpeechToText` which is
40    /// the transcription-only task. Gemma 4 E2B/E4B and Gemini do
41    /// this; Qwen2.5-VL does not.
42    AudioUnderstanding,
43    /// Visual grounding — structured object-localization output
44    /// (bounding boxes keyed to object labels) in addition to text.
45    Grounding,
46    /// Speech recognition / transcription
47    SpeechToText,
48    /// Speech synthesis / text-to-speech
49    TextToSpeech,
50    /// Image generation
51    ImageGeneration,
52    /// Video generation
53    VideoGeneration,
54}
55
56/// How to access the model.
57#[derive(Debug, Clone, Serialize, Deserialize)]
58#[serde(tag = "type", rename_all = "snake_case")]
59pub enum ModelSource {
60    /// Local GGUF file via Candle backend.
61    Local {
62        hf_repo: String,
63        hf_filename: String,
64        tokenizer_repo: String,
65    },
66    /// Remote API endpoint (OpenAI-compatible, Anthropic, etc.)
67    RemoteApi {
68        endpoint: String,
69        /// Environment variable name containing the API key (never the key itself).
70        /// The env var value may contain comma-separated keys for load balancing.
71        api_key_env: String,
72        /// Additional environment variable names for load balancing across multiple keys.
73        /// Each env var may also contain comma-separated keys.
74        #[serde(default)]
75        api_key_envs: Vec<String>,
76        #[serde(default)]
77        api_version: Option<String>,
78        protocol: ApiProtocol,
79    },
80    /// Ollama local server.
81    Ollama {
82        model_tag: String,
83        #[serde(default = "default_ollama_host")]
84        host: String,
85    },
86    /// Local MLX model via mlx-rs backend (Apple Silicon, safetensors format).
87    /// Models from mlx-community on HuggingFace.
88    Mlx {
89        /// HuggingFace repo (e.g., "mlx-community/Qwen3-4B-4bit").
90        hf_repo: String,
91        /// Optional specific weight filename. If None, auto-discovers safetensors files.
92        #[serde(default)]
93        hf_weight_file: Option<String>,
94    },
95    /// Local vLLM-MLX server (Apple Silicon, OpenAI-compatible API).
96    /// Routes through RemoteBackend with OpenAI protocol handler.
97    VllmMlx {
98        /// Server endpoint (e.g., "http://localhost:8000").
99        endpoint: String,
100        /// The model name as known to vLLM-MLX (e.g., "mlx-community/Qwen3-4B-4bit").
101        model_name: String,
102    },
103    /// Apple's on-device system model via the FoundationModels framework
104    /// (macOS 26+, Apple Silicon). Inference happens in-process through a
105    /// Swift shim — there is no HTTP, no API key, and no model file: the
106    /// OS owns the weights. Availability is checked at runtime via
107    /// `@available(macOS 26.0, *)`; on older macOS or non-Apple-Silicon
108    /// hosts the backend reports `UnsupportedMode` and the router falls
109    /// through to the next candidate.
110    AppleFoundationModels {
111        /// Optional Apple use-case hint passed through to
112        /// `LanguageModelSession`. Apple's framework tunes its prompt and
113        /// safety scaffolding per use case (e.g. "general", "summarize").
114        /// `None` uses the default.
115        #[serde(default)]
116        use_case: Option<String>,
117    },
118    /// Proprietary provider with custom auth and protocol.
119    ///
120    /// For vendor-specific APIs that aren't generic OpenAI-compatible endpoints.
121    /// Parslee is the first proprietary provider — custom auth (OAuth2),
122    /// custom response format, multi-provider routing built into the API.
123    Proprietary {
124        /// Provider identifier (e.g., "parslee").
125        provider: String,
126        /// Base URL for the API.
127        endpoint: String,
128        /// Auth configuration.
129        auth: ProprietaryAuth,
130        /// Custom protocol details.
131        protocol: ProprietaryProtocol,
132    },
133    /// Inference is delegated to a host-registered runner. CAR does
134    /// not own the wire format — the runner (typically a JS / Python
135    /// host) translates the `GenerateRequest` to its provider's API,
136    /// streams chunks back through the runner's event callback, and
137    /// returns the final aggregated result.
138    ///
139    /// Closes Parslee-ai/car-releases#24. Use this when the host
140    /// already has an SDK relationship with a provider (Anthropic,
141    /// OpenAI, GitHub Models, Vercel AI SDK) and wants CAR to sit in
142    /// the lifecycle / policy / replay path without learning every
143    /// provider's wire format.
144    ///
145    /// Routing requires that a runner has been registered via
146    /// [`crate::set_inference_runner`] (or its FFI equivalent —
147    /// `registerInferenceRunner` on JS, `register_inference_runner`
148    /// on Python, the `InferenceRunner` foreign trait on UniFFI,
149    /// `inference.register_runner` on the WebSocket protocol).
150    /// Without a runner, dispatch fails with `InferenceFailed`.
151    Delegated {
152        /// Opaque hint passed through to the runner — typically the
153        /// provider id (`"anthropic"`, `"openai"`, `"vercel-ai-sdk"`)
154        /// so a multi-provider runner can dispatch internally. CAR
155        /// does not interpret this string.
156        #[serde(default)]
157        hint: Option<String>,
158    },
159}
160
161/// Authentication method for proprietary providers.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163#[serde(tag = "type", rename_all = "snake_case")]
164pub enum ProprietaryAuth {
165    /// OAuth2 PKCE flow (e.g., Azure AD for Parslee).
166    OAuth2Pkce {
167        authority: String,
168        client_id: String,
169        scopes: Vec<String>,
170    },
171    /// Static API key from environment variable.
172    ApiKeyEnv { env_var: String },
173    /// Bearer token from environment variable.
174    BearerTokenEnv { env_var: String },
175}
176
177/// Protocol configuration for proprietary providers.
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct ProprietaryProtocol {
180    /// Chat/completion endpoint path (appended to base URL).
181    #[serde(default = "default_chat_path")]
182    pub chat_path: String,
183    /// Content type for requests.
184    #[serde(default = "default_content_type")]
185    pub content_type: String,
186    /// Whether the API streams responses via SSE.
187    #[serde(default)]
188    pub streaming: bool,
189    /// Custom headers to include in every request.
190    #[serde(default)]
191    pub extra_headers: std::collections::HashMap<String, String>,
192}
193
194impl Default for ProprietaryProtocol {
195    fn default() -> Self {
196        Self {
197            chat_path: default_chat_path(),
198            content_type: default_content_type(),
199            streaming: false,
200            extra_headers: std::collections::HashMap::new(),
201        }
202    }
203}
204
205fn default_chat_path() -> String {
206    "/chat".to_string()
207}
208
209fn default_content_type() -> String {
210    "application/json".to_string()
211}
212
213fn default_ollama_host() -> String {
214    "http://localhost:11434".to_string()
215}
216
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
218#[serde(rename_all = "snake_case")]
219pub enum ApiProtocol {
220    OpenAiCompat,
221    /// OpenAI Responses API (/v1/responses) — works with all OpenAI models including codex.
222    OpenAiResponses,
223    Anthropic,
224    Google,
225    /// Azure OpenAI — uses api-key header and deployment-based URLs.
226    /// Endpoint format: {base}/openai/deployments/{model}/chat/completions?api-version={version}
227    AzureOpenAi,
228}
229
230/// Declared performance expectations. Overridden by observed data once available.
231#[derive(Debug, Clone, Default, Serialize, Deserialize)]
232pub struct PerformanceEnvelope {
233    /// Median latency in milliseconds (declared/estimated).
234    #[serde(default)]
235    pub latency_p50_ms: Option<u64>,
236    /// 99th percentile latency in milliseconds.
237    #[serde(default)]
238    pub latency_p99_ms: Option<u64>,
239    /// Tokens per second throughput.
240    #[serde(default)]
241    pub tokens_per_second: Option<f64>,
242}
243
244/// Cost model for routing optimization.
245/// Generation parameters that a model may or may not support.
246/// Models declare which params they accept. The inference layer
247/// strips unsupported params before sending to the API.
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
249#[serde(rename_all = "snake_case")]
250pub enum GenerateParam {
251    Temperature,
252    TopP,
253    TopK,
254    MaxTokens,
255    StopSequences,
256    FrequencyPenalty,
257    PresencePenalty,
258    Seed,
259    ResponseFormat,
260    /// Extended thinking / internal reasoning before responding.
261    ExtendedThinking,
262}
263
264/// Standard parameter set for most models.
265pub fn standard_params() -> Vec<GenerateParam> {
266    vec![
267        GenerateParam::Temperature,
268        GenerateParam::TopP,
269        GenerateParam::MaxTokens,
270        GenerateParam::StopSequences,
271        GenerateParam::FrequencyPenalty,
272        GenerateParam::PresencePenalty,
273        GenerateParam::Seed,
274    ]
275}
276
277/// Parameter set for reasoning models (no temperature, no top_p).
278pub fn reasoning_params() -> Vec<GenerateParam> {
279    vec![GenerateParam::MaxTokens, GenerateParam::StopSequences]
280}
281
282#[derive(Debug, Clone, Default, Serialize, Deserialize)]
283pub struct CostModel {
284    /// USD per 1M input tokens (remote models).
285    #[serde(default)]
286    pub input_per_mtok: Option<f64>,
287    /// USD per 1M output tokens (remote models).
288    #[serde(default)]
289    pub output_per_mtok: Option<f64>,
290    /// On-disk size in MB (local models).
291    #[serde(default)]
292    pub size_mb: Option<u64>,
293    /// RAM required during inference in MB.
294    #[serde(default)]
295    pub ram_mb: Option<u64>,
296}
297
298/// A score on a public benchmark from a published source (model card,
299/// paper, leaderboard). The schema is deliberately permissive — no enum
300/// of benchmark names — so the catalog can carry whichever benchmarks
301/// the upstream provider chose to publish, and new ones can be added
302/// without a code change. Scores are stored on a 0.0–1.0 scale (e.g.
303/// 73.5% accuracy → 0.735) so they compare cleanly across benchmarks
304/// and so `routing_ext::apply_benchmark_priors` can consume them
305/// directly when wired in later.
306#[derive(Debug, Clone, Serialize, Deserialize)]
307pub struct BenchmarkScore {
308    /// Benchmark name as published (e.g., "MMLU-Pro", "GPQA-Diamond",
309    /// "SWE-bench-Verified", "HumanEval", "MATH").
310    pub name: String,
311    /// Score on a 0.0–1.0 scale.
312    pub score: f64,
313    /// Evaluation harness or setup label (e.g., "5-shot", "0-shot CoT",
314    /// "agentic", "pass@1"). Optional but strongly recommended — the
315    /// same benchmark name can mean different things under different
316    /// harnesses.
317    #[serde(default)]
318    pub harness: Option<String>,
319    /// Where the score came from (model card URL, paper, leaderboard
320    /// snapshot). Empty when the source is the upstream provider's
321    /// announcement and a stable URL is not yet known.
322    #[serde(default)]
323    pub source_url: Option<String>,
324    /// ISO 8601 date of the score snapshot (e.g., "2025-08-12"). Lets
325    /// downstream code judge how stale a number is.
326    #[serde(default)]
327    pub measured_at: Option<String>,
328}
329
330/// The full declarative schema for a model.
331///
332/// Analogous to `ToolSchema` — describes what a model is, what it can do,
333/// and how to access it. The router uses this for constraint-based filtering
334/// and cold-start scoring before observed performance data is available.
335#[derive(Debug, Clone, Serialize, Deserialize)]
336pub struct ModelSchema {
337    /// Unique identifier: "provider/model-name:variant" (e.g., "qwen/qwen3-4b:q4_k_m").
338    pub id: String,
339    /// Human-readable display name.
340    pub name: String,
341    /// Provider (qwen, openai, anthropic, google, meta, ollama, custom).
342    pub provider: String,
343    /// Model family for grouping (qwen3, gpt-4, claude-4, llama-3).
344    pub family: String,
345    /// Semantic version or checkpoint label.
346    #[serde(default)]
347    pub version: String,
348    /// What this model can do — ordered by primary capability first.
349    pub capabilities: Vec<ModelCapability>,
350    /// Context window in tokens.
351    pub context_length: usize,
352    /// Parameter count as human-readable string (e.g., "4B", "30B (3B active)").
353    #[serde(default)]
354    pub param_count: String,
355    /// Quantization (Q4_K_M, Q8_0, F16, none).
356    #[serde(default)]
357    pub quantization: Option<String>,
358    /// Declared performance envelope (initial estimate, overridden by observed data).
359    #[serde(default)]
360    pub performance: PerformanceEnvelope,
361    /// Cost structure.
362    #[serde(default)]
363    pub cost: CostModel,
364    /// How to access this model.
365    pub source: ModelSource,
366    /// Free-form tags for filtering (e.g., "fast", "multilingual", "moe").
367    #[serde(default)]
368    pub tags: Vec<String>,
369    /// Supported generation parameters. The inference layer strips any parameter
370    /// not in this set before sending to the API. Empty = all supported.
371    #[serde(default)]
372    pub supported_params: Vec<GenerateParam>,
373    /// Public benchmark scores as published by the model provider or
374    /// reproduced on a public leaderboard (MMLU-Pro, GPQA-Diamond,
375    /// SWE-bench, HumanEval, etc.). The built-in catalog ships this
376    /// empty — population is a curation step, not a code change. See
377    /// `BenchmarkScore` for the field shape and the 0.0–1.0 scoring
378    /// convention.
379    #[serde(default)]
380    pub public_benchmarks: Vec<BenchmarkScore>,
381    /// Whether this model is currently available (downloaded / reachable).
382    /// Not serialized — computed at runtime.
383    #[serde(skip)]
384    pub available: bool,
385}
386
387impl ModelSchema {
388    /// Check if this model has a given capability.
389    pub fn has_capability(&self, cap: ModelCapability) -> bool {
390        self.capabilities.contains(&cap)
391    }
392
393    /// Check if this model is local (runs on-device).
394    pub fn is_local(&self) -> bool {
395        matches!(
396            self.source,
397            ModelSource::Local { .. }
398                | ModelSource::Mlx { .. }
399                | ModelSource::VllmMlx { .. }
400                | ModelSource::AppleFoundationModels { .. }
401        )
402    }
403
404    /// Check if this model delegates inference to a host-registered
405    /// runner (closes Parslee-ai/car-releases#24).
406    pub fn is_delegated(&self) -> bool {
407        matches!(self.source, ModelSource::Delegated { .. })
408    }
409
410    /// Check if this model uses the MLX backend.
411    pub fn is_mlx(&self) -> bool {
412        matches!(self.source, ModelSource::Mlx { .. })
413    }
414
415    /// Check if this model routes to Apple's on-device FoundationModels
416    /// framework. True only for `ModelSource::AppleFoundationModels`;
417    /// callers must still verify runtime availability before dispatch
418    /// (the schema can describe the model on any host, but execution
419    /// requires macOS 26+ on Apple Silicon).
420    pub fn is_foundation_models(&self) -> bool {
421        matches!(self.source, ModelSource::AppleFoundationModels { .. })
422    }
423
424    /// Check if this model uses vLLM-MLX backend.
425    pub fn is_vllm_mlx(&self) -> bool {
426        matches!(self.source, ModelSource::VllmMlx { .. })
427    }
428
429    /// Check if this model is remote (requires API call).
430    pub fn is_remote(&self) -> bool {
431        matches!(
432            self.source,
433            ModelSource::RemoteApi { .. } | ModelSource::Proprietary { .. }
434        )
435    }
436
437    /// Collect all API key env var names for this model (primary + extras).
438    /// Returns empty vec for non-remote models.
439    pub fn all_api_key_envs(&self) -> Vec<String> {
440        match &self.source {
441            ModelSource::RemoteApi {
442                api_key_env,
443                api_key_envs,
444                ..
445            } => {
446                let mut all = vec![api_key_env.clone()];
447                all.extend(api_key_envs.iter().cloned());
448                all
449            }
450            ModelSource::Proprietary {
451                auth: ProprietaryAuth::ApiKeyEnv { env_var },
452                ..
453            }
454            | ModelSource::Proprietary {
455                auth: ProprietaryAuth::BearerTokenEnv { env_var },
456                ..
457            } => vec![env_var.clone()],
458            _ => vec![],
459        }
460    }
461
462    /// Get the size in MB (from cost model or 0 if unknown).
463    pub fn size_mb(&self) -> u64 {
464        self.cost.size_mb.unwrap_or(0)
465    }
466
467    /// Get the RAM requirement in MB (from cost model, falls back to size_mb).
468    pub fn ram_mb(&self) -> u64 {
469        self.cost.ram_mb.unwrap_or_else(|| self.size_mb())
470    }
471
472    /// Estimated cost per 1K output tokens in USD. Returns 0.0 for local models.
473    pub fn cost_per_1k_output(&self) -> f64 {
474        self.cost.output_per_mtok.map(|c| c / 1000.0).unwrap_or(0.0)
475    }
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    fn sample_local() -> ModelSchema {
483        ModelSchema {
484            id: "qwen/qwen3-4b:q4_k_m".into(),
485            name: "Qwen3-4B".into(),
486            provider: "qwen".into(),
487            family: "qwen3".into(),
488            version: "1.0".into(),
489            capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
490            context_length: 32768,
491            param_count: "4B".into(),
492            quantization: Some("Q4_K_M".into()),
493            performance: PerformanceEnvelope {
494                tokens_per_second: Some(45.0),
495                ..Default::default()
496            },
497            cost: CostModel {
498                size_mb: Some(2500),
499                ram_mb: Some(2500),
500                ..Default::default()
501            },
502            source: ModelSource::Local {
503                hf_repo: "Qwen/Qwen3-4B-GGUF".into(),
504                hf_filename: "Qwen3-4B-Q4_K_M.gguf".into(),
505                tokenizer_repo: "Qwen/Qwen3-4B".into(),
506            },
507            tags: vec!["code".into(), "fast".into()],
508            supported_params: vec![],
509            public_benchmarks: vec![],
510            available: false,
511        }
512    }
513
514    fn sample_remote() -> ModelSchema {
515        ModelSchema {
516            id: "anthropic/claude-sonnet-4-6:latest".into(),
517            name: "Claude Sonnet 4.6".into(),
518            provider: "anthropic".into(),
519            family: "claude-4".into(),
520            version: "latest".into(),
521            capabilities: vec![
522                ModelCapability::Generate,
523                ModelCapability::Code,
524                ModelCapability::Reasoning,
525                ModelCapability::ToolUse,
526                ModelCapability::Vision,
527            ],
528            context_length: 200000,
529            param_count: String::new(),
530            quantization: None,
531            performance: PerformanceEnvelope {
532                latency_p50_ms: Some(2000),
533                latency_p99_ms: Some(8000),
534                tokens_per_second: Some(80.0),
535            },
536            cost: CostModel {
537                input_per_mtok: Some(3.0),
538                output_per_mtok: Some(15.0),
539                ..Default::default()
540            },
541            source: ModelSource::RemoteApi {
542                endpoint: "https://api.anthropic.com/v1/messages".into(),
543                api_key_env: "ANTHROPIC_API_KEY".into(),
544                api_key_envs: vec![],
545                api_version: Some("2023-06-01".into()),
546                protocol: ApiProtocol::Anthropic,
547            },
548            tags: vec!["reasoning".into(), "tool_use".into()],
549            supported_params: vec![],
550            public_benchmarks: vec![],
551            available: false,
552        }
553    }
554
555    #[test]
556    fn capabilities() {
557        let m = sample_local();
558        assert!(m.has_capability(ModelCapability::Code));
559        assert!(!m.has_capability(ModelCapability::Vision));
560    }
561
562    #[test]
563    fn local_vs_remote() {
564        assert!(sample_local().is_local());
565        assert!(!sample_local().is_remote());
566        assert!(sample_remote().is_remote());
567        assert!(!sample_remote().is_local());
568    }
569
570    #[test]
571    fn cost() {
572        let local = sample_local();
573        assert_eq!(local.cost_per_1k_output(), 0.0);
574
575        let remote = sample_remote();
576        assert!(remote.cost_per_1k_output() > 0.0);
577    }
578
579    #[test]
580    fn serde_roundtrip() {
581        let local = sample_local();
582        let json = serde_json::to_string(&local).unwrap();
583        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
584        assert_eq!(parsed.id, local.id);
585        assert_eq!(parsed.capabilities, local.capabilities);
586
587        let remote = sample_remote();
588        let json = serde_json::to_string(&remote).unwrap();
589        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
590        assert_eq!(parsed.id, remote.id);
591        // available is skip-serialized, defaults to false
592        assert!(!parsed.available);
593    }
594}
car_inference/schema.rs

car_inference/
schema.rs