car_inference/schema.rs
1//! Model schema — declarative metadata for models, analogous to ToolSchema for tools.
2//!
3//! Every model (local GGUF, remote API, Ollama) is described by a `ModelSchema`
4//! that declares identity, capabilities, constraints, cost, and source.
5//! The router uses this schema for initial routing; observed outcomes refine it.
6
7use serde::{Deserialize, Serialize};
8
9/// What a model can do.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
11#[serde(rename_all = "snake_case")]
12pub enum ModelCapability {
13 /// Text completion / chat generation
14 Generate,
15 /// Vector embeddings
16 Embed,
17 /// Cross-encoder relevance scoring (query + document → relevance
18 /// score). Qwen3-Reranker is the canonical local implementation.
19 Rerank,
20 /// Label assignment / classification
21 Classify,
22 /// Code generation, repair, refactoring
23 Code,
24 /// Chain-of-thought, planning, analysis
25 Reasoning,
26 /// Text condensation
27 Summarize,
28 /// Function/tool calling
29 ToolUse,
30 /// Multiple tool calls in a single response (parallel tool execution)
31 MultiToolCall,
32 /// Vision / image understanding
33 Vision,
34 /// Video understanding (multi-frame sampling + temporal tokens).
35 /// Distinct from `Vision` so routing can prefer video-trained
36 /// models when the caller attaches a video content block.
37 VideoUnderstanding,
38 /// Audio understanding (speech + non-speech audio as an input to
39 /// a chat/reasoning model). Distinct from `SpeechToText` which is
40 /// the transcription-only task. Gemma 4 E2B/E4B and Gemini do
41 /// this; Qwen2.5-VL does not.
42 AudioUnderstanding,
43 /// Visual grounding — structured object-localization output
44 /// (bounding boxes keyed to object labels) in addition to text.
45 Grounding,
46 /// Speech recognition / transcription
47 SpeechToText,
48 /// Speech synthesis / text-to-speech
49 TextToSpeech,
50 /// Image generation
51 ImageGeneration,
52 /// Video generation
53 VideoGeneration,
54}
55
56/// How to access the model.
57#[derive(Debug, Clone, Serialize, Deserialize)]
58#[serde(tag = "type", rename_all = "snake_case")]
59pub enum ModelSource {
60 /// Local GGUF file via Candle backend.
61 Local {
62 hf_repo: String,
63 hf_filename: String,
64 tokenizer_repo: String,
65 },
66 /// Remote API endpoint (OpenAI-compatible, Anthropic, etc.)
67 RemoteApi {
68 endpoint: String,
69 /// Environment variable name containing the API key (never the key itself).
70 /// The env var value may contain comma-separated keys for load balancing.
71 api_key_env: String,
72 /// Additional environment variable names for load balancing across multiple keys.
73 /// Each env var may also contain comma-separated keys.
74 #[serde(default)]
75 api_key_envs: Vec<String>,
76 #[serde(default)]
77 api_version: Option<String>,
78 protocol: ApiProtocol,
79 },
80 /// Ollama local server.
81 Ollama {
82 model_tag: String,
83 #[serde(default = "default_ollama_host")]
84 host: String,
85 },
86 /// Local MLX model via mlx-rs backend (Apple Silicon, safetensors format).
87 /// Models from mlx-community on HuggingFace.
88 Mlx {
89 /// HuggingFace repo (e.g., "mlx-community/Qwen3-4B-4bit").
90 hf_repo: String,
91 /// Optional specific weight filename. If None, auto-discovers safetensors files.
92 #[serde(default)]
93 hf_weight_file: Option<String>,
94 },
95 /// Local vLLM-MLX server (Apple Silicon, OpenAI-compatible API).
96 /// Routes through RemoteBackend with OpenAI protocol handler.
97 VllmMlx {
98 /// Server endpoint (e.g., "http://localhost:8000").
99 endpoint: String,
100 /// The model name as known to vLLM-MLX (e.g., "mlx-community/Qwen3-4B-4bit").
101 model_name: String,
102 },
103 /// Apple's on-device system model via the FoundationModels framework
104 /// (macOS 26+, Apple Silicon). Inference happens in-process through a
105 /// Swift shim — there is no HTTP, no API key, and no model file: the
106 /// OS owns the weights. Availability is checked at runtime via
107 /// `@available(macOS 26.0, *)`; on older macOS or non-Apple-Silicon
108 /// hosts the backend reports `UnsupportedMode` and the router falls
109 /// through to the next candidate.
110 AppleFoundationModels {
111 /// Optional Apple use-case hint passed through to
112 /// `LanguageModelSession`. Apple's framework tunes its prompt and
113 /// safety scaffolding per use case (e.g. "general", "summarize").
114 /// `None` uses the default.
115 #[serde(default)]
116 use_case: Option<String>,
117 },
118 /// Proprietary provider with custom auth and protocol.
119 ///
120 /// For vendor-specific APIs that aren't generic OpenAI-compatible endpoints.
121 /// Parslee is the first proprietary provider — custom auth (OAuth2),
122 /// custom response format, multi-provider routing built into the API.
123 Proprietary {
124 /// Provider identifier (e.g., "parslee").
125 provider: String,
126 /// Base URL for the API.
127 endpoint: String,
128 /// Auth configuration.
129 auth: ProprietaryAuth,
130 /// Custom protocol details.
131 protocol: ProprietaryProtocol,
132 },
133 /// Inference is delegated to a host-registered runner. CAR does
134 /// not own the wire format — the runner (typically a JS / Python
135 /// host) translates the `GenerateRequest` to its provider's API,
136 /// streams chunks back through the runner's event callback, and
137 /// returns the final aggregated result.
138 ///
139 /// Closes Parslee-ai/car-releases#24. Use this when the host
140 /// already has an SDK relationship with a provider (Anthropic,
141 /// OpenAI, GitHub Models, Vercel AI SDK) and wants CAR to sit in
142 /// the lifecycle / policy / replay path without learning every
143 /// provider's wire format.
144 ///
145 /// Routing requires that a runner has been registered via
146 /// [`crate::set_inference_runner`] (or its FFI equivalent —
147 /// `registerInferenceRunner` on JS, `register_inference_runner`
148 /// on Python, the `InferenceRunner` foreign trait on UniFFI,
149 /// `inference.register_runner` on the WebSocket protocol).
150 /// Without a runner, dispatch fails with `InferenceFailed`.
151 Delegated {
152 /// Opaque hint passed through to the runner — typically the
153 /// provider id (`"anthropic"`, `"openai"`, `"vercel-ai-sdk"`)
154 /// so a multi-provider runner can dispatch internally. CAR
155 /// does not interpret this string.
156 #[serde(default)]
157 hint: Option<String>,
158 },
159}
160
161/// Authentication method for proprietary providers.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163#[serde(tag = "type", rename_all = "snake_case")]
164pub enum ProprietaryAuth {
165 /// OAuth2 PKCE flow (e.g., Azure AD for Parslee).
166 OAuth2Pkce {
167 authority: String,
168 client_id: String,
169 scopes: Vec<String>,
170 },
171 /// Static API key from environment variable.
172 ApiKeyEnv { env_var: String },
173 /// Bearer token from environment variable.
174 BearerTokenEnv { env_var: String },
175}
176
177/// Protocol configuration for proprietary providers.
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct ProprietaryProtocol {
180 /// Chat/completion endpoint path (appended to base URL).
181 #[serde(default = "default_chat_path")]
182 pub chat_path: String,
183 /// Content type for requests.
184 #[serde(default = "default_content_type")]
185 pub content_type: String,
186 /// Whether the API streams responses via SSE.
187 #[serde(default)]
188 pub streaming: bool,
189 /// Custom headers to include in every request.
190 #[serde(default)]
191 pub extra_headers: std::collections::HashMap<String, String>,
192}
193
194impl Default for ProprietaryProtocol {
195 fn default() -> Self {
196 Self {
197 chat_path: default_chat_path(),
198 content_type: default_content_type(),
199 streaming: false,
200 extra_headers: std::collections::HashMap::new(),
201 }
202 }
203}
204
205fn default_chat_path() -> String {
206 "/chat".to_string()
207}
208
209fn default_content_type() -> String {
210 "application/json".to_string()
211}
212
213fn default_ollama_host() -> String {
214 "http://localhost:11434".to_string()
215}
216
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
218#[serde(rename_all = "snake_case")]
219pub enum ApiProtocol {
220 OpenAiCompat,
221 /// OpenAI Responses API (/v1/responses) — works with all OpenAI models including codex.
222 OpenAiResponses,
223 Anthropic,
224 Google,
225 /// Azure OpenAI — uses api-key header and deployment-based URLs.
226 /// Endpoint format: {base}/openai/deployments/{model}/chat/completions?api-version={version}
227 AzureOpenAi,
228}
229
230/// Declared performance expectations. Overridden by observed data once available.
231#[derive(Debug, Clone, Default, Serialize, Deserialize)]
232pub struct PerformanceEnvelope {
233 /// Median latency in milliseconds (declared/estimated).
234 #[serde(default)]
235 pub latency_p50_ms: Option<u64>,
236 /// 99th percentile latency in milliseconds.
237 #[serde(default)]
238 pub latency_p99_ms: Option<u64>,
239 /// Tokens per second throughput.
240 #[serde(default)]
241 pub tokens_per_second: Option<f64>,
242}
243
244/// Cost model for routing optimization.
245/// Generation parameters that a model may or may not support.
246/// Models declare which params they accept. The inference layer
247/// strips unsupported params before sending to the API.
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
249#[serde(rename_all = "snake_case")]
250pub enum GenerateParam {
251 Temperature,
252 TopP,
253 TopK,
254 MaxTokens,
255 StopSequences,
256 FrequencyPenalty,
257 PresencePenalty,
258 Seed,
259 ResponseFormat,
260 /// Extended thinking / internal reasoning before responding.
261 ExtendedThinking,
262}
263
264/// Standard parameter set for most models.
265pub fn standard_params() -> Vec<GenerateParam> {
266 vec![
267 GenerateParam::Temperature,
268 GenerateParam::TopP,
269 GenerateParam::MaxTokens,
270 GenerateParam::StopSequences,
271 GenerateParam::FrequencyPenalty,
272 GenerateParam::PresencePenalty,
273 GenerateParam::Seed,
274 ]
275}
276
277/// Parameter set for reasoning models (no temperature, no top_p).
278pub fn reasoning_params() -> Vec<GenerateParam> {
279 vec![GenerateParam::MaxTokens, GenerateParam::StopSequences]
280}
281
282#[derive(Debug, Clone, Default, Serialize, Deserialize)]
283pub struct CostModel {
284 /// USD per 1M input tokens (remote models).
285 #[serde(default)]
286 pub input_per_mtok: Option<f64>,
287 /// USD per 1M output tokens (remote models).
288 #[serde(default)]
289 pub output_per_mtok: Option<f64>,
290 /// On-disk size in MB (local models).
291 #[serde(default)]
292 pub size_mb: Option<u64>,
293 /// RAM required during inference in MB.
294 #[serde(default)]
295 pub ram_mb: Option<u64>,
296}
297
298/// A score on a public benchmark from a published source (model card,
299/// paper, leaderboard). The schema is deliberately permissive — no enum
300/// of benchmark names — so the catalog can carry whichever benchmarks
301/// the upstream provider chose to publish, and new ones can be added
302/// without a code change. Scores are stored on a 0.0–1.0 scale (e.g.
303/// 73.5% accuracy → 0.735) so they compare cleanly across benchmarks
304/// and so `routing_ext::apply_benchmark_priors` can consume them
305/// directly when wired in later.
306#[derive(Debug, Clone, Serialize, Deserialize)]
307pub struct BenchmarkScore {
308 /// Benchmark name as published (e.g., "MMLU-Pro", "GPQA-Diamond",
309 /// "SWE-bench-Verified", "HumanEval", "MATH").
310 pub name: String,
311 /// Score on a 0.0–1.0 scale.
312 pub score: f64,
313 /// Evaluation harness or setup label (e.g., "5-shot", "0-shot CoT",
314 /// "agentic", "pass@1"). Optional but strongly recommended — the
315 /// same benchmark name can mean different things under different
316 /// harnesses.
317 #[serde(default)]
318 pub harness: Option<String>,
319 /// Where the score came from (model card URL, paper, leaderboard
320 /// snapshot). Empty when the source is the upstream provider's
321 /// announcement and a stable URL is not yet known.
322 #[serde(default)]
323 pub source_url: Option<String>,
324 /// ISO 8601 date of the score snapshot (e.g., "2025-08-12"). Lets
325 /// downstream code judge how stale a number is.
326 #[serde(default)]
327 pub measured_at: Option<String>,
328}
329
330/// The full declarative schema for a model.
331///
332/// Analogous to `ToolSchema` — describes what a model is, what it can do,
333/// and how to access it. The router uses this for constraint-based filtering
334/// and cold-start scoring before observed performance data is available.
335#[derive(Debug, Clone, Serialize, Deserialize)]
336pub struct ModelSchema {
337 /// Unique identifier: "provider/model-name:variant" (e.g., "qwen/qwen3-4b:q4_k_m").
338 pub id: String,
339 /// Human-readable display name.
340 pub name: String,
341 /// Provider (qwen, openai, anthropic, google, meta, ollama, custom).
342 pub provider: String,
343 /// Model family for grouping (qwen3, gpt-4, claude-4, llama-3).
344 pub family: String,
345 /// Semantic version or checkpoint label.
346 #[serde(default)]
347 pub version: String,
348 /// What this model can do — ordered by primary capability first.
349 pub capabilities: Vec<ModelCapability>,
350 /// Context window in tokens.
351 pub context_length: usize,
352 /// Parameter count as human-readable string (e.g., "4B", "30B (3B active)").
353 #[serde(default)]
354 pub param_count: String,
355 /// Quantization (Q4_K_M, Q8_0, F16, none).
356 #[serde(default)]
357 pub quantization: Option<String>,
358 /// Declared performance envelope (initial estimate, overridden by observed data).
359 #[serde(default)]
360 pub performance: PerformanceEnvelope,
361 /// Cost structure.
362 #[serde(default)]
363 pub cost: CostModel,
364 /// How to access this model.
365 pub source: ModelSource,
366 /// Free-form tags for filtering (e.g., "fast", "multilingual", "moe").
367 #[serde(default)]
368 pub tags: Vec<String>,
369 /// Supported generation parameters. The inference layer strips any parameter
370 /// not in this set before sending to the API. Empty = all supported.
371 #[serde(default)]
372 pub supported_params: Vec<GenerateParam>,
373 /// Public benchmark scores as published by the model provider or
374 /// reproduced on a public leaderboard (MMLU-Pro, GPQA-Diamond,
375 /// SWE-bench, HumanEval, etc.). The built-in catalog ships this
376 /// empty — population is a curation step, not a code change. See
377 /// `BenchmarkScore` for the field shape and the 0.0–1.0 scoring
378 /// convention.
379 #[serde(default)]
380 pub public_benchmarks: Vec<BenchmarkScore>,
381 /// Whether this model is currently available (downloaded / reachable).
382 /// Not serialized — computed at runtime.
383 #[serde(skip)]
384 pub available: bool,
385}
386
387impl ModelSchema {
388 /// Check if this model has a given capability.
389 pub fn has_capability(&self, cap: ModelCapability) -> bool {
390 self.capabilities.contains(&cap)
391 }
392
393 /// Check if this model is local (runs on-device).
394 pub fn is_local(&self) -> bool {
395 matches!(
396 self.source,
397 ModelSource::Local { .. }
398 | ModelSource::Mlx { .. }
399 | ModelSource::VllmMlx { .. }
400 | ModelSource::AppleFoundationModels { .. }
401 )
402 }
403
404 /// Check if this model delegates inference to a host-registered
405 /// runner (closes Parslee-ai/car-releases#24).
406 pub fn is_delegated(&self) -> bool {
407 matches!(self.source, ModelSource::Delegated { .. })
408 }
409
410 /// Check if this model uses the MLX backend.
411 pub fn is_mlx(&self) -> bool {
412 matches!(self.source, ModelSource::Mlx { .. })
413 }
414
415 /// Check if this model routes to Apple's on-device FoundationModels
416 /// framework. True only for `ModelSource::AppleFoundationModels`;
417 /// callers must still verify runtime availability before dispatch
418 /// (the schema can describe the model on any host, but execution
419 /// requires macOS 26+ on Apple Silicon).
420 pub fn is_foundation_models(&self) -> bool {
421 matches!(self.source, ModelSource::AppleFoundationModels { .. })
422 }
423
424 /// Check if this model uses vLLM-MLX backend.
425 pub fn is_vllm_mlx(&self) -> bool {
426 matches!(self.source, ModelSource::VllmMlx { .. })
427 }
428
429 /// Check if this model is remote (requires API call).
430 pub fn is_remote(&self) -> bool {
431 matches!(
432 self.source,
433 ModelSource::RemoteApi { .. } | ModelSource::Proprietary { .. }
434 )
435 }
436
437 /// Collect all API key env var names for this model (primary + extras).
438 /// Returns empty vec for non-remote models.
439 pub fn all_api_key_envs(&self) -> Vec<String> {
440 match &self.source {
441 ModelSource::RemoteApi {
442 api_key_env,
443 api_key_envs,
444 ..
445 } => {
446 let mut all = vec![api_key_env.clone()];
447 all.extend(api_key_envs.iter().cloned());
448 all
449 }
450 ModelSource::Proprietary {
451 auth: ProprietaryAuth::ApiKeyEnv { env_var },
452 ..
453 }
454 | ModelSource::Proprietary {
455 auth: ProprietaryAuth::BearerTokenEnv { env_var },
456 ..
457 } => vec![env_var.clone()],
458 _ => vec![],
459 }
460 }
461
462 /// Get the size in MB (from cost model or 0 if unknown).
463 pub fn size_mb(&self) -> u64 {
464 self.cost.size_mb.unwrap_or(0)
465 }
466
467 /// Get the RAM requirement in MB (from cost model, falls back to size_mb).
468 pub fn ram_mb(&self) -> u64 {
469 self.cost.ram_mb.unwrap_or_else(|| self.size_mb())
470 }
471
472 /// Estimated cost per 1K output tokens in USD. Returns 0.0 for local models.
473 pub fn cost_per_1k_output(&self) -> f64 {
474 self.cost.output_per_mtok.map(|c| c / 1000.0).unwrap_or(0.0)
475 }
476}
477
478#[cfg(test)]
479mod tests {
480 use super::*;
481
482 fn sample_local() -> ModelSchema {
483 ModelSchema {
484 id: "qwen/qwen3-4b:q4_k_m".into(),
485 name: "Qwen3-4B".into(),
486 provider: "qwen".into(),
487 family: "qwen3".into(),
488 version: "1.0".into(),
489 capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
490 context_length: 32768,
491 param_count: "4B".into(),
492 quantization: Some("Q4_K_M".into()),
493 performance: PerformanceEnvelope {
494 tokens_per_second: Some(45.0),
495 ..Default::default()
496 },
497 cost: CostModel {
498 size_mb: Some(2500),
499 ram_mb: Some(2500),
500 ..Default::default()
501 },
502 source: ModelSource::Local {
503 hf_repo: "Qwen/Qwen3-4B-GGUF".into(),
504 hf_filename: "Qwen3-4B-Q4_K_M.gguf".into(),
505 tokenizer_repo: "Qwen/Qwen3-4B".into(),
506 },
507 tags: vec!["code".into(), "fast".into()],
508 supported_params: vec![],
509 public_benchmarks: vec![],
510 available: false,
511 }
512 }
513
514 fn sample_remote() -> ModelSchema {
515 ModelSchema {
516 id: "anthropic/claude-sonnet-4-6:latest".into(),
517 name: "Claude Sonnet 4.6".into(),
518 provider: "anthropic".into(),
519 family: "claude-4".into(),
520 version: "latest".into(),
521 capabilities: vec![
522 ModelCapability::Generate,
523 ModelCapability::Code,
524 ModelCapability::Reasoning,
525 ModelCapability::ToolUse,
526 ModelCapability::Vision,
527 ],
528 context_length: 200000,
529 param_count: String::new(),
530 quantization: None,
531 performance: PerformanceEnvelope {
532 latency_p50_ms: Some(2000),
533 latency_p99_ms: Some(8000),
534 tokens_per_second: Some(80.0),
535 },
536 cost: CostModel {
537 input_per_mtok: Some(3.0),
538 output_per_mtok: Some(15.0),
539 ..Default::default()
540 },
541 source: ModelSource::RemoteApi {
542 endpoint: "https://api.anthropic.com/v1/messages".into(),
543 api_key_env: "ANTHROPIC_API_KEY".into(),
544 api_key_envs: vec![],
545 api_version: Some("2023-06-01".into()),
546 protocol: ApiProtocol::Anthropic,
547 },
548 tags: vec!["reasoning".into(), "tool_use".into()],
549 supported_params: vec![],
550 public_benchmarks: vec![],
551 available: false,
552 }
553 }
554
555 #[test]
556 fn capabilities() {
557 let m = sample_local();
558 assert!(m.has_capability(ModelCapability::Code));
559 assert!(!m.has_capability(ModelCapability::Vision));
560 }
561
562 #[test]
563 fn local_vs_remote() {
564 assert!(sample_local().is_local());
565 assert!(!sample_local().is_remote());
566 assert!(sample_remote().is_remote());
567 assert!(!sample_remote().is_local());
568 }
569
570 #[test]
571 fn cost() {
572 let local = sample_local();
573 assert_eq!(local.cost_per_1k_output(), 0.0);
574
575 let remote = sample_remote();
576 assert!(remote.cost_per_1k_output() > 0.0);
577 }
578
579 #[test]
580 fn serde_roundtrip() {
581 let local = sample_local();
582 let json = serde_json::to_string(&local).unwrap();
583 let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
584 assert_eq!(parsed.id, local.id);
585 assert_eq!(parsed.capabilities, local.capabilities);
586
587 let remote = sample_remote();
588 let json = serde_json::to_string(&remote).unwrap();
589 let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
590 assert_eq!(parsed.id, remote.id);
591 // available is skip-serialized, defaults to false
592 assert!(!parsed.available);
593 }
594}