car-inference 0.15.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
//! Model schema — declarative metadata for models, analogous to ToolSchema for tools.
//!
//! Every model (local GGUF, remote API, Ollama) is described by a `ModelSchema`
//! that declares identity, capabilities, constraints, cost, and source.
//! The router uses this schema for initial routing; observed outcomes refine it.

use serde::{Deserialize, Serialize};

/// What a model can do.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ModelCapability {
    /// Text completion / chat generation
    Generate,
    /// Vector embeddings
    Embed,
    /// Cross-encoder relevance scoring (query + document → relevance
    /// score). Qwen3-Reranker is the canonical local implementation.
    Rerank,
    /// Label assignment / classification
    Classify,
    /// Code generation, repair, refactoring
    Code,
    /// Chain-of-thought, planning, analysis
    Reasoning,
    /// Text condensation
    Summarize,
    /// Function/tool calling
    ToolUse,
    /// Multiple tool calls in a single response (parallel tool execution)
    MultiToolCall,
    /// Vision / image understanding
    Vision,
    /// Video understanding (multi-frame sampling + temporal tokens).
    /// Distinct from `Vision` so routing can prefer video-trained
    /// models when the caller attaches a video content block.
    VideoUnderstanding,
    /// Audio understanding (speech + non-speech audio as an input to
    /// a chat/reasoning model). Distinct from `SpeechToText` which is
    /// the transcription-only task. Gemma 4 E2B/E4B and Gemini do
    /// this; Qwen2.5-VL does not.
    AudioUnderstanding,
    /// Visual grounding — structured object-localization output
    /// (bounding boxes keyed to object labels) in addition to text.
    Grounding,
    /// Speech recognition / transcription
    SpeechToText,
    /// Speech synthesis / text-to-speech
    TextToSpeech,
    /// Image generation
    ImageGeneration,
    /// Video generation
    VideoGeneration,
}

/// How to access the model.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ModelSource {
    /// Local GGUF file via Candle backend.
    Local {
        hf_repo: String,
        hf_filename: String,
        tokenizer_repo: String,
    },
    /// Remote API endpoint (OpenAI-compatible, Anthropic, etc.)
    RemoteApi {
        endpoint: String,
        /// Environment variable name containing the API key (never the key itself).
        /// The env var value may contain comma-separated keys for load balancing.
        api_key_env: String,
        /// Additional environment variable names for load balancing across multiple keys.
        /// Each env var may also contain comma-separated keys.
        #[serde(default)]
        api_key_envs: Vec<String>,
        #[serde(default)]
        api_version: Option<String>,
        protocol: ApiProtocol,
    },
    /// Ollama local server.
    Ollama {
        model_tag: String,
        #[serde(default = "default_ollama_host")]
        host: String,
    },
    /// Local MLX model via mlx-rs backend (Apple Silicon, safetensors format).
    /// Models from mlx-community on HuggingFace.
    Mlx {
        /// HuggingFace repo (e.g., "mlx-community/Qwen3-4B-4bit").
        hf_repo: String,
        /// Optional specific weight filename. If None, auto-discovers safetensors files.
        #[serde(default)]
        hf_weight_file: Option<String>,
    },
    /// Local vLLM-MLX server (Apple Silicon, OpenAI-compatible API).
    /// Routes through RemoteBackend with OpenAI protocol handler.
    VllmMlx {
        /// Server endpoint (e.g., "http://localhost:8000").
        endpoint: String,
        /// The model name as known to vLLM-MLX (e.g., "mlx-community/Qwen3-4B-4bit").
        model_name: String,
    },
    /// Apple's on-device system model via the FoundationModels framework
    /// (macOS 26+, Apple Silicon). Inference happens in-process through a
    /// Swift shim — there is no HTTP, no API key, and no model file: the
    /// OS owns the weights. Availability is checked at runtime via
    /// `@available(macOS 26.0, *)`; on older macOS or non-Apple-Silicon
    /// hosts the backend reports `UnsupportedMode` and the router falls
    /// through to the next candidate.
    AppleFoundationModels {
        /// Optional Apple use-case hint passed through to
        /// `LanguageModelSession`. Apple's framework tunes its prompt and
        /// safety scaffolding per use case (e.g. "general", "summarize").
        /// `None` uses the default.
        #[serde(default)]
        use_case: Option<String>,
    },
    /// Proprietary provider with custom auth and protocol.
    ///
    /// For vendor-specific APIs that aren't generic OpenAI-compatible endpoints.
    /// Parslee is the first proprietary provider — custom auth (OAuth2),
    /// custom response format, multi-provider routing built into the API.
    Proprietary {
        /// Provider identifier (e.g., "parslee").
        provider: String,
        /// Base URL for the API.
        endpoint: String,
        /// Auth configuration.
        auth: ProprietaryAuth,
        /// Custom protocol details.
        protocol: ProprietaryProtocol,
    },
    /// Inference is delegated to a host-registered runner. CAR does
    /// not own the wire format — the runner (typically a JS / Python
    /// host) translates the `GenerateRequest` to its provider's API,
    /// streams chunks back through the runner's event callback, and
    /// returns the final aggregated result.
    ///
    /// Closes Parslee-ai/car-releases#24. Use this when the host
    /// already has an SDK relationship with a provider (Anthropic,
    /// OpenAI, GitHub Models, Vercel AI SDK) and wants CAR to sit in
    /// the lifecycle / policy / replay path without learning every
    /// provider's wire format.
    ///
    /// Routing requires that a runner has been registered via
    /// [`crate::set_inference_runner`] (or its FFI equivalent —
    /// `registerInferenceRunner` on JS, `register_inference_runner`
    /// on Python, the `InferenceRunner` foreign trait on UniFFI,
    /// `inference.register_runner` on the WebSocket protocol).
    /// Without a runner, dispatch fails with `InferenceFailed`.
    Delegated {
        /// Opaque hint passed through to the runner — typically the
        /// provider id (`"anthropic"`, `"openai"`, `"vercel-ai-sdk"`)
        /// so a multi-provider runner can dispatch internally. CAR
        /// does not interpret this string.
        #[serde(default)]
        hint: Option<String>,
    },
}

/// Authentication method for proprietary providers.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ProprietaryAuth {
    /// OAuth2 PKCE flow (e.g., Azure AD for Parslee).
    OAuth2Pkce {
        authority: String,
        client_id: String,
        scopes: Vec<String>,
    },
    /// Static API key from environment variable.
    ApiKeyEnv { env_var: String },
    /// Bearer token from environment variable.
    BearerTokenEnv { env_var: String },
}

/// Protocol configuration for proprietary providers.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProprietaryProtocol {
    /// Chat/completion endpoint path (appended to base URL).
    #[serde(default = "default_chat_path")]
    pub chat_path: String,
    /// Content type for requests.
    #[serde(default = "default_content_type")]
    pub content_type: String,
    /// Whether the API streams responses via SSE.
    #[serde(default)]
    pub streaming: bool,
    /// Custom headers to include in every request.
    #[serde(default)]
    pub extra_headers: std::collections::HashMap<String, String>,
}

impl Default for ProprietaryProtocol {
    fn default() -> Self {
        Self {
            chat_path: default_chat_path(),
            content_type: default_content_type(),
            streaming: false,
            extra_headers: std::collections::HashMap::new(),
        }
    }
}

fn default_chat_path() -> String {
    "/chat".to_string()
}

fn default_content_type() -> String {
    "application/json".to_string()
}

fn default_ollama_host() -> String {
    "http://localhost:11434".to_string()
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ApiProtocol {
    OpenAiCompat,
    /// OpenAI Responses API (/v1/responses) — works with all OpenAI models including codex.
    OpenAiResponses,
    Anthropic,
    Google,
    /// Azure OpenAI — uses api-key header and deployment-based URLs.
    /// Endpoint format: {base}/openai/deployments/{model}/chat/completions?api-version={version}
    AzureOpenAi,
}

/// Declared performance expectations. Overridden by observed data once available.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PerformanceEnvelope {
    /// Median latency in milliseconds (declared/estimated).
    #[serde(default)]
    pub latency_p50_ms: Option<u64>,
    /// 99th percentile latency in milliseconds.
    #[serde(default)]
    pub latency_p99_ms: Option<u64>,
    /// Tokens per second throughput.
    #[serde(default)]
    pub tokens_per_second: Option<f64>,
}

/// Cost model for routing optimization.
/// Generation parameters that a model may or may not support.
/// Models declare which params they accept. The inference layer
/// strips unsupported params before sending to the API.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum GenerateParam {
    Temperature,
    TopP,
    TopK,
    MaxTokens,
    StopSequences,
    FrequencyPenalty,
    PresencePenalty,
    Seed,
    ResponseFormat,
    /// Extended thinking / internal reasoning before responding.
    ExtendedThinking,
}

/// Standard parameter set for most models.
pub fn standard_params() -> Vec<GenerateParam> {
    vec![
        GenerateParam::Temperature,
        GenerateParam::TopP,
        GenerateParam::MaxTokens,
        GenerateParam::StopSequences,
        GenerateParam::FrequencyPenalty,
        GenerateParam::PresencePenalty,
        GenerateParam::Seed,
    ]
}

/// Parameter set for reasoning models (no temperature, no top_p).
pub fn reasoning_params() -> Vec<GenerateParam> {
    vec![GenerateParam::MaxTokens, GenerateParam::StopSequences]
}

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CostModel {
    /// USD per 1M input tokens (remote models).
    #[serde(default)]
    pub input_per_mtok: Option<f64>,
    /// USD per 1M output tokens (remote models).
    #[serde(default)]
    pub output_per_mtok: Option<f64>,
    /// On-disk size in MB (local models).
    #[serde(default)]
    pub size_mb: Option<u64>,
    /// RAM required during inference in MB.
    #[serde(default)]
    pub ram_mb: Option<u64>,
}

/// A score on a public benchmark from a published source (model card,
/// paper, leaderboard). The schema is deliberately permissive — no enum
/// of benchmark names — so the catalog can carry whichever benchmarks
/// the upstream provider chose to publish, and new ones can be added
/// without a code change. Scores are stored on a 0.0–1.0 scale (e.g.
/// 73.5% accuracy → 0.735) so they compare cleanly across benchmarks
/// and so `routing_ext::apply_benchmark_priors` can consume them
/// directly when wired in later.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkScore {
    /// Benchmark name as published (e.g., "MMLU-Pro", "GPQA-Diamond",
    /// "SWE-bench-Verified", "HumanEval", "MATH").
    pub name: String,
    /// Score on a 0.0–1.0 scale.
    pub score: f64,
    /// Evaluation harness or setup label (e.g., "5-shot", "0-shot CoT",
    /// "agentic", "pass@1"). Optional but strongly recommended — the
    /// same benchmark name can mean different things under different
    /// harnesses.
    #[serde(default)]
    pub harness: Option<String>,
    /// Where the score came from (model card URL, paper, leaderboard
    /// snapshot). Empty when the source is the upstream provider's
    /// announcement and a stable URL is not yet known.
    #[serde(default)]
    pub source_url: Option<String>,
    /// ISO 8601 date of the score snapshot (e.g., "2025-08-12"). Lets
    /// downstream code judge how stale a number is.
    #[serde(default)]
    pub measured_at: Option<String>,
}

/// The full declarative schema for a model.
///
/// Analogous to `ToolSchema` — describes what a model is, what it can do,
/// and how to access it. The router uses this for constraint-based filtering
/// and cold-start scoring before observed performance data is available.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelSchema {
    /// Unique identifier: "provider/model-name:variant" (e.g., "qwen/qwen3-4b:q4_k_m").
    pub id: String,
    /// Human-readable display name.
    pub name: String,
    /// Provider (qwen, openai, anthropic, google, meta, ollama, custom).
    pub provider: String,
    /// Model family for grouping (qwen3, gpt-4, claude-4, llama-3).
    pub family: String,
    /// Semantic version or checkpoint label.
    #[serde(default)]
    pub version: String,
    /// What this model can do — ordered by primary capability first.
    pub capabilities: Vec<ModelCapability>,
    /// Context window in tokens.
    pub context_length: usize,
    /// Parameter count as human-readable string (e.g., "4B", "30B (3B active)").
    #[serde(default)]
    pub param_count: String,
    /// Quantization (Q4_K_M, Q8_0, F16, none).
    #[serde(default)]
    pub quantization: Option<String>,
    /// Declared performance envelope (initial estimate, overridden by observed data).
    #[serde(default)]
    pub performance: PerformanceEnvelope,
    /// Cost structure.
    #[serde(default)]
    pub cost: CostModel,
    /// How to access this model.
    pub source: ModelSource,
    /// Free-form tags for filtering (e.g., "fast", "multilingual", "moe").
    #[serde(default)]
    pub tags: Vec<String>,
    /// Supported generation parameters. The inference layer strips any parameter
    /// not in this set before sending to the API. Empty = all supported.
    #[serde(default)]
    pub supported_params: Vec<GenerateParam>,
    /// Public benchmark scores as published by the model provider or
    /// reproduced on a public leaderboard (MMLU-Pro, GPQA-Diamond,
    /// SWE-bench, HumanEval, etc.). The built-in catalog ships this
    /// empty — population is a curation step, not a code change. See
    /// `BenchmarkScore` for the field shape and the 0.0–1.0 scoring
    /// convention.
    #[serde(default)]
    pub public_benchmarks: Vec<BenchmarkScore>,
    /// Whether this model is currently available (downloaded / reachable).
    /// Not serialized — computed at runtime.
    #[serde(skip)]
    pub available: bool,
}

impl ModelSchema {
    /// Check if this model has a given capability.
    pub fn has_capability(&self, cap: ModelCapability) -> bool {
        self.capabilities.contains(&cap)
    }

    /// Check if this model is local (runs on-device).
    pub fn is_local(&self) -> bool {
        matches!(
            self.source,
            ModelSource::Local { .. }
                | ModelSource::Mlx { .. }
                | ModelSource::VllmMlx { .. }
                | ModelSource::AppleFoundationModels { .. }
        )
    }

    /// Check if this model delegates inference to a host-registered
    /// runner (closes Parslee-ai/car-releases#24).
    pub fn is_delegated(&self) -> bool {
        matches!(self.source, ModelSource::Delegated { .. })
    }

    /// Check if this model uses the MLX backend.
    pub fn is_mlx(&self) -> bool {
        matches!(self.source, ModelSource::Mlx { .. })
    }

    /// Check if this model routes to Apple's on-device FoundationModels
    /// framework. True only for `ModelSource::AppleFoundationModels`;
    /// callers must still verify runtime availability before dispatch
    /// (the schema can describe the model on any host, but execution
    /// requires macOS 26+ on Apple Silicon).
    pub fn is_foundation_models(&self) -> bool {
        matches!(self.source, ModelSource::AppleFoundationModels { .. })
    }

    /// Check if this model uses vLLM-MLX backend.
    pub fn is_vllm_mlx(&self) -> bool {
        matches!(self.source, ModelSource::VllmMlx { .. })
    }

    /// Check if this model is remote (requires API call).
    pub fn is_remote(&self) -> bool {
        matches!(
            self.source,
            ModelSource::RemoteApi { .. } | ModelSource::Proprietary { .. }
        )
    }

    /// Collect all API key env var names for this model (primary + extras).
    /// Returns empty vec for non-remote models.
    pub fn all_api_key_envs(&self) -> Vec<String> {
        match &self.source {
            ModelSource::RemoteApi {
                api_key_env,
                api_key_envs,
                ..
            } => {
                let mut all = vec![api_key_env.clone()];
                all.extend(api_key_envs.iter().cloned());
                all
            }
            ModelSource::Proprietary {
                auth: ProprietaryAuth::ApiKeyEnv { env_var },
                ..
            }
            | ModelSource::Proprietary {
                auth: ProprietaryAuth::BearerTokenEnv { env_var },
                ..
            } => vec![env_var.clone()],
            _ => vec![],
        }
    }

    /// Get the size in MB (from cost model or 0 if unknown).
    pub fn size_mb(&self) -> u64 {
        self.cost.size_mb.unwrap_or(0)
    }

    /// Get the RAM requirement in MB (from cost model, falls back to size_mb).
    pub fn ram_mb(&self) -> u64 {
        self.cost.ram_mb.unwrap_or_else(|| self.size_mb())
    }

    /// Estimated cost per 1K output tokens in USD. Returns 0.0 for local models.
    pub fn cost_per_1k_output(&self) -> f64 {
        self.cost.output_per_mtok.map(|c| c / 1000.0).unwrap_or(0.0)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn sample_local() -> ModelSchema {
        ModelSchema {
            id: "qwen/qwen3-4b:q4_k_m".into(),
            name: "Qwen3-4B".into(),
            provider: "qwen".into(),
            family: "qwen3".into(),
            version: "1.0".into(),
            capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
            context_length: 32768,
            param_count: "4B".into(),
            quantization: Some("Q4_K_M".into()),
            performance: PerformanceEnvelope {
                tokens_per_second: Some(45.0),
                ..Default::default()
            },
            cost: CostModel {
                size_mb: Some(2500),
                ram_mb: Some(2500),
                ..Default::default()
            },
            source: ModelSource::Local {
                hf_repo: "Qwen/Qwen3-4B-GGUF".into(),
                hf_filename: "Qwen3-4B-Q4_K_M.gguf".into(),
                tokenizer_repo: "Qwen/Qwen3-4B".into(),
            },
            tags: vec!["code".into(), "fast".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        }
    }

    fn sample_remote() -> ModelSchema {
        ModelSchema {
            id: "anthropic/claude-sonnet-4-6:latest".into(),
            name: "Claude Sonnet 4.6".into(),
            provider: "anthropic".into(),
            family: "claude-4".into(),
            version: "latest".into(),
            capabilities: vec![
                ModelCapability::Generate,
                ModelCapability::Code,
                ModelCapability::Reasoning,
                ModelCapability::ToolUse,
                ModelCapability::Vision,
            ],
            context_length: 200000,
            param_count: String::new(),
            quantization: None,
            performance: PerformanceEnvelope {
                latency_p50_ms: Some(2000),
                latency_p99_ms: Some(8000),
                tokens_per_second: Some(80.0),
            },
            cost: CostModel {
                input_per_mtok: Some(3.0),
                output_per_mtok: Some(15.0),
                ..Default::default()
            },
            source: ModelSource::RemoteApi {
                endpoint: "https://api.anthropic.com/v1/messages".into(),
                api_key_env: "ANTHROPIC_API_KEY".into(),
                api_key_envs: vec![],
                api_version: Some("2023-06-01".into()),
                protocol: ApiProtocol::Anthropic,
            },
            tags: vec!["reasoning".into(), "tool_use".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: false,
        }
    }

    #[test]
    fn capabilities() {
        let m = sample_local();
        assert!(m.has_capability(ModelCapability::Code));
        assert!(!m.has_capability(ModelCapability::Vision));
    }

    #[test]
    fn local_vs_remote() {
        assert!(sample_local().is_local());
        assert!(!sample_local().is_remote());
        assert!(sample_remote().is_remote());
        assert!(!sample_remote().is_local());
    }

    #[test]
    fn cost() {
        let local = sample_local();
        assert_eq!(local.cost_per_1k_output(), 0.0);

        let remote = sample_remote();
        assert!(remote.cost_per_1k_output() > 0.0);
    }

    #[test]
    fn serde_roundtrip() {
        let local = sample_local();
        let json = serde_json::to_string(&local).unwrap();
        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.id, local.id);
        assert_eq!(parsed.capabilities, local.capabilities);

        let remote = sample_remote();
        let json = serde_json::to_string(&remote).unwrap();
        let parsed: ModelSchema = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.id, remote.id);
        // available is skip-serialized, defaults to false
        assert!(!parsed.available);
    }
}