car-inference 0.22.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
//! Caller-facing routing intent — express requirements, not model IDs.
//!
//! Tracks Parslee-ai/car-releases#18. The motivation is that callers
//! today choose between two extremes:
//!
//! - `model = None` → the adaptive router picks. Quality on average is
//!   good but per-request variability surfaces as UX inconsistency.
//! - `model = Some("claude-sonnet-4-7")` → the caller pins. Provider
//!   awareness leaks up the stack — exactly what CAR is supposed to
//!   prevent.
//!
//! `IntentHint` is the middle ground. The caller expresses *what* they
//! need; the router resolves intent → model. Existing `model = None`
//! and `model = Some(...)` paths are unchanged when no intent is
//! supplied.
//!
//! ## MVP scope
//!
//! Just `task`, `prefer_local`, `require`. Cost/latency ceilings wait
//! for clean registry numbers; `prefer_family` was cut as a soft
//! routing knob that accumulates tweaks without clear semantics
//! (Linus design review, 2026-05-04).
//!
//! ## Routing semantics
//!
//! `prefer_local: true` maps to a dedicated
//! [`crate::RoutingWorkload::LocalPreferred`] variant. Distinct from
//! `Background` (which is "this is a background job, latency barely
//! matters") — `LocalPreferred` keeps a quality-aware weight profile
//! and a strong local_bonus so the hint wins ties decisively.

use serde::{Deserialize, Serialize};

use crate::schema::ModelCapability;

/// What the caller is doing — coarse-grained categories the adaptive
/// router maps to `InferenceTask`. A closed enum so adding a new task
/// type is a deliberate FFI-visible change rather than a silent
/// fallback when the router doesn't recognize a string.
///
/// The MVP intentionally ships only the variants that map to a
/// distinct `InferenceTask` today. `Summarize` / `Extract` were cut
/// because both would have collapsed to `Generate` with no observable
/// behavior change — shipping enum variants that are accepted, parsed,
/// and silently discarded is exactly the routing variability the
/// intent surface is designed to remove. Add them back when the
/// registry actually distinguishes summarize-tuned or extract-tuned
/// models.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TaskHint {
    /// Conversational chat — maps to `InferenceTask::Generate`.
    Chat,
    /// Label assignment / categorization. Maps to
    /// `InferenceTask::Classify`.
    Classify,
    /// Chain-of-thought, planning, multi-step analysis. Maps to
    /// `InferenceTask::Reasoning` and tends to favor frontier
    /// reasoning models.
    Reasoning,
    /// Code generation, repair, refactoring. Maps to
    /// `InferenceTask::Code`.
    Code,
}

/// Caller-supplied routing intent. All fields are optional / additive.
/// An `IntentHint` with default values matches the no-intent path
/// exactly, so threading `Option<IntentHint>` through is safe.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IntentHint {
    /// What the caller is doing. None = let the router infer from the
    /// prompt as today.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub task: Option<TaskHint>,

    /// Hard filter — every required capability must be present on the
    /// candidate. Empty = no extra filter.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub require: Vec<ModelCapability>,

    /// Bias the score profile toward local models (cost over quality).
    /// Internally this maps to `RoutingWorkload::Background` until the
    /// follow-up split lands (parslee-ai/car#106).
    #[serde(default, skip_serializing_if = "is_false")]
    pub prefer_local: bool,

    /// Bias the score profile aggressively toward latency. Maps to
    /// [`crate::tasks::RoutingWorkload::Fastest`] — a weight profile
    /// that downweights quality and cost in favour of time-to-first-token.
    /// Designed for voice turns where a sub-500ms first-audio target
    /// beats a richer-but-slower answer. Takes precedence over
    /// `prefer_local`; if both are set, the request is routed by
    /// `Fastest` rules.
    #[serde(default, skip_serializing_if = "is_false")]
    pub prefer_fast: bool,
}

fn is_false(b: &bool) -> bool {
    !*b
}

// ---------------------------------------------------------------------------
// Acquisition intent — which model to *recommend/install* for this machine.
//
// Distinct layer from `TaskHint`/`IntentHint` above, which are *inference-time*
// routing intent (which already-installed model should serve this request).
// The types below answer the earlier question: "given this hardware and what
// the user wants to do, which model should they acquire?" They are consumed by
// `ModelRecommender` (see docs/solutions/first-class-model-ux.md) and never
// expose model IDs, quantization, or HF repos to the caller.
// ---------------------------------------------------------------------------

/// The kind of model a use case needs. The recommender ranks only *within*
/// a role's lane — an embedding model and a chat model are not comparable,
/// so a retrieval pick never competes with a generative one. A use case that
/// spans roles resolves to a bundle (one recommendation per role).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum UseCaseRole {
    /// Produces text/tokens (chat, code, vision-to-text, summarize).
    Generative,
    /// Produces vectors / relevance scores (embeddings, rerank).
    Retrieval,
    /// Consumes audio (transcription).
    Audio,
}

/// What the user wants to do, in their terms — not a model ID. Closed enum:
/// adding a use case is a deliberate FFI-visible change, never a silent
/// string fallback. Each variant maps to a [`UseCaseRole`] and a required /
/// preferred [`ModelCapability`] set (see [`UseCase::required_capabilities`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum UseCase {
    /// General chat / Q&A. The default.
    Assistant,
    /// Code generation, repair, refactoring.
    Coding,
    /// Text condensation.
    Summarize,
    /// Image understanding (a generative model that also sees).
    Vision,
    /// Audio → text.
    Transcription,
    /// Semantic search — an embedding model for retrieval. NOT an LLM
    /// performing web search with tools; this is the `Retrieval` role and
    /// maps to the `Embed` capability, so it is ranked separately from any
    /// generative chat model.
    Search,
}

impl UseCase {
    /// The role lane this use case is ranked within.
    pub fn role(self) -> UseCaseRole {
        match self {
            UseCase::Assistant
            | UseCase::Coding
            | UseCase::Summarize
            | UseCase::Vision => UseCaseRole::Generative,
            UseCase::Search => UseCaseRole::Retrieval,
            UseCase::Transcription => UseCaseRole::Audio,
        }
    }

    /// Hard eligibility filter — a model missing any of these is excluded.
    pub fn required_capabilities(self) -> &'static [ModelCapability] {
        use ModelCapability::*;
        match self {
            UseCase::Assistant => &[Generate],
            UseCase::Coding => &[Generate, Code],
            UseCase::Summarize => &[Generate],
            UseCase::Vision => &[Vision, Generate],
            UseCase::Transcription => &[SpeechToText],
            UseCase::Search => &[Embed],
        }
    }

    /// Soft preference — present capabilities add a ranking bonus but are
    /// never required for eligibility.
    pub fn preferred_capabilities(self) -> &'static [ModelCapability] {
        use ModelCapability::*;
        match self {
            UseCase::Assistant => &[ToolUse],
            UseCase::Coding => &[ToolUse, Reasoning],
            UseCase::Summarize => &[Summarize],
            UseCase::Vision => &[],
            UseCase::Transcription => &[],
            UseCase::Search => &[Rerank],
        }
    }
}

impl Default for UseCase {
    fn default() -> Self {
        UseCase::Assistant
    }
}

/// Speed/quality knob. Each tier is a fixed weighting over the recommender's
/// soft-score axes, applied *after* the hard eligibility filter, so tier
/// semantics are explicit rather than reinvented per call site.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum QualityTier {
    /// Smallest eligible model; lowest latency.
    Fastest,
    /// Best quality that fits with KV-cache headroom. The default.
    Balanced,
    /// Largest model that fits at all; accepts slower output.
    MostCapable,
}

impl Default for QualityTier {
    fn default() -> Self {
        QualityTier::Balanced
    }
}

/// Relative weights a [`QualityTier`] places on each soft-score axis. The
/// recommender normalizes each axis to `[0,1]` and combines them with these.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct TierWeights {
    /// Toward higher-quality models (benchmarks / param-count prior).
    pub quality: f32,
    /// Toward lower-latency models (smaller, better-accelerated).
    pub latency: f32,
    /// Toward leaving memory headroom (smaller fraction of budget used).
    pub memory_pressure: f32,
}

impl QualityTier {
    /// The fixed axis weighting for this tier. Mirrors the table in
    /// docs/solutions/first-class-model-ux.md.
    pub fn weights(self) -> TierWeights {
        match self {
            QualityTier::Fastest => TierWeights {
                quality: 0.2,
                latency: 0.6,
                memory_pressure: 0.2,
            },
            QualityTier::Balanced => TierWeights {
                quality: 0.5,
                latency: 0.2,
                memory_pressure: 0.3,
            },
            QualityTier::MostCapable => TierWeights {
                quality: 0.8,
                latency: 0.0,
                memory_pressure: 0.2,
            },
        }
    }
}

/// Where the user is willing to run inference. Orthogonal to [`QualityTier`].
/// Choosing the cloud is never silent — `CloudOk` only makes remote models
/// *eligible*; the recommender still flags them as requiring one-time consent.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Privacy {
    /// Local models only.
    OnDevice,
    /// Remote APIs / the Parslee gateway may compete and win.
    CloudOk,
}

impl Default for Privacy {
    fn default() -> Self {
        Privacy::OnDevice
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_intent_serializes_compactly() {
        // No-intent must round-trip through serde without verbose
        // null fields — the FFI layer transmits as JSON and clients
        // shouldn't see {"task":null,"require":[],"prefer_local":false}.
        let hint = IntentHint::default();
        let json = serde_json::to_string(&hint).unwrap();
        assert_eq!(json, "{}");
    }

    #[test]
    fn round_trip_with_capability_require() {
        let hint = IntentHint {
            task: Some(TaskHint::Code),
            require: vec![ModelCapability::Code, ModelCapability::ToolUse],
            prefer_local: true,
            prefer_fast: false,
        };
        let json = serde_json::to_string(&hint).unwrap();
        let back: IntentHint = serde_json::from_str(&json).unwrap();
        assert_eq!(back.task, Some(TaskHint::Code));
        assert_eq!(
            back.require,
            vec![ModelCapability::Code, ModelCapability::ToolUse]
        );
        assert!(back.prefer_local);
        assert!(!back.prefer_fast);
    }

    #[test]
    fn missing_fields_default_cleanly() {
        // Pre-MVP clients that don't know about IntentHint may send
        // partial JSON. Defaults must match the no-intent path.
        let hint: IntentHint = serde_json::from_str("{}").unwrap();
        assert_eq!(hint.task, None);
        assert!(hint.require.is_empty());
        assert!(!hint.prefer_local);
        assert!(!hint.prefer_fast);
    }

    #[test]
    fn prefer_fast_round_trips_and_skips_when_false() {
        let off = IntentHint::default();
        assert_eq!(serde_json::to_string(&off).unwrap(), "{}");

        let on = IntentHint {
            prefer_fast: true,
            ..IntentHint::default()
        };
        let json = serde_json::to_string(&on).unwrap();
        assert!(json.contains("prefer_fast"));
        let back: IntentHint = serde_json::from_str(&json).unwrap();
        assert!(back.prefer_fast);
    }

    // --- acquisition intent (UseCase / QualityTier / Privacy) ---

    #[test]
    fn use_case_defaults_to_assistant_and_balanced_on_device() {
        assert_eq!(UseCase::default(), UseCase::Assistant);
        assert_eq!(QualityTier::default(), QualityTier::Balanced);
        assert_eq!(Privacy::default(), Privacy::OnDevice);
    }

    #[test]
    fn coding_requires_both_generate_and_code() {
        // Regression guard for the design-review point that a coding
        // model that can't generate is useless — Code alone is not enough.
        let req = UseCase::Coding.required_capabilities();
        assert!(req.contains(&ModelCapability::Generate));
        assert!(req.contains(&ModelCapability::Code));
    }

    #[test]
    fn search_is_a_retrieval_role_not_generative() {
        // Search must never be ranked against chat models.
        assert_eq!(UseCase::Search.role(), UseCaseRole::Retrieval);
        assert_eq!(UseCase::Assistant.role(), UseCaseRole::Generative);
        assert_eq!(UseCase::Transcription.role(), UseCaseRole::Audio);
        assert_eq!(
            UseCase::Search.required_capabilities(),
            &[ModelCapability::Embed]
        );
    }

    #[test]
    fn required_and_preferred_are_disjoint() {
        // A capability listed as required must not also be "preferred" —
        // that would double-count it in scoring.
        for uc in [
            UseCase::Assistant,
            UseCase::Coding,
            UseCase::Summarize,
            UseCase::Vision,
            UseCase::Transcription,
            UseCase::Search,
        ] {
            for p in uc.preferred_capabilities() {
                assert!(
                    !uc.required_capabilities().contains(p),
                    "{uc:?}: {p:?} is both required and preferred"
                );
            }
        }
    }

    #[test]
    fn tier_weights_match_documented_table() {
        let b = QualityTier::Balanced.weights();
        assert_eq!((b.quality, b.latency, b.memory_pressure), (0.5, 0.2, 0.3));
        let f = QualityTier::Fastest.weights();
        assert!(f.latency > f.quality, "Fastest must favor latency");
        let c = QualityTier::MostCapable.weights();
        assert!(c.quality > c.latency, "MostCapable must favor quality");
    }

    #[test]
    fn tier_weights_are_non_negative_and_sum_to_one() {
        // Guards against a future edit silently letting one axis dominate
        // by making weights sum to ≠ 1.0. Epsilon compare for floats.
        for tier in [
            QualityTier::Fastest,
            QualityTier::Balanced,
            QualityTier::MostCapable,
        ] {
            let w = tier.weights();
            for axis in [w.quality, w.latency, w.memory_pressure] {
                assert!(axis >= 0.0, "{tier:?}: negative weight {axis}");
            }
            let sum = w.quality + w.latency + w.memory_pressure;
            assert!(
                (sum - 1.0).abs() < 1e-6,
                "{tier:?}: weights sum to {sum}, expected 1.0"
            );
        }
    }

    #[test]
    fn use_case_round_trips_snake_case() {
        let json = serde_json::to_string(&UseCase::Coding).unwrap();
        assert_eq!(json, "\"coding\"");
        let back: UseCase = serde_json::from_str("\"search\"").unwrap();
        assert_eq!(back, UseCase::Search);
    }
}