Skip to main content

car_inference/
intent.rs

1//! Caller-facing routing intent — express requirements, not model IDs.
2//!
3//! Tracks Parslee-ai/car-releases#18. The motivation is that callers
4//! today choose between two extremes:
5//!
6//! - `model = None` → the adaptive router picks. Quality on average is
7//!   good but per-request variability surfaces as UX inconsistency.
8//! - `model = Some("claude-sonnet-4-7")` → the caller pins. Provider
9//!   awareness leaks up the stack — exactly what CAR is supposed to
10//!   prevent.
11//!
12//! `IntentHint` is the middle ground. The caller expresses *what* they
13//! need; the router resolves intent → model. Existing `model = None`
14//! and `model = Some(...)` paths are unchanged when no intent is
15//! supplied.
16//!
17//! ## MVP scope
18//!
19//! Just `task`, `prefer_local`, `require`. Cost/latency ceilings wait
20//! for clean registry numbers; `prefer_family` was cut as a soft
21//! routing knob that accumulates tweaks without clear semantics
22//! (Linus design review, 2026-05-04).
23//!
24//! ## Routing semantics
25//!
26//! `prefer_local: true` maps to a dedicated
27//! [`crate::RoutingWorkload::LocalPreferred`] variant. Distinct from
28//! `Background` (which is "this is a background job, latency barely
29//! matters") — `LocalPreferred` keeps a quality-aware weight profile
30//! and a strong local_bonus so the hint wins ties decisively.
31
32use serde::{Deserialize, Serialize};
33
34use crate::schema::ModelCapability;
35
36/// What the caller is doing — coarse-grained categories the adaptive
37/// router maps to `InferenceTask`. A closed enum so adding a new task
38/// type is a deliberate FFI-visible change rather than a silent
39/// fallback when the router doesn't recognize a string.
40///
41/// The MVP intentionally ships only the variants that map to a
42/// distinct `InferenceTask` today. `Summarize` / `Extract` were cut
43/// because both would have collapsed to `Generate` with no observable
44/// behavior change — shipping enum variants that are accepted, parsed,
45/// and silently discarded is exactly the routing variability the
46/// intent surface is designed to remove. Add them back when the
47/// registry actually distinguishes summarize-tuned or extract-tuned
48/// models.
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
50#[serde(rename_all = "snake_case")]
51pub enum TaskHint {
52    /// Conversational chat — maps to `InferenceTask::Generate`.
53    Chat,
54    /// Label assignment / categorization. Maps to
55    /// `InferenceTask::Classify`.
56    Classify,
57    /// Chain-of-thought, planning, multi-step analysis. Maps to
58    /// `InferenceTask::Reasoning` and tends to favor frontier
59    /// reasoning models.
60    Reasoning,
61    /// Code generation, repair, refactoring. Maps to
62    /// `InferenceTask::Code`.
63    Code,
64}
65
66/// Caller-supplied routing intent. All fields are optional / additive.
67/// An `IntentHint` with default values matches the no-intent path
68/// exactly, so threading `Option<IntentHint>` through is safe.
69#[derive(Debug, Clone, Default, Serialize, Deserialize)]
70pub struct IntentHint {
71    /// What the caller is doing. None = let the router infer from the
72    /// prompt as today.
73    #[serde(default, skip_serializing_if = "Option::is_none")]
74    pub task: Option<TaskHint>,
75
76    /// Hard filter — every required capability must be present on the
77    /// candidate. Empty = no extra filter.
78    #[serde(default, skip_serializing_if = "Vec::is_empty")]
79    pub require: Vec<ModelCapability>,
80
81    /// Bias the score profile toward local models (cost over quality).
82    /// Internally this maps to `RoutingWorkload::Background` until the
83    /// follow-up split lands (parslee-ai/car#106).
84    #[serde(default, skip_serializing_if = "is_false")]
85    pub prefer_local: bool,
86
87    /// Bias the score profile aggressively toward latency. Maps to
88    /// [`crate::tasks::RoutingWorkload::Fastest`] — a weight profile
89    /// that downweights quality and cost in favour of time-to-first-token.
90    /// Designed for voice turns where a sub-500ms first-audio target
91    /// beats a richer-but-slower answer. Takes precedence over
92    /// `prefer_local`; if both are set, the request is routed by
93    /// `Fastest` rules.
94    #[serde(default, skip_serializing_if = "is_false")]
95    pub prefer_fast: bool,
96}
97
98fn is_false(b: &bool) -> bool {
99    !*b
100}
101
102// ---------------------------------------------------------------------------
103// Acquisition intent — which model to *recommend/install* for this machine.
104//
105// Distinct layer from `TaskHint`/`IntentHint` above, which are *inference-time*
106// routing intent (which already-installed model should serve this request).
107// The types below answer the earlier question: "given this hardware and what
108// the user wants to do, which model should they acquire?" They are consumed by
109// `ModelRecommender` (see docs/solutions/first-class-model-ux.md) and never
110// expose model IDs, quantization, or HF repos to the caller.
111// ---------------------------------------------------------------------------
112
113/// The kind of model a use case needs. The recommender ranks only *within*
114/// a role's lane — an embedding model and a chat model are not comparable,
115/// so a retrieval pick never competes with a generative one. A use case that
116/// spans roles resolves to a bundle (one recommendation per role).
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
118#[serde(rename_all = "snake_case")]
119pub enum UseCaseRole {
120    /// Produces text/tokens (chat, code, vision-to-text, summarize).
121    Generative,
122    /// Produces vectors / relevance scores (embeddings, rerank).
123    Retrieval,
124    /// Consumes audio (transcription).
125    Audio,
126}
127
128/// What the user wants to do, in their terms — not a model ID. Closed enum:
129/// adding a use case is a deliberate FFI-visible change, never a silent
130/// string fallback. Each variant maps to a [`UseCaseRole`] and a required /
131/// preferred [`ModelCapability`] set (see [`UseCase::required_capabilities`]).
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134pub enum UseCase {
135    /// General chat / Q&A. The default.
136    Assistant,
137    /// Code generation, repair, refactoring.
138    Coding,
139    /// Text condensation.
140    Summarize,
141    /// Image understanding (a generative model that also sees).
142    Vision,
143    /// Audio → text.
144    Transcription,
145    /// Semantic search — an embedding model for retrieval. NOT an LLM
146    /// performing web search with tools; this is the `Retrieval` role and
147    /// maps to the `Embed` capability, so it is ranked separately from any
148    /// generative chat model.
149    Search,
150}
151
152impl UseCase {
153    /// The role lane this use case is ranked within.
154    pub fn role(self) -> UseCaseRole {
155        match self {
156            UseCase::Assistant
157            | UseCase::Coding
158            | UseCase::Summarize
159            | UseCase::Vision => UseCaseRole::Generative,
160            UseCase::Search => UseCaseRole::Retrieval,
161            UseCase::Transcription => UseCaseRole::Audio,
162        }
163    }
164
165    /// Hard eligibility filter — a model missing any of these is excluded.
166    pub fn required_capabilities(self) -> &'static [ModelCapability] {
167        use ModelCapability::*;
168        match self {
169            UseCase::Assistant => &[Generate],
170            UseCase::Coding => &[Generate, Code],
171            UseCase::Summarize => &[Generate],
172            UseCase::Vision => &[Vision, Generate],
173            UseCase::Transcription => &[SpeechToText],
174            UseCase::Search => &[Embed],
175        }
176    }
177
178    /// Soft preference — present capabilities add a ranking bonus but are
179    /// never required for eligibility.
180    pub fn preferred_capabilities(self) -> &'static [ModelCapability] {
181        use ModelCapability::*;
182        match self {
183            UseCase::Assistant => &[ToolUse],
184            UseCase::Coding => &[ToolUse, Reasoning],
185            UseCase::Summarize => &[Summarize],
186            UseCase::Vision => &[],
187            UseCase::Transcription => &[],
188            UseCase::Search => &[Rerank],
189        }
190    }
191}
192
193impl Default for UseCase {
194    fn default() -> Self {
195        UseCase::Assistant
196    }
197}
198
199/// Speed/quality knob. Each tier is a fixed weighting over the recommender's
200/// soft-score axes, applied *after* the hard eligibility filter, so tier
201/// semantics are explicit rather than reinvented per call site.
202#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
203#[serde(rename_all = "snake_case")]
204pub enum QualityTier {
205    /// Smallest eligible model; lowest latency.
206    Fastest,
207    /// Best quality that fits with KV-cache headroom. The default.
208    Balanced,
209    /// Largest model that fits at all; accepts slower output.
210    MostCapable,
211}
212
213impl Default for QualityTier {
214    fn default() -> Self {
215        QualityTier::Balanced
216    }
217}
218
219/// Relative weights a [`QualityTier`] places on each soft-score axis. The
220/// recommender normalizes each axis to `[0,1]` and combines them with these.
221#[derive(Debug, Clone, Copy, PartialEq)]
222pub struct TierWeights {
223    /// Toward higher-quality models (benchmarks / param-count prior).
224    pub quality: f32,
225    /// Toward lower-latency models (smaller, better-accelerated).
226    pub latency: f32,
227    /// Toward leaving memory headroom (smaller fraction of budget used).
228    pub memory_pressure: f32,
229}
230
231impl QualityTier {
232    /// The fixed axis weighting for this tier. Mirrors the table in
233    /// docs/solutions/first-class-model-ux.md.
234    pub fn weights(self) -> TierWeights {
235        match self {
236            QualityTier::Fastest => TierWeights {
237                quality: 0.2,
238                latency: 0.6,
239                memory_pressure: 0.2,
240            },
241            QualityTier::Balanced => TierWeights {
242                quality: 0.5,
243                latency: 0.2,
244                memory_pressure: 0.3,
245            },
246            QualityTier::MostCapable => TierWeights {
247                quality: 0.8,
248                latency: 0.0,
249                memory_pressure: 0.2,
250            },
251        }
252    }
253}
254
255/// Where the user is willing to run inference. Orthogonal to [`QualityTier`].
256/// Choosing the cloud is never silent — `CloudOk` only makes remote models
257/// *eligible*; the recommender still flags them as requiring one-time consent.
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
259#[serde(rename_all = "snake_case")]
260pub enum Privacy {
261    /// Local models only.
262    OnDevice,
263    /// Remote APIs / the Parslee gateway may compete and win.
264    CloudOk,
265}
266
267impl Default for Privacy {
268    fn default() -> Self {
269        Privacy::OnDevice
270    }
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276
277    #[test]
278    fn empty_intent_serializes_compactly() {
279        // No-intent must round-trip through serde without verbose
280        // null fields — the FFI layer transmits as JSON and clients
281        // shouldn't see {"task":null,"require":[],"prefer_local":false}.
282        let hint = IntentHint::default();
283        let json = serde_json::to_string(&hint).unwrap();
284        assert_eq!(json, "{}");
285    }
286
287    #[test]
288    fn round_trip_with_capability_require() {
289        let hint = IntentHint {
290            task: Some(TaskHint::Code),
291            require: vec![ModelCapability::Code, ModelCapability::ToolUse],
292            prefer_local: true,
293            prefer_fast: false,
294        };
295        let json = serde_json::to_string(&hint).unwrap();
296        let back: IntentHint = serde_json::from_str(&json).unwrap();
297        assert_eq!(back.task, Some(TaskHint::Code));
298        assert_eq!(
299            back.require,
300            vec![ModelCapability::Code, ModelCapability::ToolUse]
301        );
302        assert!(back.prefer_local);
303        assert!(!back.prefer_fast);
304    }
305
306    #[test]
307    fn missing_fields_default_cleanly() {
308        // Pre-MVP clients that don't know about IntentHint may send
309        // partial JSON. Defaults must match the no-intent path.
310        let hint: IntentHint = serde_json::from_str("{}").unwrap();
311        assert_eq!(hint.task, None);
312        assert!(hint.require.is_empty());
313        assert!(!hint.prefer_local);
314        assert!(!hint.prefer_fast);
315    }
316
317    #[test]
318    fn prefer_fast_round_trips_and_skips_when_false() {
319        let off = IntentHint::default();
320        assert_eq!(serde_json::to_string(&off).unwrap(), "{}");
321
322        let on = IntentHint {
323            prefer_fast: true,
324            ..IntentHint::default()
325        };
326        let json = serde_json::to_string(&on).unwrap();
327        assert!(json.contains("prefer_fast"));
328        let back: IntentHint = serde_json::from_str(&json).unwrap();
329        assert!(back.prefer_fast);
330    }
331
332    // --- acquisition intent (UseCase / QualityTier / Privacy) ---
333
334    #[test]
335    fn use_case_defaults_to_assistant_and_balanced_on_device() {
336        assert_eq!(UseCase::default(), UseCase::Assistant);
337        assert_eq!(QualityTier::default(), QualityTier::Balanced);
338        assert_eq!(Privacy::default(), Privacy::OnDevice);
339    }
340
341    #[test]
342    fn coding_requires_both_generate_and_code() {
343        // Regression guard for the design-review point that a coding
344        // model that can't generate is useless — Code alone is not enough.
345        let req = UseCase::Coding.required_capabilities();
346        assert!(req.contains(&ModelCapability::Generate));
347        assert!(req.contains(&ModelCapability::Code));
348    }
349
350    #[test]
351    fn search_is_a_retrieval_role_not_generative() {
352        // Search must never be ranked against chat models.
353        assert_eq!(UseCase::Search.role(), UseCaseRole::Retrieval);
354        assert_eq!(UseCase::Assistant.role(), UseCaseRole::Generative);
355        assert_eq!(UseCase::Transcription.role(), UseCaseRole::Audio);
356        assert_eq!(
357            UseCase::Search.required_capabilities(),
358            &[ModelCapability::Embed]
359        );
360    }
361
362    #[test]
363    fn required_and_preferred_are_disjoint() {
364        // A capability listed as required must not also be "preferred" —
365        // that would double-count it in scoring.
366        for uc in [
367            UseCase::Assistant,
368            UseCase::Coding,
369            UseCase::Summarize,
370            UseCase::Vision,
371            UseCase::Transcription,
372            UseCase::Search,
373        ] {
374            for p in uc.preferred_capabilities() {
375                assert!(
376                    !uc.required_capabilities().contains(p),
377                    "{uc:?}: {p:?} is both required and preferred"
378                );
379            }
380        }
381    }
382
383    #[test]
384    fn tier_weights_match_documented_table() {
385        let b = QualityTier::Balanced.weights();
386        assert_eq!((b.quality, b.latency, b.memory_pressure), (0.5, 0.2, 0.3));
387        let f = QualityTier::Fastest.weights();
388        assert!(f.latency > f.quality, "Fastest must favor latency");
389        let c = QualityTier::MostCapable.weights();
390        assert!(c.quality > c.latency, "MostCapable must favor quality");
391    }
392
393    #[test]
394    fn tier_weights_are_non_negative_and_sum_to_one() {
395        // Guards against a future edit silently letting one axis dominate
396        // by making weights sum to ≠ 1.0. Epsilon compare for floats.
397        for tier in [
398            QualityTier::Fastest,
399            QualityTier::Balanced,
400            QualityTier::MostCapable,
401        ] {
402            let w = tier.weights();
403            for axis in [w.quality, w.latency, w.memory_pressure] {
404                assert!(axis >= 0.0, "{tier:?}: negative weight {axis}");
405            }
406            let sum = w.quality + w.latency + w.memory_pressure;
407            assert!(
408                (sum - 1.0).abs() < 1e-6,
409                "{tier:?}: weights sum to {sum}, expected 1.0"
410            );
411        }
412    }
413
414    #[test]
415    fn use_case_round_trips_snake_case() {
416        let json = serde_json::to_string(&UseCase::Coding).unwrap();
417        assert_eq!(json, "\"coding\"");
418        let back: UseCase = serde_json::from_str("\"search\"").unwrap();
419        assert_eq!(back, UseCase::Search);
420    }
421}