car_inference/intent.rs
1//! Caller-facing routing intent — express requirements, not model IDs.
2//!
3//! Tracks Parslee-ai/car-releases#18. The motivation is that callers
4//! today choose between two extremes:
5//!
6//! - `model = None` → the adaptive router picks. Quality on average is
7//! good but per-request variability surfaces as UX inconsistency.
8//! - `model = Some("claude-sonnet-4-7")` → the caller pins. Provider
9//! awareness leaks up the stack — exactly what CAR is supposed to
10//! prevent.
11//!
12//! `IntentHint` is the middle ground. The caller expresses *what* they
13//! need; the router resolves intent → model. Existing `model = None`
14//! and `model = Some(...)` paths are unchanged when no intent is
15//! supplied.
16//!
17//! ## MVP scope
18//!
19//! Just `task`, `prefer_local`, `require`. Cost/latency ceilings wait
20//! for clean registry numbers; `prefer_family` was cut as a soft
21//! routing knob that accumulates tweaks without clear semantics
22//! (Linus design review, 2026-05-04).
23//!
24//! ## Routing semantics
25//!
26//! `prefer_local: true` maps to a dedicated
27//! [`crate::RoutingWorkload::LocalPreferred`] variant. Distinct from
28//! `Background` (which is "this is a background job, latency barely
29//! matters") — `LocalPreferred` keeps a quality-aware weight profile
30//! and a strong local_bonus so the hint wins ties decisively.
31
32use serde::{Deserialize, Serialize};
33
34use crate::schema::ModelCapability;
35
36/// What the caller is doing — coarse-grained categories the adaptive
37/// router maps to `InferenceTask`. A closed enum so adding a new task
38/// type is a deliberate FFI-visible change rather than a silent
39/// fallback when the router doesn't recognize a string.
40///
41/// The MVP intentionally ships only the variants that map to a
42/// distinct `InferenceTask` today. `Summarize` / `Extract` were cut
43/// because both would have collapsed to `Generate` with no observable
44/// behavior change — shipping enum variants that are accepted, parsed,
45/// and silently discarded is exactly the routing variability the
46/// intent surface is designed to remove. Add them back when the
47/// registry actually distinguishes summarize-tuned or extract-tuned
48/// models.
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
50#[serde(rename_all = "snake_case")]
51pub enum TaskHint {
52 /// Conversational chat — maps to `InferenceTask::Generate`.
53 Chat,
54 /// Label assignment / categorization. Maps to
55 /// `InferenceTask::Classify`.
56 Classify,
57 /// Chain-of-thought, planning, multi-step analysis. Maps to
58 /// `InferenceTask::Reasoning` and tends to favor frontier
59 /// reasoning models.
60 Reasoning,
61 /// Code generation, repair, refactoring. Maps to
62 /// `InferenceTask::Code`.
63 Code,
64}
65
66/// Caller-supplied routing intent. All fields are optional / additive.
67/// An `IntentHint` with default values matches the no-intent path
68/// exactly, so threading `Option<IntentHint>` through is safe.
69#[derive(Debug, Clone, Default, Serialize, Deserialize)]
70pub struct IntentHint {
71 /// What the caller is doing. None = let the router infer from the
72 /// prompt as today.
73 #[serde(default, skip_serializing_if = "Option::is_none")]
74 pub task: Option<TaskHint>,
75
76 /// Hard filter — every required capability must be present on the
77 /// candidate. Empty = no extra filter.
78 #[serde(default, skip_serializing_if = "Vec::is_empty")]
79 pub require: Vec<ModelCapability>,
80
81 /// Bias the score profile toward local models (cost over quality).
82 /// Internally this maps to `RoutingWorkload::Background` until the
83 /// follow-up split lands (parslee-ai/car#106).
84 #[serde(default, skip_serializing_if = "is_false")]
85 pub prefer_local: bool,
86
87 /// Bias the score profile aggressively toward latency. Maps to
88 /// [`crate::tasks::RoutingWorkload::Fastest`] — a weight profile
89 /// that downweights quality and cost in favour of time-to-first-token.
90 /// Designed for voice turns where a sub-500ms first-audio target
91 /// beats a richer-but-slower answer. Takes precedence over
92 /// `prefer_local`; if both are set, the request is routed by
93 /// `Fastest` rules.
94 #[serde(default, skip_serializing_if = "is_false")]
95 pub prefer_fast: bool,
96}
97
98fn is_false(b: &bool) -> bool {
99 !*b
100}
101
102// ---------------------------------------------------------------------------
103// Acquisition intent — which model to *recommend/install* for this machine.
104//
105// Distinct layer from `TaskHint`/`IntentHint` above, which are *inference-time*
106// routing intent (which already-installed model should serve this request).
107// The types below answer the earlier question: "given this hardware and what
108// the user wants to do, which model should they acquire?" They are consumed by
109// `ModelRecommender` (see docs/solutions/first-class-model-ux.md) and never
110// expose model IDs, quantization, or HF repos to the caller.
111// ---------------------------------------------------------------------------
112
113/// The kind of model a use case needs. The recommender ranks only *within*
114/// a role's lane — an embedding model and a chat model are not comparable,
115/// so a retrieval pick never competes with a generative one. A use case that
116/// spans roles resolves to a bundle (one recommendation per role).
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
118#[serde(rename_all = "snake_case")]
119pub enum UseCaseRole {
120 /// Produces text/tokens (chat, code, vision-to-text, summarize).
121 Generative,
122 /// Produces vectors / relevance scores (embeddings, rerank).
123 Retrieval,
124 /// Consumes audio (transcription).
125 Audio,
126}
127
128/// What the user wants to do, in their terms — not a model ID. Closed enum:
129/// adding a use case is a deliberate FFI-visible change, never a silent
130/// string fallback. Each variant maps to a [`UseCaseRole`] and a required /
131/// preferred [`ModelCapability`] set (see [`UseCase::required_capabilities`]).
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134pub enum UseCase {
135 /// General chat / Q&A. The default.
136 Assistant,
137 /// Code generation, repair, refactoring.
138 Coding,
139 /// Text condensation.
140 Summarize,
141 /// Image understanding (a generative model that also sees).
142 Vision,
143 /// Audio → text.
144 Transcription,
145 /// Semantic search — an embedding model for retrieval. NOT an LLM
146 /// performing web search with tools; this is the `Retrieval` role and
147 /// maps to the `Embed` capability, so it is ranked separately from any
148 /// generative chat model.
149 Search,
150}
151
152impl UseCase {
153 /// The role lane this use case is ranked within.
154 pub fn role(self) -> UseCaseRole {
155 match self {
156 UseCase::Assistant
157 | UseCase::Coding
158 | UseCase::Summarize
159 | UseCase::Vision => UseCaseRole::Generative,
160 UseCase::Search => UseCaseRole::Retrieval,
161 UseCase::Transcription => UseCaseRole::Audio,
162 }
163 }
164
165 /// Hard eligibility filter — a model missing any of these is excluded.
166 pub fn required_capabilities(self) -> &'static [ModelCapability] {
167 use ModelCapability::*;
168 match self {
169 UseCase::Assistant => &[Generate],
170 UseCase::Coding => &[Generate, Code],
171 UseCase::Summarize => &[Generate],
172 UseCase::Vision => &[Vision, Generate],
173 UseCase::Transcription => &[SpeechToText],
174 UseCase::Search => &[Embed],
175 }
176 }
177
178 /// Soft preference — present capabilities add a ranking bonus but are
179 /// never required for eligibility.
180 pub fn preferred_capabilities(self) -> &'static [ModelCapability] {
181 use ModelCapability::*;
182 match self {
183 UseCase::Assistant => &[ToolUse],
184 UseCase::Coding => &[ToolUse, Reasoning],
185 UseCase::Summarize => &[Summarize],
186 UseCase::Vision => &[],
187 UseCase::Transcription => &[],
188 UseCase::Search => &[Rerank],
189 }
190 }
191}
192
193impl Default for UseCase {
194 fn default() -> Self {
195 UseCase::Assistant
196 }
197}
198
199/// Speed/quality knob. Each tier is a fixed weighting over the recommender's
200/// soft-score axes, applied *after* the hard eligibility filter, so tier
201/// semantics are explicit rather than reinvented per call site.
202#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
203#[serde(rename_all = "snake_case")]
204pub enum QualityTier {
205 /// Smallest eligible model; lowest latency.
206 Fastest,
207 /// Best quality that fits with KV-cache headroom. The default.
208 Balanced,
209 /// Largest model that fits at all; accepts slower output.
210 MostCapable,
211}
212
213impl Default for QualityTier {
214 fn default() -> Self {
215 QualityTier::Balanced
216 }
217}
218
219/// Relative weights a [`QualityTier`] places on each soft-score axis. The
220/// recommender normalizes each axis to `[0,1]` and combines them with these.
221#[derive(Debug, Clone, Copy, PartialEq)]
222pub struct TierWeights {
223 /// Toward higher-quality models (benchmarks / param-count prior).
224 pub quality: f32,
225 /// Toward lower-latency models (smaller, better-accelerated).
226 pub latency: f32,
227 /// Toward leaving memory headroom (smaller fraction of budget used).
228 pub memory_pressure: f32,
229}
230
231impl QualityTier {
232 /// The fixed axis weighting for this tier. Mirrors the table in
233 /// docs/solutions/first-class-model-ux.md.
234 pub fn weights(self) -> TierWeights {
235 match self {
236 QualityTier::Fastest => TierWeights {
237 quality: 0.2,
238 latency: 0.6,
239 memory_pressure: 0.2,
240 },
241 QualityTier::Balanced => TierWeights {
242 quality: 0.5,
243 latency: 0.2,
244 memory_pressure: 0.3,
245 },
246 QualityTier::MostCapable => TierWeights {
247 quality: 0.8,
248 latency: 0.0,
249 memory_pressure: 0.2,
250 },
251 }
252 }
253}
254
255/// Where the user is willing to run inference. Orthogonal to [`QualityTier`].
256/// Choosing the cloud is never silent — `CloudOk` only makes remote models
257/// *eligible*; the recommender still flags them as requiring one-time consent.
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
259#[serde(rename_all = "snake_case")]
260pub enum Privacy {
261 /// Local models only.
262 OnDevice,
263 /// Remote APIs / the Parslee gateway may compete and win.
264 CloudOk,
265}
266
267impl Default for Privacy {
268 fn default() -> Self {
269 Privacy::OnDevice
270 }
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276
277 #[test]
278 fn empty_intent_serializes_compactly() {
279 // No-intent must round-trip through serde without verbose
280 // null fields — the FFI layer transmits as JSON and clients
281 // shouldn't see {"task":null,"require":[],"prefer_local":false}.
282 let hint = IntentHint::default();
283 let json = serde_json::to_string(&hint).unwrap();
284 assert_eq!(json, "{}");
285 }
286
287 #[test]
288 fn round_trip_with_capability_require() {
289 let hint = IntentHint {
290 task: Some(TaskHint::Code),
291 require: vec![ModelCapability::Code, ModelCapability::ToolUse],
292 prefer_local: true,
293 prefer_fast: false,
294 };
295 let json = serde_json::to_string(&hint).unwrap();
296 let back: IntentHint = serde_json::from_str(&json).unwrap();
297 assert_eq!(back.task, Some(TaskHint::Code));
298 assert_eq!(
299 back.require,
300 vec![ModelCapability::Code, ModelCapability::ToolUse]
301 );
302 assert!(back.prefer_local);
303 assert!(!back.prefer_fast);
304 }
305
306 #[test]
307 fn missing_fields_default_cleanly() {
308 // Pre-MVP clients that don't know about IntentHint may send
309 // partial JSON. Defaults must match the no-intent path.
310 let hint: IntentHint = serde_json::from_str("{}").unwrap();
311 assert_eq!(hint.task, None);
312 assert!(hint.require.is_empty());
313 assert!(!hint.prefer_local);
314 assert!(!hint.prefer_fast);
315 }
316
317 #[test]
318 fn prefer_fast_round_trips_and_skips_when_false() {
319 let off = IntentHint::default();
320 assert_eq!(serde_json::to_string(&off).unwrap(), "{}");
321
322 let on = IntentHint {
323 prefer_fast: true,
324 ..IntentHint::default()
325 };
326 let json = serde_json::to_string(&on).unwrap();
327 assert!(json.contains("prefer_fast"));
328 let back: IntentHint = serde_json::from_str(&json).unwrap();
329 assert!(back.prefer_fast);
330 }
331
332 // --- acquisition intent (UseCase / QualityTier / Privacy) ---
333
334 #[test]
335 fn use_case_defaults_to_assistant_and_balanced_on_device() {
336 assert_eq!(UseCase::default(), UseCase::Assistant);
337 assert_eq!(QualityTier::default(), QualityTier::Balanced);
338 assert_eq!(Privacy::default(), Privacy::OnDevice);
339 }
340
341 #[test]
342 fn coding_requires_both_generate_and_code() {
343 // Regression guard for the design-review point that a coding
344 // model that can't generate is useless — Code alone is not enough.
345 let req = UseCase::Coding.required_capabilities();
346 assert!(req.contains(&ModelCapability::Generate));
347 assert!(req.contains(&ModelCapability::Code));
348 }
349
350 #[test]
351 fn search_is_a_retrieval_role_not_generative() {
352 // Search must never be ranked against chat models.
353 assert_eq!(UseCase::Search.role(), UseCaseRole::Retrieval);
354 assert_eq!(UseCase::Assistant.role(), UseCaseRole::Generative);
355 assert_eq!(UseCase::Transcription.role(), UseCaseRole::Audio);
356 assert_eq!(
357 UseCase::Search.required_capabilities(),
358 &[ModelCapability::Embed]
359 );
360 }
361
362 #[test]
363 fn required_and_preferred_are_disjoint() {
364 // A capability listed as required must not also be "preferred" —
365 // that would double-count it in scoring.
366 for uc in [
367 UseCase::Assistant,
368 UseCase::Coding,
369 UseCase::Summarize,
370 UseCase::Vision,
371 UseCase::Transcription,
372 UseCase::Search,
373 ] {
374 for p in uc.preferred_capabilities() {
375 assert!(
376 !uc.required_capabilities().contains(p),
377 "{uc:?}: {p:?} is both required and preferred"
378 );
379 }
380 }
381 }
382
383 #[test]
384 fn tier_weights_match_documented_table() {
385 let b = QualityTier::Balanced.weights();
386 assert_eq!((b.quality, b.latency, b.memory_pressure), (0.5, 0.2, 0.3));
387 let f = QualityTier::Fastest.weights();
388 assert!(f.latency > f.quality, "Fastest must favor latency");
389 let c = QualityTier::MostCapable.weights();
390 assert!(c.quality > c.latency, "MostCapable must favor quality");
391 }
392
393 #[test]
394 fn tier_weights_are_non_negative_and_sum_to_one() {
395 // Guards against a future edit silently letting one axis dominate
396 // by making weights sum to ≠ 1.0. Epsilon compare for floats.
397 for tier in [
398 QualityTier::Fastest,
399 QualityTier::Balanced,
400 QualityTier::MostCapable,
401 ] {
402 let w = tier.weights();
403 for axis in [w.quality, w.latency, w.memory_pressure] {
404 assert!(axis >= 0.0, "{tier:?}: negative weight {axis}");
405 }
406 let sum = w.quality + w.latency + w.memory_pressure;
407 assert!(
408 (sum - 1.0).abs() < 1e-6,
409 "{tier:?}: weights sum to {sum}, expected 1.0"
410 );
411 }
412 }
413
414 #[test]
415 fn use_case_round_trips_snake_case() {
416 let json = serde_json::to_string(&UseCase::Coding).unwrap();
417 assert_eq!(json, "\"coding\"");
418 let back: UseCase = serde_json::from_str("\"search\"").unwrap();
419 assert_eq!(back, UseCase::Search);
420 }
421}