car_inference/
recommend.rs

1//! Model recommender — hardware + intent → ranked, explained model picks.
2//!
3//! This is the framing layer that turns the typed registry
4//! (`ModelSchema`), hardware facts (`HardwareInfo` / `SupportedAcceleration`),
5//! and the acquisition-intent vocabulary (`UseCase` / `QualityTier` /
6//! `Privacy`) into something a non-expert can act on: "given this machine
7//! and what I want to do, which model should I install, and why?"
8//!
9//! The full contract lives in docs/solutions/first-class-model-ux.md. The
10//! selection pipeline is deterministic — same `(models, hardware, intent)`
11//! always yields the same ranking — and pure (no disk, no network), so the
12//! hardware × use-case matrix is unit-testable.
13//!
14//! This supersedes `hardware::recommend_model` for the user-facing flows
15//! (`car setup`, `models.recommend`). That standalone heuristic survives only
16//! as a registry-less bootstrap inside `HardwareInfo::detect`; everything that
17//! has a registry in hand should call [`recommend`].
18
19use serde::{Deserialize, Serialize};
20
21use crate::hardware::{HardwareInfo, SupportedAcceleration};
22use crate::intent::{Privacy, QualityTier, UseCase, UseCaseRole};
23use crate::schema::{ModelSchema, TrustTier};
24
25/// Whether a model fits in the machine's memory for local execution.
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27#[serde(rename_all = "snake_case")]
28pub enum FitStatus {
29    /// Comfortably fits with headroom for KV cache + overhead.
30    Fits,
31    /// Too large for this machine's memory budget.
32    TooBig,
33    /// Runs on an external server (vLLM-MLX) or remote API — local memory
34    /// does not apply. Never claimed as a local "fits".
35    ServerProvided,
36    /// The model declares no usable memory figure, so fit can't be asserted.
37    Unknown,
38}
39
40/// One ranked recommendation, explained in plain language.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct Recommendation {
43    /// Registry id (e.g. "qwen/qwen3-4b:q4_k_m"). The caller pulls by this.
44    pub model_id: String,
45    /// Human-readable name shown to the user.
46    pub display_name: String,
47    /// The role lane this pick serves.
48    pub role: UseCaseRole,
49    /// Plain-language reason, generated from the winning factors. Never
50    /// exposes quantization / repo / file jargon.
51    pub rationale: String,
52    /// Download size in MB.
53    pub download_mb: u64,
54    /// True if the model is already installed.
55    pub already_installed: bool,
56    /// Local memory fit.
57    pub fit: FitStatus,
58    /// The acceleration tier this machine would run it on.
59    pub acceleration: SupportedAcceleration,
60    /// True for on-device models, false for remote/cloud.
61    pub is_local: bool,
62    /// True when running this pick sends prompts off the machine — the
63    /// caller must obtain one-time consent before the first cloud inference.
64    pub requires_cloud_consent: bool,
65    /// How much the project vouches for this model.
66    pub trust_tier: TrustTier,
67    /// Internal blended score (higher is better). Exposed for tests/debug.
68    pub score: f32,
69}
70
71// --- tuning constants (documented; consistent with hardware.rs scale) ------
72
73/// RAM the OS + other apps need; subtracted from total before the budget.
74const OS_RESERVE_MB: u64 = 3072;
75/// Framework/activation overhead per backend.
76const OVERHEAD_METAL_MB: u64 = 512;
77const OVERHEAD_CUDA_MB: u64 = 512;
78const OVERHEAD_CPU_MB: u64 = 1024;
79/// Extra slack so a "fits" verdict is honest under real load.
80const SAFETY_MARGIN_MB: u64 = 1024;
81/// Context length used for the fit budget — a typical working window, not
82/// the model's max. Fit is weights-dominated; sizing for 128k would reject
83/// models that run fine at everyday context lengths.
84const FIT_CONTEXT_TOKENS: usize = 8192;
85
86/// The result of a recommendation query — never a bare `Vec`, so the caller
87/// can tell "here are your picks" from "nothing fits, here's why".
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct RecommendationSet {
90    /// Ranked, actionable picks (best first). Empty when nothing is eligible
91    /// or everything is too big — read `note` in that case.
92    pub picks: Vec<Recommendation>,
93    /// Eligible models that don't fit this machine's memory, for an honest
94    /// "needs more RAM" listing. Ranked by score so the closest miss is first.
95    pub not_enough_memory: Vec<Recommendation>,
96    /// Plain-language explanation when `picks` is empty, or a heads-up worth
97    /// surfacing (e.g. the top pick runs in the cloud). `None` otherwise.
98    pub note: Option<String>,
99}
100
101/// Rank model picks for a machine + intent, best first. Pure over its inputs.
102///
103/// `models` is typically `UnifiedRegistry::list()`. Returns at most one
104/// role's lane today (every `UseCase` is single-role). Too-big-but-eligible
105/// models are surfaced separately in `not_enough_memory` rather than dropped
106/// silently, and an empty `picks` always comes with an explanatory `note`.
107pub fn recommend(
108    models: &[&ModelSchema],
109    hw: &HardwareInfo,
110    use_case: UseCase,
111    tier: QualityTier,
112    privacy: Privacy,
113) -> RecommendationSet {
114    let accel = hw.supported_acceleration();
115    let sort = |v: &mut Vec<Recommendation>| {
116        // Deterministic: score desc, installed first, smaller download, id.
117        v.sort_by(|a, b| {
118            b.score
119                .partial_cmp(&a.score)
120                .unwrap_or(std::cmp::Ordering::Equal)
121                .then(b.already_installed.cmp(&a.already_installed))
122                .then(a.download_mb.cmp(&b.download_mb))
123                .then(a.model_id.cmp(&b.model_id))
124        });
125    };
126
127    let (mut picks, mut not_enough_memory): (Vec<_>, Vec<_>) = models
128        .iter()
129        .filter(|m| passes_base_filter(m, hw, use_case, privacy))
130        .map(|m| build_recommendation(m, hw, &accel, use_case, tier))
131        .partition(|r| r.fit != FitStatus::TooBig);
132    sort(&mut picks);
133    sort(&mut not_enough_memory);
134
135    let note = explain_if_needed(&picks, &not_enough_memory, hw, use_case, tier, privacy);
136    RecommendationSet {
137        picks,
138        not_enough_memory,
139        note,
140    }
141}
142
143/// Hard eligibility, minus the fit check (fit is handled by partitioning so
144/// too-big models can still be surfaced). A model failing any of these is
145/// never shown at all.
146fn passes_base_filter(
147    m: &ModelSchema,
148    hw: &HardwareInfo,
149    use_case: UseCase,
150    privacy: Privacy,
151) -> bool {
152    if m.deprecated {
153        return false;
154    }
155    // Capability requirement encodes the role lane (Search⇒Embed, etc.).
156    if !use_case
157        .required_capabilities()
158        .iter()
159        .all(|c| m.has_capability(*c))
160    {
161        return false;
162    }
163    // Privacy: on-device excludes anything that leaves the machine.
164    if privacy == Privacy::OnDevice && !m.is_local() {
165        return false;
166    }
167    // A Metal-only model on a non-Apple machine can't run at all.
168    if m.requires_apple_silicon()
169        && !matches!(hw.supported_acceleration(), SupportedAcceleration::Apple { .. })
170    {
171        return false;
172    }
173    true
174}
175
176/// Build the honest `note` for the result set. Empty picks always get one.
177fn explain_if_needed(
178    picks: &[Recommendation],
179    too_big: &[Recommendation],
180    hw: &HardwareInfo,
181    use_case: UseCase,
182    tier: QualityTier,
183    privacy: Privacy,
184) -> Option<String> {
185    let purpose = use_case_purpose(use_case);
186    if picks.is_empty() {
187        let ram_gb = hw.total_ram_mb / 1024;
188        return Some(if !too_big.is_empty() {
189            match privacy {
190                Privacy::OnDevice => format!(
191                    "No on-device model for {purpose} fits your {ram_gb} GB machine. \
192                     Free up memory, pick a smaller tier, or allow cloud models."
193                ),
194                Privacy::CloudOk => format!(
195                    "No local model for {purpose} fits your {ram_gb} GB machine, and no \
196                     cloud model is configured. Add an API key or free up memory."
197                ),
198            }
199        } else {
200            format!("No model available for {purpose} on this machine.")
201        });
202    }
203    // Heads-up when the best pick we can offer is the most-capable tier but
204    // still ran out of bigger options, or when the top pick is cloud.
205    if picks[0].requires_cloud_consent {
206        return Some(format!(
207            "The best {purpose} pick runs in the cloud and needs your OK before first use. \
208             {} fits locally if you prefer on-device.",
209            picks
210                .iter()
211                .find(|p| p.is_local)
212                .map(|p| p.display_name.as_str())
213                .unwrap_or("No local model")
214        ));
215    }
216    let _ = tier;
217    None
218}
219
220fn use_case_purpose(use_case: UseCase) -> &'static str {
221    match use_case {
222        UseCase::Assistant => "chat & general help",
223        UseCase::Coding => "coding",
224        UseCase::Summarize => "summarizing",
225        UseCase::Vision => "understanding images",
226        UseCase::Transcription => "transcription",
227        UseCase::Search => "semantic search",
228    }
229}
230
231fn build_recommendation(
232    m: &ModelSchema,
233    hw: &HardwareInfo,
234    accel: &SupportedAcceleration,
235    use_case: UseCase,
236    tier: QualityTier,
237) -> Recommendation {
238    let fit = fit_status(m, hw);
239    let quality = quality_score(m);
240    let latency = latency_score(m, accel);
241    let pressure = memory_pressure(m, hw);
242    let w = tier.weights();
243    // Higher score is better, so memory *pressure* is inverted.
244    let mut score =
245        w.quality * quality + w.latency * latency + w.memory_pressure * (1.0 - pressure);
246    // Preferred capabilities are a soft bonus, never an eligibility gate.
247    let pref_hits = use_case
248        .preferred_capabilities()
249        .iter()
250        .filter(|c| m.has_capability(**c))
251        .count();
252    score += 0.05 * pref_hits as f32;
253
254    let is_local = m.is_local();
255    Recommendation {
256        model_id: m.id.clone(),
257        display_name: m.name.clone(),
258        role: use_case.role(),
259        rationale: rationale(m, hw, use_case, tier, fit, quality),
260        download_mb: m.size_mb(),
261        already_installed: m.available,
262        fit,
263        acceleration: accel.clone(),
264        is_local,
265        requires_cloud_consent: !is_local,
266        trust_tier: m.trust_tier,
267        score,
268    }
269}
270
271/// 0.0–1.0 quality prior: published benchmarks when present, else a
272/// param-count heuristic (bigger ⇒ generally more capable, with diminishing
273/// returns).
274fn quality_score(m: &ModelSchema) -> f32 {
275    if !m.public_benchmarks.is_empty() {
276        let sum: f64 = m.public_benchmarks.iter().map(|b| b.score).sum();
277        return (sum / m.public_benchmarks.len() as f64).clamp(0.0, 1.0) as f32;
278    }
279    // Quality tracks *total* params (a 30B MoE has 30B-class knowledge even
280    // with 3B active). Saturating: 0.6B≈0.08, 4B≈0.36, 8B≈0.53, 30B≈0.81.
281    let b = param_billions_total(m).max(0.1);
282    (b / (b + 7.0)).clamp(0.0, 1.0) as f32
283}
284
285/// 0.0–1.0 latency prior: smaller + better-accelerated ⇒ faster ⇒ higher.
286/// Uses *active* params — a 30B MoE with 3B active runs at 3B-ish speed.
287fn latency_score(m: &ModelSchema, accel: &SupportedAcceleration) -> f32 {
288    let b = param_billions_active(m).max(0.1);
289    // Smaller models score higher; 0.6B≈0.93, 4B≈0.6, 8B≈0.43, 30B≈0.14.
290    let size_term = (8.0 / (b + 8.0)) as f32;
291    let accel_bonus = match accel {
292        SupportedAcceleration::Apple { .. } | SupportedAcceleration::Cuda { .. } => 0.1,
293        _ => 0.0,
294    };
295    (size_term + accel_bonus).clamp(0.0, 1.0)
296}
297
298/// Fraction of the memory budget this model consumes (0.0–1.0+). Used
299/// inverted in scoring so leaner picks win under memory-pressure weight.
300fn memory_pressure(m: &ModelSchema, hw: &HardwareInfo) -> f32 {
301    let budget = memory_budget_mb(hw);
302    if budget == 0 {
303        return 1.0;
304    }
305    (memory_required_mb(m, hw) as f32 / budget as f32).clamp(0.0, 1.5)
306}
307
308/// Local memory fit verdict for this model on this machine.
309fn fit_status(m: &ModelSchema, hw: &HardwareInfo) -> FitStatus {
310    // External server / remote: local memory is irrelevant.
311    if m.is_vllm_mlx() || m.is_remote() || m.is_delegated() {
312        return FitStatus::ServerProvided;
313    }
314    // Apple FoundationModels: the OS owns the weights, no user budget.
315    if m.is_foundation_models() {
316        return FitStatus::Fits;
317    }
318    if m.size_mb() == 0 && m.ram_mb() == 0 {
319        return FitStatus::Unknown;
320    }
321    if memory_required_mb(m, hw) + SAFETY_MARGIN_MB <= memory_budget_mb(hw) {
322        FitStatus::Fits
323    } else {
324        FitStatus::TooBig
325    }
326}
327
328/// Resident memory this model needs: weights + KV cache at a typical context
329/// + backend overhead.
330fn memory_required_mb(m: &ModelSchema, hw: &HardwareInfo) -> u64 {
331    let weights = m.ram_mb().max(m.size_mb());
332    let kv = kv_cache_mb(m, FIT_CONTEXT_TOKENS);
333    weights + kv + backend_overhead_mb(hw)
334}
335
336/// KV-cache estimate. Scaled by model size and context; weights-dominated fit
337/// means this need only be roughly right (consistent with hardware.rs scale).
338fn kv_cache_mb(m: &ModelSchema, context_tokens: usize) -> u64 {
339    // KV cache scales with active params (what's resident per token).
340    let per_1k = (param_billions_active(m) as f64 * 0.12).max(0.05);
341    ((context_tokens as f64 / 1000.0) * per_1k).ceil() as u64
342}
343
344/// Memory available for a model after OS reserve, by acceleration tier.
345fn memory_budget_mb(hw: &HardwareInfo) -> u64 {
346    match hw.supported_acceleration() {
347        SupportedAcceleration::Apple { unified_memory_mb } => {
348            unified_memory_mb.saturating_sub(OS_RESERVE_MB)
349        }
350        SupportedAcceleration::Cuda { device_memory_mb } => {
351            device_memory_mb.unwrap_or(hw.total_ram_mb)
352        }
353        // CPU and unsupported-discrete both run from system RAM.
354        _ => hw.total_ram_mb.saturating_sub(OS_RESERVE_MB),
355    }
356}
357
358fn backend_overhead_mb(hw: &HardwareInfo) -> u64 {
359    match hw.supported_acceleration() {
360        SupportedAcceleration::Apple { .. } => OVERHEAD_METAL_MB,
361        SupportedAcceleration::Cuda { .. } => OVERHEAD_CUDA_MB,
362        _ => OVERHEAD_CPU_MB,
363    }
364}
365
366/// Total parameter count in billions from `param_count` ("4B", "30B (3B
367/// active)" → 30). Drives the *quality* prior. When `param_count` is blank
368/// (common for under-curated entries), estimate from on-disk size rather
369/// than treating the model as 0B — a 4-bit GGUF is ~0.6 GB per B params.
370fn param_billions_total(m: &ModelSchema) -> f32 {
371    if let Some(b) = parse_leading_billions(&m.param_count) {
372        return b;
373    }
374    let size = m.size_mb();
375    if size > 0 {
376        (size as f32 / 600.0).max(0.1)
377    } else {
378        0.0
379    }
380}
381
382/// Active parameter count in billions — the "(N active)" hint for MoE models,
383/// falling back to total for dense models. Drives *latency* and KV sizing.
384fn param_billions_active(m: &ModelSchema) -> f32 {
385    if let Some(active) = m
386        .param_count
387        .split_once('(')
388        .and_then(|(_, rest)| rest.split_once("active"))
389        .and_then(|(num, _)| parse_leading_billions(num))
390    {
391        return active;
392    }
393    param_billions_total(m)
394}
395
396fn parse_leading_billions(s: &str) -> Option<f32> {
397    let s = s.trim();
398    let num: String = s
399        .chars()
400        .take_while(|c| c.is_ascii_digit() || *c == '.')
401        .collect();
402    let v: f32 = num.parse().ok()?;
403    // "M" suffix → fractions of a billion.
404    if s[num.len()..].trim_start().to_ascii_lowercase().starts_with('m') {
405        Some(v / 1000.0)
406    } else {
407        Some(v)
408    }
409}
410
411/// Plain-language rationale from the winning factors. Fixed templates, no
412/// free-form prose, so copy stays consistent and translatable.
413fn rationale(
414    m: &ModelSchema,
415    hw: &HardwareInfo,
416    use_case: UseCase,
417    tier: QualityTier,
418    fit: FitStatus,
419    quality: f32,
420) -> String {
421    let purpose = use_case_purpose(use_case);
422    let machine = match hw.supported_acceleration() {
423        SupportedAcceleration::Apple { unified_memory_mb } => {
424            format!("your {} GB Apple Silicon Mac (Metal)", unified_memory_mb / 1024)
425        }
426        SupportedAcceleration::Cuda { device_memory_mb } => match device_memory_mb {
427            Some(mb) => format!("your {} GB NVIDIA GPU (CUDA)", mb / 1024),
428            None => "your NVIDIA GPU (CUDA)".to_string(),
429        },
430        SupportedAcceleration::UnsupportedDiscreteGpu { .. } | SupportedAcceleration::Cpu => {
431            format!("your {} GB machine (CPU)", hw.total_ram_mb / 1024)
432        }
433    };
434
435    match fit {
436        FitStatus::ServerProvided if m.is_remote() => format!(
437            "{}: cloud model for {} — runs on Parslee's servers, nothing to download",
438            m.name, purpose
439        ),
440        FitStatus::ServerProvided => format!(
441            "{}: served externally for {} — no local memory needed",
442            m.name, purpose
443        ),
444        _ => {
445            let tier_word = match tier {
446                QualityTier::Fastest => "fastest",
447                QualityTier::Balanced => "best-balanced",
448                QualityTier::MostCapable => "most capable",
449            };
450            let quality_note = if quality >= 0.7 {
451                "high-quality "
452            } else {
453                ""
454            };
455            let size = if m.size_mb() >= 1024 {
456                format!("{:.1} GB download", m.size_mb() as f64 / 1024.0)
457            } else {
458                format!("{} MB download", m.size_mb())
459            };
460            format!(
461                "{}: the {} {}{} model that fits {} ({})",
462                m.name, tier_word, quality_note, purpose, machine, size
463            )
464        }
465    }
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471    use crate::hardware::{GpuBackend, GpuDevice, GpuVendor};
472    use crate::schema::{CostModel, ModelCapability, ModelSource, PerformanceEnvelope};
473
474    fn hw(accel_backend: GpuBackend, ram_mb: u64, gpu_mb: Option<u64>) -> HardwareInfo {
475        HardwareInfo {
476            os: "test".into(),
477            arch: "test".into(),
478            cpu_cores: 8,
479            total_ram_mb: ram_mb,
480            gpu_backend: accel_backend,
481            gpu_memory_mb: gpu_mb,
482            gpu_devices: vec![],
483            recommended_model: String::new(),
484            recommended_context: 4096,
485            max_model_mb: 0,
486        }
487    }
488
489    fn mac(ram_gb: u64) -> HardwareInfo {
490        // Metal backend ⇒ SupportedAcceleration::Apple with unified memory.
491        hw(GpuBackend::Metal, ram_gb * 1024, None)
492    }
493
494    fn local_model(id: &str, name: &str, params: &str, size_mb: u64) -> ModelSchema {
495        ModelSchema {
496            id: id.into(),
497            name: name.into(),
498            provider: "qwen".into(),
499            family: "qwen3".into(),
500            version: String::new(),
501            capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
502            context_length: 32768,
503            param_count: params.into(),
504            quantization: Some("Q4_K_M".into()),
505            performance: PerformanceEnvelope::default(),
506            cost: CostModel {
507                size_mb: Some(size_mb),
508                ram_mb: Some(size_mb),
509                ..Default::default()
510            },
511            source: ModelSource::Local {
512                hf_repo: "x/y".into(),
513                hf_filename: "m.gguf".into(),
514                tokenizer_repo: "x/y".into(),
515            },
516            tags: vec![],
517            supported_params: vec![],
518            public_benchmarks: vec![],
519            trust_tier: TrustTier::Curated,
520            deprecated: false,
521            available: false,
522        }
523    }
524
525    fn catalog() -> Vec<ModelSchema> {
526        vec![
527            local_model("qwen/qwen3-0.6b", "Qwen3-0.6B", "0.6B", 650),
528            local_model("qwen/qwen3-4b", "Qwen3-4B", "4B", 2500),
529            local_model("qwen/qwen3-8b", "Qwen3-8B", "8B", 4900),
530            local_model("qwen/qwen3-30b", "Qwen3-30B-A3B", "30B (3B active)", 17000),
531        ]
532    }
533
534    fn refs(v: &[ModelSchema]) -> Vec<&ModelSchema> {
535        v.iter().collect()
536    }
537
538    #[test]
539    fn fastest_prefers_the_small_model() {
540        let cat = catalog();
541        let recs = recommend(
542            &refs(&cat),
543            &mac(36),
544            UseCase::Coding,
545            QualityTier::Fastest,
546            Privacy::OnDevice,
547        ).picks;
548        assert_eq!(recs[0].display_name, "Qwen3-0.6B");
549    }
550
551    #[test]
552    fn most_capable_prefers_the_big_model_when_it_fits() {
553        let cat = catalog();
554        let recs = recommend(
555            &refs(&cat),
556            &mac(36), // 36 GB Mac fits the 17 GB model
557            UseCase::Coding,
558            QualityTier::MostCapable,
559            Privacy::OnDevice,
560        ).picks;
561        assert_eq!(recs[0].display_name, "Qwen3-30B-A3B");
562        assert_eq!(recs[0].fit, FitStatus::Fits);
563    }
564
565    #[test]
566    fn too_big_models_are_excluded_on_small_machines() {
567        let cat = catalog();
568        let recs = recommend(
569            &refs(&cat),
570            &mac(8), // 8 GB: 17 GB and 8 GB models shouldn't be offered
571            UseCase::Coding,
572            QualityTier::MostCapable,
573            Privacy::OnDevice,
574        ).picks;
575        let names: Vec<&str> = recs.iter().map(|r| r.display_name.as_str()).collect();
576        assert!(!names.contains(&"Qwen3-30B-A3B"), "30B must not fit 8GB");
577        assert!(recs.iter().all(|r| r.fit == FitStatus::Fits));
578        assert!(!recs.is_empty(), "the 0.6B model should still be offered");
579    }
580
581    #[test]
582    fn balanced_picks_a_capable_model_that_fits() {
583        let cat = catalog();
584        let recs = recommend(
585            &refs(&cat),
586            &mac(16),
587            UseCase::Coding,
588            QualityTier::Balanced,
589            Privacy::OnDevice,
590        ).picks;
591        // On a 16 GB Mac, Balanced should land on a mid model, not the 0.6B
592        // and not necessarily the largest.
593        assert!(matches!(
594            recs[0].display_name.as_str(),
595            "Qwen3-4B" | "Qwen3-8B"
596        ));
597    }
598
599    #[test]
600    fn search_only_returns_embedding_models() {
601        let mut cat = catalog();
602        let mut embed = local_model("qwen/embed", "Qwen3-Embedding", "0.6B", 640);
603        embed.capabilities = vec![ModelCapability::Embed];
604        cat.push(embed);
605        let recs = recommend(
606            &refs(&cat),
607            &mac(16),
608            UseCase::Search,
609            QualityTier::Balanced,
610            Privacy::OnDevice,
611        ).picks;
612        assert_eq!(recs.len(), 1, "only the embed model is in the Search lane");
613        assert_eq!(recs[0].display_name, "Qwen3-Embedding");
614        assert_eq!(recs[0].role, UseCaseRole::Retrieval);
615    }
616
617    #[test]
618    fn deprecated_models_are_never_recommended() {
619        let mut cat = catalog();
620        cat[1].deprecated = true; // deprecate the 4B
621        let recs = recommend(
622            &refs(&cat),
623            &mac(16),
624            UseCase::Coding,
625            QualityTier::Balanced,
626            Privacy::OnDevice,
627        ).picks;
628        assert!(recs.iter().all(|r| r.display_name != "Qwen3-4B"));
629    }
630
631    #[test]
632    fn on_device_excludes_cloud_but_cloud_ok_includes_it_with_consent() {
633        let mut cat = catalog();
634        let mut cloud = local_model("anthropic/sonnet", "Claude Sonnet", "", 0);
635        cloud.capabilities = vec![ModelCapability::Generate, ModelCapability::Code];
636        cloud.source = ModelSource::RemoteApi {
637            endpoint: "https://api".into(),
638            api_key_env: "K".into(),
639            api_key_envs: vec![],
640            api_version: None,
641            protocol: crate::schema::ApiProtocol::Anthropic,
642        };
643        cloud.public_benchmarks = vec![crate::schema::BenchmarkScore {
644            name: "SWE-bench".into(),
645            score: 0.7,
646            harness: None,
647            source_url: None,
648            measured_at: None,
649        }];
650        cat.push(cloud);
651
652        let on_device = recommend(
653            &refs(&cat),
654            &mac(16),
655            UseCase::Coding,
656            QualityTier::MostCapable,
657            Privacy::OnDevice,
658        )
659        .picks;
660        assert!(on_device.iter().all(|r| r.is_local));
661
662        let cloud_ok = recommend(
663            &refs(&cat),
664            &mac(16),
665            UseCase::Coding,
666            QualityTier::MostCapable,
667            Privacy::CloudOk,
668        )
669        .picks;
670        let claude = cloud_ok
671            .iter()
672            .find(|r| r.display_name == "Claude Sonnet")
673            .expect("cloud model eligible under CloudOk");
674        assert!(claude.requires_cloud_consent);
675        assert_eq!(claude.fit, FitStatus::ServerProvided);
676    }
677
678    #[test]
679    fn metal_only_model_excluded_on_cpu_host() {
680        let mut cat = catalog();
681        let mut mlx = local_model("mlx/qwen3-4b", "Qwen3-4B-MLX", "4B", 2400);
682        mlx.source = ModelSource::Mlx {
683            hf_repo: "mlx-community/x".into(),
684            hf_weight_file: None,
685        };
686        cat.push(mlx);
687        // CPU-only Linux box.
688        let recs = recommend(
689            &refs(&cat),
690            &hw(GpuBackend::Cpu, 32 * 1024, None),
691            UseCase::Coding,
692            QualityTier::Balanced,
693            Privacy::OnDevice,
694        ).picks;
695        assert!(recs.iter().all(|r| r.display_name != "Qwen3-4B-MLX"));
696    }
697
698    #[test]
699    fn ranking_is_deterministic() {
700        let cat = catalog();
701        let a = recommend(
702            &refs(&cat),
703            &mac(16),
704            UseCase::Assistant,
705            QualityTier::Balanced,
706            Privacy::OnDevice,
707        );
708        let b = recommend(
709            &refs(&cat),
710            &mac(16),
711            UseCase::Assistant,
712            QualityTier::Balanced,
713            Privacy::OnDevice,
714        );
715        let ids_a: Vec<&str> = a.picks.iter().map(|r| r.model_id.as_str()).collect();
716        let ids_b: Vec<&str> = b.picks.iter().map(|r| r.model_id.as_str()).collect();
717        assert_eq!(ids_a, ids_b);
718    }
719
720    #[test]
721    fn rationale_is_plain_language_no_jargon() {
722        let cat = catalog();
723        let recs = recommend(
724            &refs(&cat),
725            &mac(36),
726            UseCase::Coding,
727            QualityTier::Balanced,
728            Privacy::OnDevice,
729        ).picks;
730        let r = &recs[0].rationale;
731        assert!(!r.contains("Q4_K_M"), "no quantization jargon");
732        assert!(!r.contains("gguf") && !r.contains("hf_repo"));
733        assert!(r.contains("coding"), "states the purpose");
734    }
735
736    #[test]
737    fn all_too_big_surfaces_needs_more_ram_with_a_note() {
738        // A 2 GB machine fits nothing in the catalog.
739        let cat = catalog();
740        let set = recommend(
741            &refs(&cat),
742            &hw(GpuBackend::Cpu, 2 * 1024, None),
743            UseCase::Coding,
744            QualityTier::Balanced,
745            Privacy::OnDevice,
746        );
747        assert!(set.picks.is_empty(), "nothing should fit 2 GB");
748        assert!(
749            !set.not_enough_memory.is_empty(),
750            "too-big models surfaced, not dropped"
751        );
752        let note = set.note.expect("empty picks must carry a note");
753        assert!(note.contains("fits"), "note explains the no-fit: {note}");
754        // Closest miss ranked first.
755        assert_eq!(set.not_enough_memory[0].fit, FitStatus::TooBig);
756    }
757
758    #[test]
759    fn all_deprecated_gives_generic_note_not_a_memory_note() {
760        // Deprecated models are filtered before the fit partition, so they
761        // land in neither picks nor not_enough_memory — the note must be the
762        // generic "no model available", not a misleading "needs more RAM".
763        let mut cat = catalog();
764        for m in &mut cat {
765            m.deprecated = true;
766        }
767        let set = recommend(
768            &refs(&cat),
769            &mac(36), // plenty of RAM — so a memory note would be wrong
770            UseCase::Coding,
771            QualityTier::Balanced,
772            Privacy::OnDevice,
773        );
774        assert!(set.picks.is_empty());
775        assert!(set.not_enough_memory.is_empty());
776        let note = set.note.expect("must explain");
777        assert!(
778            !note.contains("fits") && !note.contains("memory"),
779            "deprecated-only must not claim a memory problem: {note}"
780        );
781    }
782
783    #[test]
784    fn not_enough_memory_is_ordered_deterministically() {
785        let cat = catalog();
786        let mk = || {
787            recommend(
788                &refs(&cat),
789                &hw(GpuBackend::Cpu, 3 * 1024, None), // only the 0.6B fits
790                UseCase::Coding,
791                QualityTier::Balanced,
792                Privacy::OnDevice,
793            )
794            .not_enough_memory
795            .into_iter()
796            .map(|r| r.model_id)
797            .collect::<Vec<_>>()
798        };
799        assert!(mk().len() >= 2, "several models should be too big for 3 GB");
800        assert_eq!(mk(), mk(), "too-big ordering must be deterministic");
801    }
802
803    #[test]
804    fn empty_registry_returns_empty_with_a_note() {
805        let set = recommend(
806            &[],
807            &mac(16),
808            UseCase::Assistant,
809            QualityTier::Balanced,
810            Privacy::OnDevice,
811        );
812        assert!(set.picks.is_empty());
813        assert!(set.not_enough_memory.is_empty());
814        assert!(set.note.is_some(), "no-model case must explain itself");
815    }
816
817    #[test]
818    fn cuda_box_sizes_against_vram() {
819        // A 24 GB CUDA GPU fits the 17 GB model under MostCapable.
820        let cat = catalog();
821        let h = hw(GpuBackend::Cuda, 64 * 1024, Some(24 * 1024));
822        let recs = recommend(
823            &refs(&cat),
824            &h,
825            UseCase::Coding,
826            QualityTier::MostCapable,
827            Privacy::OnDevice,
828        )
829        .picks;
830        assert_eq!(recs[0].display_name, "Qwen3-30B-A3B");
831    }
832
833    #[test]
834    fn unsupported_discrete_gpu_uses_system_ram_not_vram() {
835        // A 24 GB discrete GPU CAR can't drive must NOT be used as the budget;
836        // a 16 GB-RAM CPU host can't fit the 17 GB model despite the big card.
837        let cat = catalog();
838        let mut h = hw(GpuBackend::Cpu, 16 * 1024, None);
839        h.gpu_devices = vec![GpuDevice {
840            vendor: GpuVendor::Nvidia,
841            name: "GeForce RTX 4090".into(),
842            memory_mb: Some(24_000),
843        }];
844        // Sanity: this is the UnsupportedDiscreteGpu tier.
845        assert!(matches!(
846            h.supported_acceleration(),
847            crate::hardware::SupportedAcceleration::UnsupportedDiscreteGpu { .. }
848        ));
849        let recs = recommend(
850            &refs(&cat),
851            &h,
852            UseCase::Coding,
853            QualityTier::MostCapable,
854            Privacy::OnDevice,
855        )
856        .picks;
857        assert!(
858            recs.iter().all(|r| r.display_name != "Qwen3-30B-A3B"),
859            "17 GB model must not fit a 16 GB-RAM CPU host"
860        );
861        assert!(!recs.is_empty(), "smaller models still fit");
862    }
863
864    #[test]
865    fn recommendation_set_wire_shape_is_snake_case_and_stable() {
866        // Guards the JSON contract that FFI / JSON-RPC clients decode.
867        let cat = catalog();
868        let set = recommend(
869            &refs(&cat),
870            &mac(36),
871            UseCase::Coding,
872            QualityTier::Balanced,
873            Privacy::OnDevice,
874        );
875        let json = serde_json::to_string(&set).unwrap();
876        assert!(json.contains("\"picks\""));
877        assert!(json.contains("\"not_enough_memory\""));
878        assert!(json.contains("\"model_id\""));
879        assert!(json.contains("\"already_installed\""));
880        assert!(json.contains("\"requires_cloud_consent\""));
881        assert!(json.contains("\"fit\""));
882    }
883
884    #[test]
885    fn blank_param_count_estimates_from_size_not_zero() {
886        // An under-curated entry with no param_count must not be treated as a
887        // 0B model (which would falsely look tiny + low quality).
888        let mut m = local_model("x/unknown", "Unknown-Model", "", 4900);
889        m.param_count = String::new();
890        assert!(
891            param_billions_total(&m) > 5.0,
892            "4.9 GB ⇒ roughly an 8B model, not 0B"
893        );
894    }
895}
car_inference/recommend.rs

car_inference/
recommend.rs