car-inference 0.22.0

//! Model recommender — hardware + intent → ranked, explained model picks.
//!
//! This is the framing layer that turns the typed registry
//! (`ModelSchema`), hardware facts (`HardwareInfo` / `SupportedAcceleration`),
//! and the acquisition-intent vocabulary (`UseCase` / `QualityTier` /
//! `Privacy`) into something a non-expert can act on: "given this machine
//! and what I want to do, which model should I install, and why?"
//!
//! The full contract lives in docs/solutions/first-class-model-ux.md. The
//! selection pipeline is deterministic — same `(models, hardware, intent)`
//! always yields the same ranking — and pure (no disk, no network), so the
//! hardware × use-case matrix is unit-testable.
//!
//! This supersedes `hardware::recommend_model` for the user-facing flows
//! (`car setup`, `models.recommend`). That standalone heuristic survives only
//! as a registry-less bootstrap inside `HardwareInfo::detect`; everything that
//! has a registry in hand should call [`recommend`].

use serde::{Deserialize, Serialize};

use crate::hardware::{HardwareInfo, SupportedAcceleration};
use crate::intent::{Privacy, QualityTier, UseCase, UseCaseRole};
use crate::schema::{ModelSchema, TrustTier};

/// Whether a model fits in the machine's memory for local execution.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FitStatus {
    /// Comfortably fits with headroom for KV cache + overhead.
    Fits,
    /// Too large for this machine's memory budget.
    TooBig,
    /// Runs on an external server (vLLM-MLX) or remote API — local memory
    /// does not apply. Never claimed as a local "fits".
    ServerProvided,
    /// The model declares no usable memory figure, so fit can't be asserted.
    Unknown,
}

/// One ranked recommendation, explained in plain language.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Recommendation {
    /// Registry id (e.g. "qwen/qwen3-4b:q4_k_m"). The caller pulls by this.
    pub model_id: String,
    /// Human-readable name shown to the user.
    pub display_name: String,
    /// The role lane this pick serves.
    pub role: UseCaseRole,
    /// Plain-language reason, generated from the winning factors. Never
    /// exposes quantization / repo / file jargon.
    pub rationale: String,
    /// Download size in MB.
    pub download_mb: u64,
    /// True if the model is already installed.
    pub already_installed: bool,
    /// Local memory fit.
    pub fit: FitStatus,
    /// The acceleration tier this machine would run it on.
    pub acceleration: SupportedAcceleration,
    /// True for on-device models, false for remote/cloud.
    pub is_local: bool,
    /// True when running this pick sends prompts off the machine — the
    /// caller must obtain one-time consent before the first cloud inference.
    pub requires_cloud_consent: bool,
    /// How much the project vouches for this model.
    pub trust_tier: TrustTier,
    /// Internal blended score (higher is better). Exposed for tests/debug.
    pub score: f32,
}

// --- tuning constants (documented; consistent with hardware.rs scale) ------

/// RAM the OS + other apps need; subtracted from total before the budget.
const OS_RESERVE_MB: u64 = 3072;
/// Framework/activation overhead per backend.
const OVERHEAD_METAL_MB: u64 = 512;
const OVERHEAD_CUDA_MB: u64 = 512;
const OVERHEAD_CPU_MB: u64 = 1024;
/// Extra slack so a "fits" verdict is honest under real load.
const SAFETY_MARGIN_MB: u64 = 1024;
/// Context length used for the fit budget — a typical working window, not
/// the model's max. Fit is weights-dominated; sizing for 128k would reject
/// models that run fine at everyday context lengths.
const FIT_CONTEXT_TOKENS: usize = 8192;

/// The result of a recommendation query — never a bare `Vec`, so the caller
/// can tell "here are your picks" from "nothing fits, here's why".
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecommendationSet {
    /// Ranked, actionable picks (best first). Empty when nothing is eligible
    /// or everything is too big — read `note` in that case.
    pub picks: Vec<Recommendation>,
    /// Eligible models that don't fit this machine's memory, for an honest
    /// "needs more RAM" listing. Ranked by score so the closest miss is first.
    pub not_enough_memory: Vec<Recommendation>,
    /// Plain-language explanation when `picks` is empty, or a heads-up worth
    /// surfacing (e.g. the top pick runs in the cloud). `None` otherwise.
    pub note: Option<String>,
}

/// Rank model picks for a machine + intent, best first. Pure over its inputs.
///
/// `models` is typically `UnifiedRegistry::list()`. Returns at most one
/// role's lane today (every `UseCase` is single-role). Too-big-but-eligible
/// models are surfaced separately in `not_enough_memory` rather than dropped
/// silently, and an empty `picks` always comes with an explanatory `note`.
pub fn recommend(
    models: &[&ModelSchema],
    hw: &HardwareInfo,
    use_case: UseCase,
    tier: QualityTier,
    privacy: Privacy,
) -> RecommendationSet {
    let accel = hw.supported_acceleration();
    let sort = |v: &mut Vec<Recommendation>| {
        // Deterministic: score desc, installed first, smaller download, id.
        v.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap_or(std::cmp::Ordering::Equal)
                .then(b.already_installed.cmp(&a.already_installed))
                .then(a.download_mb.cmp(&b.download_mb))
                .then(a.model_id.cmp(&b.model_id))
        });
    };

    let (mut picks, mut not_enough_memory): (Vec<_>, Vec<_>) = models
        .iter()
        .filter(|m| passes_base_filter(m, hw, use_case, privacy))
        .map(|m| build_recommendation(m, hw, &accel, use_case, tier))
        .partition(|r| r.fit != FitStatus::TooBig);
    sort(&mut picks);
    sort(&mut not_enough_memory);

    let note = explain_if_needed(&picks, &not_enough_memory, hw, use_case, tier, privacy);
    RecommendationSet {
        picks,
        not_enough_memory,
        note,
    }
}

/// Hard eligibility, minus the fit check (fit is handled by partitioning so
/// too-big models can still be surfaced). A model failing any of these is
/// never shown at all.
fn passes_base_filter(
    m: &ModelSchema,
    hw: &HardwareInfo,
    use_case: UseCase,
    privacy: Privacy,
) -> bool {
    if m.deprecated {
        return false;
    }
    // Capability requirement encodes the role lane (Search⇒Embed, etc.).
    if !use_case
        .required_capabilities()
        .iter()
        .all(|c| m.has_capability(*c))
    {
        return false;
    }
    // Privacy: on-device excludes anything that leaves the machine.
    if privacy == Privacy::OnDevice && !m.is_local() {
        return false;
    }
    // A Metal-only model on a non-Apple machine can't run at all.
    if m.requires_apple_silicon()
        && !matches!(hw.supported_acceleration(), SupportedAcceleration::Apple { .. })
    {
        return false;
    }
    true
}

/// Build the honest `note` for the result set. Empty picks always get one.
fn explain_if_needed(
    picks: &[Recommendation],
    too_big: &[Recommendation],
    hw: &HardwareInfo,
    use_case: UseCase,
    tier: QualityTier,
    privacy: Privacy,
) -> Option<String> {
    let purpose = use_case_purpose(use_case);
    if picks.is_empty() {
        let ram_gb = hw.total_ram_mb / 1024;
        return Some(if !too_big.is_empty() {
            match privacy {
                Privacy::OnDevice => format!(
                    "No on-device model for {purpose} fits your {ram_gb} GB machine. \
                     Free up memory, pick a smaller tier, or allow cloud models."
                ),
                Privacy::CloudOk => format!(
                    "No local model for {purpose} fits your {ram_gb} GB machine, and no \
                     cloud model is configured. Add an API key or free up memory."
                ),
            }
        } else {
            format!("No model available for {purpose} on this machine.")
        });
    }
    // Heads-up when the best pick we can offer is the most-capable tier but
    // still ran out of bigger options, or when the top pick is cloud.
    if picks[0].requires_cloud_consent {
        return Some(format!(
            "The best {purpose} pick runs in the cloud and needs your OK before first use. \
             {} fits locally if you prefer on-device.",
            picks
                .iter()
                .find(|p| p.is_local)
                .map(|p| p.display_name.as_str())
                .unwrap_or("No local model")
        ));
    }
    let _ = tier;
    None
}

fn use_case_purpose(use_case: UseCase) -> &'static str {
    match use_case {
        UseCase::Assistant => "chat & general help",
        UseCase::Coding => "coding",
        UseCase::Summarize => "summarizing",
        UseCase::Vision => "understanding images",
        UseCase::Transcription => "transcription",
        UseCase::Search => "semantic search",
    }
}

fn build_recommendation(
    m: &ModelSchema,
    hw: &HardwareInfo,
    accel: &SupportedAcceleration,
    use_case: UseCase,
    tier: QualityTier,
) -> Recommendation {
    let fit = fit_status(m, hw);
    let quality = quality_score(m);
    let latency = latency_score(m, accel);
    let pressure = memory_pressure(m, hw);
    let w = tier.weights();
    // Higher score is better, so memory *pressure* is inverted.
    let mut score =
        w.quality * quality + w.latency * latency + w.memory_pressure * (1.0 - pressure);
    // Preferred capabilities are a soft bonus, never an eligibility gate.
    let pref_hits = use_case
        .preferred_capabilities()
        .iter()
        .filter(|c| m.has_capability(**c))
        .count();
    score += 0.05 * pref_hits as f32;

    let is_local = m.is_local();
    Recommendation {
        model_id: m.id.clone(),
        display_name: m.name.clone(),
        role: use_case.role(),
        rationale: rationale(m, hw, use_case, tier, fit, quality),
        download_mb: m.size_mb(),
        already_installed: m.available,
        fit,
        acceleration: accel.clone(),
        is_local,
        requires_cloud_consent: !is_local,
        trust_tier: m.trust_tier,
        score,
    }
}

/// 0.0–1.0 quality prior: published benchmarks when present, else a
/// param-count heuristic (bigger ⇒ generally more capable, with diminishing
/// returns).
fn quality_score(m: &ModelSchema) -> f32 {
    if !m.public_benchmarks.is_empty() {
        let sum: f64 = m.public_benchmarks.iter().map(|b| b.score).sum();
        return (sum / m.public_benchmarks.len() as f64).clamp(0.0, 1.0) as f32;
    }
    // Quality tracks *total* params (a 30B MoE has 30B-class knowledge even
    // with 3B active). Saturating: 0.6B≈0.08, 4B≈0.36, 8B≈0.53, 30B≈0.81.
    let b = param_billions_total(m).max(0.1);
    (b / (b + 7.0)).clamp(0.0, 1.0) as f32
}

/// 0.0–1.0 latency prior: smaller + better-accelerated ⇒ faster ⇒ higher.
/// Uses *active* params — a 30B MoE with 3B active runs at 3B-ish speed.
fn latency_score(m: &ModelSchema, accel: &SupportedAcceleration) -> f32 {
    let b = param_billions_active(m).max(0.1);
    // Smaller models score higher; 0.6B≈0.93, 4B≈0.6, 8B≈0.43, 30B≈0.14.
    let size_term = (8.0 / (b + 8.0)) as f32;
    let accel_bonus = match accel {
        SupportedAcceleration::Apple { .. } | SupportedAcceleration::Cuda { .. } => 0.1,
        _ => 0.0,
    };
    (size_term + accel_bonus).clamp(0.0, 1.0)
}

/// Fraction of the memory budget this model consumes (0.0–1.0+). Used
/// inverted in scoring so leaner picks win under memory-pressure weight.
fn memory_pressure(m: &ModelSchema, hw: &HardwareInfo) -> f32 {
    let budget = memory_budget_mb(hw);
    if budget == 0 {
        return 1.0;
    }
    (memory_required_mb(m, hw) as f32 / budget as f32).clamp(0.0, 1.5)
}

/// Local memory fit verdict for this model on this machine.
fn fit_status(m: &ModelSchema, hw: &HardwareInfo) -> FitStatus {
    // External server / remote: local memory is irrelevant.
    if m.is_vllm_mlx() || m.is_remote() || m.is_delegated() {
        return FitStatus::ServerProvided;
    }
    // Apple FoundationModels: the OS owns the weights, no user budget.
    if m.is_foundation_models() {
        return FitStatus::Fits;
    }
    if m.size_mb() == 0 && m.ram_mb() == 0 {
        return FitStatus::Unknown;
    }
    if memory_required_mb(m, hw) + SAFETY_MARGIN_MB <= memory_budget_mb(hw) {
        FitStatus::Fits
    } else {
        FitStatus::TooBig
    }
}

/// Resident memory this model needs: weights + KV cache at a typical context
/// + backend overhead.
fn memory_required_mb(m: &ModelSchema, hw: &HardwareInfo) -> u64 {
    let weights = m.ram_mb().max(m.size_mb());
    let kv = kv_cache_mb(m, FIT_CONTEXT_TOKENS);
    weights + kv + backend_overhead_mb(hw)
}

/// KV-cache estimate. Scaled by model size and context; weights-dominated fit
/// means this need only be roughly right (consistent with hardware.rs scale).
fn kv_cache_mb(m: &ModelSchema, context_tokens: usize) -> u64 {
    // KV cache scales with active params (what's resident per token).
    let per_1k = (param_billions_active(m) as f64 * 0.12).max(0.05);
    ((context_tokens as f64 / 1000.0) * per_1k).ceil() as u64
}

/// Memory available for a model after OS reserve, by acceleration tier.
fn memory_budget_mb(hw: &HardwareInfo) -> u64 {
    match hw.supported_acceleration() {
        SupportedAcceleration::Apple { unified_memory_mb } => {
            unified_memory_mb.saturating_sub(OS_RESERVE_MB)
        }
        SupportedAcceleration::Cuda { device_memory_mb } => {
            device_memory_mb.unwrap_or(hw.total_ram_mb)
        }
        // CPU and unsupported-discrete both run from system RAM.
        _ => hw.total_ram_mb.saturating_sub(OS_RESERVE_MB),
    }
}

fn backend_overhead_mb(hw: &HardwareInfo) -> u64 {
    match hw.supported_acceleration() {
        SupportedAcceleration::Apple { .. } => OVERHEAD_METAL_MB,
        SupportedAcceleration::Cuda { .. } => OVERHEAD_CUDA_MB,
        _ => OVERHEAD_CPU_MB,
    }
}

/// Total parameter count in billions from `param_count` ("4B", "30B (3B
/// active)" → 30). Drives the *quality* prior. When `param_count` is blank
/// (common for under-curated entries), estimate from on-disk size rather
/// than treating the model as 0B — a 4-bit GGUF is ~0.6 GB per B params.
fn param_billions_total(m: &ModelSchema) -> f32 {
    if let Some(b) = parse_leading_billions(&m.param_count) {
        return b;
    }
    let size = m.size_mb();
    if size > 0 {
        (size as f32 / 600.0).max(0.1)
    } else {
        0.0
    }
}

/// Active parameter count in billions — the "(N active)" hint for MoE models,
/// falling back to total for dense models. Drives *latency* and KV sizing.
fn param_billions_active(m: &ModelSchema) -> f32 {
    if let Some(active) = m
        .param_count
        .split_once('(')
        .and_then(|(_, rest)| rest.split_once("active"))
        .and_then(|(num, _)| parse_leading_billions(num))
    {
        return active;
    }
    param_billions_total(m)
}

fn parse_leading_billions(s: &str) -> Option<f32> {
    let s = s.trim();
    let num: String = s
        .chars()
        .take_while(|c| c.is_ascii_digit() || *c == '.')
        .collect();
    let v: f32 = num.parse().ok()?;
    // "M" suffix → fractions of a billion.
    if s[num.len()..].trim_start().to_ascii_lowercase().starts_with('m') {
        Some(v / 1000.0)
    } else {
        Some(v)
    }
}

/// Plain-language rationale from the winning factors. Fixed templates, no
/// free-form prose, so copy stays consistent and translatable.
fn rationale(
    m: &ModelSchema,
    hw: &HardwareInfo,
    use_case: UseCase,
    tier: QualityTier,
    fit: FitStatus,
    quality: f32,
) -> String {
    let purpose = use_case_purpose(use_case);
    let machine = match hw.supported_acceleration() {
        SupportedAcceleration::Apple { unified_memory_mb } => {
            format!("your {} GB Apple Silicon Mac (Metal)", unified_memory_mb / 1024)
        }
        SupportedAcceleration::Cuda { device_memory_mb } => match device_memory_mb {
            Some(mb) => format!("your {} GB NVIDIA GPU (CUDA)", mb / 1024),
            None => "your NVIDIA GPU (CUDA)".to_string(),
        },
        SupportedAcceleration::UnsupportedDiscreteGpu { .. } | SupportedAcceleration::Cpu => {
            format!("your {} GB machine (CPU)", hw.total_ram_mb / 1024)
        }
    };

    match fit {
        FitStatus::ServerProvided if m.is_remote() => format!(
            "{}: cloud model for {} — runs on Parslee's servers, nothing to download",
            m.name, purpose
        ),
        FitStatus::ServerProvided => format!(
            "{}: served externally for {} — no local memory needed",
            m.name, purpose
        ),
        _ => {
            let tier_word = match tier {
                QualityTier::Fastest => "fastest",
                QualityTier::Balanced => "best-balanced",
                QualityTier::MostCapable => "most capable",
            };
            let quality_note = if quality >= 0.7 {
                "high-quality "
            } else {
                ""
            };
            let size = if m.size_mb() >= 1024 {
                format!("{:.1} GB download", m.size_mb() as f64 / 1024.0)
            } else {
                format!("{} MB download", m.size_mb())
            };
            format!(
                "{}: the {} {}{} model that fits {} ({})",
                m.name, tier_word, quality_note, purpose, machine, size
            )
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hardware::{GpuBackend, GpuDevice, GpuVendor};
    use crate::schema::{CostModel, ModelCapability, ModelSource, PerformanceEnvelope};

    fn hw(accel_backend: GpuBackend, ram_mb: u64, gpu_mb: Option<u64>) -> HardwareInfo {
        HardwareInfo {
            os: "test".into(),
            arch: "test".into(),
            cpu_cores: 8,
            total_ram_mb: ram_mb,
            gpu_backend: accel_backend,
            gpu_memory_mb: gpu_mb,
            gpu_devices: vec![],
            recommended_model: String::new(),
            recommended_context: 4096,
            max_model_mb: 0,
        }
    }

    fn mac(ram_gb: u64) -> HardwareInfo {
        // Metal backend ⇒ SupportedAcceleration::Apple with unified memory.
        hw(GpuBackend::Metal, ram_gb * 1024, None)
    }

    fn local_model(id: &str, name: &str, params: &str, size_mb: u64) -> ModelSchema {
        ModelSchema {
            id: id.into(),
            name: name.into(),
            provider: "qwen".into(),
            family: "qwen3".into(),
            version: String::new(),
            capabilities: vec![ModelCapability::Generate, ModelCapability::Code],
            context_length: 32768,
            param_count: params.into(),
            quantization: Some("Q4_K_M".into()),
            performance: PerformanceEnvelope::default(),
            cost: CostModel {
                size_mb: Some(size_mb),
                ram_mb: Some(size_mb),
                ..Default::default()
            },
            source: ModelSource::Local {
                hf_repo: "x/y".into(),
                hf_filename: "m.gguf".into(),
                tokenizer_repo: "x/y".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: TrustTier::Curated,
            deprecated: false,
            available: false,
        }
    }

    fn catalog() -> Vec<ModelSchema> {
        vec![
            local_model("qwen/qwen3-0.6b", "Qwen3-0.6B", "0.6B", 650),
            local_model("qwen/qwen3-4b", "Qwen3-4B", "4B", 2500),
            local_model("qwen/qwen3-8b", "Qwen3-8B", "8B", 4900),
            local_model("qwen/qwen3-30b", "Qwen3-30B-A3B", "30B (3B active)", 17000),
        ]
    }

    fn refs(v: &[ModelSchema]) -> Vec<&ModelSchema> {
        v.iter().collect()
    }

    #[test]
    fn fastest_prefers_the_small_model() {
        let cat = catalog();
        let recs = recommend(
            &refs(&cat),
            &mac(36),
            UseCase::Coding,
            QualityTier::Fastest,
            Privacy::OnDevice,
        ).picks;
        assert_eq!(recs[0].display_name, "Qwen3-0.6B");
    }

    #[test]
    fn most_capable_prefers_the_big_model_when_it_fits() {
        let cat = catalog();
        let recs = recommend(
            &refs(&cat),
            &mac(36), // 36 GB Mac fits the 17 GB model
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::OnDevice,
        ).picks;
        assert_eq!(recs[0].display_name, "Qwen3-30B-A3B");
        assert_eq!(recs[0].fit, FitStatus::Fits);
    }

    #[test]
    fn too_big_models_are_excluded_on_small_machines() {
        let cat = catalog();
        let recs = recommend(
            &refs(&cat),
            &mac(8), // 8 GB: 17 GB and 8 GB models shouldn't be offered
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::OnDevice,
        ).picks;
        let names: Vec<&str> = recs.iter().map(|r| r.display_name.as_str()).collect();
        assert!(!names.contains(&"Qwen3-30B-A3B"), "30B must not fit 8GB");
        assert!(recs.iter().all(|r| r.fit == FitStatus::Fits));
        assert!(!recs.is_empty(), "the 0.6B model should still be offered");
    }

    #[test]
    fn balanced_picks_a_capable_model_that_fits() {
        let cat = catalog();
        let recs = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        ).picks;
        // On a 16 GB Mac, Balanced should land on a mid model, not the 0.6B
        // and not necessarily the largest.
        assert!(matches!(
            recs[0].display_name.as_str(),
            "Qwen3-4B" | "Qwen3-8B"
        ));
    }

    #[test]
    fn search_only_returns_embedding_models() {
        let mut cat = catalog();
        let mut embed = local_model("qwen/embed", "Qwen3-Embedding", "0.6B", 640);
        embed.capabilities = vec![ModelCapability::Embed];
        cat.push(embed);
        let recs = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Search,
            QualityTier::Balanced,
            Privacy::OnDevice,
        ).picks;
        assert_eq!(recs.len(), 1, "only the embed model is in the Search lane");
        assert_eq!(recs[0].display_name, "Qwen3-Embedding");
        assert_eq!(recs[0].role, UseCaseRole::Retrieval);
    }

    #[test]
    fn deprecated_models_are_never_recommended() {
        let mut cat = catalog();
        cat[1].deprecated = true; // deprecate the 4B
        let recs = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        ).picks;
        assert!(recs.iter().all(|r| r.display_name != "Qwen3-4B"));
    }

    #[test]
    fn on_device_excludes_cloud_but_cloud_ok_includes_it_with_consent() {
        let mut cat = catalog();
        let mut cloud = local_model("anthropic/sonnet", "Claude Sonnet", "", 0);
        cloud.capabilities = vec![ModelCapability::Generate, ModelCapability::Code];
        cloud.source = ModelSource::RemoteApi {
            endpoint: "https://api".into(),
            api_key_env: "K".into(),
            api_key_envs: vec![],
            api_version: None,
            protocol: crate::schema::ApiProtocol::Anthropic,
        };
        cloud.public_benchmarks = vec![crate::schema::BenchmarkScore {
            name: "SWE-bench".into(),
            score: 0.7,
            harness: None,
            source_url: None,
            measured_at: None,
        }];
        cat.push(cloud);

        let on_device = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::OnDevice,
        )
        .picks;
        assert!(on_device.iter().all(|r| r.is_local));

        let cloud_ok = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::CloudOk,
        )
        .picks;
        let claude = cloud_ok
            .iter()
            .find(|r| r.display_name == "Claude Sonnet")
            .expect("cloud model eligible under CloudOk");
        assert!(claude.requires_cloud_consent);
        assert_eq!(claude.fit, FitStatus::ServerProvided);
    }

    #[test]
    fn metal_only_model_excluded_on_cpu_host() {
        let mut cat = catalog();
        let mut mlx = local_model("mlx/qwen3-4b", "Qwen3-4B-MLX", "4B", 2400);
        mlx.source = ModelSource::Mlx {
            hf_repo: "mlx-community/x".into(),
            hf_weight_file: None,
        };
        cat.push(mlx);
        // CPU-only Linux box.
        let recs = recommend(
            &refs(&cat),
            &hw(GpuBackend::Cpu, 32 * 1024, None),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        ).picks;
        assert!(recs.iter().all(|r| r.display_name != "Qwen3-4B-MLX"));
    }

    #[test]
    fn ranking_is_deterministic() {
        let cat = catalog();
        let a = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Assistant,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        let b = recommend(
            &refs(&cat),
            &mac(16),
            UseCase::Assistant,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        let ids_a: Vec<&str> = a.picks.iter().map(|r| r.model_id.as_str()).collect();
        let ids_b: Vec<&str> = b.picks.iter().map(|r| r.model_id.as_str()).collect();
        assert_eq!(ids_a, ids_b);
    }

    #[test]
    fn rationale_is_plain_language_no_jargon() {
        let cat = catalog();
        let recs = recommend(
            &refs(&cat),
            &mac(36),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        ).picks;
        let r = &recs[0].rationale;
        assert!(!r.contains("Q4_K_M"), "no quantization jargon");
        assert!(!r.contains("gguf") && !r.contains("hf_repo"));
        assert!(r.contains("coding"), "states the purpose");
    }

    #[test]
    fn all_too_big_surfaces_needs_more_ram_with_a_note() {
        // A 2 GB machine fits nothing in the catalog.
        let cat = catalog();
        let set = recommend(
            &refs(&cat),
            &hw(GpuBackend::Cpu, 2 * 1024, None),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        assert!(set.picks.is_empty(), "nothing should fit 2 GB");
        assert!(
            !set.not_enough_memory.is_empty(),
            "too-big models surfaced, not dropped"
        );
        let note = set.note.expect("empty picks must carry a note");
        assert!(note.contains("fits"), "note explains the no-fit: {note}");
        // Closest miss ranked first.
        assert_eq!(set.not_enough_memory[0].fit, FitStatus::TooBig);
    }

    #[test]
    fn all_deprecated_gives_generic_note_not_a_memory_note() {
        // Deprecated models are filtered before the fit partition, so they
        // land in neither picks nor not_enough_memory — the note must be the
        // generic "no model available", not a misleading "needs more RAM".
        let mut cat = catalog();
        for m in &mut cat {
            m.deprecated = true;
        }
        let set = recommend(
            &refs(&cat),
            &mac(36), // plenty of RAM — so a memory note would be wrong
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        assert!(set.picks.is_empty());
        assert!(set.not_enough_memory.is_empty());
        let note = set.note.expect("must explain");
        assert!(
            !note.contains("fits") && !note.contains("memory"),
            "deprecated-only must not claim a memory problem: {note}"
        );
    }

    #[test]
    fn not_enough_memory_is_ordered_deterministically() {
        let cat = catalog();
        let mk = || {
            recommend(
                &refs(&cat),
                &hw(GpuBackend::Cpu, 3 * 1024, None), // only the 0.6B fits
                UseCase::Coding,
                QualityTier::Balanced,
                Privacy::OnDevice,
            )
            .not_enough_memory
            .into_iter()
            .map(|r| r.model_id)
            .collect::<Vec<_>>()
        };
        assert!(mk().len() >= 2, "several models should be too big for 3 GB");
        assert_eq!(mk(), mk(), "too-big ordering must be deterministic");
    }

    #[test]
    fn empty_registry_returns_empty_with_a_note() {
        let set = recommend(
            &[],
            &mac(16),
            UseCase::Assistant,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        assert!(set.picks.is_empty());
        assert!(set.not_enough_memory.is_empty());
        assert!(set.note.is_some(), "no-model case must explain itself");
    }

    #[test]
    fn cuda_box_sizes_against_vram() {
        // A 24 GB CUDA GPU fits the 17 GB model under MostCapable.
        let cat = catalog();
        let h = hw(GpuBackend::Cuda, 64 * 1024, Some(24 * 1024));
        let recs = recommend(
            &refs(&cat),
            &h,
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::OnDevice,
        )
        .picks;
        assert_eq!(recs[0].display_name, "Qwen3-30B-A3B");
    }

    #[test]
    fn unsupported_discrete_gpu_uses_system_ram_not_vram() {
        // A 24 GB discrete GPU CAR can't drive must NOT be used as the budget;
        // a 16 GB-RAM CPU host can't fit the 17 GB model despite the big card.
        let cat = catalog();
        let mut h = hw(GpuBackend::Cpu, 16 * 1024, None);
        h.gpu_devices = vec![GpuDevice {
            vendor: GpuVendor::Nvidia,
            name: "GeForce RTX 4090".into(),
            memory_mb: Some(24_000),
        }];
        // Sanity: this is the UnsupportedDiscreteGpu tier.
        assert!(matches!(
            h.supported_acceleration(),
            crate::hardware::SupportedAcceleration::UnsupportedDiscreteGpu { .. }
        ));
        let recs = recommend(
            &refs(&cat),
            &h,
            UseCase::Coding,
            QualityTier::MostCapable,
            Privacy::OnDevice,
        )
        .picks;
        assert!(
            recs.iter().all(|r| r.display_name != "Qwen3-30B-A3B"),
            "17 GB model must not fit a 16 GB-RAM CPU host"
        );
        assert!(!recs.is_empty(), "smaller models still fit");
    }

    #[test]
    fn recommendation_set_wire_shape_is_snake_case_and_stable() {
        // Guards the JSON contract that FFI / JSON-RPC clients decode.
        let cat = catalog();
        let set = recommend(
            &refs(&cat),
            &mac(36),
            UseCase::Coding,
            QualityTier::Balanced,
            Privacy::OnDevice,
        );
        let json = serde_json::to_string(&set).unwrap();
        assert!(json.contains("\"picks\""));
        assert!(json.contains("\"not_enough_memory\""));
        assert!(json.contains("\"model_id\""));
        assert!(json.contains("\"already_installed\""));
        assert!(json.contains("\"requires_cloud_consent\""));
        assert!(json.contains("\"fit\""));
    }

    #[test]
    fn blank_param_count_estimates_from_size_not_zero() {
        // An under-curated entry with no param_count must not be treated as a
        // 0B model (which would falsely look tiny + low quality).
        let mut m = local_model("x/unknown", "Unknown-Model", "", 4900);
        m.param_count = String::new();
        assert!(
            param_billions_total(&m) > 5.0,
            "4.9 GB ⇒ roughly an 8B model, not 0B"
        );
    }
}