car-inference 0.26.0

//! Proactive model concierge — turns "your hardware can run a great model for
//! X, but you have none installed" into a single, non-nagging suggestion.
//!
//! Sibling of [`crate::nudge`]: the upgrade nudge handles the *installed →
//! newer* case; the concierge handles the *acquisition gap* — a use-case lane
//! with no working model at all. Both are pure decision functions the daemon
//! drives on its periodic tick, and both reuse [`NudgeState`] for throttle +
//! dismissal bookkeeping and the [`UpdatePolicy`] `Off` switch, so the two
//! together stay quiet rather than each nagging on its own schedule. Keeping
//! the decision here makes it pure and unit-testable: the caller injects
//! `now_secs` and `throttle_secs`.
//!
//! Rules:
//! - `policy == Off` → do nothing.
//! - Never interrupt active inference — defer to a later, idle tick.
//! - A use-case lane is "served" when any model the recommender ranks for it
//!   is already installed; served lanes never produce a suggestion. This is
//!   what keeps the concierge distinct from the upgrade nudge — it only fires
//!   when the user has *nothing* for a lane, never to push a marginal upgrade.
//! - Only on-device, memory-fitting picks are suggested — an acquisition the
//!   user can act on immediately, with no cloud consent and no "needs more
//!   RAM" caveat.
//! - At most one suggestion round per `throttle_secs` (shared with the nudge),
//!   and a suggestion the user dismissed is never repeated.

use serde::{Deserialize, Serialize};

use crate::hardware::HardwareInfo;
use crate::intent::{Privacy, QualityTier, UseCase};
use crate::nudge::NudgeState;
use crate::recommend::{recommend, FitStatus, Recommendation};
use crate::schema::ModelSchema;
use crate::update_prefs::{UpdatePolicy, UpdatePreferences};
use crate::usage_profile::{LaneUsage, UsageProfile};

// ---- Phase B2/B4: decision modes + labeled dismissals ----------------

/// What the concierge should do this cycle (Phase B2). Splitting the verb
/// is the whole point: observation is always on and silent; speaking
/// proactively (`Act`) only happens when every gate passes.
///
/// NOTE: `evaluate_concierge` (Phase B3) only ever returns `Act` or
/// `Observe`. `Answer` and `Ask` are forward-declarations for the
/// interactive path (Phase F `concierge.ask`); don't wire UI for them off
/// B3's output.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ConciergeMode {
    /// Default: observe + learn, no UI.
    Observe,
    /// Respond to a user question (ledger-backed).
    Answer,
    /// Ask the user one clarifying bit (rare).
    Ask,
    /// Speak proactively with a specific, reversible suggestion.
    Act,
}

/// The concierge's decision for a cycle, with the receipts behind it.
#[derive(Debug, Clone, Serialize)]
pub struct ConciergeDecision {
    pub mode: ConciergeMode,
    /// Diagnosis confidence in `[0,1]` — higher with more resolved
    /// samples and worse observed success.
    pub confidence: f64,
    /// Human-readable receipts the suggestion is built on (the evidence a
    /// good concierge cites instead of asserting).
    pub evidence: Vec<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub suggestion: Option<ConciergeSuggestion>,
}

impl ConciergeDecision {
    /// The silent default — observe and learn, surface nothing.
    pub fn observe() -> Self {
        Self {
            mode: ConciergeMode::Observe,
            confidence: 0.0,
            evidence: Vec::new(),
            suggestion: None,
        }
    }
}

/// Why a user dismissed a suggestion (Phase B4). Carries different signal
/// than a single tombstone key.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DismissReason {
    /// Soft: cool down, may re-suggest later.
    NotNow,
    /// Hard: the recommendation was wrong — don't repeat it.
    Wrong,
    /// Hard: cost-sensitive — don't push paid/large options like this.
    TooExpensive,
    /// Hard: privacy — don't push this class.
    Privacy,
    /// Hard: not for this project.
    NeverForProject,
}

impl DismissReason {
    /// Whether this dismissal suppresses the same suggestion permanently
    /// (vs. a soft cooldown).
    pub fn is_permanent(self) -> bool {
        !matches!(self, DismissReason::NotNow)
    }
}

/// Outcome of a post-apply canary check (Phase F3).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum CanaryVerdict {
    /// The switched-to model is performing acceptably — keep it.
    Keep,
    /// It's measurably worse than the prior — auto-revert.
    Revert,
    /// Not enough post-switch data yet — wait.
    Insufficient,
}

/// Decide whether a just-applied model switch should stand. Conservative:
/// revert ONLY with enough post-switch samples AND a success rate that is
/// worse than the prior baseline by more than `margin`. Never self-graded
/// — `new_success_rate` comes from observed verifier/outcome receipts.
pub fn canary_verdict(
    new_success_rate: Option<f64>,
    new_samples: u64,
    baseline_success_rate: f64,
    min_samples: u64,
    margin: f64,
) -> CanaryVerdict {
    match new_success_rate {
        Some(r) if new_samples >= min_samples => {
            if r + margin < baseline_success_rate {
                CanaryVerdict::Revert
            } else {
                CanaryVerdict::Keep
            }
        }
        _ => CanaryVerdict::Insufficient,
    }
}

/// Min post-switch resolved samples before a canary can revert.
pub const CANARY_MIN_SAMPLES: u64 = 5;
/// How much worse (success-rate points) the new model must be than the
/// prior baseline to trigger an auto-revert.
pub const CANARY_REGRESSION_MARGIN: f64 = 0.1;

/// Observed health of one model, from the outcome profiles (Phase C1).
/// The pull-not-push "Model Health" surface renders these.
#[derive(Debug, Clone, Serialize)]
pub struct ModelHealth {
    pub model_id: String,
    pub calls: u64,
    /// Success rate over *resolved* outcomes, or `None` when nothing has
    /// resolved yet. `None` (not a fabricated 0.5) so the UI can render
    /// "no resolved signal" instead of a misleading confident "50%" for a
    /// never-measured model. Mirrors `LaneUsage::success_rate`.
    pub success_rate: Option<f64>,
    pub avg_latency_ms: f64,
    pub quality: f64,
    /// Session-excluded (e.g. repeated 429).
    pub excluded: bool,
}

/// A model switch that has been standing long enough that we'd expect to
/// have verified it, but still lacks enough resolved outcomes to canary
/// (low-resolution lane). Surfaced so the *user* can decide, rather than
/// it sitting silently un-verifiable forever.
#[derive(Debug, Clone, Serialize)]
pub struct PendingVerification {
    pub use_case: UseCase,
    pub model_id: String,
    pub set_at: u64,
    pub resolved_samples: u64,
    pub needed: u64,
}

/// The ambient concierge status the `concierge.status` WS method returns
/// (Phase C1): per-lane usage + the current decision (Observe or
/// Act+suggestion) + per-model health. Pull, not push.
#[derive(Debug, Clone, Serialize)]
pub struct ConciergeStatus {
    pub lanes: Vec<LaneUsage>,
    pub decision: ConciergeDecision,
    pub models: Vec<ModelHealth>,
    /// Standing switches too old to still be unverified yet lacking
    /// resolved samples to auto-verify (deferred-item surfacing).
    #[serde(default)]
    pub pending_verification: Vec<PendingVerification>,
}

/// A labeled dismissal record (Phase B4) persisted in `NudgeState`.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct DismissalRecord {
    pub key: String,
    pub reason: DismissReason,
    pub timestamp: u64,
}

/// Cooldown for a soft ("not now") dismissal before the same suggestion
/// may resurface.
pub const SOFT_DISMISS_COOLDOWN_SECS: u64 = 14 * 24 * 60 * 60;

/// Minimum resolved (success+failure) receipts in a lane before the
/// concierge will make a friction claim — don't diagnose on 1-2 samples.
const MIN_RESOLVED_SAMPLES: u64 = 5;

/// Below this success rate (with enough samples) a lane is "frictional".
const FRICTION_SUCCESS_RATE: f64 = 0.6;

/// Minimum diagnosis confidence before the concierge will speak (`Act`).
/// Filters thin/borderline diagnoses that clear the sample floor but
/// aren't strong enough to interrupt the user over.
const MIN_DIAGNOSIS_CONFIDENCE: f64 = 0.25;

/// A single, plain-language acquisition suggestion for one unserved lane.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ConciergeSuggestion {
    /// The use-case lane that has no installed model.
    pub use_case: UseCase,
    /// Registry id of the model to acquire — the recommender's top on-device,
    /// memory-fitting pick for this lane. The caller pulls by this.
    pub model_id: String,
    /// Human-readable name shown to the user.
    pub display_name: String,
    /// Download size in MB.
    pub download_mb: u64,
    /// One-line, jargon-free message: "You have no model for … — install it?".
    pub message: String,
    /// Stable key the client echoes back to dismiss this suggestion. Encodes
    /// the lane and target model, so a *different* later pick can still fire
    /// even after this one is dismissed.
    pub dismiss_key: String,
}

/// Default throttle: at most one acquisition suggestion per week. Deliberately
/// rarer than the upgrade nudge's daily cadence — "you have nothing for X" is a
/// bigger, more deliberate prompt than "a newer version exists", and should not
/// nag. Independent of the nudge throttle (separate `NudgeState` field).
pub const DEFAULT_CONCIERGE_THROTTLE_SECS: u64 = 7 * 24 * 60 * 60;

/// The default lanes the concierge watches when the caller passes none.
///
/// Core interactive uses that almost every user benefits from — deliberately
/// NOT the specialist lanes (`Vision` / `Transcription` / `Search`), which
/// many users never touch; suggesting those unprompted would nag. A caller
/// that knows the user's actual interests should pass them explicitly.
pub const DEFAULT_WATCHED_USE_CASES: &[UseCase] = &[UseCase::Assistant, UseCase::Coding];

/// Stable string slug for a use-case lane. Used in the persisted dismiss key,
/// so it MUST be stable across releases — never `{:?}`/`Debug`, which is not a
/// stability contract (a variant rename would silently un-dismiss). Exhaustive
/// so a new lane is a compile error here, not a silently-changed key.
fn use_case_slug(use_case: UseCase) -> &'static str {
    match use_case {
        UseCase::Assistant => "assistant",
        UseCase::Coding => "coding",
        UseCase::Summarize => "summarize",
        UseCase::Vision => "vision",
        UseCase::Transcription => "transcription",
        UseCase::Search => "search",
    }
}

/// Stable dismiss key for a (lane, target) pair. The `concierge:` prefix keeps
/// the namespace disjoint from the upgrade nudge's bare `from=>to` keys, which
/// share the same `NudgeState.dismissed` Vec.
fn dismiss_key(use_case: UseCase, model_id: &str) -> String {
    format!("concierge:{}=>{model_id}", use_case_slug(use_case))
}

/// Decide which acquisition suggestions to surface, given the registry,
/// hardware, lanes to watch, prefs, and bookkeeping. Pure over its inputs.
///
/// Does **not** mutate `state` — the caller records `last_nudge_secs` when it
/// actually surfaces a suggestion (so a dropped one can re-fire), and appends
/// to `dismissed` when the user waves one away.
#[allow(clippy::too_many_arguments)]
pub fn decide_concierge(
    models: &[&ModelSchema],
    hw: &HardwareInfo,
    use_cases: &[UseCase],
    tier: QualityTier,
    prefs: &UpdatePreferences,
    state: &NudgeState,
    now_secs: u64,
    throttle_secs: u64,
    inference_active: bool,
) -> Vec<ConciergeSuggestion> {
    // Defer entirely while inference is running (memory/compute contention) or
    // when the user has turned proactive prompts off.
    if inference_active || matches!(prefs.policy, UpdatePolicy::Off) {
        return Vec::new();
    }

    // Throttle: at most one round per window, off the concierge's OWN
    // `last_concierge_secs` — independent of the upgrade nudge's
    // `last_nudge_secs`. The two have different cadences ("you have nothing"
    // should be rarer and more deliberate than "newer exists") and must not
    // starve each other: sharing one field let whichever ran first each tick
    // burn the window forever.
    let throttled = state.last_concierge_secs != 0
        && now_secs.saturating_sub(state.last_concierge_secs) < throttle_secs;
    if throttled {
        return Vec::new();
    }

    let mut out = Vec::new();
    for &use_case in use_cases {
        // On-device only — a suggestion the user can act on now.
        let set = recommend(models, hw, use_case, tier, Privacy::OnDevice);

        // Lane already served by an installed model → nothing to suggest. This
        // is the line that separates "you have nothing" (concierge) from
        // "something better exists" (upgrade nudge).
        if set.picks.iter().any(|p| p.already_installed) {
            continue;
        }

        // The acquisition: first on-device, memory-fitting, not-installed pick.
        let pick = match set
            .picks
            .iter()
            .find(|p| !p.already_installed && p.is_local && p.fit == FitStatus::Fits)
        {
            Some(p) => p,
            None => continue,
        };

        let key = dismiss_key(use_case, &pick.model_id);
        if state.dismissed.iter().any(|k| k == &key) {
            continue;
        }

        out.push(ConciergeSuggestion {
            use_case,
            model_id: pick.model_id.clone(),
            display_name: pick.display_name.clone(),
            download_mb: pick.download_mb,
            message: suggestion_message(use_case, pick),
            dismiss_key: key,
        });
    }
    out
}

/// One plain-language line. No model ids, quant, or repo jargon — exhaustive
/// over `UseCase` so a new lane is a compile error here, not a silent gap.
fn suggestion_message(use_case: UseCase, pick: &Recommendation) -> String {
    let purpose = match use_case {
        UseCase::Assistant => "chat & general help",
        UseCase::Coding => "coding",
        UseCase::Summarize => "summarizing",
        UseCase::Vision => "understanding images",
        UseCase::Transcription => "transcription",
        UseCase::Search => "semantic search",
    };
    let mb = pick.download_mb;
    let size = if mb >= 1024 {
        format!("{:.1} GB", mb as f64 / 1024.0)
    } else {
        format!("{mb} MB")
    };
    format!(
        "You have no model for {purpose}. {} fits your machine ({size}) — install it?",
        pick.display_name
    )
}

// ---- Phase B3: friction-driven, receipts-backed evaluation ----------

/// Decide what the concierge should do from *observed usage*, not an
/// empty-lane rule. Walks the user's busiest lanes; when one shows real
/// friction (enough resolved receipts, low success or a failing model)
/// AND `recommend()` grounds a better-fitting on-device option that the
/// user hasn't dismissed, returns an `Act` decision carrying the receipts
/// behind it. Otherwise stays silent (`Observe`).
///
/// The deterministic safety envelope is absolute: policy-off or
/// inference-active → silent; the throttle suppresses *proactive speech*
/// (a real diagnosis under throttle downgrades to `Observe`, it does not
/// nag). No LLM — `recommend()` is the grounding oracle; this only adds
/// evidence and gating.
#[allow(clippy::too_many_arguments)]
pub fn evaluate_concierge(
    models: &[&ModelSchema],
    hw: &HardwareInfo,
    usage: &UsageProfile,
    tier: QualityTier,
    prefs: &UpdatePreferences,
    state: &NudgeState,
    now_secs: u64,
    throttle_secs: u64,
    inference_active: bool,
) -> ConciergeDecision {
    // Hard envelope.
    if matches!(prefs.policy, UpdatePolicy::Off) || inference_active {
        return ConciergeDecision::observe();
    }

    let throttled = state.last_concierge_secs != 0
        && now_secs.saturating_sub(state.last_concierge_secs) < throttle_secs;

    for lane in usage.active_lanes() {
        let resolved = lane.successes + lane.failures;
        if resolved < MIN_RESOLVED_SAMPLES {
            continue;
        }
        let Some(success_rate) = lane.success_rate() else {
            continue;
        };
        // Friction is a low success RATE — NOT the mere presence of one
        // failure. `failing_models` is for exclusion + the receipt text,
        // never the trigger (one fail in 50 successes is not friction).
        if success_rate >= FRICTION_SUCCESS_RATE {
            continue;
        }

        // Confidence floor: never speak on a thin or borderline diagnosis
        // (e.g. 5 samples at 59% success → very low confidence).
        let confidence = diagnosis_confidence(resolved, success_rate);
        if confidence < MIN_DIAGNOSIS_CONFIDENCE {
            continue;
        }

        // Grounded candidate: on-device, fits this machine, and NOT one of
        // the models already failing in this lane. recommend() (best-first)
        // is the only thing allowed to assert fit/ranking.
        let set = recommend(models, hw, lane.use_case, tier, Privacy::OnDevice);
        // The bar the pick must clear: the best recommender score among the
        // lane's failing models, if the recommender ranks them.
        let failing_score = set
            .picks
            .iter()
            .filter(|p| lane.failing_models.contains(&p.model_id))
            .map(|p| p.score)
            .fold(f32::NEG_INFINITY, f32::max);
        let Some(pick) = set.picks.iter().find(|p| {
            p.is_local
                && p.fit == FitStatus::Fits
                && !lane.failing_models.contains(&p.model_id)
        }) else {
            continue;
        };
        // Don't suggest a model the lane is already using fine.
        if pick.already_installed && lane.models_used.contains(&pick.model_id) {
            continue;
        }
        // Honesty gate: only suggest a model we can SHOW is better than
        // what's failing. If no failing model was rankable (can't compare),
        // stay silent rather than promise an improvement we didn't verify —
        // this is what backs the "should do better" claim below.
        if !(failing_score.is_finite() && pick.score > failing_score) {
            continue;
        }

        let key = dismiss_key(lane.use_case, &pick.model_id);
        if is_dismissed(state, &key, now_secs) {
            continue;
        }

        // Throttle suppresses proactive SPEECH (not the diagnosis) — check
        // before doing the formatting work.
        if throttled {
            return ConciergeDecision::observe();
        }

        let failing = if lane.failing_models.is_empty() {
            String::new()
        } else {
            format!(
                "; failing on {}",
                lane.failing_models
                    .iter()
                    .cloned()
                    .collect::<Vec<_>>()
                    .join(", ")
            )
        };
        let evidence = vec![format!(
            "{} lane: {}/{} resolved calls succeeded ({:.0}%) across {} call(s){}",
            lane_label(lane.use_case),
            lane.successes,
            resolved,
            success_rate * 100.0,
            lane.calls,
            failing
        )];
        let suggestion = ConciergeSuggestion {
            use_case: lane.use_case,
            model_id: pick.model_id.clone(),
            display_name: pick.display_name.clone(),
            download_mb: pick.download_mb,
            message: format!(
                "Your {} model is struggling ({:.0}% success). {} fits this machine and ranks higher for this task.",
                lane_label(lane.use_case),
                success_rate * 100.0,
                pick.display_name
            ),
            dismiss_key: key,
        };

        return ConciergeDecision {
            mode: ConciergeMode::Act,
            confidence,
            evidence,
            suggestion: Some(suggestion),
        };
    }

    ConciergeDecision::observe()
}

/// True if `key` is suppressed: a legacy plain dismissal (permanent), a
/// permanent labeled dismissal, or a soft "not now" still within its
/// cooldown.
fn is_dismissed(state: &NudgeState, key: &str, now_secs: u64) -> bool {
    if state.dismissed.iter().any(|k| k == key) {
        return true;
    }
    state.concierge_dismissals.iter().any(|d| {
        d.key == key
            && (d.reason.is_permanent()
                || now_secs.saturating_sub(d.timestamp) < SOFT_DISMISS_COOLDOWN_SECS)
    })
}

/// Diagnosis confidence: worse success + more samples → higher. The
/// sample factor dampens claims made on thin evidence.
fn diagnosis_confidence(resolved: u64, success_rate: f64) -> f64 {
    let severity = (1.0 - success_rate).clamp(0.0, 1.0);
    let sample_factor = resolved as f64 / (resolved as f64 + 10.0);
    (severity * sample_factor).clamp(0.0, 1.0)
}

fn lane_label(use_case: UseCase) -> &'static str {
    match use_case {
        UseCase::Assistant => "assistant",
        UseCase::Coding => "coding",
        UseCase::Summarize => "summarization",
        UseCase::Vision => "vision",
        UseCase::Transcription => "transcription",
        UseCase::Search => "search",
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hardware::GpuBackend;
    use crate::schema::{
        CostModel, ModelCapability, ModelSchema, ModelSource, PerformanceEnvelope, TrustTier,
    };

    // A local model schema parameterized by id, install state, download size,
    // and capabilities. `available == installed`. Mirrors recommend.rs's test
    // builder so the recommender ranks it the same way.
    fn model(id: &str, installed: bool, download_mb: u64, caps: &[ModelCapability]) -> ModelSchema {
        ModelSchema {
            id: id.into(),
            name: id.into(),
            provider: "qwen".into(),
            family: "qwen3".into(),
            version: String::new(),
            capabilities: caps.to_vec(),
            context_length: 32768,
            max_output_tokens: None,
            param_count: "4B".into(),
            quantization: Some("Q4_K_M".into()),
            performance: PerformanceEnvelope::default(),
            cost: CostModel {
                size_mb: Some(download_mb),
                ram_mb: Some(download_mb),
                ..Default::default()
            },
            source: ModelSource::Local {
                hf_repo: "x/y".into(),
                hf_filename: "m.gguf".into(),
                tokenizer_repo: "x/y".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: TrustTier::Curated,
            deprecated: false,
            available: installed,
        }
    }

    fn prefs(policy: UpdatePolicy) -> UpdatePreferences {
        UpdatePreferences {
            policy,
            ..Default::default()
        }
    }

    // 16 GB Apple Silicon (Metal ⇒ Apple tier) so everything here fits.
    fn hw() -> HardwareInfo {
        HardwareInfo {
            os: "test".into(),
            arch: "test".into(),
            cpu_cores: 8,
            total_ram_mb: 16_384,
            gpu_backend: GpuBackend::Metal,
            gpu_memory_mb: None,
            gpu_devices: vec![],
            recommended_model: String::new(),
            recommended_context: 4096,
            max_model_mb: 0,
        }
    }

    #[test]
    fn off_policy_suggests_nothing() {
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Off),
            &NudgeState::default(),
            100,
            10,
            false,
        );
        assert!(out.is_empty());
    }

    #[test]
    fn active_inference_defers() {
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &NudgeState::default(),
            100,
            10,
            true,
        );
        assert!(out.is_empty());
    }

    #[test]
    fn suggests_when_lane_unserved() {
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &NudgeState::default(),
            100,
            10,
            false,
        );
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].use_case, UseCase::Assistant);
        assert_eq!(out[0].model_id, "chat-a");
        assert!(out[0].message.contains("install it?"));
    }

    #[test]
    fn served_lane_suggests_nothing() {
        // An installed model for the lane → no acquisition suggestion.
        let m = [model("chat-a", true, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &NudgeState::default(),
            100,
            10,
            false,
        );
        assert!(out.is_empty());
    }

    #[test]
    fn dismissed_suggestion_not_repeated() {
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let mut state = NudgeState::default();
        state.dismiss(&dismiss_key(UseCase::Assistant, "chat-a"));
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &state,
            100,
            10,
            false,
        );
        assert!(out.is_empty());
    }

    #[test]
    fn throttled_within_window() {
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let state = NudgeState {
            last_concierge_secs: 95,
            ..Default::default()
        };
        // now=100, last=95, throttle=10 → still throttled.
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &state,
            100,
            10,
            false,
        );
        assert!(out.is_empty());
    }

    #[test]
    fn not_throttled_by_upgrade_nudge_field() {
        // Regression guard for the shared-throttle starvation bug: a recent
        // *upgrade nudge* (last_nudge_secs) must NOT throttle the concierge,
        // which throttles only on its own last_concierge_secs.
        let m = [model("chat-a", false, 2000, &[ModelCapability::Generate])];
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let state = NudgeState {
            last_nudge_secs: 99, // upgrade nudge fired one tick ago
            last_concierge_secs: 0, // concierge never has
            ..Default::default()
        };
        let out = decide_concierge(
            &refs,
            &hw(),
            &[UseCase::Assistant],
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &state,
            100,
            10,
            false,
        );
        assert_eq!(out.len(), 1, "concierge must fire regardless of the upgrade nudge's window");
    }

    #[test]
    fn dismiss_key_is_stable_slug_not_debug() {
        // The persisted key must use the stable slug, not Debug formatting.
        assert_eq!(dismiss_key(UseCase::Assistant, "chat-a"), "concierge:assistant=>chat-a");
        assert_eq!(dismiss_key(UseCase::Coding, "code-x"), "concierge:coding=>code-x");
    }
}

#[cfg(test)]
mod b3_tests {
    use super::*;
    use crate::hardware::GpuBackend;
    use crate::outcome::{InferenceTask, OutcomeLedgerEntry};
    use crate::schema::{
        CostModel, ModelCapability, ModelSchema, ModelSource, PerformanceEnvelope, TrustTier,
    };

    fn model(id: &str, installed: bool, download_mb: u64, caps: &[ModelCapability]) -> ModelSchema {
        ModelSchema {
            id: id.into(),
            name: id.into(),
            provider: "qwen".into(),
            family: "qwen3".into(),
            version: String::new(),
            capabilities: caps.to_vec(),
            context_length: 32768,
            max_output_tokens: None,
            param_count: "4B".into(),
            quantization: Some("Q4_K_M".into()),
            performance: PerformanceEnvelope::default(),
            cost: CostModel {
                size_mb: Some(download_mb),
                ram_mb: Some(download_mb),
                ..Default::default()
            },
            source: ModelSource::Local {
                hf_repo: "x/y".into(),
                hf_filename: "m.gguf".into(),
                tokenizer_repo: "x/y".into(),
            },
            tags: vec![],
            supported_params: vec![],
            public_benchmarks: vec![],
            trust_tier: TrustTier::Curated,
            deprecated: false,
            available: installed,
        }
    }

    fn hw() -> HardwareInfo {
        HardwareInfo {
            os: "test".into(),
            arch: "test".into(),
            cpu_cores: 8,
            total_ram_mb: 16_384,
            gpu_backend: GpuBackend::Metal,
            gpu_memory_mb: None,
            gpu_devices: vec![],
            recommended_model: String::new(),
            recommended_context: 4096,
            max_model_mb: 0,
        }
    }

    fn prefs(policy: UpdatePolicy) -> UpdatePreferences {
        UpdatePreferences {
            policy,
            ..Default::default()
        }
    }

    fn coding_ledger(failing_model: &str, successes: usize, failures: usize) -> Vec<OutcomeLedgerEntry> {
        (0..(successes + failures))
            .map(|i| OutcomeLedgerEntry {
                trace_id: format!("t{i}"),
                model_id: failing_model.to_string(),
                task: InferenceTask::Code,
                routing_reason: "r".into(),
                latency_ms: 100,
                input_tokens: 1,
                output_tokens: 1,
                success: Some(i < successes),
                quality: None,
                error: None,
                project_id: None,
                intent: None,
                timestamp: 100,
            })
            .collect()
    }

    fn coding_models() -> Vec<ModelSchema> {
        // The failing model is tiny (1B); the candidate is larger (7B), so
        // the recommender's quality prior ranks it strictly higher — which
        // the honesty gate now requires before suggesting it.
        let mut small = model(
            "small-coder",
            true,
            800,
            &[ModelCapability::Generate, ModelCapability::Code],
        );
        small.param_count = "1B".into();
        let mut big = model(
            "big-coder",
            false,
            5000,
            &[ModelCapability::Generate, ModelCapability::Code],
        );
        big.param_count = "7B".into();
        vec![small, big]
    }

    #[test]
    fn friction_triggers_act_with_receipts() {
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 3, 5); // 37.5% success, failing
        let usage = UsageProfile::from_ledger(&entries, 100, 0);

        let decision = evaluate_concierge(
            &refs,
            &hw(),
            &usage,
            QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify),
            &NudgeState::default(),
            1_000_000,
            DEFAULT_CONCIERGE_THROTTLE_SECS,
            false,
        );
        assert_eq!(decision.mode, ConciergeMode::Act);
        let s = decision.suggestion.expect("a suggestion");
        assert_eq!(s.use_case, UseCase::Coding);
        assert_ne!(s.model_id, "small-coder", "must not suggest the failing model");
        assert!(decision.confidence > 0.0);
        assert!(!decision.evidence.is_empty());
    }

    #[test]
    fn canary_reverts_only_on_worse_with_samples() {
        // Worse than baseline by > margin, enough samples → revert.
        assert_eq!(
            canary_verdict(Some(0.4), 6, 0.8, 5, 0.1),
            CanaryVerdict::Revert
        );
        // Comparable (within margin) → keep.
        assert_eq!(
            canary_verdict(Some(0.75), 6, 0.8, 5, 0.1),
            CanaryVerdict::Keep
        );
        // Better → keep.
        assert_eq!(
            canary_verdict(Some(0.95), 6, 0.8, 5, 0.1),
            CanaryVerdict::Keep
        );
        // Too few samples → insufficient (never revert on thin data).
        assert_eq!(
            canary_verdict(Some(0.0), 2, 0.8, 5, 0.1),
            CanaryVerdict::Insufficient
        );
        // No resolved signal → insufficient.
        assert_eq!(
            canary_verdict(None, 0, 0.8, 5, 0.1),
            CanaryVerdict::Insufficient
        );
    }

    #[test]
    fn one_stray_failure_is_not_friction() {
        // 49 successes, 1 failure (98%) → must NOT trigger, even though
        // failing_models is non-empty. Guards Neo's "any failing_model is
        // too trigger-happy" fix.
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 49, 1);
        let usage = UsageProfile::from_ledger(&entries, 100, 0);
        let decision = evaluate_concierge(
            &refs, &hw(), &usage, QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify), &NudgeState::default(),
            1_000_000, DEFAULT_CONCIERGE_THROTTLE_SECS, false,
        );
        assert_eq!(decision.mode, ConciergeMode::Observe);
    }

    #[test]
    fn healthy_lane_stays_silent() {
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 8, 0); // 100% success
        let usage = UsageProfile::from_ledger(&entries, 100, 0);
        let decision = evaluate_concierge(
            &refs, &hw(), &usage, QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify), &NudgeState::default(),
            1_000_000, DEFAULT_CONCIERGE_THROTTLE_SECS, false,
        );
        assert_eq!(decision.mode, ConciergeMode::Observe);
    }

    #[test]
    fn throttled_diagnosis_downgrades_to_observe() {
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 2, 6);
        let usage = UsageProfile::from_ledger(&entries, 100, 0);
        let now = 1_000_000;
        let state = NudgeState {
            last_concierge_secs: now - 10, // within throttle window
            ..Default::default()
        };
        let decision = evaluate_concierge(
            &refs, &hw(), &usage, QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify), &state, now,
            DEFAULT_CONCIERGE_THROTTLE_SECS, false,
        );
        assert_eq!(decision.mode, ConciergeMode::Observe);
    }

    #[test]
    fn permanent_dismissal_suppresses() {
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 2, 6);
        let usage = UsageProfile::from_ledger(&entries, 100, 0);
        let state = NudgeState {
            concierge_dismissals: vec![DismissalRecord {
                key: dismiss_key(UseCase::Coding, "big-coder"),
                reason: DismissReason::Wrong,
                timestamp: 0,
            }],
            ..Default::default()
        };
        let decision = evaluate_concierge(
            &refs, &hw(), &usage, QualityTier::Balanced,
            &prefs(UpdatePolicy::Notify), &state, 1_000_000,
            DEFAULT_CONCIERGE_THROTTLE_SECS, false,
        );
        assert_eq!(decision.mode, ConciergeMode::Observe);
    }

    #[test]
    fn policy_off_is_silent() {
        let m = coding_models();
        let refs: Vec<&ModelSchema> = m.iter().collect();
        let entries = coding_ledger("small-coder", 2, 6);
        let usage = UsageProfile::from_ledger(&entries, 100, 0);
        let decision = evaluate_concierge(
            &refs, &hw(), &usage, QualityTier::Balanced,
            &prefs(UpdatePolicy::Off), &NudgeState::default(),
            1_000_000, DEFAULT_CONCIERGE_THROTTLE_SECS, false,
        );
        assert_eq!(decision.mode, ConciergeMode::Observe);
    }
}