car-inference 0.26.0

Local model inference for CAR — Candle backend with Qwen3 models
//! UsageProfile — what the user *actually does*, aggregated from the
//! outcome ledger (Phase B1).
//!
//! The old concierge triggered on "is a lane empty?". This turns the
//! durable per-call receipts (`OutcomeLedgerEntry`) into per-lane
//! evidence — activity, success, latency, and friction — so the
//! concierge can speak from what happened ("your coding lane failed 4 of
//! the last 9 calls on model X") instead of from a static catalog.
//!
//! Pure over its inputs (the caller supplies the ledger slice via
//! `read_ledger`), so it's fully unit-testable without disk.

use std::collections::{BTreeMap, BTreeSet};

use serde::Serialize;

use crate::intent::{TaskHint, UseCase};
use crate::outcome::{InferenceTask, OutcomeLedgerEntry};

/// Map a recorded inference task to the concierge use-case lane it
/// belongs to, so receipts (keyed by `InferenceTask`) aggregate into the
/// `UseCase` lanes the recommender ranks within.
pub fn use_case_for_task(task: InferenceTask) -> UseCase {
    match task {
        InferenceTask::Code => UseCase::Coding,
        InferenceTask::Classify => UseCase::Summarize,
        InferenceTask::Embed => UseCase::Search,
        // Generate and Reasoning are both general generative chat from the
        // lane's perspective.
        InferenceTask::Generate | InferenceTask::Reasoning => UseCase::Assistant,
    }
}

/// Map a request's inference-time `TaskHint` to the concierge use-case
/// lane, so a request's intent resolves to the lane whose default model
/// should apply. Mirrors `use_case_for_task` (which maps the recorded
/// `InferenceTask`).
pub fn use_case_for_task_hint(hint: TaskHint) -> UseCase {
    match hint {
        TaskHint::Code => UseCase::Coding,
        TaskHint::Classify => UseCase::Summarize,
        // Chat and Reasoning are both general generative chat.
        TaskHint::Chat | TaskHint::Reasoning => UseCase::Assistant,
    }
}

/// Aggregated activity + friction for one use-case lane.
#[derive(Debug, Clone, Serialize)]
pub struct LaneUsage {
    pub use_case: UseCase,
    /// All receipts in the window for this lane (resolved + inconclusive).
    pub calls: u64,
    pub successes: u64,
    pub failures: u64,
    /// Completed-but-unresolved receipts (no follow-up signal). Tracked
    /// so the resolution rate is visible and stats aren't read as if every
    /// call were judged.
    pub inconclusive: u64,
    pub total_latency_ms: u64,
    /// Mean quality over receipts that carried a quality signal.
    pub avg_quality: f64,
    /// Distinct models seen serving this lane.
    pub models_used: BTreeSet<String>,
    /// Models that produced at least one failure in this lane — the
    /// friction signal a suggestion is built on.
    pub failing_models: BTreeSet<String>,
}

impl LaneUsage {
    fn new(use_case: UseCase) -> Self {
        Self {
            use_case,
            calls: 0,
            successes: 0,
            failures: 0,
            inconclusive: 0,
            total_latency_ms: 0,
            avg_quality: 0.0,
            models_used: BTreeSet::new(),
            failing_models: BTreeSet::new(),
        }
    }

    /// Success rate over *resolved* receipts (successes + failures);
    /// `None` when nothing in the lane was resolved (don't infer quality
    /// from inconclusive-only data).
    pub fn success_rate(&self) -> Option<f64> {
        let resolved = self.successes + self.failures;
        if resolved == 0 {
            None
        } else {
            Some(self.successes as f64 / resolved as f64)
        }
    }

    /// Fraction of calls that got a resolution signal — context for how
    /// much to trust `success_rate`.
    pub fn resolution_rate(&self) -> f64 {
        if self.calls == 0 {
            0.0
        } else {
            (self.successes + self.failures) as f64 / self.calls as f64
        }
    }

    pub fn avg_latency_ms(&self) -> f64 {
        if self.calls == 0 {
            0.0
        } else {
            self.total_latency_ms as f64 / self.calls as f64
        }
    }
}

/// Per-lane usage rolled up from the ledger over a recency window.
#[derive(Debug, Clone, Serialize)]
pub struct UsageProfile {
    pub lanes: BTreeMap<UseCase, LaneUsage>,
    pub total_calls: u64,
    /// The recency window (seconds) the profile was built over.
    pub window_secs: u64,
}

impl UsageProfile {
    /// Build from ledger receipts, keeping only those within
    /// `window_secs` of `now`. `window_secs == 0` means "all history".
    pub fn from_ledger(entries: &[OutcomeLedgerEntry], now: u64, window_secs: u64) -> Self {
        let cutoff = if window_secs == 0 {
            0
        } else {
            now.saturating_sub(window_secs)
        };

        // Running quality sum + count per lane for the mean.
        let mut quality_sum: BTreeMap<UseCase, (f64, u64)> = BTreeMap::new();
        let mut lanes: BTreeMap<UseCase, LaneUsage> = BTreeMap::new();
        let mut total_calls = 0u64;

        for e in entries.iter().filter(|e| e.timestamp >= cutoff) {
            let uc = use_case_for_task(e.task);
            let lane = lanes.entry(uc).or_insert_with(|| LaneUsage::new(uc));
            lane.calls += 1;
            total_calls += 1;
            lane.total_latency_ms += e.latency_ms;
            lane.models_used.insert(e.model_id.clone());

            match e.success {
                Some(true) => lane.successes += 1,
                Some(false) => {
                    lane.failures += 1;
                    lane.failing_models.insert(e.model_id.clone());
                }
                None => lane.inconclusive += 1,
            }

            if let Some(q) = e.quality {
                let entry = quality_sum.entry(uc).or_insert((0.0, 0));
                entry.0 += q;
                entry.1 += 1;
            }
        }

        for (uc, (sum, count)) in quality_sum {
            if let Some(lane) = lanes.get_mut(&uc) {
                if count > 0 {
                    lane.avg_quality = sum / count as f64;
                }
            }
        }

        Self {
            lanes,
            total_calls,
            window_secs,
        }
    }

    pub fn lane(&self, use_case: UseCase) -> Option<&LaneUsage> {
        self.lanes.get(&use_case)
    }

    /// Lanes the user actually exercised, busiest first — what the
    /// concierge should reason about (vs. lanes they never touch).
    pub fn active_lanes(&self) -> Vec<&LaneUsage> {
        let mut v: Vec<&LaneUsage> = self.lanes.values().collect();
        v.sort_by(|a, b| b.calls.cmp(&a.calls));
        v
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn entry(
        model: &str,
        task: InferenceTask,
        success: Option<bool>,
        quality: Option<f64>,
        latency: u64,
        ts: u64,
    ) -> OutcomeLedgerEntry {
        OutcomeLedgerEntry {
            trace_id: format!("t-{ts}-{model}"),
            model_id: model.to_string(),
            task,
            routing_reason: "test".into(),
            latency_ms: latency,
            input_tokens: 10,
            output_tokens: 20,
            success,
            quality,
            error: None,
            project_id: None,
            intent: None,
            timestamp: ts,
        }
    }

    #[test]
    fn aggregates_lanes_and_friction() {
        let entries = vec![
            // Coding: 2 success, 1 failure on "small-coder"
            entry("big-coder", InferenceTask::Code, Some(true), Some(0.9), 1000, 100),
            entry("big-coder", InferenceTask::Code, Some(true), Some(0.8), 1100, 110),
            entry("small-coder", InferenceTask::Code, Some(false), None, 200, 120),
            // Assistant: 1 inconclusive
            entry("chat", InferenceTask::Generate, None, None, 300, 130),
        ];
        let profile = UsageProfile::from_ledger(&entries, 200, 0);

        assert_eq!(profile.total_calls, 4);

        let coding = profile.lane(UseCase::Coding).unwrap();
        assert_eq!(coding.calls, 3);
        assert_eq!(coding.successes, 2);
        assert_eq!(coding.failures, 1);
        assert_eq!(coding.success_rate(), Some(2.0 / 3.0));
        assert!(coding.failing_models.contains("small-coder"));
        assert!(!coding.failing_models.contains("big-coder"));
        assert!((coding.avg_quality - 0.85).abs() < 1e-9);

        let assistant = profile.lane(UseCase::Assistant).unwrap();
        assert_eq!(assistant.inconclusive, 1);
        assert_eq!(assistant.success_rate(), None); // nothing resolved
        assert!((assistant.resolution_rate() - 0.0).abs() < 1e-9);

        // Busiest lane first.
        assert_eq!(profile.active_lanes()[0].use_case, UseCase::Coding);
    }

    #[test]
    fn window_filters_old_receipts() {
        let entries = vec![
            entry("m", InferenceTask::Generate, Some(true), Some(1.0), 1, 10),
            entry("m", InferenceTask::Generate, Some(true), Some(1.0), 1, 1000),
        ];
        // now=1000, window=100 → only the ts=1000 receipt survives.
        let profile = UsageProfile::from_ledger(&entries, 1000, 100);
        assert_eq!(profile.total_calls, 1);
    }
}