car-inference 0.26.0

Local model inference for CAR — Candle backend with Qwen3 models
//! Persistent outcome-scoreboard aggregator.
//!
//! Folds the durable [`crate::outcome::OutcomeLedger`] (one privacy-bounded
//! receipt per resolved trace) into a per-model, OUTCOME-DENOMINATED scoreboard:
//! cost-per-success, tokens-per-success, success-rate. This is the
//! deployment-level view the "results, not KPIs / cry once" thesis is legible
//! in — the cheapest path to a *correct* outcome, not the cheapest token.
//!
//! Distinct from [`crate::usage_profile::UsageProfile`], which folds the same
//! ledger by use-case *lane* (routing evidence) and carries no dollar cost. This
//! scoreboard is keyed by *model* and priced.
//!
//! Pure + catalog-free: the caller supplies per-model prices via a lookup
//! closure (`model_id -> (input_per_mtok, output_per_mtok)` in USD per 1M
//! tokens), so this module stays decoupled from the registry. The math mirrors
//! [`crate::outcome::ModelProfile::usd_per_success`] but folds the *durable*
//! ledger (cross-session, de-biased by the pending-sweep) rather than the
//! in-memory live profiles.

use crate::outcome::OutcomeLedgerEntry;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

/// One model's outcome-denominated scoreboard row.
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct ScoreboardRow {
    /// Model id (ledger `model_id`).
    pub model_id: String,
    /// Receipts resolved as a success (`success: Some(true)`).
    pub success_count: u64,
    /// Receipts resolved as a failure (`success: Some(false)`).
    pub fail_count: u64,
    /// Receipts that completed but never received a quality/conversation signal
    /// (swept Inconclusive, `success: None`). Surfaced — not hidden — so the
    /// denominator's coverage is legible rather than silently inflating rates.
    pub inconclusive_count: u64,
    /// Total input tokens across all of this model's receipts.
    pub total_input_tokens: u64,
    /// Total output tokens across all of this model's receipts.
    pub total_output_tokens: u64,
    /// Mean over receipts that carried a quality signal; `None` if none did.
    pub avg_quality: Option<f64>,
    /// Mean latency (ms) over all of this model's receipts.
    pub avg_latency_ms: f64,
    /// ADJUDICATED success rate: `successes / (successes + failures)` — per
    /// *resolved* outcome, NOT per attempt (`inconclusive_count` is excluded
    /// from the denominator by design; pair the two for coverage =
    /// resolved/(resolved+inconclusive)). `None` when nothing resolved (don't
    /// fabricate a rate from an all-inconclusive denominator).
    pub success_rate: Option<f64>,
    /// `(input + output tokens) / successes`; `None` before any success.
    /// Price-free token efficiency.
    pub tokens_per_success: Option<f64>,
    /// Priced spend ÷ successes (USD); `None` when the model is unpriced or has
    /// no success yet. THE headline per-model metric — "cry once" is only
    /// legible in dollars-per-correct-outcome.
    pub usd_per_success: Option<f64>,
}

/// Deployment-level outcome scoreboard, folded from the durable ledger.
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct Scoreboard {
    /// Per-model rows, sorted cheapest-correct-outcome first: priced rows by
    /// `usd_per_success` ascending, then unpriced/no-success rows by
    /// `success_rate` descending, then `model_id` for a stable tie-break.
    pub rows: Vec<ScoreboardRow>,
    /// Successes across every model.
    pub total_successes: u64,
    /// Failures across every model.
    pub total_failures: u64,
    /// Inconclusive (unscored-but-completed) receipts across every model.
    pub total_inconclusive: u64,
    /// Total priced spend (USD) across models that had prices; `None` when no
    /// model on the board was priced.
    pub total_usd: Option<f64>,
    /// `total_usd / total_successes`; `None` when unpriced or no success. The
    /// single headline number: dollars per correct outcome across the deployment.
    pub overall_usd_per_success: Option<f64>,
    /// Distinct models on the board.
    pub model_count: usize,
    /// Receipts folded (successes + failures + inconclusive).
    pub receipts: u64,
}

/// Per-model running accumulator while folding the ledger.
#[derive(Default)]
struct Acc {
    success: u64,
    fail: u64,
    inconclusive: u64,
    in_tok: u64,
    out_tok: u64,
    quality_sum: f64,
    quality_n: u64,
    latency_sum: u64,
    calls: u64,
}

impl Scoreboard {
    /// Fold ledger `entries` into a scoreboard. `price_lookup` maps a model id
    /// to its `(input_per_mtok, output_per_mtok)` USD rates; return `None` for a
    /// model with no known price (its `usd_per_success` stays `None`).
    pub fn from_ledger(
        entries: &[OutcomeLedgerEntry],
        price_lookup: impl Fn(&str) -> Option<(f64, f64)>,
    ) -> Self {
        let mut accs: BTreeMap<String, Acc> = BTreeMap::new();
        for e in entries {
            let a = accs.entry(e.model_id.clone()).or_default();
            match e.success {
                Some(true) => a.success += 1,
                Some(false) => a.fail += 1,
                None => a.inconclusive += 1,
            }
            a.in_tok += e.input_tokens as u64;
            a.out_tok += e.output_tokens as u64;
            if let Some(q) = e.quality {
                a.quality_sum += q;
                a.quality_n += 1;
            }
            a.latency_sum += e.latency_ms;
            a.calls += 1;
        }

        let mut rows: Vec<ScoreboardRow> = Vec::with_capacity(accs.len());
        let (mut total_successes, mut total_failures, mut total_inconclusive) = (0u64, 0u64, 0u64);
        let mut total_usd = 0.0;
        let mut any_priced = false;

        for (model_id, a) in accs {
            let resolved = a.success + a.fail;
            let success_rate = (resolved > 0).then(|| a.success as f64 / resolved as f64);
            let total_tokens = a.in_tok + a.out_tok;
            let tokens_per_success =
                (a.success > 0).then(|| total_tokens as f64 / a.success as f64);
            // Priced spend for this model (USD), if a price was supplied.
            let usd = price_lookup(&model_id)
                .map(|(ip, op)| (a.in_tok as f64 * ip + a.out_tok as f64 * op) / 1_000_000.0);
            let usd_per_success = match usd {
                Some(u) if a.success > 0 => Some(u / a.success as f64),
                _ => None,
            };
            let avg_quality = (a.quality_n > 0).then(|| a.quality_sum / a.quality_n as f64);
            // An `Acc` only exists because at least one entry created it via
            // `or_default()`, and every entry increments `calls` — so `calls >= 1`.
            let avg_latency_ms = a.latency_sum as f64 / a.calls as f64;

            if let Some(u) = usd {
                total_usd += u;
                any_priced = true;
            }
            total_successes += a.success;
            total_failures += a.fail;
            total_inconclusive += a.inconclusive;

            rows.push(ScoreboardRow {
                model_id,
                success_count: a.success,
                fail_count: a.fail,
                inconclusive_count: a.inconclusive,
                total_input_tokens: a.in_tok,
                total_output_tokens: a.out_tok,
                avg_quality,
                avg_latency_ms,
                success_rate,
                tokens_per_success,
                usd_per_success,
            });
        }

        // Cheapest-correct-outcome first: priced rows ascending by
        // usd_per_success, then unpriced rows by success_rate descending, then
        // model_id for a stable, deterministic order.
        rows.sort_by(|x, y| {
            match (x.usd_per_success, y.usd_per_success) {
                (Some(a), Some(b)) => a
                    .partial_cmp(&b)
                    .unwrap_or(std::cmp::Ordering::Equal)
                    .then_with(|| x.model_id.cmp(&y.model_id)),
                (Some(_), None) => std::cmp::Ordering::Less,
                (None, Some(_)) => std::cmp::Ordering::Greater,
                (None, None) => {
                    // Neither priced/successful: better success_rate first
                    // (None rate sorts last), then model_id.
                    let xr = x.success_rate.unwrap_or(-1.0);
                    let yr = y.success_rate.unwrap_or(-1.0);
                    yr.partial_cmp(&xr)
                        .unwrap_or(std::cmp::Ordering::Equal)
                        .then_with(|| x.model_id.cmp(&y.model_id))
                }
            }
        });

        let total_usd = any_priced.then_some(total_usd);
        let overall_usd_per_success = match total_usd {
            Some(u) if total_successes > 0 => Some(u / total_successes as f64),
            _ => None,
        };
        let receipts = total_successes + total_failures + total_inconclusive;

        Scoreboard {
            model_count: rows.len(),
            rows,
            total_successes,
            total_failures,
            total_inconclusive,
            total_usd,
            overall_usd_per_success,
            receipts,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::outcome::InferenceTask;

    fn entry(model: &str, success: Option<bool>, quality: Option<f64>, inp: usize, out: usize) -> OutcomeLedgerEntry {
        OutcomeLedgerEntry {
            trace_id: format!("t-{model}-{inp}-{out}"),
            model_id: model.to_string(),
            task: InferenceTask::Generate,
            routing_reason: "test".into(),
            latency_ms: 100,
            input_tokens: inp,
            output_tokens: out,
            success,
            quality,
            error: None,
            project_id: None,
            intent: None,
            timestamp: 0,
        }
    }

    // Prices: expensive model is high-quality, cheap model is low-quality.
    fn prices(id: &str) -> Option<(f64, f64)> {
        match id {
            "expensive" => Some((10.0, 30.0)),
            "cheap" => Some((0.1, 0.3)),
            _ => None, // "local" is unpriced
        }
    }

    #[test]
    fn folds_outcome_denominated_metrics_per_model() {
        let entries = vec![
            // expensive: 2 success, 0 fail. 1000 in + 1000 out tokens each.
            entry("expensive", Some(true), Some(0.9), 1000, 1000),
            entry("expensive", Some(true), Some(0.8), 1000, 1000),
            // cheap: 1 success, 1 fail. same tokens.
            entry("cheap", Some(true), None, 1000, 1000),
            entry("cheap", Some(false), None, 1000, 1000),
        ];
        let sb = Scoreboard::from_ledger(&entries, prices);

        let exp = sb.rows.iter().find(|r| r.model_id == "expensive").unwrap();
        assert_eq!(exp.success_count, 2);
        assert_eq!(exp.fail_count, 0);
        assert_eq!(exp.success_rate, Some(1.0));
        // tokens: (1000+1000)*2 = 4000 / 2 successes = 2000
        assert_eq!(exp.tokens_per_success, Some(2000.0));
        // usd: (2000*10 + 2000*30)/1e6 = (20000+60000)/1e6 = 0.08 / 2 = 0.04
        assert!((exp.usd_per_success.unwrap() - 0.04).abs() < 1e-9);
        assert!((exp.avg_quality.unwrap() - 0.85).abs() < 1e-9);

        let cheap = sb.rows.iter().find(|r| r.model_id == "cheap").unwrap();
        assert_eq!(cheap.success_count, 1);
        assert_eq!(cheap.fail_count, 1);
        assert_eq!(cheap.success_rate, Some(0.5));
        assert_eq!(cheap.avg_quality, None, "no quality signal → None, not 0.0");
        // usd: (2000*0.1 + 2000*0.3)/1e6 = 800/1e6 = 0.0008 / 1 success = 0.0008
        assert!((cheap.usd_per_success.unwrap() - 0.0008).abs() < 1e-12);

        assert_eq!(sb.total_successes, 3);
        assert_eq!(sb.total_failures, 1);
        assert_eq!(sb.receipts, 4);
        assert_eq!(sb.model_count, 2);
    }

    #[test]
    fn sorts_cheapest_correct_outcome_first() {
        // cheap is far cheaper per success than expensive; it must rank first
        // even though expensive has the higher success_rate — the board leads
        // with dollars-per-correct-outcome.
        let entries = vec![
            entry("expensive", Some(true), None, 1000, 1000),
            entry("cheap", Some(true), None, 1000, 1000),
        ];
        let sb = Scoreboard::from_ledger(&entries, prices);
        assert_eq!(sb.rows[0].model_id, "cheap", "cheapest usd_per_success first");
        assert_eq!(sb.rows[1].model_id, "expensive");
    }

    #[test]
    fn unpriced_and_unresolved_are_honest_not_fabricated() {
        // "local" is unpriced; one success + one inconclusive (swept, no signal).
        let entries = vec![
            entry("local", Some(true), None, 500, 500),
            entry("local", None, None, 500, 500),
        ];
        let sb = Scoreboard::from_ledger(&entries, prices);
        let local = &sb.rows[0];
        assert_eq!(local.model_id, "local");
        assert_eq!(local.success_count, 1);
        assert_eq!(local.inconclusive_count, 1, "inconclusive surfaced, not dropped");
        // success_rate is over resolved (1 success / 1 resolved) — inconclusive
        // is NOT in the denominator.
        assert_eq!(local.success_rate, Some(1.0));
        assert_eq!(local.usd_per_success, None, "unpriced model → no dollar figure");
        assert_eq!(sb.total_usd, None, "no priced model → no deployment total");
        assert_eq!(sb.overall_usd_per_success, None);
        assert_eq!(sb.total_inconclusive, 1);
    }

    #[test]
    fn no_success_yields_none_not_zero() {
        // A model that only ever failed has no cost-per-success (can't divide by
        // zero successes) — None, never 0.0 or infinity.
        let entries = vec![entry("cheap", Some(false), None, 1000, 1000)];
        let sb = Scoreboard::from_ledger(&entries, prices);
        let row = &sb.rows[0];
        assert_eq!(row.success_rate, Some(0.0));
        assert_eq!(row.tokens_per_success, None);
        assert_eq!(row.usd_per_success, None);
        assert_eq!(sb.overall_usd_per_success, None);
    }

    #[test]
    fn unpriced_rows_sort_by_success_rate_then_id() {
        // The (None, None) comparator branch: two unpriced models, neither with
        // a dollar figure. Higher adjudicated success_rate ranks first; a
        // no-resolved (None rate) model sorts last.
        let entries = vec![
            // "local" (unpriced): unknown id is unpriced via the test price fn.
            // local_lo: 1 success, 1 fail → success_rate 0.5
            entry("local_lo", Some(true), None, 100, 100),
            entry("local_lo", Some(false), None, 100, 100),
            // local_hi: 2 success → success_rate 1.0
            entry("local_hi", Some(true), None, 100, 100),
            entry("local_hi", Some(true), None, 100, 100),
            // local_none: only inconclusive → success_rate None (sorts last)
            entry("local_none", None, None, 100, 100),
        ];
        let sb = Scoreboard::from_ledger(&entries, prices);
        let order: Vec<&str> = sb.rows.iter().map(|r| r.model_id.as_str()).collect();
        assert_eq!(
            order,
            vec!["local_hi", "local_lo", "local_none"],
            "unpriced rows: higher success_rate first, None rate last"
        );
    }

    #[test]
    fn empty_ledger_is_empty_board() {
        let sb = Scoreboard::from_ledger(&[], prices);
        assert!(sb.rows.is_empty());
        assert_eq!(sb.receipts, 0);
        assert_eq!(sb.total_usd, None);
        assert_eq!(sb.overall_usd_per_success, None);
    }
}