car-inference 0.28.0

Local model inference for CAR — Candle backend with Qwen3 models
//! Shadow calibration scaffolding for the router's cold-start quality constants
//! (#369).
//!
//! The adaptive router's quality math rests on conservative, **untuned**
//! cold-start constants — `LIVE_GRADE_WEIGHT` (W), `PRIOR_SHRINK_PSEUDOCOUNT`
//! (K), and the `BENCH_TIER_FLOOR`/`BENCH_TIER_CEIL` band (see
//! [`crate::adaptive_router`]). Fitting them empirically is **data-blocked**:
//! the durable outcome ledger is dominated by ungraded mechanical successes
//! (completed calls with no accept/edit/reject signal), which carry no quality
//! information. Fitting on a handful of graded receipts would be cargo-cult
//! calibration.
//!
//! This module is the "shadow metrics now, fit later" answer. It folds the same
//! durable ledger the scoreboard reads into:
//!   - the **graded-evidence volume** accumulating toward a fit, and
//!   - the **empirical quality band** that the tier bounds *would* tune to,
//!
//! emitted as telemetry. It NEVER touches the live constants and NEVER changes a
//! routing decision — it only makes the data arrive shaped and makes the
//! calibration self-improve as graded volume accumulates, rather than being a
//! future from-scratch effort. The gate before any constant is allowed to move
//! is [`MIN_GRADED_EVIDENCE`].

use crate::outcome::OutcomeLedgerEntry;

/// Minimum number of **graded** quality observations (ledger receipts carrying a
/// `quality` signal) that must accumulate before any router quality constant —
/// W, K, or the tier bounds — is allowed to move off its conservative
/// cold-start default.
///
/// This is the documented minimum-evidence bar (#369). It is deliberately
/// conservative: these constants reshape *routing*, so a fit must rest on enough
/// graded signal that an empirical band/quantile is stable rather than an
/// artifact of two or three receipts. 200 graded observations is a starting bar
/// (enough to estimate 5th/95th-percentile band edges with some stability), not
/// a tuned value — revisit it once production graded volume exists. Below the
/// bar, [`ShadowCalibration::ready`] stays `false` and the suggested band is
/// reported as informational only, never as actionable.
pub const MIN_GRADED_EVIDENCE: u64 = 200;

/// Lower/upper empirical-band percentiles used as the shadow analog of
/// `BENCH_TIER_FLOOR`/`BENCH_TIER_CEIL`. Robust quantiles (not raw min/max) so a
/// single outlier graded receipt can't define the band.
const BAND_LOW_PCT: f64 = 5.0;
const BAND_HIGH_PCT: f64 = 95.0;

/// A read-only snapshot of what the router's quality calibration *would* look
/// like given the graded evidence accumulated so far. Pure telemetry — see the
/// module docs. Never feeds back into routing.
#[derive(Debug, Clone, PartialEq)]
pub struct ShadowCalibration {
    /// Ledger receipts that carried a graded `quality` signal — the evidence a
    /// fit would rest on.
    pub graded_observations: u64,
    /// Receipts that completed but were never graded (ungraded mechanical
    /// successes + inconclusive). Surfaced so the graded *coverage* is legible,
    /// not hidden behind the graded count.
    pub ungraded_observations: u64,
    /// Distinct models that have at least one graded receipt.
    pub models_with_grades: usize,
    /// `graded / (graded + ungraded)` — how much of the ledger actually informs
    /// quality. `None` when the ledger is empty.
    pub graded_coverage: Option<f64>,
    /// Shadow analog of `BENCH_TIER_FLOOR`: the low-percentile edge of observed
    /// graded quality. `None` until the evidence bar is crossed (no honest band
    /// below threshold).
    pub observed_band_floor: Option<f64>,
    /// Shadow analog of `BENCH_TIER_CEIL`: the high-percentile edge of observed
    /// graded quality. `None` until the evidence bar is crossed.
    pub observed_band_ceil: Option<f64>,
    /// Mean observed graded quality. `None` until the evidence bar is crossed.
    pub observed_mean: Option<f64>,
    /// Whether [`MIN_GRADED_EVIDENCE`] graded observations have accumulated —
    /// the gate before any constant may move. When `false`, the band fields are
    /// `None` and the metrics are accumulation-tracking only.
    pub ready: bool,
    /// The bar in force, echoed for telemetry legibility.
    pub min_evidence: u64,
}

impl ShadowCalibration {
    /// Fold `entries` (the durable outcome ledger) into a shadow-calibration
    /// snapshot. Pure: no IO, no mutation, no routing effect.
    pub fn from_ledger(entries: &[OutcomeLedgerEntry]) -> Self {
        let mut graded: Vec<f64> = Vec::new();
        let mut ungraded: u64 = 0;
        let mut models_with_grades = std::collections::HashSet::new();
        for e in entries {
            match e.quality {
                Some(q) => {
                    graded.push(q);
                    models_with_grades.insert(e.model_id.as_str());
                }
                None => ungraded += 1,
            }
        }

        let graded_observations = graded.len() as u64;
        let total = graded_observations + ungraded;
        let graded_coverage =
            (total > 0).then(|| graded_observations as f64 / total as f64);
        let ready = graded_observations >= MIN_GRADED_EVIDENCE;

        // The empirical band is only honest once the bar is crossed — below it,
        // a "band" from a few receipts would invite exactly the cargo-cult fit
        // this scaffolding exists to prevent.
        let (observed_band_floor, observed_band_ceil, observed_mean) = if ready {
            // Sum is order-independent, so sort `graded` in place and reuse it
            // for both the mean and the percentile band — no clone.
            let mean = graded.iter().sum::<f64>() / graded_observations as f64;
            graded.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
            (
                Some(percentile(&graded, BAND_LOW_PCT)),
                Some(percentile(&graded, BAND_HIGH_PCT)),
                Some(mean),
            )
        } else {
            (None, None, None)
        };

        Self {
            graded_observations,
            ungraded_observations: ungraded,
            models_with_grades: models_with_grades.len(),
            graded_coverage,
            observed_band_floor,
            observed_band_ceil,
            observed_mean,
            ready,
            min_evidence: MIN_GRADED_EVIDENCE,
        }
    }

    /// Emit the snapshot as structured telemetry (the #369 "visible via
    /// telemetry" surface). `info` when below the evidence bar (accumulation
    /// tracking), still `info` when ready but flagged so a calibration run is an
    /// obvious, deliberate next step rather than an automatic constant move.
    pub fn emit(&self) {
        if self.ready {
            tracing::info!(
                graded = self.graded_observations,
                ungraded = self.ungraded_observations,
                coverage = ?self.graded_coverage,
                band_floor = ?self.observed_band_floor,
                band_ceil = ?self.observed_band_ceil,
                mean = ?self.observed_mean,
                min_evidence = self.min_evidence,
                "shadow-calibration: evidence bar crossed — empirical quality band \
                 available for a REVIEWED tier-bound fit (constants still untouched; #369)"
            );
        } else {
            tracing::info!(
                graded = self.graded_observations,
                ungraded = self.ungraded_observations,
                coverage = ?self.graded_coverage,
                min_evidence = self.min_evidence,
                "shadow-calibration: accumulating graded evidence \
                 ({}/{} toward the fit bar; constants stay at cold-start defaults; #369)",
                self.graded_observations,
                self.min_evidence,
            );
        }
    }
}

/// Linear-interpolated percentile of an ascending-sorted slice. `pct` in
/// `[0, 100]`. Empty slice → 0.0 (callers gate on non-empty via `ready`).
fn percentile(sorted_asc: &[f64], pct: f64) -> f64 {
    // Callers only reach here on the `ready` (non-empty) path; the empty→0.0
    // fallback must never masquerade as a real band edge.
    debug_assert!(
        !sorted_asc.is_empty(),
        "percentile called on empty slice — band edge would be a fake 0.0"
    );
    if sorted_asc.is_empty() {
        return 0.0;
    }
    if sorted_asc.len() == 1 {
        return sorted_asc[0];
    }
    let rank = (pct / 100.0) * (sorted_asc.len() as f64 - 1.0);
    let lo = rank.floor() as usize;
    let hi = rank.ceil() as usize;
    let frac = rank - lo as f64;
    sorted_asc[lo] + (sorted_asc[hi] - sorted_asc[lo]) * frac
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::outcome::InferenceTask;

    fn entry(model: &str, quality: Option<f64>) -> OutcomeLedgerEntry {
        OutcomeLedgerEntry {
            trace_id: format!("t-{model}-{:?}", quality),
            model_id: model.to_string(),
            task: InferenceTask::Generate,
            routing_reason: "test".into(),
            latency_ms: 100,
            input_tokens: 10,
            output_tokens: 10,
            success: quality.map(|q| q >= 0.5),
            quality,
            error: None,
            project_id: None,
            intent: None,
            timestamp: 0,
        }
    }

    #[test]
    fn below_bar_tracks_volume_but_suggests_no_band() {
        // The data-blocked reality: a few graded receipts among many ungraded.
        // We must surface the accumulation WITHOUT inventing a band to fit.
        let mut entries = vec![entry("m", Some(0.9)), entry("m", Some(0.4))];
        entries.extend((0..67).map(|_| entry("m", None)));
        let cal = ShadowCalibration::from_ledger(&entries);
        assert_eq!(cal.graded_observations, 2);
        assert_eq!(cal.ungraded_observations, 67);
        assert!(!cal.ready, "2 graded < MIN_GRADED_EVIDENCE must not be ready");
        assert_eq!(cal.observed_band_floor, None);
        assert_eq!(cal.observed_band_ceil, None);
        assert_eq!(cal.observed_mean, None);
        // Coverage is legible: 2 / 69.
        let cov = cal.graded_coverage.unwrap();
        assert!((cov - 2.0 / 69.0).abs() < 1e-9);
    }

    #[test]
    fn at_bar_reports_robust_band_and_mean() {
        // Once the bar is crossed, the empirical band becomes available. Use a
        // uniform 0.0..1.0 spread so the 5th/95th percentiles are predictable.
        let n = MIN_GRADED_EVIDENCE as usize;
        let entries: Vec<_> = (0..n)
            .map(|i| entry("m", Some(i as f64 / (n as f64 - 1.0))))
            .collect();
        let cal = ShadowCalibration::from_ledger(&entries);
        assert!(cal.ready);
        assert_eq!(cal.graded_observations, MIN_GRADED_EVIDENCE);
        // Robust band edges near the 5th/95th percentiles of a 0..1 ramp.
        let floor = cal.observed_band_floor.unwrap();
        let ceil = cal.observed_band_ceil.unwrap();
        assert!((floor - 0.05).abs() < 0.02, "band floor ~0.05, got {floor}");
        assert!((ceil - 0.95).abs() < 0.02, "band ceil ~0.95, got {ceil}");
        assert!((cal.observed_mean.unwrap() - 0.5).abs() < 0.01);
        assert_eq!(cal.models_with_grades, 1);
    }

    #[test]
    fn outliers_do_not_define_the_band() {
        // One extreme outlier must not blow out the band — robust quantiles, not
        // raw min/max. A run of 0.6..0.8 with a single 0.01 and a single 0.99.
        let mut entries: Vec<_> = (0..MIN_GRADED_EVIDENCE as usize)
            .map(|i| entry("m", Some(0.6 + 0.2 * (i as f64 / MIN_GRADED_EVIDENCE as f64))))
            .collect();
        entries[0] = entry("m", Some(0.01));
        *entries.last_mut().unwrap() = entry("m", Some(0.99));
        let cal = ShadowCalibration::from_ledger(&entries);
        let floor = cal.observed_band_floor.unwrap();
        let ceil = cal.observed_band_ceil.unwrap();
        assert!(floor > 0.4, "5th-pct floor ignores the 0.01 outlier, got {floor}");
        assert!(ceil < 0.9, "95th-pct ceil ignores the 0.99 outlier, got {ceil}");
    }

    #[test]
    fn empty_ledger_is_honest() {
        let cal = ShadowCalibration::from_ledger(&[]);
        assert_eq!(cal.graded_observations, 0);
        assert_eq!(cal.graded_coverage, None);
        assert!(!cal.ready);
        assert_eq!(cal.observed_band_floor, None);
    }
}