use crate::outcome::OutcomeLedgerEntry;
pub const MIN_GRADED_EVIDENCE: u64 = 200;
const BAND_LOW_PCT: f64 = 5.0;
const BAND_HIGH_PCT: f64 = 95.0;
#[derive(Debug, Clone, PartialEq)]
pub struct ShadowCalibration {
pub graded_observations: u64,
pub ungraded_observations: u64,
pub models_with_grades: usize,
pub graded_coverage: Option<f64>,
pub observed_band_floor: Option<f64>,
pub observed_band_ceil: Option<f64>,
pub observed_mean: Option<f64>,
pub ready: bool,
pub min_evidence: u64,
}
impl ShadowCalibration {
pub fn from_ledger(entries: &[OutcomeLedgerEntry]) -> Self {
let mut graded: Vec<f64> = Vec::new();
let mut ungraded: u64 = 0;
let mut models_with_grades = std::collections::HashSet::new();
for e in entries {
match e.quality {
Some(q) => {
graded.push(q);
models_with_grades.insert(e.model_id.as_str());
}
None => ungraded += 1,
}
}
let graded_observations = graded.len() as u64;
let total = graded_observations + ungraded;
let graded_coverage =
(total > 0).then(|| graded_observations as f64 / total as f64);
let ready = graded_observations >= MIN_GRADED_EVIDENCE;
let (observed_band_floor, observed_band_ceil, observed_mean) = if ready {
let mean = graded.iter().sum::<f64>() / graded_observations as f64;
graded.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
(
Some(percentile(&graded, BAND_LOW_PCT)),
Some(percentile(&graded, BAND_HIGH_PCT)),
Some(mean),
)
} else {
(None, None, None)
};
Self {
graded_observations,
ungraded_observations: ungraded,
models_with_grades: models_with_grades.len(),
graded_coverage,
observed_band_floor,
observed_band_ceil,
observed_mean,
ready,
min_evidence: MIN_GRADED_EVIDENCE,
}
}
pub fn emit(&self) {
if self.ready {
tracing::info!(
graded = self.graded_observations,
ungraded = self.ungraded_observations,
coverage = ?self.graded_coverage,
band_floor = ?self.observed_band_floor,
band_ceil = ?self.observed_band_ceil,
mean = ?self.observed_mean,
min_evidence = self.min_evidence,
"shadow-calibration: evidence bar crossed — empirical quality band \
available for a REVIEWED tier-bound fit (constants still untouched; #369)"
);
} else {
tracing::info!(
graded = self.graded_observations,
ungraded = self.ungraded_observations,
coverage = ?self.graded_coverage,
min_evidence = self.min_evidence,
"shadow-calibration: accumulating graded evidence \
({}/{} toward the fit bar; constants stay at cold-start defaults; #369)",
self.graded_observations,
self.min_evidence,
);
}
}
}
fn percentile(sorted_asc: &[f64], pct: f64) -> f64 {
debug_assert!(
!sorted_asc.is_empty(),
"percentile called on empty slice — band edge would be a fake 0.0"
);
if sorted_asc.is_empty() {
return 0.0;
}
if sorted_asc.len() == 1 {
return sorted_asc[0];
}
let rank = (pct / 100.0) * (sorted_asc.len() as f64 - 1.0);
let lo = rank.floor() as usize;
let hi = rank.ceil() as usize;
let frac = rank - lo as f64;
sorted_asc[lo] + (sorted_asc[hi] - sorted_asc[lo]) * frac
}
#[cfg(test)]
mod tests {
use super::*;
use crate::outcome::InferenceTask;
fn entry(model: &str, quality: Option<f64>) -> OutcomeLedgerEntry {
OutcomeLedgerEntry {
trace_id: format!("t-{model}-{:?}", quality),
model_id: model.to_string(),
task: InferenceTask::Generate,
routing_reason: "test".into(),
latency_ms: 100,
input_tokens: 10,
output_tokens: 10,
success: quality.map(|q| q >= 0.5),
quality,
error: None,
project_id: None,
intent: None,
timestamp: 0,
}
}
#[test]
fn below_bar_tracks_volume_but_suggests_no_band() {
let mut entries = vec![entry("m", Some(0.9)), entry("m", Some(0.4))];
entries.extend((0..67).map(|_| entry("m", None)));
let cal = ShadowCalibration::from_ledger(&entries);
assert_eq!(cal.graded_observations, 2);
assert_eq!(cal.ungraded_observations, 67);
assert!(!cal.ready, "2 graded < MIN_GRADED_EVIDENCE must not be ready");
assert_eq!(cal.observed_band_floor, None);
assert_eq!(cal.observed_band_ceil, None);
assert_eq!(cal.observed_mean, None);
let cov = cal.graded_coverage.unwrap();
assert!((cov - 2.0 / 69.0).abs() < 1e-9);
}
#[test]
fn at_bar_reports_robust_band_and_mean() {
let n = MIN_GRADED_EVIDENCE as usize;
let entries: Vec<_> = (0..n)
.map(|i| entry("m", Some(i as f64 / (n as f64 - 1.0))))
.collect();
let cal = ShadowCalibration::from_ledger(&entries);
assert!(cal.ready);
assert_eq!(cal.graded_observations, MIN_GRADED_EVIDENCE);
let floor = cal.observed_band_floor.unwrap();
let ceil = cal.observed_band_ceil.unwrap();
assert!((floor - 0.05).abs() < 0.02, "band floor ~0.05, got {floor}");
assert!((ceil - 0.95).abs() < 0.02, "band ceil ~0.95, got {ceil}");
assert!((cal.observed_mean.unwrap() - 0.5).abs() < 0.01);
assert_eq!(cal.models_with_grades, 1);
}
#[test]
fn outliers_do_not_define_the_band() {
let mut entries: Vec<_> = (0..MIN_GRADED_EVIDENCE as usize)
.map(|i| entry("m", Some(0.6 + 0.2 * (i as f64 / MIN_GRADED_EVIDENCE as f64))))
.collect();
entries[0] = entry("m", Some(0.01));
*entries.last_mut().unwrap() = entry("m", Some(0.99));
let cal = ShadowCalibration::from_ledger(&entries);
let floor = cal.observed_band_floor.unwrap();
let ceil = cal.observed_band_ceil.unwrap();
assert!(floor > 0.4, "5th-pct floor ignores the 0.01 outlier, got {floor}");
assert!(ceil < 0.9, "95th-pct ceil ignores the 0.99 outlier, got {ceil}");
}
#[test]
fn empty_ledger_is_honest() {
let cal = ShadowCalibration::from_ledger(&[]);
assert_eq!(cal.graded_observations, 0);
assert_eq!(cal.graded_coverage, None);
assert!(!cal.ready);
assert_eq!(cal.observed_band_floor, None);
}
}