use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use crate::error::{Error, Result};
use crate::staleness::{ConflictReport, StalenessReport};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricReport {
pub metric: String,
pub n: usize,
pub mean: f64,
pub stddev: f64,
pub min: f64,
pub max: f64,
pub p50: f64,
pub p95: f64,
pub per_query: Vec<(String, f64)>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ci: Option<MetricCi>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub struct MetricCi {
pub lower: f64,
pub upper: f64,
pub level: f64,
pub iterations: usize,
}
impl MetricReport {
pub fn from_per_query(metric: String, per_query: Vec<(String, f64)>) -> Self {
let n = per_query.len();
if n == 0 {
return Self {
metric,
n: 0,
mean: 0.0,
stddev: 0.0,
min: 0.0,
max: 0.0,
p50: 0.0,
p95: 0.0,
per_query,
ci: None,
};
}
let scores: Vec<f64> = per_query.iter().map(|(_, s)| *s).collect();
let sum: f64 = scores.iter().sum();
let mean = sum / n as f64;
let var = if n > 1 {
scores.iter().map(|s| (s - mean).powi(2)).sum::<f64>() / (n as f64 - 1.0)
} else {
0.0
};
let stddev = var.sqrt();
let mut sorted = scores.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let min = sorted.first().copied().unwrap_or(0.0);
let max = sorted.last().copied().unwrap_or(0.0);
let p50 = percentile(&sorted, 0.50);
let p95 = percentile(&sorted, 0.95);
Self {
metric,
n,
mean,
stddev,
min,
max,
p50,
p95,
per_query,
ci: None,
}
}
#[must_use]
pub fn bootstrap_ci(&self, iterations: usize, level: f64, seed: u64) -> Option<MetricCi> {
if self.per_query.is_empty() || iterations == 0 {
return None;
}
if !(level > 0.0 && level < 1.0) {
return None;
}
let scores: Vec<f64> = self.per_query.iter().map(|(_, s)| *s).collect();
let n = scores.len();
let mut state = seed;
let mut resample_means: Vec<f64> = Vec::with_capacity(iterations);
for _ in 0..iterations {
let mut sum = 0.0;
for _ in 0..n {
let r = splitmix64(&mut state);
let idx = (r as usize) % n;
sum += scores.get(idx).copied().unwrap_or(0.0);
}
resample_means.push(sum / n as f64);
}
resample_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let alpha = (1.0 - level) / 2.0;
let lower = percentile(&resample_means, alpha);
let upper = percentile(&resample_means, 1.0 - alpha);
Some(MetricCi {
lower,
upper,
level,
iterations,
})
}
#[must_use]
pub fn with_bootstrap_ci(mut self, iterations: usize, level: f64, seed: u64) -> Self {
self.ci = self.bootstrap_ci(iterations, level, seed);
self
}
}
fn splitmix64(state: &mut u64) -> u64 {
*state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = *state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
z ^ (z >> 31)
}
fn percentile(sorted: &[f64], q: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
if sorted.len() == 1 {
return sorted.first().copied().unwrap_or(0.0);
}
let rank = q * (sorted.len() as f64 - 1.0);
let lo = rank.floor() as usize;
let hi = rank.ceil() as usize;
let lo_v = sorted.get(lo).copied().unwrap_or(0.0);
let hi_v = sorted.get(hi).copied().unwrap_or(lo_v);
let frac = rank - lo as f64;
lo_v + (hi_v - lo_v) * frac
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryReliability {
pub query_id: String,
pub trials: usize,
pub successes: usize,
pub pass_rate: f64,
pub pass_at_k: f64,
pub pass_all_k: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReliabilityReport {
pub metric: String,
pub threshold: f64,
pub k: usize,
pub n_queries: usize,
pub trials_per_query: usize,
pub mean_pass_rate: f64,
pub pass_at_k: f64,
pub pass_all_k: f64,
pub per_query: Vec<QueryReliability>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FreshnessQueryRollup {
pub query_id: String,
pub considered: usize,
pub stale_hits: usize,
pub stale_rate: f64,
pub conflict_groups: usize,
pub conflicting_doc_count: usize,
pub conflict_rate: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FreshnessReport {
pub k: usize,
pub query_count: usize,
pub total_considered: usize,
pub stale_hit_count: usize,
pub stale_query_count: usize,
pub stale_rate: f64,
pub stale_query_rate: f64,
pub conflict_group_count: usize,
pub conflicting_doc_count: usize,
pub conflict_query_count: usize,
pub conflict_rate: f64,
pub conflict_query_rate: f64,
pub per_query: Vec<FreshnessQueryRollup>,
}
impl FreshnessReport {
pub fn from_query_reports(
k: usize,
staleness: &[StalenessReport],
conflicts: &[ConflictReport],
) -> Result<Self> {
if staleness.len() != conflicts.len() {
return Err(Error::BaselineMismatch(format!(
"freshness report count mismatch: stale={} conflict={}",
staleness.len(),
conflicts.len()
)));
}
let mut per_query = Vec::with_capacity(staleness.len());
for (stale, conflict) in staleness.iter().zip(conflicts) {
if stale.query_id != conflict.query_id {
return Err(Error::BaselineMismatch(format!(
"freshness query mismatch: stale={} conflict={}",
stale.query_id, conflict.query_id
)));
}
if stale.considered != conflict.considered {
return Err(Error::BaselineMismatch(format!(
"freshness considered mismatch for {}: stale={} conflict={}",
stale.query_id, stale.considered, conflict.considered
)));
}
per_query.push(FreshnessQueryRollup {
query_id: stale.query_id.clone(),
considered: stale.considered,
stale_hits: stale.stale_hits.len(),
stale_rate: stale.stale_rate(),
conflict_groups: conflict.groups.len(),
conflicting_doc_count: conflict.conflicting_doc_count,
conflict_rate: conflict.conflict_rate(),
});
}
let query_count = per_query.len();
let total_considered = per_query.iter().map(|row| row.considered).sum();
let stale_hit_count = per_query.iter().map(|row| row.stale_hits).sum();
let stale_query_count = per_query.iter().filter(|row| row.stale_hits > 0).count();
let conflict_group_count = per_query.iter().map(|row| row.conflict_groups).sum();
let conflicting_doc_count = per_query.iter().map(|row| row.conflicting_doc_count).sum();
let conflict_query_count = per_query
.iter()
.filter(|row| row.conflict_groups > 0)
.count();
Ok(Self {
k,
query_count,
total_considered,
stale_hit_count,
stale_query_count,
stale_rate: ratio(stale_hit_count, total_considered),
stale_query_rate: ratio(stale_query_count, query_count),
conflict_group_count,
conflicting_doc_count,
conflict_query_count,
conflict_rate: ratio(conflicting_doc_count, total_considered),
conflict_query_rate: ratio(conflict_query_count, query_count),
per_query,
})
}
#[must_use]
pub fn metric_reports(&self) -> Vec<MetricReport> {
let stale_free = self
.per_query
.iter()
.map(|row| (row.query_id.clone(), 1.0 - row.stale_rate))
.collect();
let conflict_free = self
.per_query
.iter()
.map(|row| (row.query_id.clone(), 1.0 - row.conflict_rate))
.collect();
vec![
MetricReport::from_per_query(
format!("freshness.stale_free_rate@{}", self.k),
stale_free,
),
MetricReport::from_per_query(
format!("freshness.conflict_free_rate@{}", self.k),
conflict_free,
),
]
}
}
fn ratio(numerator: usize, denominator: usize) -> f64 {
if denominator == 0 {
0.0
} else {
numerator as f64 / denominator as f64
}
}
impl ReliabilityReport {
pub fn from_metric_reports(
metric: impl Into<String>,
threshold: f64,
k: usize,
reports: &[MetricReport],
) -> Result<Self> {
let metric = metric.into();
if reports.is_empty() {
return Err(Error::Config(
"at least one trial report is required".into(),
));
}
if k == 0 {
return Err(Error::Config("pass@k requires k > 0".into()));
}
if !threshold.is_finite() {
return Err(Error::Config("reliability threshold must be finite".into()));
}
if k > reports.len() {
return Err(Error::Config(format!(
"pass@k k={} exceeds trial count {}",
k,
reports.len()
)));
}
for report in reports {
if report.metric != metric {
return Err(Error::BaselineMismatch(format!(
"metric mismatch: expected {metric}, got {}",
report.metric
)));
}
}
let first = reports
.first()
.ok_or_else(|| Error::Config("at least one trial report is required".into()))?;
let mut query_order = Vec::with_capacity(first.per_query.len());
let mut seen = std::collections::BTreeSet::new();
for (query_id, score) in &first.per_query {
if !score.is_finite() {
return Err(Error::Config(format!(
"non-finite score for query {query_id}"
)));
}
if !seen.insert(query_id.as_str()) {
return Err(Error::BaselineMismatch(format!(
"duplicate query id in trial report: {query_id}"
)));
}
query_order.push(query_id.clone());
}
let mut scores_by_query: BTreeMap<String, Vec<f64>> = query_order
.iter()
.map(|query_id| (query_id.clone(), Vec::with_capacity(reports.len())))
.collect();
for report in reports {
let mut report_scores = BTreeMap::new();
for (query_id, score) in &report.per_query {
if !score.is_finite() {
return Err(Error::Config(format!(
"non-finite score for query {query_id}"
)));
}
if report_scores.insert(query_id.as_str(), *score).is_some() {
return Err(Error::BaselineMismatch(format!(
"duplicate query id in trial report: {query_id}"
)));
}
}
if report_scores.len() != query_order.len() {
return Err(Error::BaselineMismatch(format!(
"trial report has {} queries; expected {}",
report_scores.len(),
query_order.len()
)));
}
for query_id in &query_order {
let Some(score) = report_scores.get(query_id.as_str()).copied() else {
return Err(Error::BaselineMismatch(format!(
"trial report missing query id {query_id}"
)));
};
let Some(scores) = scores_by_query.get_mut(query_id) else {
return Err(Error::BaselineMismatch(format!(
"unexpected query id {query_id}"
)));
};
scores.push(score);
}
}
let mut per_query = Vec::with_capacity(query_order.len());
for query_id in query_order {
let Some(scores) = scores_by_query.remove(&query_id) else {
return Err(Error::BaselineMismatch(format!(
"missing scores for query id {query_id}"
)));
};
per_query.push(query_reliability(query_id, &scores, threshold, k));
}
let n_queries = per_query.len();
let trials_per_query = reports.len();
let mean_pass_rate = mean_by(&per_query, |q| q.pass_rate);
let pass_at_k = mean_by(&per_query, |q| q.pass_at_k);
let pass_all_k = mean_by(&per_query, |q| q.pass_all_k);
Ok(Self {
metric,
threshold,
k,
n_queries,
trials_per_query,
mean_pass_rate,
pass_at_k,
pass_all_k,
per_query,
})
}
}
fn query_reliability(
query_id: String,
scores: &[f64],
threshold: f64,
k: usize,
) -> QueryReliability {
let trials = scores.len();
let successes = scores.iter().filter(|score| **score >= threshold).count();
let pass_rate = if trials == 0 {
0.0
} else {
successes as f64 / trials as f64
};
QueryReliability {
query_id,
trials,
successes,
pass_rate,
pass_at_k: pass_at_k_estimate(trials, successes, k),
pass_all_k: pass_all_k_estimate(trials, successes, k),
}
}
fn mean_by(rows: &[QueryReliability], f: impl Fn(&QueryReliability) -> f64) -> f64 {
if rows.is_empty() {
return 0.0;
}
rows.iter().map(f).sum::<f64>() / rows.len() as f64
}
fn pass_at_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
if k == 0 || trials == 0 || successes == 0 {
return 0.0;
}
if k > trials || trials - successes < k {
return 1.0;
}
let fail_all = (0..k).fold(1.0, |acc, offset| {
acc * ((trials - successes - offset) as f64 / (trials - offset) as f64)
});
1.0 - fail_all
}
fn pass_all_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
if k == 0 || trials == 0 || successes < k || k > trials {
return 0.0;
}
(0..k).fold(1.0, |acc, offset| {
acc * ((successes - offset) as f64 / (trials - offset) as f64)
})
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct MultiReport {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub dataset_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub store_kind: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub judge_fingerprint: Option<String>,
pub metrics: Vec<MetricReport>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub freshness: Option<FreshnessReport>,
}
impl MultiReport {
#[must_use]
pub fn new(metrics: Vec<MetricReport>) -> Self {
Self {
metrics,
..Default::default()
}
}
#[must_use]
pub fn with_dataset(mut self, id: impl Into<String>) -> Self {
self.dataset_id = Some(id.into());
self
}
#[must_use]
pub fn with_store(mut self, kind: impl Into<String>) -> Self {
self.store_kind = Some(kind.into());
self
}
#[must_use]
pub fn with_judge_fingerprint(mut self, fp: impl Into<String>) -> Self {
self.judge_fingerprint = Some(fp.into());
self
}
#[must_use]
pub fn with_bootstrap(mut self, iterations: usize, level: f64, seed: u64) -> Self {
self.metrics = self
.metrics
.into_iter()
.map(|m| m.with_bootstrap_ci(iterations, level, seed))
.collect();
self
}
#[must_use]
pub fn with_freshness(mut self, freshness: FreshnessReport) -> Self {
self.freshness = Some(freshness);
self
}
#[must_use]
pub fn with_freshness_metrics(mut self, freshness: FreshnessReport) -> Self {
self.metrics.extend(freshness.metric_reports());
self.freshness = Some(freshness);
self
}
pub fn to_json(&self) -> Result<String> {
Ok(serde_json::to_string_pretty(self)?)
}
#[must_use]
pub fn to_markdown(&self) -> String {
let mut out = String::new();
out.push_str("| metric | n | mean | stddev | p50 | p95 | min | max |\n");
out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|\n");
for m in &self.metrics {
out.push_str(&format!(
"| {} | {} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} |\n",
m.metric, m.n, m.mean, m.stddev, m.p50, m.p95, m.min, m.max
));
}
out
}
pub fn diff(&self, baseline: &MultiReport) -> Result<ReportDiff> {
if self.judge_fingerprint != baseline.judge_fingerprint {
return Err(Error::BaselineMismatch(format!(
"judge fingerprint mismatch: current={:?} baseline={:?}",
self.judge_fingerprint, baseline.judge_fingerprint
)));
}
let base_by_name: BTreeMap<&str, &MetricReport> = baseline
.metrics
.iter()
.map(|m| (m.metric.as_str(), m))
.collect();
let mut rows = Vec::with_capacity(self.metrics.len());
for m in &self.metrics {
let base = base_by_name.get(m.metric.as_str()).copied();
let baseline_mean = base.map(|b| b.mean);
let (query_changes, winners, losers, unchanged) = match base {
Some(b) => compute_query_changes(&m.per_query, &b.per_query),
None => (Vec::new(), 0, 0, 0),
};
rows.push(MetricDelta {
metric: m.metric.clone(),
current_mean: m.mean,
baseline_mean,
delta: baseline_mean.map(|b| m.mean - b),
winners,
losers,
unchanged,
query_changes,
current_ci: m.ci,
baseline_ci: base.and_then(|b| b.ci),
});
}
Ok(ReportDiff { rows })
}
pub fn delta_markdown(&self, baseline: &MultiReport) -> Result<String> {
Ok(self.diff(baseline)?.to_markdown())
}
}
const EPSILON: f64 = 1e-9;
fn compute_query_changes(
current: &[(String, f64)],
baseline: &[(String, f64)],
) -> (Vec<QueryDelta>, usize, usize, usize) {
let base_by_query: BTreeMap<&str, f64> =
baseline.iter().map(|(q, s)| (q.as_str(), *s)).collect();
let mut changes = Vec::new();
let mut winners = 0usize;
let mut losers = 0usize;
let mut unchanged = 0usize;
for (query_id, cur_score) in current {
let Some(base_score) = base_by_query.get(query_id.as_str()).copied() else {
continue;
};
let delta = cur_score - base_score;
if delta > EPSILON {
winners += 1;
} else if delta < -EPSILON {
losers += 1;
} else {
unchanged += 1;
}
changes.push(QueryDelta {
query_id: query_id.clone(),
current: *cur_score,
baseline: base_score,
delta,
});
}
changes.sort_by(|a, b| {
b.delta
.abs()
.partial_cmp(&a.delta.abs())
.unwrap_or(std::cmp::Ordering::Equal)
});
(changes, winners, losers, unchanged)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricDelta {
pub metric: String,
pub current_mean: f64,
pub baseline_mean: Option<f64>,
pub delta: Option<f64>,
#[serde(default)]
pub winners: usize,
#[serde(default)]
pub losers: usize,
#[serde(default)]
pub unchanged: usize,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub query_changes: Vec<QueryDelta>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub current_ci: Option<MetricCi>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub baseline_ci: Option<MetricCi>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryDelta {
pub query_id: String,
pub current: f64,
pub baseline: f64,
pub delta: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReportDiff {
pub rows: Vec<MetricDelta>,
}
impl ReportDiff {
#[must_use]
pub fn to_markdown(&self) -> String {
let mut out = String::new();
out.push_str("| metric | current | baseline | Δ | win | lose | same |\n");
out.push_str("|---|---:|---:|---:|---:|---:|---:|\n");
for r in &self.rows {
let baseline = r
.baseline_mean
.map(|v| format!("{v:.4}"))
.unwrap_or_else(|| "—".to_string());
let delta = r
.delta
.map(|v| format!("{v:+.4}"))
.unwrap_or_else(|| "—".to_string());
out.push_str(&format!(
"| {} | {:.4} | {} | {} | {} | {} | {} |\n",
r.metric, r.current_mean, baseline, delta, r.winners, r.losers, r.unchanged
));
}
out
}
pub fn to_json(&self) -> Result<String> {
Ok(serde_json::to_string_pretty(self)?)
}
#[must_use]
pub fn regressions(&self, gate: &RegressionGate) -> Vec<MetricDelta> {
self.rows
.iter()
.filter(|r| match (gate.threshold(&r.metric), r.delta) {
(Some(threshold), Some(delta)) => delta < -threshold,
_ => false,
})
.cloned()
.collect()
}
#[must_use]
pub fn is_clean(&self, gate: &RegressionGate) -> bool {
self.regressions(gate).is_empty()
}
#[must_use]
pub fn exit_code(&self, gate: &RegressionGate) -> i32 {
if self.is_clean(gate) { 0 } else { 1 }
}
}
#[derive(Debug, Clone, Default)]
pub struct RegressionGate {
thresholds: BTreeMap<String, f64>,
}
impl RegressionGate {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_threshold(mut self, metric: impl Into<String>, threshold: f64) -> Self {
self.thresholds.insert(metric.into(), threshold.max(0.0));
self
}
#[must_use]
pub fn threshold(&self, metric: &str) -> Option<f64> {
self.thresholds.get(metric).copied()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::panic, clippy::indexing_slicing)]
mod tests {
use super::*;
use crate::staleness::{ConflictGroup, ConflictReport, StaleHit, StalenessReport};
#[test]
fn metric_report_aggregates() {
let r = MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.0), ("q2".into(), 0.5), ("q3".into(), 1.0)],
);
assert_eq!(r.n, 3);
assert!((r.mean - 0.5).abs() < 1e-9);
assert!((r.min - 0.0).abs() < 1e-9);
assert!((r.max - 1.0).abs() < 1e-9);
assert!((r.p50 - 0.5).abs() < 1e-9);
}
#[test]
fn empty_report_is_zero() {
let r = MetricReport::from_per_query("m".into(), vec![]);
assert_eq!(r.n, 0);
assert_eq!(r.mean, 0.0);
}
#[test]
fn diff_flags_fingerprint_mismatch() {
let a = MultiReport::new(vec![]).with_judge_fingerprint("a");
let b = MultiReport::new(vec![]).with_judge_fingerprint("b");
assert!(a.diff(&b).is_err());
}
#[test]
fn diff_computes_per_metric_delta() {
let cur = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.8)],
)]);
let base = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.6)],
)]);
let diff = cur.diff(&base).unwrap();
assert_eq!(diff.rows.len(), 1);
let row = &diff.rows[0];
assert!((row.delta.unwrap_or(0.0) - 0.2).abs() < 1e-9);
}
#[test]
fn diff_buckets_per_query_winners_losers_and_unchanged() {
let cur = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![
("q1".into(), 1.0), ("q2".into(), 0.0), ("q3".into(), 0.5), ("q4".into(), 0.9), ],
)]);
let base = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![
("q1".into(), 0.5),
("q2".into(), 0.5),
("q3".into(), 0.5),
("q5".into(), 1.0), ],
)]);
let diff = cur.diff(&base).unwrap();
let row = &diff.rows[0];
assert_eq!(row.winners, 1);
assert_eq!(row.losers, 1);
assert_eq!(row.unchanged, 1);
assert_eq!(row.query_changes.len(), 3);
assert_eq!(row.query_changes[2].query_id, "q3");
assert!((row.query_changes[2].delta).abs() < 1e-9);
}
#[test]
fn diff_query_changes_empty_when_baseline_missing_metric() {
let cur = MultiReport::new(vec![MetricReport::from_per_query(
"ndcg@10".into(),
vec![("q1".into(), 0.9)],
)]);
let base = MultiReport::new(vec![]);
let diff = cur.diff(&base).unwrap();
let row = &diff.rows[0];
assert!(row.delta.is_none());
assert_eq!(row.winners, 0);
assert_eq!(row.losers, 0);
assert_eq!(row.unchanged, 0);
assert!(row.query_changes.is_empty());
}
#[test]
fn regression_gate_flags_only_metrics_below_threshold() {
let cur = MultiReport::new(vec![
MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.50)]),
MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.595)]),
MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.10)]),
]);
let base = MultiReport::new(vec![
MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.60)]),
MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.60)]),
MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.90)]),
]);
let diff = cur.diff(&base).unwrap();
let gate = RegressionGate::new()
.with_threshold("recall@10", 0.02)
.with_threshold("ndcg@10", 0.02);
let regressed = diff.regressions(&gate);
assert_eq!(regressed.len(), 1);
assert_eq!(regressed[0].metric, "recall@10");
}
#[test]
fn regression_gate_clamps_negative_thresholds() {
let gate = RegressionGate::new().with_threshold("recall@10", -0.5);
assert_eq!(gate.threshold("recall@10"), Some(0.0));
}
#[test]
fn report_diff_to_json_round_trips() {
let cur = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
)]);
let base = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.5), ("q2".into(), 0.5)],
)]);
let diff = cur.diff(&base).unwrap();
let json = diff.to_json().unwrap();
let parsed: ReportDiff = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.rows.len(), 1);
assert_eq!(parsed.rows[0].winners, 1);
assert_eq!(parsed.rows[0].losers, 1);
assert_eq!(parsed.rows[0].query_changes.len(), 2);
}
#[test]
fn reliability_report_computes_pass_at_k_and_pass_all_k() {
let reports = vec![
MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
),
MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.0), ("q2".into(), 1.0)],
),
MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
),
];
let reliability =
ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap();
assert_eq!(reliability.n_queries, 2);
assert_eq!(reliability.trials_per_query, 3);
assert!((reliability.mean_pass_rate - 0.5).abs() < 1e-9);
assert!((reliability.pass_at_k - (5.0 / 6.0)).abs() < 1e-9);
assert!((reliability.pass_all_k - (1.0 / 6.0)).abs() < 1e-9);
assert_eq!(reliability.per_query[0].query_id, "q1");
assert_eq!(reliability.per_query[0].successes, 2);
}
#[test]
fn reliability_report_requires_matching_metrics() {
let reports = vec![
MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 1.0)]),
];
let err =
ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
match err {
Error::BaselineMismatch(message) => assert!(message.contains("metric mismatch")),
other => panic!("unexpected error: {other:?}"),
}
}
#[test]
fn reliability_report_requires_same_queries() {
let reports = vec![
MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
MetricReport::from_per_query("recall@10".into(), vec![("q2".into(), 1.0)]),
];
let err =
ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
match err {
Error::BaselineMismatch(message) => assert!(message.contains("missing query id")),
other => panic!("unexpected error: {other:?}"),
}
}
#[test]
fn reliability_report_rejects_k_larger_than_trials() {
let reports = vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 1.0)],
)];
let err =
ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap_err();
match err {
Error::Config(message) => assert!(message.contains("exceeds trial count")),
other => panic!("unexpected error: {other:?}"),
}
}
#[test]
fn reliability_report_serializes() {
let reports = vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 1.0)],
)];
let reliability =
ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap();
let json = serde_json::to_string(&reliability).unwrap();
let parsed: ReliabilityReport = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.metric, "recall@10");
assert_eq!(parsed.per_query.len(), 1);
assert!((parsed.pass_at_k - 1.0).abs() < 1e-9);
}
#[test]
fn bootstrap_ci_brackets_the_mean_for_well_separated_scores() {
let r = MetricReport::from_per_query(
"recall@10".into(),
(0..50)
.map(|i| (format!("q{i}"), if i % 2 == 0 { 0.8 } else { 0.6 }))
.collect(),
)
.with_bootstrap_ci(1000, 0.95, 0xC0FFEE);
let ci = r.ci.unwrap();
assert!(ci.lower < r.mean);
assert!(ci.upper > r.mean);
assert!(ci.lower >= 0.6 - 1e-9);
assert!(ci.upper <= 0.8 + 1e-9);
assert_eq!(ci.iterations, 1000);
assert!((ci.level - 0.95).abs() < 1e-9);
}
#[test]
fn bootstrap_ci_is_deterministic_for_a_fixed_seed() {
let scores: Vec<(String, f64)> = (0..30)
.map(|i| (format!("q{i}"), (i % 5) as f64 / 4.0))
.collect();
let a = MetricReport::from_per_query("m".into(), scores.clone())
.with_bootstrap_ci(500, 0.9, 42)
.ci
.unwrap();
let b = MetricReport::from_per_query("m".into(), scores)
.with_bootstrap_ci(500, 0.9, 42)
.ci
.unwrap();
assert_eq!(a, b);
}
#[test]
fn bootstrap_ci_rejects_invalid_input() {
let r = MetricReport::from_per_query("m".into(), vec![("q1".into(), 0.5)]);
assert!(r.bootstrap_ci(0, 0.95, 1).is_none());
assert!(r.bootstrap_ci(100, 0.0, 1).is_none());
assert!(r.bootstrap_ci(100, 1.0, 1).is_none());
let empty = MetricReport::from_per_query("m".into(), vec![]);
assert!(empty.bootstrap_ci(100, 0.95, 1).is_none());
}
#[test]
fn report_diff_exit_code_signals_regression() {
let baseline = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.9), ("q2".into(), 0.9)],
)]);
let candidate = MultiReport::new(vec![MetricReport::from_per_query(
"recall@10".into(),
vec![("q1".into(), 0.4), ("q2".into(), 0.4)],
)]);
let diff = candidate.diff(&baseline).unwrap();
let gate = RegressionGate::new().with_threshold("recall@10", 0.05);
assert!(!diff.is_clean(&gate));
assert_eq!(diff.exit_code(&gate), 1);
let lax_gate = RegressionGate::new().with_threshold("recall@10", 1.0);
assert!(diff.is_clean(&lax_gate));
assert_eq!(diff.exit_code(&lax_gate), 0);
}
#[test]
fn report_diff_carries_bootstrap_ci_on_both_sides() {
let scores: Vec<(String, f64)> = (0..20).map(|i| (format!("q{i}"), 0.5)).collect();
let candidate = MultiReport::new(vec![
MetricReport::from_per_query("recall@10".into(), scores.clone())
.with_bootstrap_ci(200, 0.95, 7),
]);
let baseline = MultiReport::new(vec![
MetricReport::from_per_query("recall@10".into(), scores)
.with_bootstrap_ci(200, 0.95, 7),
]);
let diff = candidate.diff(&baseline).unwrap();
let row = diff.rows.first().unwrap();
assert!(row.current_ci.is_some());
assert!(row.baseline_ci.is_some());
}
#[test]
fn freshness_report_rolls_up_query_rates_and_appends_gateable_metrics() {
let staleness = vec![
StalenessReport {
query_id: "q1".into(),
stale_hits: vec![StaleHit {
doc_id: "old".into(),
rank: 0,
superseded_by: "new".into(),
}],
considered: 2,
},
StalenessReport {
query_id: "q2".into(),
stale_hits: vec![],
considered: 2,
},
];
let conflicts = vec![
ConflictReport {
query_id: "q1".into(),
groups: vec![ConflictGroup {
version_key: "alice:address".into(),
doc_ids: vec!["old".into(), "new".into()],
}],
conflicting_doc_count: 2,
considered: 2,
},
ConflictReport {
query_id: "q2".into(),
groups: vec![],
conflicting_doc_count: 0,
considered: 2,
},
];
let freshness = FreshnessReport::from_query_reports(2, &staleness, &conflicts).unwrap();
assert_eq!(freshness.query_count, 2);
assert_eq!(freshness.total_considered, 4);
assert_eq!(freshness.stale_hit_count, 1);
assert_eq!(freshness.stale_query_count, 1);
assert!((freshness.stale_rate - 0.25).abs() < 1e-9);
assert!((freshness.stale_query_rate - 0.5).abs() < 1e-9);
assert_eq!(freshness.conflict_group_count, 1);
assert_eq!(freshness.conflicting_doc_count, 2);
assert!((freshness.conflict_rate - 0.5).abs() < 1e-9);
assert_eq!(freshness.per_query[0].stale_rate, 0.5);
assert_eq!(freshness.per_query[0].conflict_rate, 1.0);
let report = MultiReport::new(vec![]).with_freshness_metrics(freshness);
assert!(report.freshness.is_some());
assert_eq!(report.metrics.len(), 2);
assert_eq!(report.metrics[0].metric, "freshness.stale_free_rate@2");
assert!((report.metrics[0].mean - 0.75).abs() < 1e-9);
assert_eq!(report.metrics[1].metric, "freshness.conflict_free_rate@2");
assert!((report.metrics[1].mean - 0.5).abs() < 1e-9);
}
#[test]
fn freshness_metrics_participate_in_existing_regression_gate() {
let baseline_freshness = FreshnessReport::from_query_reports(
2,
&[StalenessReport {
query_id: "q1".into(),
stale_hits: vec![],
considered: 2,
}],
&[ConflictReport {
query_id: "q1".into(),
groups: vec![],
conflicting_doc_count: 0,
considered: 2,
}],
)
.unwrap();
let candidate_freshness = FreshnessReport::from_query_reports(
2,
&[StalenessReport {
query_id: "q1".into(),
stale_hits: vec![StaleHit {
doc_id: "old".into(),
rank: 0,
superseded_by: "new".into(),
}],
considered: 2,
}],
&[ConflictReport {
query_id: "q1".into(),
groups: vec![],
conflicting_doc_count: 0,
considered: 2,
}],
)
.unwrap();
let baseline = MultiReport::new(vec![]).with_freshness_metrics(baseline_freshness);
let candidate = MultiReport::new(vec![]).with_freshness_metrics(candidate_freshness);
let diff = candidate.diff(&baseline).unwrap();
let gate = RegressionGate::new().with_threshold("freshness.stale_free_rate@2", 0.1);
let regressions = diff.regressions(&gate);
assert_eq!(regressions.len(), 1);
assert_eq!(regressions[0].metric, "freshness.stale_free_rate@2");
}
#[test]
fn freshness_report_requires_matching_query_reports() {
let err = FreshnessReport::from_query_reports(
5,
&[StalenessReport {
query_id: "q1".into(),
stale_hits: vec![],
considered: 1,
}],
&[ConflictReport {
query_id: "q2".into(),
groups: vec![],
conflicting_doc_count: 0,
considered: 1,
}],
)
.unwrap_err();
match err {
Error::BaselineMismatch(message) => {
assert!(message.contains("freshness query mismatch"));
}
other => panic!("unexpected error: {other:?}"),
}
}
}