Skip to main content

rig_retrieval_evals/
report.rs

1//! Aggregation, serialization, and baseline diffing of per-query metric
2//! scores produced by [`crate::harness::RetrievalHarness`].
3//!
4//! Two layers:
5//!
6//! - [`MetricReport`] — aggregates a single metric across all queries (mean,
7//!   stddev, P50/P95, min/max, per-query scores).
8//! - [`ReliabilityReport`] — aggregates repeated trials for one metric into
9//!   pass@k / pass^k reliability estimates.
10//! - [`MultiReport`]  — bundles several [`MetricReport`]s with optional
11//!   metadata (dataset id, store kind, judge fingerprint) so reports can be
12//!   diffed across runs.
13
14use std::collections::BTreeMap;
15
16use serde::{Deserialize, Serialize};
17
18use crate::error::{Error, Result};
19use crate::staleness::{ConflictReport, StalenessReport};
20
21/// Aggregated statistics for a single metric across a query set.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct MetricReport {
24    /// Metric identifier (e.g. `"recall@10"`).
25    pub metric: String,
26    /// Number of queries scored.
27    pub n: usize,
28    /// Arithmetic mean.
29    pub mean: f64,
30    /// Sample standard deviation (N-1). `0.0` for `n < 2`.
31    pub stddev: f64,
32    /// Minimum observed score.
33    pub min: f64,
34    /// Maximum observed score.
35    pub max: f64,
36    /// 50th percentile (median) via linear interpolation.
37    pub p50: f64,
38    /// 95th percentile via linear interpolation.
39    pub p95: f64,
40    /// Per-query `(query_id, score)` pairs, in input order.
41    pub per_query: Vec<(String, f64)>,
42    /// Optional bootstrap confidence interval for [`MetricReport::mean`].
43    /// Populated by [`MetricReport::with_bootstrap_ci`]. Serialized as
44    /// `"ci"` when present, omitted when `None` so existing reports stay
45    /// schema-compatible.
46    #[serde(default, skip_serializing_if = "Option::is_none")]
47    pub ci: Option<MetricCi>,
48}
49
50/// Bootstrap confidence interval for [`MetricReport::mean`].
51///
52/// Produced by [`MetricReport::bootstrap_ci`] using a deterministic
53/// percentile bootstrap: resample the per-query scores `iterations` times
54/// with replacement, take the empirical mean of each resample, and report
55/// the two-sided quantile interval for the requested `level`. The same
56/// `seed` yields the same interval on the same input, so CI gates and
57/// reproducibility tests don't need extra fixtures.
58#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
59pub struct MetricCi {
60    /// Lower bound of the confidence interval (inclusive).
61    pub lower: f64,
62    /// Upper bound of the confidence interval (inclusive).
63    pub upper: f64,
64    /// Two-sided coverage probability the interval was computed for,
65    /// e.g. `0.95` for a 95 % CI.
66    pub level: f64,
67    /// Number of bootstrap resamples drawn.
68    pub iterations: usize,
69}
70
71impl MetricReport {
72    /// Build a [`MetricReport`] from per-query `(query_id, score)` pairs.
73    ///
74    /// Scores are aggregated in-place; the original ordering is preserved
75    /// in [`MetricReport::per_query`] for diff and audit use cases.
76    pub fn from_per_query(metric: String, per_query: Vec<(String, f64)>) -> Self {
77        let n = per_query.len();
78        if n == 0 {
79            return Self {
80                metric,
81                n: 0,
82                mean: 0.0,
83                stddev: 0.0,
84                min: 0.0,
85                max: 0.0,
86                p50: 0.0,
87                p95: 0.0,
88                per_query,
89                ci: None,
90            };
91        }
92        let scores: Vec<f64> = per_query.iter().map(|(_, s)| *s).collect();
93        let sum: f64 = scores.iter().sum();
94        let mean = sum / n as f64;
95        let var = if n > 1 {
96            scores.iter().map(|s| (s - mean).powi(2)).sum::<f64>() / (n as f64 - 1.0)
97        } else {
98            0.0
99        };
100        let stddev = var.sqrt();
101
102        let mut sorted = scores.clone();
103        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
104        let min = sorted.first().copied().unwrap_or(0.0);
105        let max = sorted.last().copied().unwrap_or(0.0);
106        let p50 = percentile(&sorted, 0.50);
107        let p95 = percentile(&sorted, 0.95);
108
109        Self {
110            metric,
111            n,
112            mean,
113            stddev,
114            min,
115            max,
116            p50,
117            p95,
118            per_query,
119            ci: None,
120        }
121    }
122
123    /// Compute a percentile-bootstrap confidence interval for
124    /// [`MetricReport::mean`] without mutating `self`.
125    ///
126    /// Returns `None` when there are no per-query scores or when
127    /// `iterations` / `level` are out of range (`iterations == 0`, or
128    /// `level` not strictly inside `(0.0, 1.0)`). Otherwise draws
129    /// `iterations` resamples of size `n` with replacement using a
130    /// deterministic SplitMix64 stream seeded by `seed`, and returns the
131    /// two-sided percentile interval at the requested `level`.
132    #[must_use]
133    pub fn bootstrap_ci(&self, iterations: usize, level: f64, seed: u64) -> Option<MetricCi> {
134        if self.per_query.is_empty() || iterations == 0 {
135            return None;
136        }
137        if !(level > 0.0 && level < 1.0) {
138            return None;
139        }
140        let scores: Vec<f64> = self.per_query.iter().map(|(_, s)| *s).collect();
141        let n = scores.len();
142        let mut state = seed;
143        let mut resample_means: Vec<f64> = Vec::with_capacity(iterations);
144        for _ in 0..iterations {
145            let mut sum = 0.0;
146            for _ in 0..n {
147                let r = splitmix64(&mut state);
148                let idx = (r as usize) % n;
149                sum += scores.get(idx).copied().unwrap_or(0.0);
150            }
151            resample_means.push(sum / n as f64);
152        }
153        resample_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
154        let alpha = (1.0 - level) / 2.0;
155        let lower = percentile(&resample_means, alpha);
156        let upper = percentile(&resample_means, 1.0 - alpha);
157        Some(MetricCi {
158            lower,
159            upper,
160            level,
161            iterations,
162        })
163    }
164
165    /// Compute and attach a percentile-bootstrap confidence interval to
166    /// [`MetricReport::ci`]. See [`MetricReport::bootstrap_ci`] for the
167    /// algorithm and return-value semantics. Returns `self` so the call
168    /// chains directly off [`MetricReport::from_per_query`].
169    #[must_use]
170    pub fn with_bootstrap_ci(mut self, iterations: usize, level: f64, seed: u64) -> Self {
171        self.ci = self.bootstrap_ci(iterations, level, seed);
172        self
173    }
174}
175
176/// Deterministic SplitMix64 PRNG. Stable, dependency-free, and good
177/// enough for bootstrap resampling — we are not generating cryptographic
178/// material here.
179fn splitmix64(state: &mut u64) -> u64 {
180    *state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
181    let mut z = *state;
182    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
183    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
184    z ^ (z >> 31)
185}
186
187fn percentile(sorted: &[f64], q: f64) -> f64 {
188    if sorted.is_empty() {
189        return 0.0;
190    }
191    if sorted.len() == 1 {
192        return sorted.first().copied().unwrap_or(0.0);
193    }
194    let rank = q * (sorted.len() as f64 - 1.0);
195    let lo = rank.floor() as usize;
196    let hi = rank.ceil() as usize;
197    let lo_v = sorted.get(lo).copied().unwrap_or(0.0);
198    let hi_v = sorted.get(hi).copied().unwrap_or(lo_v);
199    let frac = rank - lo as f64;
200    lo_v + (hi_v - lo_v) * frac
201}
202
203/// Reliability summary for one query across repeated trials of the same
204/// metric.
205///
206/// A trial counts as successful when its score is greater than or equal to
207/// [`ReliabilityReport::threshold`]. The per-query pass@k estimate is the
208/// probability that at least one of `k` sampled trials succeeds; pass^k is
209/// the probability that all `k` sampled trials succeed.
210#[derive(Debug, Clone, Serialize, Deserialize)]
211pub struct QueryReliability {
212    /// Gold-query identifier.
213    pub query_id: String,
214    /// Number of repeated trials observed for this query.
215    pub trials: usize,
216    /// Number of trials whose score met or exceeded the threshold.
217    pub successes: usize,
218    /// `successes / trials`.
219    pub pass_rate: f64,
220    /// Empirical pass@k estimate for this query.
221    pub pass_at_k: f64,
222    /// Empirical pass^k estimate for this query.
223    pub pass_all_k: f64,
224}
225
226/// Repeated-trial reliability report for a single metric.
227///
228/// This report turns a set of repeated [`MetricReport`]s for the same metric
229/// into reliability estimates. Scores are thresholded into pass/fail outcomes
230/// first, then pass@k and pass^k are estimated per query and averaged.
231///
232/// ```
233/// use rig_retrieval_evals::{MetricReport, ReliabilityReport};
234///
235/// let trial_a = MetricReport::from_per_query(
236///     "recall@10".into(),
237///     vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
238/// );
239/// let trial_b = MetricReport::from_per_query(
240///     "recall@10".into(),
241///     vec![("q1".into(), 1.0), ("q2".into(), 1.0)],
242/// );
243///
244/// let reliability = ReliabilityReport::from_metric_reports(
245///     "recall@10",
246///     1.0,
247///     2,
248///     &[trial_a, trial_b],
249/// )?;
250/// assert_eq!(reliability.n_queries, 2);
251/// assert_eq!(reliability.trials_per_query, 2);
252/// # Ok::<(), rig_retrieval_evals::Error>(())
253/// ```
254#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ReliabilityReport {
256    /// Metric identifier shared by every trial report.
257    pub metric: String,
258    /// Score threshold used to convert each trial into pass/fail.
259    pub threshold: f64,
260    /// Number of attempts sampled in pass@k / pass^k estimates.
261    pub k: usize,
262    /// Number of queries included in the reliability estimate.
263    pub n_queries: usize,
264    /// Number of trials observed for each query.
265    pub trials_per_query: usize,
266    /// Mean per-query pass rate.
267    pub mean_pass_rate: f64,
268    /// Mean per-query pass@k.
269    pub pass_at_k: f64,
270    /// Mean per-query pass^k.
271    pub pass_all_k: f64,
272    /// Per-query reliability rows, in the first trial report's query order.
273    pub per_query: Vec<QueryReliability>,
274}
275
276/// Per-query freshness rollup derived from stale-hit and conflict detectors.
277#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
278pub struct FreshnessQueryRollup {
279    /// Query id shared by the stale and conflict reports.
280    pub query_id: String,
281    /// Number of top-k positions considered for this query.
282    pub considered: usize,
283    /// Number of stale hits inside the considered window.
284    pub stale_hits: usize,
285    /// Fraction of considered positions that were stale.
286    pub stale_rate: f64,
287    /// Number of conflict groups inside the considered window.
288    pub conflict_groups: usize,
289    /// Number of documents participating in conflict groups.
290    pub conflicting_doc_count: usize,
291    /// Fraction of considered positions that participated in a conflict group.
292    pub conflict_rate: f64,
293}
294
295/// Dataset-level freshness rollup for stale hits and version-key conflicts.
296///
297/// `FreshnessReport` is intentionally separate from IR metrics so callers can
298/// inspect stale/conflict details as freshness signals. Use
299/// [`MultiReport::with_freshness_metrics`] when those signals should also be
300/// converted into score-like metric rows that participate in
301/// [`RegressionGate`] checks.
302#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
303pub struct FreshnessReport {
304    /// Top-k window used to produce the underlying stale/conflict reports.
305    pub k: usize,
306    /// Number of queries rolled up.
307    pub query_count: usize,
308    /// Total top-k positions considered across all queries.
309    pub total_considered: usize,
310    /// Number of stale hits across all queries.
311    pub stale_hit_count: usize,
312    /// Number of queries with at least one stale hit.
313    pub stale_query_count: usize,
314    /// Dataset-level stale-hit rate: `stale_hit_count / total_considered`.
315    pub stale_rate: f64,
316    /// Fraction of queries that had at least one stale hit.
317    pub stale_query_rate: f64,
318    /// Number of conflict groups across all queries.
319    pub conflict_group_count: usize,
320    /// Number of documents participating in conflict groups across all queries.
321    pub conflicting_doc_count: usize,
322    /// Number of queries with at least one conflict group.
323    pub conflict_query_count: usize,
324    /// Dataset-level conflict rate: `conflicting_doc_count / total_considered`.
325    pub conflict_rate: f64,
326    /// Fraction of queries that had at least one conflict group.
327    pub conflict_query_rate: f64,
328    /// Per-query stale/conflict rollups, preserving stale-report input order.
329    pub per_query: Vec<FreshnessQueryRollup>,
330}
331
332impl FreshnessReport {
333    /// Build a dataset-level freshness rollup from per-query detector outputs.
334    ///
335    /// `staleness` and `conflicts` must cover the same query ids in the same
336    /// order and must use the same `considered` count per query.
337    pub fn from_query_reports(
338        k: usize,
339        staleness: &[StalenessReport],
340        conflicts: &[ConflictReport],
341    ) -> Result<Self> {
342        if staleness.len() != conflicts.len() {
343            return Err(Error::BaselineMismatch(format!(
344                "freshness report count mismatch: stale={} conflict={}",
345                staleness.len(),
346                conflicts.len()
347            )));
348        }
349
350        let mut per_query = Vec::with_capacity(staleness.len());
351        for (stale, conflict) in staleness.iter().zip(conflicts) {
352            if stale.query_id != conflict.query_id {
353                return Err(Error::BaselineMismatch(format!(
354                    "freshness query mismatch: stale={} conflict={}",
355                    stale.query_id, conflict.query_id
356                )));
357            }
358            if stale.considered != conflict.considered {
359                return Err(Error::BaselineMismatch(format!(
360                    "freshness considered mismatch for {}: stale={} conflict={}",
361                    stale.query_id, stale.considered, conflict.considered
362                )));
363            }
364            per_query.push(FreshnessQueryRollup {
365                query_id: stale.query_id.clone(),
366                considered: stale.considered,
367                stale_hits: stale.stale_hits.len(),
368                stale_rate: stale.stale_rate(),
369                conflict_groups: conflict.groups.len(),
370                conflicting_doc_count: conflict.conflicting_doc_count,
371                conflict_rate: conflict.conflict_rate(),
372            });
373        }
374
375        let query_count = per_query.len();
376        let total_considered = per_query.iter().map(|row| row.considered).sum();
377        let stale_hit_count = per_query.iter().map(|row| row.stale_hits).sum();
378        let stale_query_count = per_query.iter().filter(|row| row.stale_hits > 0).count();
379        let conflict_group_count = per_query.iter().map(|row| row.conflict_groups).sum();
380        let conflicting_doc_count = per_query.iter().map(|row| row.conflicting_doc_count).sum();
381        let conflict_query_count = per_query
382            .iter()
383            .filter(|row| row.conflict_groups > 0)
384            .count();
385
386        Ok(Self {
387            k,
388            query_count,
389            total_considered,
390            stale_hit_count,
391            stale_query_count,
392            stale_rate: ratio(stale_hit_count, total_considered),
393            stale_query_rate: ratio(stale_query_count, query_count),
394            conflict_group_count,
395            conflicting_doc_count,
396            conflict_query_count,
397            conflict_rate: ratio(conflicting_doc_count, total_considered),
398            conflict_query_rate: ratio(conflict_query_count, query_count),
399            per_query,
400        })
401    }
402
403    /// Convert freshness failures into score-like metric reports.
404    ///
405    /// Higher is better for these metrics, so the existing
406    /// [`RegressionGate`] can flag freshness regressions without a new gate
407    /// direction system.
408    #[must_use]
409    pub fn metric_reports(&self) -> Vec<MetricReport> {
410        let stale_free = self
411            .per_query
412            .iter()
413            .map(|row| (row.query_id.clone(), 1.0 - row.stale_rate))
414            .collect();
415        let conflict_free = self
416            .per_query
417            .iter()
418            .map(|row| (row.query_id.clone(), 1.0 - row.conflict_rate))
419            .collect();
420        vec![
421            MetricReport::from_per_query(
422                format!("freshness.stale_free_rate@{}", self.k),
423                stale_free,
424            ),
425            MetricReport::from_per_query(
426                format!("freshness.conflict_free_rate@{}", self.k),
427                conflict_free,
428            ),
429        ]
430    }
431}
432
433fn ratio(numerator: usize, denominator: usize) -> f64 {
434    if denominator == 0 {
435        0.0
436    } else {
437        numerator as f64 / denominator as f64
438    }
439}
440
441impl ReliabilityReport {
442    /// Build a repeated-trial reliability report from multiple
443    /// [`MetricReport`]s for the same metric.
444    ///
445    /// Every report must contain the same query ids exactly once. `k` must be
446    /// in `1..=reports.len()`. Scores must be finite.
447    pub fn from_metric_reports(
448        metric: impl Into<String>,
449        threshold: f64,
450        k: usize,
451        reports: &[MetricReport],
452    ) -> Result<Self> {
453        let metric = metric.into();
454        if reports.is_empty() {
455            return Err(Error::Config(
456                "at least one trial report is required".into(),
457            ));
458        }
459        if k == 0 {
460            return Err(Error::Config("pass@k requires k > 0".into()));
461        }
462        if !threshold.is_finite() {
463            return Err(Error::Config("reliability threshold must be finite".into()));
464        }
465        if k > reports.len() {
466            return Err(Error::Config(format!(
467                "pass@k k={} exceeds trial count {}",
468                k,
469                reports.len()
470            )));
471        }
472
473        for report in reports {
474            if report.metric != metric {
475                return Err(Error::BaselineMismatch(format!(
476                    "metric mismatch: expected {metric}, got {}",
477                    report.metric
478                )));
479            }
480        }
481
482        let first = reports
483            .first()
484            .ok_or_else(|| Error::Config("at least one trial report is required".into()))?;
485        let mut query_order = Vec::with_capacity(first.per_query.len());
486        let mut seen = std::collections::BTreeSet::new();
487        for (query_id, score) in &first.per_query {
488            if !score.is_finite() {
489                return Err(Error::Config(format!(
490                    "non-finite score for query {query_id}"
491                )));
492            }
493            if !seen.insert(query_id.as_str()) {
494                return Err(Error::BaselineMismatch(format!(
495                    "duplicate query id in trial report: {query_id}"
496                )));
497            }
498            query_order.push(query_id.clone());
499        }
500
501        let mut scores_by_query: BTreeMap<String, Vec<f64>> = query_order
502            .iter()
503            .map(|query_id| (query_id.clone(), Vec::with_capacity(reports.len())))
504            .collect();
505
506        for report in reports {
507            let mut report_scores = BTreeMap::new();
508            for (query_id, score) in &report.per_query {
509                if !score.is_finite() {
510                    return Err(Error::Config(format!(
511                        "non-finite score for query {query_id}"
512                    )));
513                }
514                if report_scores.insert(query_id.as_str(), *score).is_some() {
515                    return Err(Error::BaselineMismatch(format!(
516                        "duplicate query id in trial report: {query_id}"
517                    )));
518                }
519            }
520            if report_scores.len() != query_order.len() {
521                return Err(Error::BaselineMismatch(format!(
522                    "trial report has {} queries; expected {}",
523                    report_scores.len(),
524                    query_order.len()
525                )));
526            }
527            for query_id in &query_order {
528                let Some(score) = report_scores.get(query_id.as_str()).copied() else {
529                    return Err(Error::BaselineMismatch(format!(
530                        "trial report missing query id {query_id}"
531                    )));
532                };
533                let Some(scores) = scores_by_query.get_mut(query_id) else {
534                    return Err(Error::BaselineMismatch(format!(
535                        "unexpected query id {query_id}"
536                    )));
537                };
538                scores.push(score);
539            }
540        }
541
542        let mut per_query = Vec::with_capacity(query_order.len());
543        for query_id in query_order {
544            let Some(scores) = scores_by_query.remove(&query_id) else {
545                return Err(Error::BaselineMismatch(format!(
546                    "missing scores for query id {query_id}"
547                )));
548            };
549            per_query.push(query_reliability(query_id, &scores, threshold, k));
550        }
551
552        let n_queries = per_query.len();
553        let trials_per_query = reports.len();
554        let mean_pass_rate = mean_by(&per_query, |q| q.pass_rate);
555        let pass_at_k = mean_by(&per_query, |q| q.pass_at_k);
556        let pass_all_k = mean_by(&per_query, |q| q.pass_all_k);
557
558        Ok(Self {
559            metric,
560            threshold,
561            k,
562            n_queries,
563            trials_per_query,
564            mean_pass_rate,
565            pass_at_k,
566            pass_all_k,
567            per_query,
568        })
569    }
570}
571
572fn query_reliability(
573    query_id: String,
574    scores: &[f64],
575    threshold: f64,
576    k: usize,
577) -> QueryReliability {
578    let trials = scores.len();
579    let successes = scores.iter().filter(|score| **score >= threshold).count();
580    let pass_rate = if trials == 0 {
581        0.0
582    } else {
583        successes as f64 / trials as f64
584    };
585    QueryReliability {
586        query_id,
587        trials,
588        successes,
589        pass_rate,
590        pass_at_k: pass_at_k_estimate(trials, successes, k),
591        pass_all_k: pass_all_k_estimate(trials, successes, k),
592    }
593}
594
595fn mean_by(rows: &[QueryReliability], f: impl Fn(&QueryReliability) -> f64) -> f64 {
596    if rows.is_empty() {
597        return 0.0;
598    }
599    rows.iter().map(f).sum::<f64>() / rows.len() as f64
600}
601
602fn pass_at_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
603    if k == 0 || trials == 0 || successes == 0 {
604        return 0.0;
605    }
606    if k > trials || trials - successes < k {
607        return 1.0;
608    }
609    let fail_all = (0..k).fold(1.0, |acc, offset| {
610        acc * ((trials - successes - offset) as f64 / (trials - offset) as f64)
611    });
612    1.0 - fail_all
613}
614
615fn pass_all_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
616    if k == 0 || trials == 0 || successes < k || k > trials {
617        return 0.0;
618    }
619    (0..k).fold(1.0, |acc, offset| {
620        acc * ((successes - offset) as f64 / (trials - offset) as f64)
621    })
622}
623
624/// A bundle of [`MetricReport`]s with optional run metadata, suitable for
625/// JSON persistence and baseline comparison.
626#[derive(Debug, Clone, Serialize, Deserialize, Default)]
627pub struct MultiReport {
628    /// Free-form dataset identifier (e.g. `"beir/nq"` or `"internal/v3"`).
629    #[serde(default, skip_serializing_if = "Option::is_none")]
630    pub dataset_id: Option<String>,
631    /// Free-form store identifier (e.g. `"memvid:livetest.mv2"`).
632    #[serde(default, skip_serializing_if = "Option::is_none")]
633    pub store_kind: Option<String>,
634    /// Opaque fingerprint of any LLM judges used. Reports with mismatched
635    /// fingerprints refuse to diff to prevent silent comparison drift.
636    /// Reserved for the upcoming `ragas` feature; pure retrieval runs leave
637    /// this empty.
638    #[serde(default, skip_serializing_if = "Option::is_none")]
639    pub judge_fingerprint: Option<String>,
640    /// One report per metric, in the order metrics were declared.
641    pub metrics: Vec<MetricReport>,
642    /// Optional stale/conflict freshness rollup for the same dataset.
643    #[serde(default, skip_serializing_if = "Option::is_none")]
644    pub freshness: Option<FreshnessReport>,
645}
646
647impl MultiReport {
648    /// Construct a [`MultiReport`] from a metric report vector. Other
649    /// metadata is filled in via the `with_*` builders.
650    #[must_use]
651    pub fn new(metrics: Vec<MetricReport>) -> Self {
652        Self {
653            metrics,
654            ..Default::default()
655        }
656    }
657
658    /// Attach a dataset identifier.
659    #[must_use]
660    pub fn with_dataset(mut self, id: impl Into<String>) -> Self {
661        self.dataset_id = Some(id.into());
662        self
663    }
664
665    /// Attach a store kind identifier.
666    #[must_use]
667    pub fn with_store(mut self, kind: impl Into<String>) -> Self {
668        self.store_kind = Some(kind.into());
669        self
670    }
671
672    /// Attach a judge fingerprint (reserved for `ragas`).
673    #[must_use]
674    pub fn with_judge_fingerprint(mut self, fp: impl Into<String>) -> Self {
675        self.judge_fingerprint = Some(fp.into());
676        self
677    }
678
679    /// Attach a deterministic percentile-bootstrap confidence interval to
680    /// every metric row. Equivalent to mapping [`MetricReport::with_bootstrap_ci`]
681    /// over [`MultiReport::metrics`]. See that method for the algorithm and
682    /// determinism guarantees.
683    #[must_use]
684    pub fn with_bootstrap(mut self, iterations: usize, level: f64, seed: u64) -> Self {
685        self.metrics = self
686            .metrics
687            .into_iter()
688            .map(|m| m.with_bootstrap_ci(iterations, level, seed))
689            .collect();
690        self
691    }
692
693    /// Attach a freshness rollup without modifying metric rows.
694    #[must_use]
695    pub fn with_freshness(mut self, freshness: FreshnessReport) -> Self {
696        self.freshness = Some(freshness);
697        self
698    }
699
700    /// Attach a freshness rollup and append score-like freshness metrics.
701    ///
702    /// Appended metric names are `freshness.stale_free_rate@k` and
703    /// `freshness.conflict_free_rate@k`. Because higher is better, these rows
704    /// can be gated with [`RegressionGate`] just like recall, nDCG, or MRR.
705    #[must_use]
706    pub fn with_freshness_metrics(mut self, freshness: FreshnessReport) -> Self {
707        self.metrics.extend(freshness.metric_reports());
708        self.freshness = Some(freshness);
709        self
710    }
711
712    /// Serialize as pretty-printed JSON.
713    pub fn to_json(&self) -> Result<String> {
714        Ok(serde_json::to_string_pretty(self)?)
715    }
716
717    /// Render a compact Markdown summary table.
718    #[must_use]
719    pub fn to_markdown(&self) -> String {
720        let mut out = String::new();
721        out.push_str("| metric | n | mean | stddev | p50 | p95 | min | max |\n");
722        out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|\n");
723        for m in &self.metrics {
724            out.push_str(&format!(
725                "| {} | {} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} |\n",
726                m.metric, m.n, m.mean, m.stddev, m.p50, m.p95, m.min, m.max
727            ));
728        }
729        out
730    }
731
732    /// Diff this report against a baseline. Returns a [`ReportDiff`] with
733    /// per-metric Δ-mean and per-query winners/losers. Fails if the two
734    /// reports were produced with different judge fingerprints (silent
735    /// comparison drift).
736    ///
737    /// Per-query deltas are computed by intersecting the two reports'
738    /// `per_query` vectors on `query_id`. Queries missing from either side
739    /// are skipped (they cannot be compared). `winners`, `losers`, and
740    /// `unchanged` use an absolute threshold of `1e-9` to filter floating
741    /// point noise; callers needing different sensitivity should inspect
742    /// [`MetricDelta::query_changes`] directly.
743    pub fn diff(&self, baseline: &MultiReport) -> Result<ReportDiff> {
744        if self.judge_fingerprint != baseline.judge_fingerprint {
745            return Err(Error::BaselineMismatch(format!(
746                "judge fingerprint mismatch: current={:?} baseline={:?}",
747                self.judge_fingerprint, baseline.judge_fingerprint
748            )));
749        }
750        let base_by_name: BTreeMap<&str, &MetricReport> = baseline
751            .metrics
752            .iter()
753            .map(|m| (m.metric.as_str(), m))
754            .collect();
755        let mut rows = Vec::with_capacity(self.metrics.len());
756        for m in &self.metrics {
757            let base = base_by_name.get(m.metric.as_str()).copied();
758            let baseline_mean = base.map(|b| b.mean);
759            let (query_changes, winners, losers, unchanged) = match base {
760                Some(b) => compute_query_changes(&m.per_query, &b.per_query),
761                None => (Vec::new(), 0, 0, 0),
762            };
763            rows.push(MetricDelta {
764                metric: m.metric.clone(),
765                current_mean: m.mean,
766                baseline_mean,
767                delta: baseline_mean.map(|b| m.mean - b),
768                winners,
769                losers,
770                unchanged,
771                query_changes,
772                current_ci: m.ci,
773                baseline_ci: base.and_then(|b| b.ci),
774            });
775        }
776        Ok(ReportDiff { rows })
777    }
778
779    /// Render a head-to-head Markdown delta table of this report against
780    /// `baseline` (current, baseline, and `Δ = current − baseline` per
781    /// metric). Convenience wrapper over `self.diff(baseline)?.to_markdown()`;
782    /// fails on a judge-fingerprint mismatch for the same reason [`diff`] does.
783    ///
784    /// [`diff`]: MultiReport::diff
785    pub fn delta_markdown(&self, baseline: &MultiReport) -> Result<String> {
786        Ok(self.diff(baseline)?.to_markdown())
787    }
788}
789
790/// Floating-point noise floor used when bucketing per-query deltas into
791/// winners / losers / unchanged. Deltas with `|delta| <= EPSILON` count as
792/// unchanged.
793const EPSILON: f64 = 1e-9;
794
795/// Intersect per-query scores and return `(changes, winners, losers, unchanged)`.
796/// `changes` is sorted by `|delta|` descending so the largest movers are
797/// surfaced first.
798fn compute_query_changes(
799    current: &[(String, f64)],
800    baseline: &[(String, f64)],
801) -> (Vec<QueryDelta>, usize, usize, usize) {
802    let base_by_query: BTreeMap<&str, f64> =
803        baseline.iter().map(|(q, s)| (q.as_str(), *s)).collect();
804    let mut changes = Vec::new();
805    let mut winners = 0usize;
806    let mut losers = 0usize;
807    let mut unchanged = 0usize;
808    for (query_id, cur_score) in current {
809        let Some(base_score) = base_by_query.get(query_id.as_str()).copied() else {
810            continue;
811        };
812        let delta = cur_score - base_score;
813        if delta > EPSILON {
814            winners += 1;
815        } else if delta < -EPSILON {
816            losers += 1;
817        } else {
818            unchanged += 1;
819        }
820        changes.push(QueryDelta {
821            query_id: query_id.clone(),
822            current: *cur_score,
823            baseline: base_score,
824            delta,
825        });
826    }
827    changes.sort_by(|a, b| {
828        b.delta
829            .abs()
830            .partial_cmp(&a.delta.abs())
831            .unwrap_or(std::cmp::Ordering::Equal)
832    });
833    (changes, winners, losers, unchanged)
834}
835
836/// Per-metric delta produced by [`MultiReport::diff`].
837#[derive(Debug, Clone, Serialize, Deserialize)]
838pub struct MetricDelta {
839    /// Metric identifier.
840    pub metric: String,
841    /// Mean from the current report.
842    pub current_mean: f64,
843    /// Mean from the baseline report, if the metric was present.
844    pub baseline_mean: Option<f64>,
845    /// `current_mean - baseline_mean`, if comparable.
846    pub delta: Option<f64>,
847    /// Number of queries whose score improved relative to the baseline.
848    #[serde(default)]
849    pub winners: usize,
850    /// Number of queries whose score regressed relative to the baseline.
851    #[serde(default)]
852    pub losers: usize,
853    /// Number of queries whose score was unchanged (within floating-point
854    /// noise) relative to the baseline.
855    #[serde(default)]
856    pub unchanged: usize,
857    /// Per-query deltas for queries present in both reports, sorted by
858    /// `|delta|` descending. Empty if the metric was missing from the
859    /// baseline.
860    #[serde(default, skip_serializing_if = "Vec::is_empty")]
861    pub query_changes: Vec<QueryDelta>,
862    /// Bootstrap CI on the current report's mean, if it was computed.
863    #[serde(default, skip_serializing_if = "Option::is_none")]
864    pub current_ci: Option<MetricCi>,
865    /// Bootstrap CI on the baseline report's mean, if it was computed.
866    #[serde(default, skip_serializing_if = "Option::is_none")]
867    pub baseline_ci: Option<MetricCi>,
868}
869
870/// Per-query score change for a single metric, produced by
871/// [`MultiReport::diff`].
872#[derive(Debug, Clone, Serialize, Deserialize)]
873pub struct QueryDelta {
874    /// Gold-query identifier.
875    pub query_id: String,
876    /// Score on the current report.
877    pub current: f64,
878    /// Score on the baseline report.
879    pub baseline: f64,
880    /// `current - baseline`.
881    pub delta: f64,
882}
883
884/// Result of [`MultiReport::diff`].
885#[derive(Debug, Clone, Serialize, Deserialize)]
886pub struct ReportDiff {
887    /// One row per metric in the current report.
888    pub rows: Vec<MetricDelta>,
889}
890
891impl ReportDiff {
892    /// Render the diff as a Markdown table including per-metric mean delta
893    /// and per-query winner/loser/unchanged counts. Per-query movers are
894    /// not inlined; inspect [`MetricDelta::query_changes`] for that detail.
895    #[must_use]
896    pub fn to_markdown(&self) -> String {
897        let mut out = String::new();
898        out.push_str("| metric | current | baseline | Δ | win | lose | same |\n");
899        out.push_str("|---|---:|---:|---:|---:|---:|---:|\n");
900        for r in &self.rows {
901            let baseline = r
902                .baseline_mean
903                .map(|v| format!("{v:.4}"))
904                .unwrap_or_else(|| "—".to_string());
905            let delta = r
906                .delta
907                .map(|v| format!("{v:+.4}"))
908                .unwrap_or_else(|| "—".to_string());
909            out.push_str(&format!(
910                "| {} | {:.4} | {} | {} | {} | {} | {} |\n",
911                r.metric, r.current_mean, baseline, delta, r.winners, r.losers, r.unchanged
912            ));
913        }
914        out
915    }
916
917    /// Serialize as pretty-printed JSON.
918    pub fn to_json(&self) -> Result<String> {
919        Ok(serde_json::to_string_pretty(self)?)
920    }
921
922    /// Evaluate the diff against a [`RegressionGate`]. Returns the subset
923    /// of [`MetricDelta`] rows whose mean delta is more negative than the
924    /// configured threshold for that metric. Metrics not listed in the
925    /// gate are ignored.
926    #[must_use]
927    pub fn regressions(&self, gate: &RegressionGate) -> Vec<MetricDelta> {
928        self.rows
929            .iter()
930            .filter(|r| match (gate.threshold(&r.metric), r.delta) {
931                (Some(threshold), Some(delta)) => delta < -threshold,
932                _ => false,
933            })
934            .cloned()
935            .collect()
936    }
937
938    /// True when [`ReportDiff::regressions`] returns no rows for `gate`.
939    /// Convenience accessor for CI scripts that just want a yes / no.
940    #[must_use]
941    pub fn is_clean(&self, gate: &RegressionGate) -> bool {
942        self.regressions(gate).is_empty()
943    }
944
945    /// Process exit code suitable for `std::process::exit` in a CI eval
946    /// binary: `0` when the diff passes `gate`, `1` when one or more
947    /// metrics regress beyond their tolerated drop. Mirrors the
948    /// long-standing UNIX convention of `0 = success, non-zero = failure`
949    /// and is the single bit consumers should branch on.
950    #[must_use]
951    pub fn exit_code(&self, gate: &RegressionGate) -> i32 {
952        if self.is_clean(gate) { 0 } else { 1 }
953    }
954}
955
956/// Threshold-based regression gate over a [`ReportDiff`].
957///
958/// Each entry maps a metric name to the **minimum tolerated drop** in mean
959/// score: a metric regresses when its `delta` is more negative than
960/// `-threshold`. Thresholds are non-negative; negative values are clamped
961/// to zero on insert.
962///
963/// ```
964/// use rig_retrieval_evals::RegressionGate;
965///
966/// let gate = RegressionGate::new()
967///     .with_threshold("recall@10", 0.02)
968///     .with_threshold("ndcg@10", 0.01);
969/// assert_eq!(gate.threshold("recall@10"), Some(0.02));
970/// assert_eq!(gate.threshold("mrr"), None);
971/// ```
972#[derive(Debug, Clone, Default)]
973pub struct RegressionGate {
974    thresholds: BTreeMap<String, f64>,
975}
976
977impl RegressionGate {
978    /// Build an empty gate. Metrics added via
979    /// [`RegressionGate::with_threshold`] participate in regression checks;
980    /// any others are ignored.
981    #[must_use]
982    pub fn new() -> Self {
983        Self::default()
984    }
985
986    /// Register `threshold` as the maximum tolerated drop in mean score
987    /// for `metric`. Negative values are clamped to `0.0`.
988    #[must_use]
989    pub fn with_threshold(mut self, metric: impl Into<String>, threshold: f64) -> Self {
990        self.thresholds.insert(metric.into(), threshold.max(0.0));
991        self
992    }
993
994    /// Threshold registered for `metric`, if any.
995    #[must_use]
996    pub fn threshold(&self, metric: &str) -> Option<f64> {
997        self.thresholds.get(metric).copied()
998    }
999}
1000
1001#[cfg(test)]
1002#[allow(clippy::unwrap_used, clippy::panic, clippy::indexing_slicing)]
1003mod tests {
1004    use super::*;
1005    use crate::staleness::{ConflictGroup, ConflictReport, StaleHit, StalenessReport};
1006
1007    #[test]
1008    fn metric_report_aggregates() {
1009        let r = MetricReport::from_per_query(
1010            "recall@10".into(),
1011            vec![("q1".into(), 0.0), ("q2".into(), 0.5), ("q3".into(), 1.0)],
1012        );
1013        assert_eq!(r.n, 3);
1014        assert!((r.mean - 0.5).abs() < 1e-9);
1015        assert!((r.min - 0.0).abs() < 1e-9);
1016        assert!((r.max - 1.0).abs() < 1e-9);
1017        assert!((r.p50 - 0.5).abs() < 1e-9);
1018    }
1019
1020    #[test]
1021    fn empty_report_is_zero() {
1022        let r = MetricReport::from_per_query("m".into(), vec![]);
1023        assert_eq!(r.n, 0);
1024        assert_eq!(r.mean, 0.0);
1025    }
1026
1027    #[test]
1028    fn diff_flags_fingerprint_mismatch() {
1029        let a = MultiReport::new(vec![]).with_judge_fingerprint("a");
1030        let b = MultiReport::new(vec![]).with_judge_fingerprint("b");
1031        assert!(a.diff(&b).is_err());
1032    }
1033
1034    #[test]
1035    fn diff_computes_per_metric_delta() {
1036        let cur = MultiReport::new(vec![MetricReport::from_per_query(
1037            "recall@10".into(),
1038            vec![("q1".into(), 0.8)],
1039        )]);
1040        let base = MultiReport::new(vec![MetricReport::from_per_query(
1041            "recall@10".into(),
1042            vec![("q1".into(), 0.6)],
1043        )]);
1044        let diff = cur.diff(&base).unwrap();
1045        assert_eq!(diff.rows.len(), 1);
1046        let row = &diff.rows[0];
1047        assert!((row.delta.unwrap_or(0.0) - 0.2).abs() < 1e-9);
1048    }
1049
1050    #[test]
1051    fn diff_buckets_per_query_winners_losers_and_unchanged() {
1052        let cur = MultiReport::new(vec![MetricReport::from_per_query(
1053            "recall@10".into(),
1054            vec![
1055                ("q1".into(), 1.0), // winner: 0.5 -> 1.0
1056                ("q2".into(), 0.0), // loser:  0.5 -> 0.0
1057                ("q3".into(), 0.5), // unchanged
1058                ("q4".into(), 0.9), // current-only, skipped
1059            ],
1060        )]);
1061        let base = MultiReport::new(vec![MetricReport::from_per_query(
1062            "recall@10".into(),
1063            vec![
1064                ("q1".into(), 0.5),
1065                ("q2".into(), 0.5),
1066                ("q3".into(), 0.5),
1067                ("q5".into(), 1.0), // baseline-only, skipped
1068            ],
1069        )]);
1070        let diff = cur.diff(&base).unwrap();
1071        let row = &diff.rows[0];
1072        assert_eq!(row.winners, 1);
1073        assert_eq!(row.losers, 1);
1074        assert_eq!(row.unchanged, 1);
1075        // q4 / q5 are skipped because they are not in both reports.
1076        assert_eq!(row.query_changes.len(), 3);
1077        // Sorted by |delta| desc: q1 and q2 tie at 0.5, q3 at 0.0.
1078        assert_eq!(row.query_changes[2].query_id, "q3");
1079        assert!((row.query_changes[2].delta).abs() < 1e-9);
1080    }
1081
1082    #[test]
1083    fn diff_query_changes_empty_when_baseline_missing_metric() {
1084        let cur = MultiReport::new(vec![MetricReport::from_per_query(
1085            "ndcg@10".into(),
1086            vec![("q1".into(), 0.9)],
1087        )]);
1088        let base = MultiReport::new(vec![]);
1089        let diff = cur.diff(&base).unwrap();
1090        let row = &diff.rows[0];
1091        assert!(row.delta.is_none());
1092        assert_eq!(row.winners, 0);
1093        assert_eq!(row.losers, 0);
1094        assert_eq!(row.unchanged, 0);
1095        assert!(row.query_changes.is_empty());
1096    }
1097
1098    #[test]
1099    fn regression_gate_flags_only_metrics_below_threshold() {
1100        // recall@10 drops 0.10 (regression), ndcg@10 drops 0.005 (within
1101        // tolerance), mrr is not in the gate (ignored).
1102        let cur = MultiReport::new(vec![
1103            MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.50)]),
1104            MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.595)]),
1105            MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.10)]),
1106        ]);
1107        let base = MultiReport::new(vec![
1108            MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.60)]),
1109            MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.60)]),
1110            MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.90)]),
1111        ]);
1112        let diff = cur.diff(&base).unwrap();
1113        let gate = RegressionGate::new()
1114            .with_threshold("recall@10", 0.02)
1115            .with_threshold("ndcg@10", 0.02);
1116        let regressed = diff.regressions(&gate);
1117        assert_eq!(regressed.len(), 1);
1118        assert_eq!(regressed[0].metric, "recall@10");
1119    }
1120
1121    #[test]
1122    fn regression_gate_clamps_negative_thresholds() {
1123        let gate = RegressionGate::new().with_threshold("recall@10", -0.5);
1124        assert_eq!(gate.threshold("recall@10"), Some(0.0));
1125    }
1126
1127    #[test]
1128    fn report_diff_to_json_round_trips() {
1129        let cur = MultiReport::new(vec![MetricReport::from_per_query(
1130            "recall@10".into(),
1131            vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1132        )]);
1133        let base = MultiReport::new(vec![MetricReport::from_per_query(
1134            "recall@10".into(),
1135            vec![("q1".into(), 0.5), ("q2".into(), 0.5)],
1136        )]);
1137        let diff = cur.diff(&base).unwrap();
1138        let json = diff.to_json().unwrap();
1139        let parsed: ReportDiff = serde_json::from_str(&json).unwrap();
1140        assert_eq!(parsed.rows.len(), 1);
1141        assert_eq!(parsed.rows[0].winners, 1);
1142        assert_eq!(parsed.rows[0].losers, 1);
1143        assert_eq!(parsed.rows[0].query_changes.len(), 2);
1144    }
1145
1146    #[test]
1147    fn reliability_report_computes_pass_at_k_and_pass_all_k() {
1148        let reports = vec![
1149            MetricReport::from_per_query(
1150                "recall@10".into(),
1151                vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1152            ),
1153            MetricReport::from_per_query(
1154                "recall@10".into(),
1155                vec![("q1".into(), 0.0), ("q2".into(), 1.0)],
1156            ),
1157            MetricReport::from_per_query(
1158                "recall@10".into(),
1159                vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1160            ),
1161        ];
1162
1163        let reliability =
1164            ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap();
1165
1166        assert_eq!(reliability.n_queries, 2);
1167        assert_eq!(reliability.trials_per_query, 3);
1168        assert!((reliability.mean_pass_rate - 0.5).abs() < 1e-9);
1169        // q1: 2/3 successes => pass@2 = 1.0, pass^2 = 1/3.
1170        // q2: 1/3 successes => pass@2 = 2/3, pass^2 = 0.0.
1171        assert!((reliability.pass_at_k - (5.0 / 6.0)).abs() < 1e-9);
1172        assert!((reliability.pass_all_k - (1.0 / 6.0)).abs() < 1e-9);
1173        assert_eq!(reliability.per_query[0].query_id, "q1");
1174        assert_eq!(reliability.per_query[0].successes, 2);
1175    }
1176
1177    #[test]
1178    fn reliability_report_requires_matching_metrics() {
1179        let reports = vec![
1180            MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
1181            MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 1.0)]),
1182        ];
1183
1184        let err =
1185            ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
1186
1187        match err {
1188            Error::BaselineMismatch(message) => assert!(message.contains("metric mismatch")),
1189            other => panic!("unexpected error: {other:?}"),
1190        }
1191    }
1192
1193    #[test]
1194    fn reliability_report_requires_same_queries() {
1195        let reports = vec![
1196            MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
1197            MetricReport::from_per_query("recall@10".into(), vec![("q2".into(), 1.0)]),
1198        ];
1199
1200        let err =
1201            ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
1202
1203        match err {
1204            Error::BaselineMismatch(message) => assert!(message.contains("missing query id")),
1205            other => panic!("unexpected error: {other:?}"),
1206        }
1207    }
1208
1209    #[test]
1210    fn reliability_report_rejects_k_larger_than_trials() {
1211        let reports = vec![MetricReport::from_per_query(
1212            "recall@10".into(),
1213            vec![("q1".into(), 1.0)],
1214        )];
1215
1216        let err =
1217            ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap_err();
1218
1219        match err {
1220            Error::Config(message) => assert!(message.contains("exceeds trial count")),
1221            other => panic!("unexpected error: {other:?}"),
1222        }
1223    }
1224
1225    #[test]
1226    fn reliability_report_serializes() {
1227        let reports = vec![MetricReport::from_per_query(
1228            "recall@10".into(),
1229            vec![("q1".into(), 1.0)],
1230        )];
1231
1232        let reliability =
1233            ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap();
1234        let json = serde_json::to_string(&reliability).unwrap();
1235        let parsed: ReliabilityReport = serde_json::from_str(&json).unwrap();
1236
1237        assert_eq!(parsed.metric, "recall@10");
1238        assert_eq!(parsed.per_query.len(), 1);
1239        assert!((parsed.pass_at_k - 1.0).abs() < 1e-9);
1240    }
1241
1242    #[test]
1243    fn bootstrap_ci_brackets_the_mean_for_well_separated_scores() {
1244        let r = MetricReport::from_per_query(
1245            "recall@10".into(),
1246            (0..50)
1247                .map(|i| (format!("q{i}"), if i % 2 == 0 { 0.8 } else { 0.6 }))
1248                .collect(),
1249        )
1250        .with_bootstrap_ci(1000, 0.95, 0xC0FFEE);
1251        let ci = r.ci.unwrap();
1252        assert!(ci.lower < r.mean);
1253        assert!(ci.upper > r.mean);
1254        assert!(ci.lower >= 0.6 - 1e-9);
1255        assert!(ci.upper <= 0.8 + 1e-9);
1256        assert_eq!(ci.iterations, 1000);
1257        assert!((ci.level - 0.95).abs() < 1e-9);
1258    }
1259
1260    #[test]
1261    fn bootstrap_ci_is_deterministic_for_a_fixed_seed() {
1262        let scores: Vec<(String, f64)> = (0..30)
1263            .map(|i| (format!("q{i}"), (i % 5) as f64 / 4.0))
1264            .collect();
1265        let a = MetricReport::from_per_query("m".into(), scores.clone())
1266            .with_bootstrap_ci(500, 0.9, 42)
1267            .ci
1268            .unwrap();
1269        let b = MetricReport::from_per_query("m".into(), scores)
1270            .with_bootstrap_ci(500, 0.9, 42)
1271            .ci
1272            .unwrap();
1273        assert_eq!(a, b);
1274    }
1275
1276    #[test]
1277    fn bootstrap_ci_rejects_invalid_input() {
1278        let r = MetricReport::from_per_query("m".into(), vec![("q1".into(), 0.5)]);
1279        assert!(r.bootstrap_ci(0, 0.95, 1).is_none());
1280        assert!(r.bootstrap_ci(100, 0.0, 1).is_none());
1281        assert!(r.bootstrap_ci(100, 1.0, 1).is_none());
1282        let empty = MetricReport::from_per_query("m".into(), vec![]);
1283        assert!(empty.bootstrap_ci(100, 0.95, 1).is_none());
1284    }
1285
1286    #[test]
1287    fn report_diff_exit_code_signals_regression() {
1288        let baseline = MultiReport::new(vec![MetricReport::from_per_query(
1289            "recall@10".into(),
1290            vec![("q1".into(), 0.9), ("q2".into(), 0.9)],
1291        )]);
1292        let candidate = MultiReport::new(vec![MetricReport::from_per_query(
1293            "recall@10".into(),
1294            vec![("q1".into(), 0.4), ("q2".into(), 0.4)],
1295        )]);
1296        let diff = candidate.diff(&baseline).unwrap();
1297        let gate = RegressionGate::new().with_threshold("recall@10", 0.05);
1298        assert!(!diff.is_clean(&gate));
1299        assert_eq!(diff.exit_code(&gate), 1);
1300
1301        let lax_gate = RegressionGate::new().with_threshold("recall@10", 1.0);
1302        assert!(diff.is_clean(&lax_gate));
1303        assert_eq!(diff.exit_code(&lax_gate), 0);
1304    }
1305
1306    #[test]
1307    fn report_diff_carries_bootstrap_ci_on_both_sides() {
1308        let scores: Vec<(String, f64)> = (0..20).map(|i| (format!("q{i}"), 0.5)).collect();
1309        let candidate = MultiReport::new(vec![
1310            MetricReport::from_per_query("recall@10".into(), scores.clone())
1311                .with_bootstrap_ci(200, 0.95, 7),
1312        ]);
1313        let baseline = MultiReport::new(vec![
1314            MetricReport::from_per_query("recall@10".into(), scores)
1315                .with_bootstrap_ci(200, 0.95, 7),
1316        ]);
1317        let diff = candidate.diff(&baseline).unwrap();
1318        let row = diff.rows.first().unwrap();
1319        assert!(row.current_ci.is_some());
1320        assert!(row.baseline_ci.is_some());
1321    }
1322
1323    #[test]
1324    fn freshness_report_rolls_up_query_rates_and_appends_gateable_metrics() {
1325        let staleness = vec![
1326            StalenessReport {
1327                query_id: "q1".into(),
1328                stale_hits: vec![StaleHit {
1329                    doc_id: "old".into(),
1330                    rank: 0,
1331                    superseded_by: "new".into(),
1332                }],
1333                considered: 2,
1334            },
1335            StalenessReport {
1336                query_id: "q2".into(),
1337                stale_hits: vec![],
1338                considered: 2,
1339            },
1340        ];
1341        let conflicts = vec![
1342            ConflictReport {
1343                query_id: "q1".into(),
1344                groups: vec![ConflictGroup {
1345                    version_key: "alice:address".into(),
1346                    doc_ids: vec!["old".into(), "new".into()],
1347                }],
1348                conflicting_doc_count: 2,
1349                considered: 2,
1350            },
1351            ConflictReport {
1352                query_id: "q2".into(),
1353                groups: vec![],
1354                conflicting_doc_count: 0,
1355                considered: 2,
1356            },
1357        ];
1358
1359        let freshness = FreshnessReport::from_query_reports(2, &staleness, &conflicts).unwrap();
1360
1361        assert_eq!(freshness.query_count, 2);
1362        assert_eq!(freshness.total_considered, 4);
1363        assert_eq!(freshness.stale_hit_count, 1);
1364        assert_eq!(freshness.stale_query_count, 1);
1365        assert!((freshness.stale_rate - 0.25).abs() < 1e-9);
1366        assert!((freshness.stale_query_rate - 0.5).abs() < 1e-9);
1367        assert_eq!(freshness.conflict_group_count, 1);
1368        assert_eq!(freshness.conflicting_doc_count, 2);
1369        assert!((freshness.conflict_rate - 0.5).abs() < 1e-9);
1370        assert_eq!(freshness.per_query[0].stale_rate, 0.5);
1371        assert_eq!(freshness.per_query[0].conflict_rate, 1.0);
1372
1373        let report = MultiReport::new(vec![]).with_freshness_metrics(freshness);
1374        assert!(report.freshness.is_some());
1375        assert_eq!(report.metrics.len(), 2);
1376        assert_eq!(report.metrics[0].metric, "freshness.stale_free_rate@2");
1377        assert!((report.metrics[0].mean - 0.75).abs() < 1e-9);
1378        assert_eq!(report.metrics[1].metric, "freshness.conflict_free_rate@2");
1379        assert!((report.metrics[1].mean - 0.5).abs() < 1e-9);
1380    }
1381
1382    #[test]
1383    fn freshness_metrics_participate_in_existing_regression_gate() {
1384        let baseline_freshness = FreshnessReport::from_query_reports(
1385            2,
1386            &[StalenessReport {
1387                query_id: "q1".into(),
1388                stale_hits: vec![],
1389                considered: 2,
1390            }],
1391            &[ConflictReport {
1392                query_id: "q1".into(),
1393                groups: vec![],
1394                conflicting_doc_count: 0,
1395                considered: 2,
1396            }],
1397        )
1398        .unwrap();
1399        let candidate_freshness = FreshnessReport::from_query_reports(
1400            2,
1401            &[StalenessReport {
1402                query_id: "q1".into(),
1403                stale_hits: vec![StaleHit {
1404                    doc_id: "old".into(),
1405                    rank: 0,
1406                    superseded_by: "new".into(),
1407                }],
1408                considered: 2,
1409            }],
1410            &[ConflictReport {
1411                query_id: "q1".into(),
1412                groups: vec![],
1413                conflicting_doc_count: 0,
1414                considered: 2,
1415            }],
1416        )
1417        .unwrap();
1418
1419        let baseline = MultiReport::new(vec![]).with_freshness_metrics(baseline_freshness);
1420        let candidate = MultiReport::new(vec![]).with_freshness_metrics(candidate_freshness);
1421        let diff = candidate.diff(&baseline).unwrap();
1422        let gate = RegressionGate::new().with_threshold("freshness.stale_free_rate@2", 0.1);
1423
1424        let regressions = diff.regressions(&gate);
1425        assert_eq!(regressions.len(), 1);
1426        assert_eq!(regressions[0].metric, "freshness.stale_free_rate@2");
1427    }
1428
1429    #[test]
1430    fn freshness_report_requires_matching_query_reports() {
1431        let err = FreshnessReport::from_query_reports(
1432            5,
1433            &[StalenessReport {
1434                query_id: "q1".into(),
1435                stale_hits: vec![],
1436                considered: 1,
1437            }],
1438            &[ConflictReport {
1439                query_id: "q2".into(),
1440                groups: vec![],
1441                conflicting_doc_count: 0,
1442                considered: 1,
1443            }],
1444        )
1445        .unwrap_err();
1446
1447        match err {
1448            Error::BaselineMismatch(message) => {
1449                assert!(message.contains("freshness query mismatch"));
1450            }
1451            other => panic!("unexpected error: {other:?}"),
1452        }
1453    }
1454}