1use std::collections::BTreeMap;
15
16use serde::{Deserialize, Serialize};
17
18use crate::error::{Error, Result};
19use crate::staleness::{ConflictReport, StalenessReport};
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct MetricReport {
24 pub metric: String,
26 pub n: usize,
28 pub mean: f64,
30 pub stddev: f64,
32 pub min: f64,
34 pub max: f64,
36 pub p50: f64,
38 pub p95: f64,
40 pub per_query: Vec<(String, f64)>,
42 #[serde(default, skip_serializing_if = "Option::is_none")]
47 pub ci: Option<MetricCi>,
48}
49
50#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
59pub struct MetricCi {
60 pub lower: f64,
62 pub upper: f64,
64 pub level: f64,
67 pub iterations: usize,
69}
70
71impl MetricReport {
72 pub fn from_per_query(metric: String, per_query: Vec<(String, f64)>) -> Self {
77 let n = per_query.len();
78 if n == 0 {
79 return Self {
80 metric,
81 n: 0,
82 mean: 0.0,
83 stddev: 0.0,
84 min: 0.0,
85 max: 0.0,
86 p50: 0.0,
87 p95: 0.0,
88 per_query,
89 ci: None,
90 };
91 }
92 let scores: Vec<f64> = per_query.iter().map(|(_, s)| *s).collect();
93 let sum: f64 = scores.iter().sum();
94 let mean = sum / n as f64;
95 let var = if n > 1 {
96 scores.iter().map(|s| (s - mean).powi(2)).sum::<f64>() / (n as f64 - 1.0)
97 } else {
98 0.0
99 };
100 let stddev = var.sqrt();
101
102 let mut sorted = scores.clone();
103 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
104 let min = sorted.first().copied().unwrap_or(0.0);
105 let max = sorted.last().copied().unwrap_or(0.0);
106 let p50 = percentile(&sorted, 0.50);
107 let p95 = percentile(&sorted, 0.95);
108
109 Self {
110 metric,
111 n,
112 mean,
113 stddev,
114 min,
115 max,
116 p50,
117 p95,
118 per_query,
119 ci: None,
120 }
121 }
122
123 #[must_use]
133 pub fn bootstrap_ci(&self, iterations: usize, level: f64, seed: u64) -> Option<MetricCi> {
134 if self.per_query.is_empty() || iterations == 0 {
135 return None;
136 }
137 if !(level > 0.0 && level < 1.0) {
138 return None;
139 }
140 let scores: Vec<f64> = self.per_query.iter().map(|(_, s)| *s).collect();
141 let n = scores.len();
142 let mut state = seed;
143 let mut resample_means: Vec<f64> = Vec::with_capacity(iterations);
144 for _ in 0..iterations {
145 let mut sum = 0.0;
146 for _ in 0..n {
147 let r = splitmix64(&mut state);
148 let idx = (r as usize) % n;
149 sum += scores.get(idx).copied().unwrap_or(0.0);
150 }
151 resample_means.push(sum / n as f64);
152 }
153 resample_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
154 let alpha = (1.0 - level) / 2.0;
155 let lower = percentile(&resample_means, alpha);
156 let upper = percentile(&resample_means, 1.0 - alpha);
157 Some(MetricCi {
158 lower,
159 upper,
160 level,
161 iterations,
162 })
163 }
164
165 #[must_use]
170 pub fn with_bootstrap_ci(mut self, iterations: usize, level: f64, seed: u64) -> Self {
171 self.ci = self.bootstrap_ci(iterations, level, seed);
172 self
173 }
174}
175
176fn splitmix64(state: &mut u64) -> u64 {
180 *state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
181 let mut z = *state;
182 z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
183 z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
184 z ^ (z >> 31)
185}
186
187fn percentile(sorted: &[f64], q: f64) -> f64 {
188 if sorted.is_empty() {
189 return 0.0;
190 }
191 if sorted.len() == 1 {
192 return sorted.first().copied().unwrap_or(0.0);
193 }
194 let rank = q * (sorted.len() as f64 - 1.0);
195 let lo = rank.floor() as usize;
196 let hi = rank.ceil() as usize;
197 let lo_v = sorted.get(lo).copied().unwrap_or(0.0);
198 let hi_v = sorted.get(hi).copied().unwrap_or(lo_v);
199 let frac = rank - lo as f64;
200 lo_v + (hi_v - lo_v) * frac
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize)]
211pub struct QueryReliability {
212 pub query_id: String,
214 pub trials: usize,
216 pub successes: usize,
218 pub pass_rate: f64,
220 pub pass_at_k: f64,
222 pub pass_all_k: f64,
224}
225
226#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct ReliabilityReport {
256 pub metric: String,
258 pub threshold: f64,
260 pub k: usize,
262 pub n_queries: usize,
264 pub trials_per_query: usize,
266 pub mean_pass_rate: f64,
268 pub pass_at_k: f64,
270 pub pass_all_k: f64,
272 pub per_query: Vec<QueryReliability>,
274}
275
276#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
278pub struct FreshnessQueryRollup {
279 pub query_id: String,
281 pub considered: usize,
283 pub stale_hits: usize,
285 pub stale_rate: f64,
287 pub conflict_groups: usize,
289 pub conflicting_doc_count: usize,
291 pub conflict_rate: f64,
293}
294
295#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
303pub struct FreshnessReport {
304 pub k: usize,
306 pub query_count: usize,
308 pub total_considered: usize,
310 pub stale_hit_count: usize,
312 pub stale_query_count: usize,
314 pub stale_rate: f64,
316 pub stale_query_rate: f64,
318 pub conflict_group_count: usize,
320 pub conflicting_doc_count: usize,
322 pub conflict_query_count: usize,
324 pub conflict_rate: f64,
326 pub conflict_query_rate: f64,
328 pub per_query: Vec<FreshnessQueryRollup>,
330}
331
332impl FreshnessReport {
333 pub fn from_query_reports(
338 k: usize,
339 staleness: &[StalenessReport],
340 conflicts: &[ConflictReport],
341 ) -> Result<Self> {
342 if staleness.len() != conflicts.len() {
343 return Err(Error::BaselineMismatch(format!(
344 "freshness report count mismatch: stale={} conflict={}",
345 staleness.len(),
346 conflicts.len()
347 )));
348 }
349
350 let mut per_query = Vec::with_capacity(staleness.len());
351 for (stale, conflict) in staleness.iter().zip(conflicts) {
352 if stale.query_id != conflict.query_id {
353 return Err(Error::BaselineMismatch(format!(
354 "freshness query mismatch: stale={} conflict={}",
355 stale.query_id, conflict.query_id
356 )));
357 }
358 if stale.considered != conflict.considered {
359 return Err(Error::BaselineMismatch(format!(
360 "freshness considered mismatch for {}: stale={} conflict={}",
361 stale.query_id, stale.considered, conflict.considered
362 )));
363 }
364 per_query.push(FreshnessQueryRollup {
365 query_id: stale.query_id.clone(),
366 considered: stale.considered,
367 stale_hits: stale.stale_hits.len(),
368 stale_rate: stale.stale_rate(),
369 conflict_groups: conflict.groups.len(),
370 conflicting_doc_count: conflict.conflicting_doc_count,
371 conflict_rate: conflict.conflict_rate(),
372 });
373 }
374
375 let query_count = per_query.len();
376 let total_considered = per_query.iter().map(|row| row.considered).sum();
377 let stale_hit_count = per_query.iter().map(|row| row.stale_hits).sum();
378 let stale_query_count = per_query.iter().filter(|row| row.stale_hits > 0).count();
379 let conflict_group_count = per_query.iter().map(|row| row.conflict_groups).sum();
380 let conflicting_doc_count = per_query.iter().map(|row| row.conflicting_doc_count).sum();
381 let conflict_query_count = per_query
382 .iter()
383 .filter(|row| row.conflict_groups > 0)
384 .count();
385
386 Ok(Self {
387 k,
388 query_count,
389 total_considered,
390 stale_hit_count,
391 stale_query_count,
392 stale_rate: ratio(stale_hit_count, total_considered),
393 stale_query_rate: ratio(stale_query_count, query_count),
394 conflict_group_count,
395 conflicting_doc_count,
396 conflict_query_count,
397 conflict_rate: ratio(conflicting_doc_count, total_considered),
398 conflict_query_rate: ratio(conflict_query_count, query_count),
399 per_query,
400 })
401 }
402
403 #[must_use]
409 pub fn metric_reports(&self) -> Vec<MetricReport> {
410 let stale_free = self
411 .per_query
412 .iter()
413 .map(|row| (row.query_id.clone(), 1.0 - row.stale_rate))
414 .collect();
415 let conflict_free = self
416 .per_query
417 .iter()
418 .map(|row| (row.query_id.clone(), 1.0 - row.conflict_rate))
419 .collect();
420 vec![
421 MetricReport::from_per_query(
422 format!("freshness.stale_free_rate@{}", self.k),
423 stale_free,
424 ),
425 MetricReport::from_per_query(
426 format!("freshness.conflict_free_rate@{}", self.k),
427 conflict_free,
428 ),
429 ]
430 }
431}
432
433fn ratio(numerator: usize, denominator: usize) -> f64 {
434 if denominator == 0 {
435 0.0
436 } else {
437 numerator as f64 / denominator as f64
438 }
439}
440
441impl ReliabilityReport {
442 pub fn from_metric_reports(
448 metric: impl Into<String>,
449 threshold: f64,
450 k: usize,
451 reports: &[MetricReport],
452 ) -> Result<Self> {
453 let metric = metric.into();
454 if reports.is_empty() {
455 return Err(Error::Config(
456 "at least one trial report is required".into(),
457 ));
458 }
459 if k == 0 {
460 return Err(Error::Config("pass@k requires k > 0".into()));
461 }
462 if !threshold.is_finite() {
463 return Err(Error::Config("reliability threshold must be finite".into()));
464 }
465 if k > reports.len() {
466 return Err(Error::Config(format!(
467 "pass@k k={} exceeds trial count {}",
468 k,
469 reports.len()
470 )));
471 }
472
473 for report in reports {
474 if report.metric != metric {
475 return Err(Error::BaselineMismatch(format!(
476 "metric mismatch: expected {metric}, got {}",
477 report.metric
478 )));
479 }
480 }
481
482 let first = reports
483 .first()
484 .ok_or_else(|| Error::Config("at least one trial report is required".into()))?;
485 let mut query_order = Vec::with_capacity(first.per_query.len());
486 let mut seen = std::collections::BTreeSet::new();
487 for (query_id, score) in &first.per_query {
488 if !score.is_finite() {
489 return Err(Error::Config(format!(
490 "non-finite score for query {query_id}"
491 )));
492 }
493 if !seen.insert(query_id.as_str()) {
494 return Err(Error::BaselineMismatch(format!(
495 "duplicate query id in trial report: {query_id}"
496 )));
497 }
498 query_order.push(query_id.clone());
499 }
500
501 let mut scores_by_query: BTreeMap<String, Vec<f64>> = query_order
502 .iter()
503 .map(|query_id| (query_id.clone(), Vec::with_capacity(reports.len())))
504 .collect();
505
506 for report in reports {
507 let mut report_scores = BTreeMap::new();
508 for (query_id, score) in &report.per_query {
509 if !score.is_finite() {
510 return Err(Error::Config(format!(
511 "non-finite score for query {query_id}"
512 )));
513 }
514 if report_scores.insert(query_id.as_str(), *score).is_some() {
515 return Err(Error::BaselineMismatch(format!(
516 "duplicate query id in trial report: {query_id}"
517 )));
518 }
519 }
520 if report_scores.len() != query_order.len() {
521 return Err(Error::BaselineMismatch(format!(
522 "trial report has {} queries; expected {}",
523 report_scores.len(),
524 query_order.len()
525 )));
526 }
527 for query_id in &query_order {
528 let Some(score) = report_scores.get(query_id.as_str()).copied() else {
529 return Err(Error::BaselineMismatch(format!(
530 "trial report missing query id {query_id}"
531 )));
532 };
533 let Some(scores) = scores_by_query.get_mut(query_id) else {
534 return Err(Error::BaselineMismatch(format!(
535 "unexpected query id {query_id}"
536 )));
537 };
538 scores.push(score);
539 }
540 }
541
542 let mut per_query = Vec::with_capacity(query_order.len());
543 for query_id in query_order {
544 let Some(scores) = scores_by_query.remove(&query_id) else {
545 return Err(Error::BaselineMismatch(format!(
546 "missing scores for query id {query_id}"
547 )));
548 };
549 per_query.push(query_reliability(query_id, &scores, threshold, k));
550 }
551
552 let n_queries = per_query.len();
553 let trials_per_query = reports.len();
554 let mean_pass_rate = mean_by(&per_query, |q| q.pass_rate);
555 let pass_at_k = mean_by(&per_query, |q| q.pass_at_k);
556 let pass_all_k = mean_by(&per_query, |q| q.pass_all_k);
557
558 Ok(Self {
559 metric,
560 threshold,
561 k,
562 n_queries,
563 trials_per_query,
564 mean_pass_rate,
565 pass_at_k,
566 pass_all_k,
567 per_query,
568 })
569 }
570}
571
572fn query_reliability(
573 query_id: String,
574 scores: &[f64],
575 threshold: f64,
576 k: usize,
577) -> QueryReliability {
578 let trials = scores.len();
579 let successes = scores.iter().filter(|score| **score >= threshold).count();
580 let pass_rate = if trials == 0 {
581 0.0
582 } else {
583 successes as f64 / trials as f64
584 };
585 QueryReliability {
586 query_id,
587 trials,
588 successes,
589 pass_rate,
590 pass_at_k: pass_at_k_estimate(trials, successes, k),
591 pass_all_k: pass_all_k_estimate(trials, successes, k),
592 }
593}
594
595fn mean_by(rows: &[QueryReliability], f: impl Fn(&QueryReliability) -> f64) -> f64 {
596 if rows.is_empty() {
597 return 0.0;
598 }
599 rows.iter().map(f).sum::<f64>() / rows.len() as f64
600}
601
602fn pass_at_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
603 if k == 0 || trials == 0 || successes == 0 {
604 return 0.0;
605 }
606 if k > trials || trials - successes < k {
607 return 1.0;
608 }
609 let fail_all = (0..k).fold(1.0, |acc, offset| {
610 acc * ((trials - successes - offset) as f64 / (trials - offset) as f64)
611 });
612 1.0 - fail_all
613}
614
615fn pass_all_k_estimate(trials: usize, successes: usize, k: usize) -> f64 {
616 if k == 0 || trials == 0 || successes < k || k > trials {
617 return 0.0;
618 }
619 (0..k).fold(1.0, |acc, offset| {
620 acc * ((successes - offset) as f64 / (trials - offset) as f64)
621 })
622}
623
624#[derive(Debug, Clone, Serialize, Deserialize, Default)]
627pub struct MultiReport {
628 #[serde(default, skip_serializing_if = "Option::is_none")]
630 pub dataset_id: Option<String>,
631 #[serde(default, skip_serializing_if = "Option::is_none")]
633 pub store_kind: Option<String>,
634 #[serde(default, skip_serializing_if = "Option::is_none")]
639 pub judge_fingerprint: Option<String>,
640 pub metrics: Vec<MetricReport>,
642 #[serde(default, skip_serializing_if = "Option::is_none")]
644 pub freshness: Option<FreshnessReport>,
645}
646
647impl MultiReport {
648 #[must_use]
651 pub fn new(metrics: Vec<MetricReport>) -> Self {
652 Self {
653 metrics,
654 ..Default::default()
655 }
656 }
657
658 #[must_use]
660 pub fn with_dataset(mut self, id: impl Into<String>) -> Self {
661 self.dataset_id = Some(id.into());
662 self
663 }
664
665 #[must_use]
667 pub fn with_store(mut self, kind: impl Into<String>) -> Self {
668 self.store_kind = Some(kind.into());
669 self
670 }
671
672 #[must_use]
674 pub fn with_judge_fingerprint(mut self, fp: impl Into<String>) -> Self {
675 self.judge_fingerprint = Some(fp.into());
676 self
677 }
678
679 #[must_use]
684 pub fn with_bootstrap(mut self, iterations: usize, level: f64, seed: u64) -> Self {
685 self.metrics = self
686 .metrics
687 .into_iter()
688 .map(|m| m.with_bootstrap_ci(iterations, level, seed))
689 .collect();
690 self
691 }
692
693 #[must_use]
695 pub fn with_freshness(mut self, freshness: FreshnessReport) -> Self {
696 self.freshness = Some(freshness);
697 self
698 }
699
700 #[must_use]
706 pub fn with_freshness_metrics(mut self, freshness: FreshnessReport) -> Self {
707 self.metrics.extend(freshness.metric_reports());
708 self.freshness = Some(freshness);
709 self
710 }
711
712 pub fn to_json(&self) -> Result<String> {
714 Ok(serde_json::to_string_pretty(self)?)
715 }
716
717 #[must_use]
719 pub fn to_markdown(&self) -> String {
720 let mut out = String::new();
721 out.push_str("| metric | n | mean | stddev | p50 | p95 | min | max |\n");
722 out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|\n");
723 for m in &self.metrics {
724 out.push_str(&format!(
725 "| {} | {} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} | {:.4} |\n",
726 m.metric, m.n, m.mean, m.stddev, m.p50, m.p95, m.min, m.max
727 ));
728 }
729 out
730 }
731
732 pub fn diff(&self, baseline: &MultiReport) -> Result<ReportDiff> {
744 if self.judge_fingerprint != baseline.judge_fingerprint {
745 return Err(Error::BaselineMismatch(format!(
746 "judge fingerprint mismatch: current={:?} baseline={:?}",
747 self.judge_fingerprint, baseline.judge_fingerprint
748 )));
749 }
750 let base_by_name: BTreeMap<&str, &MetricReport> = baseline
751 .metrics
752 .iter()
753 .map(|m| (m.metric.as_str(), m))
754 .collect();
755 let mut rows = Vec::with_capacity(self.metrics.len());
756 for m in &self.metrics {
757 let base = base_by_name.get(m.metric.as_str()).copied();
758 let baseline_mean = base.map(|b| b.mean);
759 let (query_changes, winners, losers, unchanged) = match base {
760 Some(b) => compute_query_changes(&m.per_query, &b.per_query),
761 None => (Vec::new(), 0, 0, 0),
762 };
763 rows.push(MetricDelta {
764 metric: m.metric.clone(),
765 current_mean: m.mean,
766 baseline_mean,
767 delta: baseline_mean.map(|b| m.mean - b),
768 winners,
769 losers,
770 unchanged,
771 query_changes,
772 current_ci: m.ci,
773 baseline_ci: base.and_then(|b| b.ci),
774 });
775 }
776 Ok(ReportDiff { rows })
777 }
778
779 pub fn delta_markdown(&self, baseline: &MultiReport) -> Result<String> {
786 Ok(self.diff(baseline)?.to_markdown())
787 }
788}
789
790const EPSILON: f64 = 1e-9;
794
795fn compute_query_changes(
799 current: &[(String, f64)],
800 baseline: &[(String, f64)],
801) -> (Vec<QueryDelta>, usize, usize, usize) {
802 let base_by_query: BTreeMap<&str, f64> =
803 baseline.iter().map(|(q, s)| (q.as_str(), *s)).collect();
804 let mut changes = Vec::new();
805 let mut winners = 0usize;
806 let mut losers = 0usize;
807 let mut unchanged = 0usize;
808 for (query_id, cur_score) in current {
809 let Some(base_score) = base_by_query.get(query_id.as_str()).copied() else {
810 continue;
811 };
812 let delta = cur_score - base_score;
813 if delta > EPSILON {
814 winners += 1;
815 } else if delta < -EPSILON {
816 losers += 1;
817 } else {
818 unchanged += 1;
819 }
820 changes.push(QueryDelta {
821 query_id: query_id.clone(),
822 current: *cur_score,
823 baseline: base_score,
824 delta,
825 });
826 }
827 changes.sort_by(|a, b| {
828 b.delta
829 .abs()
830 .partial_cmp(&a.delta.abs())
831 .unwrap_or(std::cmp::Ordering::Equal)
832 });
833 (changes, winners, losers, unchanged)
834}
835
836#[derive(Debug, Clone, Serialize, Deserialize)]
838pub struct MetricDelta {
839 pub metric: String,
841 pub current_mean: f64,
843 pub baseline_mean: Option<f64>,
845 pub delta: Option<f64>,
847 #[serde(default)]
849 pub winners: usize,
850 #[serde(default)]
852 pub losers: usize,
853 #[serde(default)]
856 pub unchanged: usize,
857 #[serde(default, skip_serializing_if = "Vec::is_empty")]
861 pub query_changes: Vec<QueryDelta>,
862 #[serde(default, skip_serializing_if = "Option::is_none")]
864 pub current_ci: Option<MetricCi>,
865 #[serde(default, skip_serializing_if = "Option::is_none")]
867 pub baseline_ci: Option<MetricCi>,
868}
869
870#[derive(Debug, Clone, Serialize, Deserialize)]
873pub struct QueryDelta {
874 pub query_id: String,
876 pub current: f64,
878 pub baseline: f64,
880 pub delta: f64,
882}
883
884#[derive(Debug, Clone, Serialize, Deserialize)]
886pub struct ReportDiff {
887 pub rows: Vec<MetricDelta>,
889}
890
891impl ReportDiff {
892 #[must_use]
896 pub fn to_markdown(&self) -> String {
897 let mut out = String::new();
898 out.push_str("| metric | current | baseline | Δ | win | lose | same |\n");
899 out.push_str("|---|---:|---:|---:|---:|---:|---:|\n");
900 for r in &self.rows {
901 let baseline = r
902 .baseline_mean
903 .map(|v| format!("{v:.4}"))
904 .unwrap_or_else(|| "—".to_string());
905 let delta = r
906 .delta
907 .map(|v| format!("{v:+.4}"))
908 .unwrap_or_else(|| "—".to_string());
909 out.push_str(&format!(
910 "| {} | {:.4} | {} | {} | {} | {} | {} |\n",
911 r.metric, r.current_mean, baseline, delta, r.winners, r.losers, r.unchanged
912 ));
913 }
914 out
915 }
916
917 pub fn to_json(&self) -> Result<String> {
919 Ok(serde_json::to_string_pretty(self)?)
920 }
921
922 #[must_use]
927 pub fn regressions(&self, gate: &RegressionGate) -> Vec<MetricDelta> {
928 self.rows
929 .iter()
930 .filter(|r| match (gate.threshold(&r.metric), r.delta) {
931 (Some(threshold), Some(delta)) => delta < -threshold,
932 _ => false,
933 })
934 .cloned()
935 .collect()
936 }
937
938 #[must_use]
941 pub fn is_clean(&self, gate: &RegressionGate) -> bool {
942 self.regressions(gate).is_empty()
943 }
944
945 #[must_use]
951 pub fn exit_code(&self, gate: &RegressionGate) -> i32 {
952 if self.is_clean(gate) { 0 } else { 1 }
953 }
954}
955
956#[derive(Debug, Clone, Default)]
973pub struct RegressionGate {
974 thresholds: BTreeMap<String, f64>,
975}
976
977impl RegressionGate {
978 #[must_use]
982 pub fn new() -> Self {
983 Self::default()
984 }
985
986 #[must_use]
989 pub fn with_threshold(mut self, metric: impl Into<String>, threshold: f64) -> Self {
990 self.thresholds.insert(metric.into(), threshold.max(0.0));
991 self
992 }
993
994 #[must_use]
996 pub fn threshold(&self, metric: &str) -> Option<f64> {
997 self.thresholds.get(metric).copied()
998 }
999}
1000
1001#[cfg(test)]
1002#[allow(clippy::unwrap_used, clippy::panic, clippy::indexing_slicing)]
1003mod tests {
1004 use super::*;
1005 use crate::staleness::{ConflictGroup, ConflictReport, StaleHit, StalenessReport};
1006
1007 #[test]
1008 fn metric_report_aggregates() {
1009 let r = MetricReport::from_per_query(
1010 "recall@10".into(),
1011 vec![("q1".into(), 0.0), ("q2".into(), 0.5), ("q3".into(), 1.0)],
1012 );
1013 assert_eq!(r.n, 3);
1014 assert!((r.mean - 0.5).abs() < 1e-9);
1015 assert!((r.min - 0.0).abs() < 1e-9);
1016 assert!((r.max - 1.0).abs() < 1e-9);
1017 assert!((r.p50 - 0.5).abs() < 1e-9);
1018 }
1019
1020 #[test]
1021 fn empty_report_is_zero() {
1022 let r = MetricReport::from_per_query("m".into(), vec![]);
1023 assert_eq!(r.n, 0);
1024 assert_eq!(r.mean, 0.0);
1025 }
1026
1027 #[test]
1028 fn diff_flags_fingerprint_mismatch() {
1029 let a = MultiReport::new(vec![]).with_judge_fingerprint("a");
1030 let b = MultiReport::new(vec![]).with_judge_fingerprint("b");
1031 assert!(a.diff(&b).is_err());
1032 }
1033
1034 #[test]
1035 fn diff_computes_per_metric_delta() {
1036 let cur = MultiReport::new(vec![MetricReport::from_per_query(
1037 "recall@10".into(),
1038 vec![("q1".into(), 0.8)],
1039 )]);
1040 let base = MultiReport::new(vec![MetricReport::from_per_query(
1041 "recall@10".into(),
1042 vec![("q1".into(), 0.6)],
1043 )]);
1044 let diff = cur.diff(&base).unwrap();
1045 assert_eq!(diff.rows.len(), 1);
1046 let row = &diff.rows[0];
1047 assert!((row.delta.unwrap_or(0.0) - 0.2).abs() < 1e-9);
1048 }
1049
1050 #[test]
1051 fn diff_buckets_per_query_winners_losers_and_unchanged() {
1052 let cur = MultiReport::new(vec![MetricReport::from_per_query(
1053 "recall@10".into(),
1054 vec![
1055 ("q1".into(), 1.0), ("q2".into(), 0.0), ("q3".into(), 0.5), ("q4".into(), 0.9), ],
1060 )]);
1061 let base = MultiReport::new(vec![MetricReport::from_per_query(
1062 "recall@10".into(),
1063 vec![
1064 ("q1".into(), 0.5),
1065 ("q2".into(), 0.5),
1066 ("q3".into(), 0.5),
1067 ("q5".into(), 1.0), ],
1069 )]);
1070 let diff = cur.diff(&base).unwrap();
1071 let row = &diff.rows[0];
1072 assert_eq!(row.winners, 1);
1073 assert_eq!(row.losers, 1);
1074 assert_eq!(row.unchanged, 1);
1075 assert_eq!(row.query_changes.len(), 3);
1077 assert_eq!(row.query_changes[2].query_id, "q3");
1079 assert!((row.query_changes[2].delta).abs() < 1e-9);
1080 }
1081
1082 #[test]
1083 fn diff_query_changes_empty_when_baseline_missing_metric() {
1084 let cur = MultiReport::new(vec![MetricReport::from_per_query(
1085 "ndcg@10".into(),
1086 vec![("q1".into(), 0.9)],
1087 )]);
1088 let base = MultiReport::new(vec![]);
1089 let diff = cur.diff(&base).unwrap();
1090 let row = &diff.rows[0];
1091 assert!(row.delta.is_none());
1092 assert_eq!(row.winners, 0);
1093 assert_eq!(row.losers, 0);
1094 assert_eq!(row.unchanged, 0);
1095 assert!(row.query_changes.is_empty());
1096 }
1097
1098 #[test]
1099 fn regression_gate_flags_only_metrics_below_threshold() {
1100 let cur = MultiReport::new(vec![
1103 MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.50)]),
1104 MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.595)]),
1105 MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.10)]),
1106 ]);
1107 let base = MultiReport::new(vec![
1108 MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 0.60)]),
1109 MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 0.60)]),
1110 MetricReport::from_per_query("mrr".into(), vec![("q1".into(), 0.90)]),
1111 ]);
1112 let diff = cur.diff(&base).unwrap();
1113 let gate = RegressionGate::new()
1114 .with_threshold("recall@10", 0.02)
1115 .with_threshold("ndcg@10", 0.02);
1116 let regressed = diff.regressions(&gate);
1117 assert_eq!(regressed.len(), 1);
1118 assert_eq!(regressed[0].metric, "recall@10");
1119 }
1120
1121 #[test]
1122 fn regression_gate_clamps_negative_thresholds() {
1123 let gate = RegressionGate::new().with_threshold("recall@10", -0.5);
1124 assert_eq!(gate.threshold("recall@10"), Some(0.0));
1125 }
1126
1127 #[test]
1128 fn report_diff_to_json_round_trips() {
1129 let cur = MultiReport::new(vec![MetricReport::from_per_query(
1130 "recall@10".into(),
1131 vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1132 )]);
1133 let base = MultiReport::new(vec![MetricReport::from_per_query(
1134 "recall@10".into(),
1135 vec![("q1".into(), 0.5), ("q2".into(), 0.5)],
1136 )]);
1137 let diff = cur.diff(&base).unwrap();
1138 let json = diff.to_json().unwrap();
1139 let parsed: ReportDiff = serde_json::from_str(&json).unwrap();
1140 assert_eq!(parsed.rows.len(), 1);
1141 assert_eq!(parsed.rows[0].winners, 1);
1142 assert_eq!(parsed.rows[0].losers, 1);
1143 assert_eq!(parsed.rows[0].query_changes.len(), 2);
1144 }
1145
1146 #[test]
1147 fn reliability_report_computes_pass_at_k_and_pass_all_k() {
1148 let reports = vec![
1149 MetricReport::from_per_query(
1150 "recall@10".into(),
1151 vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1152 ),
1153 MetricReport::from_per_query(
1154 "recall@10".into(),
1155 vec![("q1".into(), 0.0), ("q2".into(), 1.0)],
1156 ),
1157 MetricReport::from_per_query(
1158 "recall@10".into(),
1159 vec![("q1".into(), 1.0), ("q2".into(), 0.0)],
1160 ),
1161 ];
1162
1163 let reliability =
1164 ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap();
1165
1166 assert_eq!(reliability.n_queries, 2);
1167 assert_eq!(reliability.trials_per_query, 3);
1168 assert!((reliability.mean_pass_rate - 0.5).abs() < 1e-9);
1169 assert!((reliability.pass_at_k - (5.0 / 6.0)).abs() < 1e-9);
1172 assert!((reliability.pass_all_k - (1.0 / 6.0)).abs() < 1e-9);
1173 assert_eq!(reliability.per_query[0].query_id, "q1");
1174 assert_eq!(reliability.per_query[0].successes, 2);
1175 }
1176
1177 #[test]
1178 fn reliability_report_requires_matching_metrics() {
1179 let reports = vec![
1180 MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
1181 MetricReport::from_per_query("ndcg@10".into(), vec![("q1".into(), 1.0)]),
1182 ];
1183
1184 let err =
1185 ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
1186
1187 match err {
1188 Error::BaselineMismatch(message) => assert!(message.contains("metric mismatch")),
1189 other => panic!("unexpected error: {other:?}"),
1190 }
1191 }
1192
1193 #[test]
1194 fn reliability_report_requires_same_queries() {
1195 let reports = vec![
1196 MetricReport::from_per_query("recall@10".into(), vec![("q1".into(), 1.0)]),
1197 MetricReport::from_per_query("recall@10".into(), vec![("q2".into(), 1.0)]),
1198 ];
1199
1200 let err =
1201 ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap_err();
1202
1203 match err {
1204 Error::BaselineMismatch(message) => assert!(message.contains("missing query id")),
1205 other => panic!("unexpected error: {other:?}"),
1206 }
1207 }
1208
1209 #[test]
1210 fn reliability_report_rejects_k_larger_than_trials() {
1211 let reports = vec![MetricReport::from_per_query(
1212 "recall@10".into(),
1213 vec![("q1".into(), 1.0)],
1214 )];
1215
1216 let err =
1217 ReliabilityReport::from_metric_reports("recall@10", 1.0, 2, &reports).unwrap_err();
1218
1219 match err {
1220 Error::Config(message) => assert!(message.contains("exceeds trial count")),
1221 other => panic!("unexpected error: {other:?}"),
1222 }
1223 }
1224
1225 #[test]
1226 fn reliability_report_serializes() {
1227 let reports = vec![MetricReport::from_per_query(
1228 "recall@10".into(),
1229 vec![("q1".into(), 1.0)],
1230 )];
1231
1232 let reliability =
1233 ReliabilityReport::from_metric_reports("recall@10", 1.0, 1, &reports).unwrap();
1234 let json = serde_json::to_string(&reliability).unwrap();
1235 let parsed: ReliabilityReport = serde_json::from_str(&json).unwrap();
1236
1237 assert_eq!(parsed.metric, "recall@10");
1238 assert_eq!(parsed.per_query.len(), 1);
1239 assert!((parsed.pass_at_k - 1.0).abs() < 1e-9);
1240 }
1241
1242 #[test]
1243 fn bootstrap_ci_brackets_the_mean_for_well_separated_scores() {
1244 let r = MetricReport::from_per_query(
1245 "recall@10".into(),
1246 (0..50)
1247 .map(|i| (format!("q{i}"), if i % 2 == 0 { 0.8 } else { 0.6 }))
1248 .collect(),
1249 )
1250 .with_bootstrap_ci(1000, 0.95, 0xC0FFEE);
1251 let ci = r.ci.unwrap();
1252 assert!(ci.lower < r.mean);
1253 assert!(ci.upper > r.mean);
1254 assert!(ci.lower >= 0.6 - 1e-9);
1255 assert!(ci.upper <= 0.8 + 1e-9);
1256 assert_eq!(ci.iterations, 1000);
1257 assert!((ci.level - 0.95).abs() < 1e-9);
1258 }
1259
1260 #[test]
1261 fn bootstrap_ci_is_deterministic_for_a_fixed_seed() {
1262 let scores: Vec<(String, f64)> = (0..30)
1263 .map(|i| (format!("q{i}"), (i % 5) as f64 / 4.0))
1264 .collect();
1265 let a = MetricReport::from_per_query("m".into(), scores.clone())
1266 .with_bootstrap_ci(500, 0.9, 42)
1267 .ci
1268 .unwrap();
1269 let b = MetricReport::from_per_query("m".into(), scores)
1270 .with_bootstrap_ci(500, 0.9, 42)
1271 .ci
1272 .unwrap();
1273 assert_eq!(a, b);
1274 }
1275
1276 #[test]
1277 fn bootstrap_ci_rejects_invalid_input() {
1278 let r = MetricReport::from_per_query("m".into(), vec![("q1".into(), 0.5)]);
1279 assert!(r.bootstrap_ci(0, 0.95, 1).is_none());
1280 assert!(r.bootstrap_ci(100, 0.0, 1).is_none());
1281 assert!(r.bootstrap_ci(100, 1.0, 1).is_none());
1282 let empty = MetricReport::from_per_query("m".into(), vec![]);
1283 assert!(empty.bootstrap_ci(100, 0.95, 1).is_none());
1284 }
1285
1286 #[test]
1287 fn report_diff_exit_code_signals_regression() {
1288 let baseline = MultiReport::new(vec![MetricReport::from_per_query(
1289 "recall@10".into(),
1290 vec![("q1".into(), 0.9), ("q2".into(), 0.9)],
1291 )]);
1292 let candidate = MultiReport::new(vec![MetricReport::from_per_query(
1293 "recall@10".into(),
1294 vec![("q1".into(), 0.4), ("q2".into(), 0.4)],
1295 )]);
1296 let diff = candidate.diff(&baseline).unwrap();
1297 let gate = RegressionGate::new().with_threshold("recall@10", 0.05);
1298 assert!(!diff.is_clean(&gate));
1299 assert_eq!(diff.exit_code(&gate), 1);
1300
1301 let lax_gate = RegressionGate::new().with_threshold("recall@10", 1.0);
1302 assert!(diff.is_clean(&lax_gate));
1303 assert_eq!(diff.exit_code(&lax_gate), 0);
1304 }
1305
1306 #[test]
1307 fn report_diff_carries_bootstrap_ci_on_both_sides() {
1308 let scores: Vec<(String, f64)> = (0..20).map(|i| (format!("q{i}"), 0.5)).collect();
1309 let candidate = MultiReport::new(vec![
1310 MetricReport::from_per_query("recall@10".into(), scores.clone())
1311 .with_bootstrap_ci(200, 0.95, 7),
1312 ]);
1313 let baseline = MultiReport::new(vec![
1314 MetricReport::from_per_query("recall@10".into(), scores)
1315 .with_bootstrap_ci(200, 0.95, 7),
1316 ]);
1317 let diff = candidate.diff(&baseline).unwrap();
1318 let row = diff.rows.first().unwrap();
1319 assert!(row.current_ci.is_some());
1320 assert!(row.baseline_ci.is_some());
1321 }
1322
1323 #[test]
1324 fn freshness_report_rolls_up_query_rates_and_appends_gateable_metrics() {
1325 let staleness = vec![
1326 StalenessReport {
1327 query_id: "q1".into(),
1328 stale_hits: vec![StaleHit {
1329 doc_id: "old".into(),
1330 rank: 0,
1331 superseded_by: "new".into(),
1332 }],
1333 considered: 2,
1334 },
1335 StalenessReport {
1336 query_id: "q2".into(),
1337 stale_hits: vec![],
1338 considered: 2,
1339 },
1340 ];
1341 let conflicts = vec![
1342 ConflictReport {
1343 query_id: "q1".into(),
1344 groups: vec![ConflictGroup {
1345 version_key: "alice:address".into(),
1346 doc_ids: vec!["old".into(), "new".into()],
1347 }],
1348 conflicting_doc_count: 2,
1349 considered: 2,
1350 },
1351 ConflictReport {
1352 query_id: "q2".into(),
1353 groups: vec![],
1354 conflicting_doc_count: 0,
1355 considered: 2,
1356 },
1357 ];
1358
1359 let freshness = FreshnessReport::from_query_reports(2, &staleness, &conflicts).unwrap();
1360
1361 assert_eq!(freshness.query_count, 2);
1362 assert_eq!(freshness.total_considered, 4);
1363 assert_eq!(freshness.stale_hit_count, 1);
1364 assert_eq!(freshness.stale_query_count, 1);
1365 assert!((freshness.stale_rate - 0.25).abs() < 1e-9);
1366 assert!((freshness.stale_query_rate - 0.5).abs() < 1e-9);
1367 assert_eq!(freshness.conflict_group_count, 1);
1368 assert_eq!(freshness.conflicting_doc_count, 2);
1369 assert!((freshness.conflict_rate - 0.5).abs() < 1e-9);
1370 assert_eq!(freshness.per_query[0].stale_rate, 0.5);
1371 assert_eq!(freshness.per_query[0].conflict_rate, 1.0);
1372
1373 let report = MultiReport::new(vec![]).with_freshness_metrics(freshness);
1374 assert!(report.freshness.is_some());
1375 assert_eq!(report.metrics.len(), 2);
1376 assert_eq!(report.metrics[0].metric, "freshness.stale_free_rate@2");
1377 assert!((report.metrics[0].mean - 0.75).abs() < 1e-9);
1378 assert_eq!(report.metrics[1].metric, "freshness.conflict_free_rate@2");
1379 assert!((report.metrics[1].mean - 0.5).abs() < 1e-9);
1380 }
1381
1382 #[test]
1383 fn freshness_metrics_participate_in_existing_regression_gate() {
1384 let baseline_freshness = FreshnessReport::from_query_reports(
1385 2,
1386 &[StalenessReport {
1387 query_id: "q1".into(),
1388 stale_hits: vec![],
1389 considered: 2,
1390 }],
1391 &[ConflictReport {
1392 query_id: "q1".into(),
1393 groups: vec![],
1394 conflicting_doc_count: 0,
1395 considered: 2,
1396 }],
1397 )
1398 .unwrap();
1399 let candidate_freshness = FreshnessReport::from_query_reports(
1400 2,
1401 &[StalenessReport {
1402 query_id: "q1".into(),
1403 stale_hits: vec![StaleHit {
1404 doc_id: "old".into(),
1405 rank: 0,
1406 superseded_by: "new".into(),
1407 }],
1408 considered: 2,
1409 }],
1410 &[ConflictReport {
1411 query_id: "q1".into(),
1412 groups: vec![],
1413 conflicting_doc_count: 0,
1414 considered: 2,
1415 }],
1416 )
1417 .unwrap();
1418
1419 let baseline = MultiReport::new(vec![]).with_freshness_metrics(baseline_freshness);
1420 let candidate = MultiReport::new(vec![]).with_freshness_metrics(candidate_freshness);
1421 let diff = candidate.diff(&baseline).unwrap();
1422 let gate = RegressionGate::new().with_threshold("freshness.stale_free_rate@2", 0.1);
1423
1424 let regressions = diff.regressions(&gate);
1425 assert_eq!(regressions.len(), 1);
1426 assert_eq!(regressions[0].metric, "freshness.stale_free_rate@2");
1427 }
1428
1429 #[test]
1430 fn freshness_report_requires_matching_query_reports() {
1431 let err = FreshnessReport::from_query_reports(
1432 5,
1433 &[StalenessReport {
1434 query_id: "q1".into(),
1435 stale_hits: vec![],
1436 considered: 1,
1437 }],
1438 &[ConflictReport {
1439 query_id: "q2".into(),
1440 groups: vec![],
1441 conflicting_doc_count: 0,
1442 considered: 1,
1443 }],
1444 )
1445 .unwrap_err();
1446
1447 match err {
1448 Error::BaselineMismatch(message) => {
1449 assert!(message.contains("freshness query mismatch"));
1450 }
1451 other => panic!("unexpected error: {other:?}"),
1452 }
1453 }
1454}