shadow_core/diff/
judge.rs1use async_trait::async_trait;
10
11use crate::agentlog::Record;
12use crate::diff::axes::{Axis, AxisStat};
13use crate::diff::bootstrap::{median, paired_ci};
14
15#[async_trait]
19pub trait Judge: Send + Sync {
20 async fn score(&self, baseline: &Record, candidate: &Record) -> f64;
22}
23
24pub async fn compute<J: Judge + ?Sized>(
26 pairs: &[(&Record, &Record)],
27 judge: &J,
28 seed: Option<u64>,
29) -> AxisStat {
30 if pairs.is_empty() {
31 return AxisStat::empty(Axis::Judge);
32 }
33 let mut scores = Vec::with_capacity(pairs.len());
34 for (b, c) in pairs {
35 scores.push(judge.score(b, c).await);
36 }
37 let baseline_ones: Vec<f64> = (0..scores.len()).map(|_| 1.0).collect();
38 let bm = 1.0;
39 let cm = median(&scores);
40 let delta = cm - bm;
41 let ci = paired_ci(
42 &baseline_ones,
43 &scores,
44 |bs, cs| median(cs) - median(bs),
45 0,
46 seed,
47 );
48 AxisStat::new_value(Axis::Judge, bm, cm, delta, ci.low, ci.high, pairs.len())
49}
50
51#[cfg(test)]
52mod tests {
53 use super::*;
54 use crate::agentlog::Kind;
55 use crate::diff::axes::Severity;
56 use async_trait::async_trait;
57 use serde_json::json;
58
59 struct ConstantJudge(f64);
62
63 #[async_trait]
64 impl Judge for ConstantJudge {
65 async fn score(&self, _baseline: &Record, _candidate: &Record) -> f64 {
66 self.0
67 }
68 }
69
70 fn response() -> Record {
71 Record::new(
72 Kind::ChatResponse,
73 json!({
74 "model": "x",
75 "content": [],
76 "stop_reason": "end_turn",
77 "latency_ms": 0,
78 "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
79 }),
80 "2026-04-21T10:00:00Z",
81 None,
82 )
83 }
84
85 #[tokio::test]
86 async fn perfect_score_is_no_regression() {
87 let r = response();
88 let pairs = vec![(&r, &r); 10];
89 let stat = compute(&pairs, &ConstantJudge(1.0), Some(1)).await;
90 assert!((stat.candidate_median - 1.0).abs() < 1e-9);
91 assert_eq!(stat.severity, Severity::None);
92 }
93
94 #[tokio::test]
95 async fn low_score_is_flagged_severe() {
96 let r = response();
97 let pairs = vec![(&r, &r); 10];
98 let stat = compute(&pairs, &ConstantJudge(0.4), Some(1)).await;
99 assert!(stat.candidate_median < 1.0);
100 assert!(matches!(
101 stat.severity,
102 Severity::Severe | Severity::Moderate
103 ));
104 }
105}