shadow_core/diff/
judge.rs

1//! Axis 8: LLM-judge (user-supplied rubric).
2//!
3//! This module defines the [`Judge`] trait that users implement (usually
4//! in Python — see `python/src/shadow/llm/`). The Rust side only provides
5//! the trait and the aggregation logic; no Rust-side default evaluator
6//! is included, because calling an LLM from Rust is out of scope for v0.1
7//! (SDKs are Python-first per CONTRIBUTING.md).
8
9use async_trait::async_trait;
10
11use crate::agentlog::Record;
12use crate::diff::axes::{Axis, AxisStat};
13use crate::diff::bootstrap::{median, paired_ci};
14
15/// User-supplied evaluator that scores a single (baseline, candidate)
16/// response pair. Scores are in `[0.0, 1.0]` where 1.0 means "candidate
17/// is at least as good as baseline."
18#[async_trait]
19pub trait Judge: Send + Sync {
20    /// Return a score in `[0.0, 1.0]` for the given pair.
21    async fn score(&self, baseline: &Record, candidate: &Record) -> f64;
22}
23
24/// Aggregate scores from a user-supplied judge into an [`AxisStat`].
25pub async fn compute<J: Judge + ?Sized>(
26    pairs: &[(&Record, &Record)],
27    judge: &J,
28    seed: Option<u64>,
29) -> AxisStat {
30    if pairs.is_empty() {
31        return AxisStat::empty(Axis::Judge);
32    }
33    let mut scores = Vec::with_capacity(pairs.len());
34    for (b, c) in pairs {
35        scores.push(judge.score(b, c).await);
36    }
37    let baseline_ones: Vec<f64> = (0..scores.len()).map(|_| 1.0).collect();
38    let bm = 1.0;
39    let cm = median(&scores);
40    let delta = cm - bm;
41    let ci = paired_ci(
42        &baseline_ones,
43        &scores,
44        |bs, cs| median(cs) - median(bs),
45        0,
46        seed,
47    );
48    AxisStat::new_value(Axis::Judge, bm, cm, delta, ci.low, ci.high, pairs.len())
49}
50
51#[cfg(test)]
52mod tests {
53    use super::*;
54    use crate::agentlog::Kind;
55    use crate::diff::axes::Severity;
56    use async_trait::async_trait;
57    use serde_json::json;
58
59    /// A Judge that returns a fixed score regardless of input — good for
60    /// testing the aggregation plumbing without needing an actual LLM.
61    struct ConstantJudge(f64);
62
63    #[async_trait]
64    impl Judge for ConstantJudge {
65        async fn score(&self, _baseline: &Record, _candidate: &Record) -> f64 {
66            self.0
67        }
68    }
69
70    fn response() -> Record {
71        Record::new(
72            Kind::ChatResponse,
73            json!({
74                "model": "x",
75                "content": [],
76                "stop_reason": "end_turn",
77                "latency_ms": 0,
78                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
79            }),
80            "2026-04-21T10:00:00Z",
81            None,
82        )
83    }
84
85    #[tokio::test]
86    async fn perfect_score_is_no_regression() {
87        let r = response();
88        let pairs = vec![(&r, &r); 10];
89        let stat = compute(&pairs, &ConstantJudge(1.0), Some(1)).await;
90        assert!((stat.candidate_median - 1.0).abs() < 1e-9);
91        assert_eq!(stat.severity, Severity::None);
92    }
93
94    #[tokio::test]
95    async fn low_score_is_flagged_severe() {
96        let r = response();
97        let pairs = vec![(&r, &r); 10];
98        let stat = compute(&pairs, &ConstantJudge(0.4), Some(1)).await;
99        assert!(stat.candidate_median < 1.0);
100        assert!(matches!(
101            stat.severity,
102            Severity::Severe | Severity::Moderate
103        ));
104    }
105}
shadow_core/diff/judge.rs

shadow_core/diff/
judge.rs