Skip to main content

shadow_core/diff/
mod.rs

1//! Nine-axis behavioral differ, bootstrap CI, and report renderers.
2//!
3//! See README.md "The nine axes" for the list and SPEC §Replay for what
4//! "diverges" means in this context.
5//!
6//! Usage:
7//! ```no_run
8//! # use shadow_core::agentlog::Record;
9//! # use shadow_core::diff;
10//! # fn demo(baseline: Vec<Record>, candidate: Vec<Record>) {
11//! let pricing = diff::cost::Pricing::new();
12//! let report = diff::compute_report(&baseline, &candidate, &pricing, Some(42));
13//! println!("{}", report.to_terminal());
14//! # }
15//! ```
16
17use crate::agentlog::{Kind, Record};
18
19pub mod alignment;
20pub mod axes;
21pub mod bootstrap;
22pub mod conformance;
23pub mod cost;
24pub mod drill_down;
25pub mod embedder;
26pub mod judge;
27pub mod latency;
28pub mod reasoning;
29pub mod recommendations;
30pub mod report;
31pub mod safety;
32pub mod semantic;
33pub mod trajectory;
34pub mod verbosity;
35
36pub use alignment::{DivergenceKind, FirstDivergence};
37pub use axes::{Axis, AxisStat, Severity};
38pub use bootstrap::{paired_ci, CiResult};
39pub use drill_down::{PairAxisScore, PairDrilldown};
40pub use recommendations::{ActionKind, Recommendation, RecommendationSeverity};
41pub use report::DiffReport;
42
43/// Extract (baseline_response, candidate_response) pairs by pairing the
44/// i-th `chat_response` in `baseline` with the i-th in `candidate`.
45///
46/// If the counts differ (e.g. candidate had backend errors), truncate to
47/// the shorter of the two. Callers that need divergence-on-count should
48/// consult the `replay_summary` record directly.
49pub fn extract_response_pairs<'a>(
50    baseline: &'a [Record],
51    candidate: &'a [Record],
52) -> Vec<(&'a Record, &'a Record)> {
53    let b: Vec<&Record> = baseline
54        .iter()
55        .filter(|r| r.kind == Kind::ChatResponse)
56        .collect();
57    let c: Vec<&Record> = candidate
58        .iter()
59        .filter(|r| r.kind == Kind::ChatResponse)
60        .collect();
61    b.into_iter().zip(c).collect()
62}
63
64/// Compute a [`DiffReport`] from a baseline and candidate trace.
65///
66/// The Judge axis is set to `empty(Axis::Judge)` because no Judge is
67/// supplied here; the Python layer plugs in a Judge via `compute_report_with_judge`.
68pub fn compute_report(
69    baseline: &[Record],
70    candidate: &[Record],
71    pricing: &cost::Pricing,
72    seed: Option<u64>,
73) -> DiffReport {
74    let pairs = extract_response_pairs(baseline, candidate);
75    let rows = vec![
76        semantic::compute(&pairs, seed),
77        trajectory::compute(&pairs, seed),
78        safety::compute(&pairs, seed),
79        verbosity::compute(&pairs, seed),
80        latency::compute(&pairs, seed),
81        cost::compute(&pairs, pricing, seed),
82        reasoning::compute(&pairs, seed),
83        AxisStat::empty(Axis::Judge),
84        conformance::compute(&pairs, seed),
85    ];
86    let first_divergence = alignment::detect(baseline, candidate);
87    let divergences = alignment::detect_top_k(baseline, candidate, alignment::DEFAULT_K);
88    let drill_down = drill_down::compute(&pairs, pricing, drill_down::DEFAULT_K);
89    let mut report = DiffReport {
90        rows,
91        baseline_trace_id: baseline.first().map(|r| r.id.clone()).unwrap_or_default(),
92        candidate_trace_id: candidate.first().map(|r| r.id.clone()).unwrap_or_default(),
93        pair_count: pairs.len(),
94        first_divergence,
95        divergences,
96        drill_down,
97        recommendations: Vec::new(),
98    };
99    // Recommendations are derived from the rest of the report, so fill
100    // the field last. Keeps the function ordering natural and avoids
101    // passing the half-built report around.
102    report.recommendations = recommendations::generate(&report);
103    report
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109    use crate::agentlog::Kind;
110    use serde_json::json;
111
112    fn make_trace(responses: Vec<(u64, &str)>) -> Vec<Record> {
113        let meta = Record::new(
114            Kind::Metadata,
115            json!({"sdk": {"name": "shadow"}}),
116            "2026-04-21T10:00:00Z",
117            None,
118        );
119        let mut out = vec![meta];
120        for (i, (latency, text)) in responses.iter().enumerate() {
121            let req = Record::new(
122                Kind::ChatRequest,
123                json!({"model": "x", "messages": [{"role": "user", "content": format!("q{i}")}], "params": {}}),
124                format!("2026-04-21T10:00:{:02}.000Z", i),
125                out.last().map(|r| r.id.clone()),
126            );
127            let resp = Record::new(
128                Kind::ChatResponse,
129                json!({
130                    "model": "x",
131                    "content": [{"type": "text", "text": text}],
132                    "stop_reason": "end_turn",
133                    "latency_ms": latency,
134                    "usage": {"input_tokens": 10, "output_tokens": 5, "thinking_tokens": 0},
135                }),
136                format!("2026-04-21T10:00:{:02}.500Z", i),
137                Some(req.id.clone()),
138            );
139            out.push(req);
140            out.push(resp);
141        }
142        out
143    }
144
145    #[test]
146    fn compute_report_shapes_to_nine_axes() {
147        let baseline = make_trace(vec![(100, "yes"), (110, "ok"), (90, "sure")]);
148        let candidate = make_trace(vec![(200, "yes"), (220, "ok"), (180, "sure")]);
149        let pricing = cost::Pricing::new();
150        let report = compute_report(&baseline, &candidate, &pricing, Some(42));
151        assert_eq!(report.rows.len(), 9);
152        assert_eq!(report.pair_count, 3);
153        // Latency axis should show a delta.
154        let latency_row = report
155            .rows
156            .iter()
157            .find(|r| r.axis == Axis::Latency)
158            .unwrap();
159        assert!(latency_row.delta > 0.0);
160    }
161
162    #[test]
163    fn extract_response_pairs_truncates_to_shorter() {
164        let b = make_trace(vec![(1, "a"), (2, "b"), (3, "c")]);
165        let c = make_trace(vec![(1, "a"), (2, "b")]);
166        let pairs = extract_response_pairs(&b, &c);
167        assert_eq!(pairs.len(), 2);
168    }
169}