shadow_core/diff/
mod.rs

1//! Nine-axis behavioral differ, bootstrap CI, and report renderers.
2//!
3//! See README.md "The nine axes" for the list and SPEC §Replay for what
4//! "diverges" means in this context.
5//!
6//! Usage:
7//! ```no_run
8//! # use shadow_core::agentlog::Record;
9//! # use shadow_core::diff;
10//! # fn demo(baseline: Vec<Record>, candidate: Vec<Record>) {
11//! let pricing = diff::cost::Pricing::new();
12//! let report = diff::compute_report(&baseline, &candidate, &pricing, Some(42));
13//! println!("{}", report.to_terminal());
14//! # }
15//! ```
16
17use crate::agentlog::{Kind, Record};
18
19pub mod alignment;
20pub mod axes;
21pub mod bootstrap;
22pub mod conformance;
23pub mod cost;
24pub mod drill_down;
25pub mod embedder;
26pub mod judge;
27pub mod latency;
28pub mod reasoning;
29pub mod recommendations;
30pub mod report;
31pub mod safety;
32pub mod semantic;
33pub mod trajectory;
34pub mod verbosity;
35
36pub use alignment::{DivergenceKind, FirstDivergence};
37pub use axes::{Axis, AxisStat, Severity};
38pub use bootstrap::{paired_ci, CiResult};
39pub use drill_down::{PairAxisScore, PairDrilldown};
40pub use recommendations::{ActionKind, Recommendation, RecommendationSeverity};
41pub use report::DiffReport;
42
43/// Extract (baseline_response, candidate_response) pairs by pairing the
44/// i-th `chat_response` in `baseline` with the i-th in `candidate`.
45///
46/// If the counts differ (e.g. candidate had backend errors), truncate to
47/// the shorter of the two. Callers that need divergence-on-count should
48/// consult the `replay_summary` record directly.
49pub fn extract_response_pairs<'a>(
50    baseline: &'a [Record],
51    candidate: &'a [Record],
52) -> Vec<(&'a Record, &'a Record)> {
53    let b: Vec<&Record> = baseline
54        .iter()
55        .filter(|r| r.kind == Kind::ChatResponse)
56        .collect();
57    let c: Vec<&Record> = candidate
58        .iter()
59        .filter(|r| r.kind == Kind::ChatResponse)
60        .collect();
61    b.into_iter().zip(c).collect()
62}
63
64/// Stable trace identifier for a `.agentlog` record stream.
65///
66/// Two-step resolution:
67///
68/// 1. **Envelope `meta.trace_id`** — preferred. The Python SDK's
69///    `Session` mints a unique 128-bit hex `trace_id` per instance and
70///    stamps it on every record's envelope `meta`. Envelope meta is
71///    deliberately *not* part of the content hash (SPEC §6), so this
72///    stays unique even when two sessions emit byte-identical
73///    metadata payloads.
74///
75/// 2. **First record's content id** — fallback. Used for traces that
76///    don't carry envelope-level `meta.trace_id` (third-party
77///    OpenTelemetry imports, hand-constructed fixtures, traces from
78///    SDK versions older than v1.x). The `id` field of the first
79///    record is the SHA-256 of its canonical payload, so this is
80///    stable for any given input but can collide across runs whose
81///    metadata payloads happen to match exactly — which is the case
82///    `meta.trace_id` exists to prevent.
83///
84/// Returns `String::new()` for an empty record list.
85fn trace_id_for(records: &[Record]) -> String {
86    records
87        .iter()
88        .find_map(|r| {
89            r.meta
90                .as_ref()
91                .and_then(|m| m.get("trace_id"))
92                .and_then(|v| v.as_str())
93                .map(str::to_string)
94        })
95        .or_else(|| records.first().map(|r| r.id.clone()))
96        .unwrap_or_default()
97}
98
99/// Compute a [`DiffReport`] from a baseline and candidate trace.
100///
101/// The Judge axis is set to `empty(Axis::Judge)` because no Judge is
102/// supplied here; the Python layer plugs in a Judge via `compute_report_with_judge`.
103pub fn compute_report(
104    baseline: &[Record],
105    candidate: &[Record],
106    pricing: &cost::Pricing,
107    seed: Option<u64>,
108) -> DiffReport {
109    let pairs = extract_response_pairs(baseline, candidate);
110    let rows = vec![
111        semantic::compute(&pairs, seed),
112        trajectory::compute(&pairs, seed),
113        safety::compute(&pairs, seed),
114        verbosity::compute(&pairs, seed),
115        latency::compute(&pairs, seed),
116        cost::compute(&pairs, pricing, seed),
117        reasoning::compute(&pairs, seed),
118        AxisStat::empty(Axis::Judge),
119        conformance::compute(&pairs, seed),
120    ];
121    let first_divergence = alignment::detect(baseline, candidate);
122    let divergences = alignment::detect_top_k(baseline, candidate, alignment::DEFAULT_K);
123    let drill_down = drill_down::compute(&pairs, pricing, drill_down::DEFAULT_K);
124    let mut report = DiffReport {
125        rows,
126        baseline_trace_id: trace_id_for(baseline),
127        candidate_trace_id: trace_id_for(candidate),
128        pair_count: pairs.len(),
129        first_divergence,
130        divergences,
131        drill_down,
132        recommendations: Vec::new(),
133    };
134    // Recommendations are derived from the rest of the report, so fill
135    // the field last. Keeps the function ordering natural and avoids
136    // passing the half-built report around.
137    report.recommendations = recommendations::generate(&report);
138    report
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144    use crate::agentlog::Kind;
145    use serde_json::json;
146
147    fn make_trace(responses: Vec<(u64, &str)>) -> Vec<Record> {
148        let meta = Record::new(
149            Kind::Metadata,
150            json!({"sdk": {"name": "shadow"}}),
151            "2026-04-21T10:00:00Z",
152            None,
153        );
154        let mut out = vec![meta];
155        for (i, (latency, text)) in responses.iter().enumerate() {
156            let req = Record::new(
157                Kind::ChatRequest,
158                json!({"model": "x", "messages": [{"role": "user", "content": format!("q{i}")}], "params": {}}),
159                format!("2026-04-21T10:00:{:02}.000Z", i),
160                out.last().map(|r| r.id.clone()),
161            );
162            let resp = Record::new(
163                Kind::ChatResponse,
164                json!({
165                    "model": "x",
166                    "content": [{"type": "text", "text": text}],
167                    "stop_reason": "end_turn",
168                    "latency_ms": latency,
169                    "usage": {"input_tokens": 10, "output_tokens": 5, "thinking_tokens": 0},
170                }),
171                format!("2026-04-21T10:00:{:02}.500Z", i),
172                Some(req.id.clone()),
173            );
174            out.push(req);
175            out.push(resp);
176        }
177        out
178    }
179
180    #[test]
181    fn compute_report_shapes_to_nine_axes() {
182        let baseline = make_trace(vec![(100, "yes"), (110, "ok"), (90, "sure")]);
183        let candidate = make_trace(vec![(200, "yes"), (220, "ok"), (180, "sure")]);
184        let pricing = cost::Pricing::new();
185        let report = compute_report(&baseline, &candidate, &pricing, Some(42));
186        assert_eq!(report.rows.len(), 9);
187        assert_eq!(report.pair_count, 3);
188        // Latency axis should show a delta.
189        let latency_row = report
190            .rows
191            .iter()
192            .find(|r| r.axis == Axis::Latency)
193            .unwrap();
194        assert!(latency_row.delta > 0.0);
195    }
196
197    #[test]
198    fn extract_response_pairs_truncates_to_shorter() {
199        let b = make_trace(vec![(1, "a"), (2, "b"), (3, "c")]);
200        let c = make_trace(vec![(1, "a"), (2, "b")]);
201        let pairs = extract_response_pairs(&b, &c);
202        assert_eq!(pairs.len(), 2);
203    }
204
205    #[test]
206    fn trace_ids_use_envelope_meta_to_avoid_payload_collisions() {
207        // Two traces with byte-identical metadata payloads (no tags
208        // distinguish them) but different envelope-level meta.trace_id.
209        // Before the fix, the diff report used `Record.id` (the content
210        // hash of the payload) for `baseline_trace_id` and
211        // `candidate_trace_id`, which collided whenever the metadata
212        // payload was the same — i.e. on every default-tagless run pair.
213        //
214        // After the fix: the report prefers `meta.trace_id` from the
215        // envelope, which the Python SDK Session mints uniquely per
216        // instance. Envelope meta is not part of the content hash, so it
217        // stays unique even when payloads match exactly.
218        fn stamp_meta(mut rec: Record, trace_id: &str) -> Record {
219            let mut m = serde_json::Map::new();
220            m.insert("trace_id".into(), json!(trace_id));
221            rec.meta = Some(m);
222            rec
223        }
224        let b = make_trace(vec![(1, "hello")])
225            .into_iter()
226            .map(|r| stamp_meta(r, "trace-aaaa"))
227            .collect::<Vec<_>>();
228        let c = make_trace(vec![(2, "hello")])
229            .into_iter()
230            .map(|r| stamp_meta(r, "trace-bbbb"))
231            .collect::<Vec<_>>();
232
233        // Sanity: the metadata payload (Record.id of the first record)
234        // is identical across baseline and candidate — this is the
235        // collision case the bug report cited.
236        assert_eq!(b[0].id, c[0].id);
237
238        let pricing = cost::Pricing::new();
239        let report = compute_report(&b, &c, &pricing, Some(42));
240
241        assert_eq!(report.baseline_trace_id, "trace-aaaa");
242        assert_eq!(report.candidate_trace_id, "trace-bbbb");
243        assert_ne!(report.baseline_trace_id, report.candidate_trace_id);
244    }
245
246    #[test]
247    fn trace_id_falls_back_to_first_record_id_when_meta_missing() {
248        // Traces without envelope meta.trace_id (third-party imports,
249        // hand-constructed fixtures, pre-1.0 SDK output) keep the
250        // pre-fix behaviour: use the first record's content id. This
251        // preserves backward compatibility for everything that doesn't
252        // have a Session-stamped envelope.
253        let b = make_trace(vec![(1, "hello")]);
254        let c = make_trace(vec![(2, "world")]);
255        let pricing = cost::Pricing::new();
256        let report = compute_report(&b, &c, &pricing, Some(42));
257        // The metadata payloads are identical so first-record content
258        // ids collide here — that's the documented fallback behaviour.
259        assert_eq!(report.baseline_trace_id, b[0].id);
260        assert_eq!(report.candidate_trace_id, c[0].id);
261    }
262}
shadow_core/diff/mod.rs

shadow_core/diff/
mod.rs