1use std::fmt::Write as _;
4
5use serde::{Deserialize, Serialize};
6
7use crate::diff::alignment::FirstDivergence;
8use crate::diff::axes::{AxisStat, Severity};
9use crate::diff::drill_down::PairDrilldown;
10use crate::diff::recommendations::Recommendation;
11
12#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
14pub struct DiffReport {
15 pub rows: Vec<AxisStat>,
17 pub baseline_trace_id: String,
19 pub candidate_trace_id: String,
21 pub pair_count: usize,
23 #[serde(default)]
31 pub first_divergence: Option<FirstDivergence>,
32 #[serde(default)]
39 pub divergences: Vec<FirstDivergence>,
40 #[serde(default)]
46 pub recommendations: Vec<Recommendation>,
47 #[serde(default)]
54 pub drill_down: Vec<PairDrilldown>,
55}
56
57impl DiffReport {
58 pub fn worst_severity(&self) -> Severity {
60 self.rows
61 .iter()
62 .map(|r| r.severity)
63 .max()
64 .unwrap_or(Severity::None)
65 }
66
67 pub fn to_markdown(&self) -> String {
70 let mut out = String::new();
71 writeln!(
72 out,
73 "# Shadow diff — {pair_count} response pair{s}\n",
74 pair_count = self.pair_count,
75 s = if self.pair_count == 1 { "" } else { "s" }
76 )
77 .ok();
78 writeln!(
79 out,
80 "| axis | baseline | candidate | delta | 95% CI | severity | flags |"
81 )
82 .ok();
83 writeln!(
84 out,
85 "|------|---------:|----------:|------:|--------|----------|-------|"
86 )
87 .ok();
88 for row in &self.rows {
89 let flags = if row.flags.is_empty() {
90 String::new()
91 } else {
92 row.flags
93 .iter()
94 .map(|f| f.label())
95 .collect::<Vec<_>>()
96 .join(",")
97 };
98 writeln!(
99 out,
100 "| {axis} | {bm:.3} | {cm:.3} | {d:+.3} | [{lo:+.3}, {hi:+.3}] | {sev} | {flags} |",
101 axis = row.axis.label(),
102 bm = row.baseline_median,
103 cm = row.candidate_median,
104 d = row.delta,
105 lo = row.ci95_low,
106 hi = row.ci95_high,
107 sev = row.severity.label(),
108 )
109 .ok();
110 }
111 writeln!(
112 out,
113 "\n**Worst severity:** `{}` · baseline `{}` · candidate `{}`",
114 self.worst_severity().label(),
115 short(&self.baseline_trace_id),
116 short(&self.candidate_trace_id),
117 )
118 .ok();
119 if let Some(fd) = &self.first_divergence {
120 writeln!(
121 out,
122 "\n### First divergence\n\n**Turn** baseline `#{}` ↔ candidate `#{}` · **Kind** `{}` · **Axis** `{}` · **Confidence** {:.0}%\n\n> {}",
123 fd.baseline_turn,
124 fd.candidate_turn,
125 fd.kind.label(),
126 fd.primary_axis.label(),
127 fd.confidence * 100.0,
128 fd.explanation,
129 )
130 .ok();
131 }
132 if !self.drill_down.is_empty() {
133 writeln!(out, "\n### Top regressive pairs").ok();
134 let shown = self.drill_down.len().min(3);
135 for row in &self.drill_down[..shown] {
136 writeln!(
137 out,
138 "\n- **pair `#{i}`** · dominant: `{axis}` · score `{score:.2}`",
139 i = row.pair_index,
140 axis = row.dominant_axis.label(),
141 score = row.regression_score,
142 )
143 .ok();
144 let mut contributions: Vec<_> = row.axis_scores.iter().collect();
145 contributions.sort_by(|a, b| {
146 b.normalized_delta
147 .partial_cmp(&a.normalized_delta)
148 .unwrap_or(std::cmp::Ordering::Equal)
149 });
150 for score in contributions.iter().take(2) {
151 if score.normalized_delta < 0.05 {
152 break;
153 }
154 writeln!(
155 out,
156 " - `{axis}`: {bv:.2} → {cv:.2} (delta `{d:+.2}`, norm `{n:.2}`)",
157 axis = score.axis.label(),
158 bv = score.baseline_value,
159 cv = score.candidate_value,
160 d = score.delta,
161 n = score.normalized_delta,
162 )
163 .ok();
164 }
165 }
166 if self.drill_down.len() > shown {
167 writeln!(
168 out,
169 "\n<details><summary>+ {} more regressive pair(s)</summary>\n",
170 self.drill_down.len() - shown
171 )
172 .ok();
173 for row in &self.drill_down[shown..] {
174 writeln!(
175 out,
176 "- pair `#{i}` · `{axis}` · score `{score:.2}`",
177 i = row.pair_index,
178 axis = row.dominant_axis.label(),
179 score = row.regression_score,
180 )
181 .ok();
182 }
183 writeln!(out, "\n</details>").ok();
184 }
185 }
186 out
187 }
188
189 pub fn to_terminal(&self) -> String {
191 let mut out = String::new();
192 writeln!(out, "Shadow diff — {} response pair(s)", self.pair_count).ok();
193 writeln!(out, "baseline : {}", self.baseline_trace_id).ok();
194 writeln!(out, "candidate: {}", self.candidate_trace_id).ok();
195 writeln!(out).ok();
196 writeln!(
197 out,
198 "{:<22} {:>10} {:>10} {:>10} {:>20} {:>10} flags",
199 "axis", "baseline", "candidate", "delta", "95% CI", "severity"
200 )
201 .ok();
202 writeln!(out, "{}", "-".repeat(100)).ok();
203 for row in &self.rows {
204 let flags = if row.flags.is_empty() {
205 String::new()
206 } else {
207 row.flags
208 .iter()
209 .map(|f| f.label())
210 .collect::<Vec<_>>()
211 .join(",")
212 };
213 writeln!(
214 out,
215 "{axis:<22} {bm:>10.3} {cm:>10.3} {d:>+10.3} {ci:>20} {sev:>10} {flags}",
216 axis = row.axis.label(),
217 bm = row.baseline_median,
218 cm = row.candidate_median,
219 d = row.delta,
220 ci = format!("[{:+.2}, {:+.2}]", row.ci95_low, row.ci95_high),
221 sev = row.severity.label(),
222 )
223 .ok();
224 }
225 writeln!(out, "\nworst severity: {}", self.worst_severity().label()).ok();
226 if let Some(fd) = &self.first_divergence {
227 writeln!(out).ok();
228 writeln!(
229 out,
230 "first divergence: baseline turn #{} ↔ candidate turn #{}",
231 fd.baseline_turn, fd.candidate_turn,
232 )
233 .ok();
234 writeln!(
235 out,
236 " kind: {} · axis: {} · confidence: {:.0}%",
237 fd.kind.label(),
238 fd.primary_axis.label(),
239 fd.confidence * 100.0,
240 )
241 .ok();
242 writeln!(out, " explanation: {}", fd.explanation).ok();
243 }
244 if !self.drill_down.is_empty() {
245 writeln!(out).ok();
246 let shown = self.drill_down.len().min(3);
247 writeln!(
248 out,
249 "top regressive pairs ({shown} shown of {total}):",
250 total = self.drill_down.len(),
251 )
252 .ok();
253 for row in &self.drill_down[..shown] {
254 writeln!(
255 out,
256 " pair #{i} · dominant axis: {axis} · score: {score:.2}",
257 i = row.pair_index,
258 axis = row.dominant_axis.label(),
259 score = row.regression_score,
260 )
261 .ok();
262 let mut contributions: Vec<_> = row.axis_scores.iter().collect();
264 contributions.sort_by(|a, b| {
265 b.normalized_delta
266 .partial_cmp(&a.normalized_delta)
267 .unwrap_or(std::cmp::Ordering::Equal)
268 });
269 for score in contributions.iter().take(2) {
270 if score.normalized_delta < 0.05 {
271 break;
272 }
273 writeln!(
274 out,
275 " {axis}: {bv:.2} → {cv:.2} (delta {d:+.2}, norm {n:.2})",
276 axis = score.axis.label(),
277 bv = score.baseline_value,
278 cv = score.candidate_value,
279 d = score.delta,
280 n = score.normalized_delta,
281 )
282 .ok();
283 }
284 }
285 if self.drill_down.len() > shown {
286 writeln!(out, " +{} more", self.drill_down.len() - shown).ok();
287 }
288 }
289 out
290 }
291}
292
293fn short(id: &str) -> String {
294 if id.len() > 16 {
295 format!("{}…{}", &id[..12], &id[id.len() - 4..])
296 } else {
297 id.to_string()
298 }
299}
300
301#[cfg(test)]
302mod tests {
303 use super::*;
304 use crate::diff::axes::Axis;
305
306 fn fake_report() -> DiffReport {
307 let rows = Axis::all()
308 .iter()
309 .map(|a| AxisStat {
310 axis: *a,
311 baseline_median: 1.0,
312 candidate_median: 1.1,
313 delta: 0.1,
314 ci95_low: 0.05,
315 ci95_high: 0.15,
316 severity: Severity::Minor,
317 n: 10,
318 flags: Vec::new(),
319 })
320 .collect();
321 DiffReport {
322 rows,
323 baseline_trace_id:
324 "sha256:aaaa0000bbbb1111cccc2222dddd3333eeee4444ffff5555aaaa6666bbbb".to_string(),
325 candidate_trace_id:
326 "sha256:0000aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666aaaa7777".to_string(),
327 pair_count: 10,
328 first_divergence: None,
329 divergences: Vec::new(),
330 recommendations: Vec::new(),
331 drill_down: Vec::new(),
332 }
333 }
334
335 #[test]
336 fn markdown_has_nine_rows_plus_header() {
337 let md = fake_report().to_markdown();
338 assert!(md.contains("| axis |"));
339 let row_count = md.lines().filter(|l| l.starts_with("| semantic")).count()
340 + md.lines().filter(|l| l.starts_with("| tool-call")).count()
341 + md.lines().filter(|l| l.starts_with("| refusal")).count()
342 + md.lines().filter(|l| l.starts_with("| verbosity")).count()
343 + md.lines().filter(|l| l.starts_with("| latency")).count()
344 + md.lines().filter(|l| l.starts_with("| cost")).count()
345 + md.lines().filter(|l| l.starts_with("| reasoning")).count()
346 + md.lines().filter(|l| l.starts_with("| llm-judge")).count()
347 + md.lines().filter(|l| l.starts_with("| format")).count();
348 assert_eq!(row_count, 9);
349 }
350
351 #[test]
352 fn terminal_renders_all_axes() {
353 let txt = fake_report().to_terminal();
354 for axis in Axis::all() {
355 assert!(txt.contains(axis.label()), "missing axis {:?}", axis);
356 }
357 }
358
359 #[test]
360 fn worst_severity_picks_highest() {
361 let mut r = fake_report();
362 r.rows[3].severity = Severity::Severe;
363 r.rows[0].severity = Severity::Moderate;
364 assert_eq!(r.worst_severity(), Severity::Severe);
365 }
366
367 #[test]
368 fn roundtrip_through_serde_json() {
369 let r = fake_report();
370 let wire = serde_json::to_string(&r).unwrap();
371 let back: DiffReport = serde_json::from_str(&wire).unwrap();
372 assert_eq!(back, r);
373 }
374}