Skip to main content

shadow_core/diff/
report.rs

1//! Rendering of [`DiffReport`] to markdown and terminal.
2
3use std::fmt::Write as _;
4
5use serde::{Deserialize, Serialize};
6
7use crate::diff::alignment::FirstDivergence;
8use crate::diff::axes::{AxisStat, Severity};
9use crate::diff::drill_down::PairDrilldown;
10use crate::diff::recommendations::Recommendation;
11
12/// Top-level diff result.
13#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
14pub struct DiffReport {
15    /// Per-axis rows, in [`Axis::all`] order (nine entries).
16    pub rows: Vec<AxisStat>,
17    /// Content id of the baseline trace used to produce this report.
18    pub baseline_trace_id: String,
19    /// Content id of the candidate trace used to produce this report.
20    pub candidate_trace_id: String,
21    /// Number of paired responses the report is based on.
22    pub pair_count: usize,
23    /// The first turn at which the candidate meaningfully diverged from
24    /// the baseline, with a classification (style / decision / structural).
25    /// `None` when the two traces agree end-to-end.
26    ///
27    /// Preserved for backward compatibility; equivalent to the first
28    /// element of [`Self::divergences`] sorted by alignment order (not
29    /// by importance rank).
30    #[serde(default)]
31    pub first_divergence: Option<FirstDivergence>,
32    /// Top-ranked divergences between the two traces, sorted by
33    /// importance (Structural > Decision > Style by class, then by
34    /// confidence within a class). Empty when the traces agree
35    /// end-to-end. Capped at `DEFAULT_K` entries in standard reports;
36    /// renderers typically show the top 3 inline and hide the rest in
37    /// a collapsible details section.
38    #[serde(default)]
39    pub divergences: Vec<FirstDivergence>,
40    /// Prescriptive fix recommendations derived from the divergence
41    /// set and the axis rows. Sorted by severity (Error > Warning >
42    /// Info), capped at 8 entries. Every recommendation names a
43    /// specific action (Restore / Remove / Revert / Review / Verify)
44    /// and the turn it targets. Empty when nothing is actionable.
45    #[serde(default)]
46    pub recommendations: Vec<Recommendation>,
47    /// Top-K most-regressive response pairs with per-axis breakdown,
48    /// ranked by an aggregate regression score. Surfaces *which*
49    /// specific turns drove the aggregate axis deltas — without this,
50    /// a reviewer looking at a PR with many paired traces has to
51    /// hand-audit each pair. Empty when no pairs are in the report.
52    /// Capped at `drill_down::DEFAULT_K` entries.
53    #[serde(default)]
54    pub drill_down: Vec<PairDrilldown>,
55}
56
57impl DiffReport {
58    /// The highest severity observed across all axes.
59    pub fn worst_severity(&self) -> Severity {
60        self.rows
61            .iter()
62            .map(|r| r.severity)
63            .max()
64            .unwrap_or(Severity::None)
65    }
66
67    /// Render as a markdown table (one row per axis, columns as in
68    /// the nine axes in README.md).
69    pub fn to_markdown(&self) -> String {
70        let mut out = String::new();
71        writeln!(
72            out,
73            "# Shadow diff — {pair_count} response pair{s}\n",
74            pair_count = self.pair_count,
75            s = if self.pair_count == 1 { "" } else { "s" }
76        )
77        .ok();
78        writeln!(
79            out,
80            "| axis | baseline | candidate | delta | 95% CI | severity | flags |"
81        )
82        .ok();
83        writeln!(
84            out,
85            "|------|---------:|----------:|------:|--------|----------|-------|"
86        )
87        .ok();
88        for row in &self.rows {
89            let flags = if row.flags.is_empty() {
90                String::new()
91            } else {
92                row.flags
93                    .iter()
94                    .map(|f| f.label())
95                    .collect::<Vec<_>>()
96                    .join(",")
97            };
98            writeln!(
99                out,
100                "| {axis} | {bm:.3} | {cm:.3} | {d:+.3} | [{lo:+.3}, {hi:+.3}] | {sev} | {flags} |",
101                axis = row.axis.label(),
102                bm = row.baseline_median,
103                cm = row.candidate_median,
104                d = row.delta,
105                lo = row.ci95_low,
106                hi = row.ci95_high,
107                sev = row.severity.label(),
108            )
109            .ok();
110        }
111        writeln!(
112            out,
113            "\n**Worst severity:** `{}` &nbsp; · &nbsp; baseline `{}` &nbsp; · &nbsp; candidate `{}`",
114            self.worst_severity().label(),
115            short(&self.baseline_trace_id),
116            short(&self.candidate_trace_id),
117        )
118        .ok();
119        if let Some(fd) = &self.first_divergence {
120            writeln!(
121                out,
122                "\n### First divergence\n\n**Turn** baseline `#{}` ↔ candidate `#{}` &nbsp; · &nbsp; **Kind** `{}` &nbsp; · &nbsp; **Axis** `{}` &nbsp; · &nbsp; **Confidence** {:.0}%\n\n> {}",
123                fd.baseline_turn,
124                fd.candidate_turn,
125                fd.kind.label(),
126                fd.primary_axis.label(),
127                fd.confidence * 100.0,
128                fd.explanation,
129            )
130            .ok();
131        }
132        if !self.drill_down.is_empty() {
133            writeln!(out, "\n### Top regressive pairs").ok();
134            let shown = self.drill_down.len().min(3);
135            for row in &self.drill_down[..shown] {
136                writeln!(
137                    out,
138                    "\n- **pair `#{i}`** &nbsp;·&nbsp; dominant: `{axis}` &nbsp;·&nbsp; score `{score:.2}`",
139                    i = row.pair_index,
140                    axis = row.dominant_axis.label(),
141                    score = row.regression_score,
142                )
143                .ok();
144                let mut contributions: Vec<_> = row.axis_scores.iter().collect();
145                contributions.sort_by(|a, b| {
146                    b.normalized_delta
147                        .partial_cmp(&a.normalized_delta)
148                        .unwrap_or(std::cmp::Ordering::Equal)
149                });
150                for score in contributions.iter().take(2) {
151                    if score.normalized_delta < 0.05 {
152                        break;
153                    }
154                    writeln!(
155                        out,
156                        "  - `{axis}`: {bv:.2} → {cv:.2} &nbsp;(delta `{d:+.2}`, norm `{n:.2}`)",
157                        axis = score.axis.label(),
158                        bv = score.baseline_value,
159                        cv = score.candidate_value,
160                        d = score.delta,
161                        n = score.normalized_delta,
162                    )
163                    .ok();
164                }
165            }
166            if self.drill_down.len() > shown {
167                writeln!(
168                    out,
169                    "\n<details><summary>+ {} more regressive pair(s)</summary>\n",
170                    self.drill_down.len() - shown
171                )
172                .ok();
173                for row in &self.drill_down[shown..] {
174                    writeln!(
175                        out,
176                        "- pair `#{i}` &nbsp;·&nbsp; `{axis}` &nbsp;·&nbsp; score `{score:.2}`",
177                        i = row.pair_index,
178                        axis = row.dominant_axis.label(),
179                        score = row.regression_score,
180                    )
181                    .ok();
182                }
183                writeln!(out, "\n</details>").ok();
184            }
185        }
186        out
187    }
188
189    /// Render a plain-text table suitable for `stdout`.
190    pub fn to_terminal(&self) -> String {
191        let mut out = String::new();
192        writeln!(out, "Shadow diff — {} response pair(s)", self.pair_count).ok();
193        writeln!(out, "baseline : {}", self.baseline_trace_id).ok();
194        writeln!(out, "candidate: {}", self.candidate_trace_id).ok();
195        writeln!(out).ok();
196        writeln!(
197            out,
198            "{:<22} {:>10} {:>10} {:>10} {:>20} {:>10}  flags",
199            "axis", "baseline", "candidate", "delta", "95% CI", "severity"
200        )
201        .ok();
202        writeln!(out, "{}", "-".repeat(100)).ok();
203        for row in &self.rows {
204            let flags = if row.flags.is_empty() {
205                String::new()
206            } else {
207                row.flags
208                    .iter()
209                    .map(|f| f.label())
210                    .collect::<Vec<_>>()
211                    .join(",")
212            };
213            writeln!(
214                out,
215                "{axis:<22} {bm:>10.3} {cm:>10.3} {d:>+10.3} {ci:>20} {sev:>10}  {flags}",
216                axis = row.axis.label(),
217                bm = row.baseline_median,
218                cm = row.candidate_median,
219                d = row.delta,
220                ci = format!("[{:+.2}, {:+.2}]", row.ci95_low, row.ci95_high),
221                sev = row.severity.label(),
222            )
223            .ok();
224        }
225        writeln!(out, "\nworst severity: {}", self.worst_severity().label()).ok();
226        if let Some(fd) = &self.first_divergence {
227            writeln!(out).ok();
228            writeln!(
229                out,
230                "first divergence: baseline turn #{}  ↔  candidate turn #{}",
231                fd.baseline_turn, fd.candidate_turn,
232            )
233            .ok();
234            writeln!(
235                out,
236                "  kind: {}  ·  axis: {}  ·  confidence: {:.0}%",
237                fd.kind.label(),
238                fd.primary_axis.label(),
239                fd.confidence * 100.0,
240            )
241            .ok();
242            writeln!(out, "  explanation: {}", fd.explanation).ok();
243        }
244        if !self.drill_down.is_empty() {
245            writeln!(out).ok();
246            let shown = self.drill_down.len().min(3);
247            writeln!(
248                out,
249                "top regressive pairs ({shown} shown of {total}):",
250                total = self.drill_down.len(),
251            )
252            .ok();
253            for row in &self.drill_down[..shown] {
254                writeln!(
255                    out,
256                    "  pair #{i}  ·  dominant axis: {axis}  ·  score: {score:.2}",
257                    i = row.pair_index,
258                    axis = row.dominant_axis.label(),
259                    score = row.regression_score,
260                )
261                .ok();
262                // Show the top 2 contributing axes inline.
263                let mut contributions: Vec<_> = row.axis_scores.iter().collect();
264                contributions.sort_by(|a, b| {
265                    b.normalized_delta
266                        .partial_cmp(&a.normalized_delta)
267                        .unwrap_or(std::cmp::Ordering::Equal)
268                });
269                for score in contributions.iter().take(2) {
270                    if score.normalized_delta < 0.05 {
271                        break;
272                    }
273                    writeln!(
274                        out,
275                        "    {axis}: {bv:.2} → {cv:.2} (delta {d:+.2}, norm {n:.2})",
276                        axis = score.axis.label(),
277                        bv = score.baseline_value,
278                        cv = score.candidate_value,
279                        d = score.delta,
280                        n = score.normalized_delta,
281                    )
282                    .ok();
283                }
284            }
285            if self.drill_down.len() > shown {
286                writeln!(out, "  +{} more", self.drill_down.len() - shown).ok();
287            }
288        }
289        out
290    }
291}
292
293fn short(id: &str) -> String {
294    if id.len() > 16 {
295        format!("{}…{}", &id[..12], &id[id.len() - 4..])
296    } else {
297        id.to_string()
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304    use crate::diff::axes::Axis;
305
306    fn fake_report() -> DiffReport {
307        let rows = Axis::all()
308            .iter()
309            .map(|a| AxisStat {
310                axis: *a,
311                baseline_median: 1.0,
312                candidate_median: 1.1,
313                delta: 0.1,
314                ci95_low: 0.05,
315                ci95_high: 0.15,
316                severity: Severity::Minor,
317                n: 10,
318                flags: Vec::new(),
319            })
320            .collect();
321        DiffReport {
322            rows,
323            baseline_trace_id:
324                "sha256:aaaa0000bbbb1111cccc2222dddd3333eeee4444ffff5555aaaa6666bbbb".to_string(),
325            candidate_trace_id:
326                "sha256:0000aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666aaaa7777".to_string(),
327            pair_count: 10,
328            first_divergence: None,
329            divergences: Vec::new(),
330            recommendations: Vec::new(),
331            drill_down: Vec::new(),
332        }
333    }
334
335    #[test]
336    fn markdown_has_nine_rows_plus_header() {
337        let md = fake_report().to_markdown();
338        assert!(md.contains("| axis |"));
339        let row_count = md.lines().filter(|l| l.starts_with("| semantic")).count()
340            + md.lines().filter(|l| l.starts_with("| tool-call")).count()
341            + md.lines().filter(|l| l.starts_with("| refusal")).count()
342            + md.lines().filter(|l| l.starts_with("| verbosity")).count()
343            + md.lines().filter(|l| l.starts_with("| latency")).count()
344            + md.lines().filter(|l| l.starts_with("| cost")).count()
345            + md.lines().filter(|l| l.starts_with("| reasoning")).count()
346            + md.lines().filter(|l| l.starts_with("| llm-judge")).count()
347            + md.lines().filter(|l| l.starts_with("| format")).count();
348        assert_eq!(row_count, 9);
349    }
350
351    #[test]
352    fn terminal_renders_all_axes() {
353        let txt = fake_report().to_terminal();
354        for axis in Axis::all() {
355            assert!(txt.contains(axis.label()), "missing axis {:?}", axis);
356        }
357    }
358
359    #[test]
360    fn worst_severity_picks_highest() {
361        let mut r = fake_report();
362        r.rows[3].severity = Severity::Severe;
363        r.rows[0].severity = Severity::Moderate;
364        assert_eq!(r.worst_severity(), Severity::Severe);
365    }
366
367    #[test]
368    fn roundtrip_through_serde_json() {
369        let r = fake_report();
370        let wire = serde_json::to_string(&r).unwrap();
371        let back: DiffReport = serde_json::from_str(&wire).unwrap();
372        assert_eq!(back, r);
373    }
374}