shadow_core/diff/
alignment.rs

1//! First-divergence detection over paired chat responses.
2//!
3//! Given two traces, this module identifies the **first turn at which
4//! the candidate meaningfully diverged from the baseline** and classifies
5//! the divergence as one of three kinds:
6//!
7//! - **Structural** — the tool-call sequence differs (missing, extra, or
8//!   reordered calls). Surfaces as a gap in the alignment, OR a same-
9//!   position pair with different `tool_name`s.
10//! - **Decision** — the tool sequence matches but the *decision* changed:
11//!   same tool, different arg values; final-answer semantic cosine < 0.8;
12//!   `stop_reason` flipped; refusal where there wasn't one.
13//! - **Style** — cosmetic wording differences only; semantic cosine ≥ 0.9,
14//!   identical tool shape, identical stop_reason.
15//!
16//! ## Algorithm
17//!
18//! A Needleman-Wunsch global alignment with Gotoh affine gap penalties
19//! pairs baseline and candidate chat_response records. The cost for
20//! aligning pair `(a, b)` is:
21//!
22//! ```text
23//!   cost(a, b) = w_struct * (1 - jaccard(tool_shape_a, tool_shape_b))
24//!              + w_sem    * (1 - text_similarity(a, b))
25//!              + w_stop   * stop_reason_mismatch(a, b)
26//! ```
27//!
28//! After alignment, we walk the alignment path left-to-right and emit
29//! the first cell whose per-cell divergence exceeds the noise floor.
30//!
31//! ## Why NW and not position-match
32//!
33//! Position-match fails when one side inserts or drops a turn (common
34//! when one config retries where the other doesn't). NW pays a
35//! controlled gap cost instead of mis-pairing every subsequent turn.
36//! Cost is O(n·m) in DP cells; traces rarely exceed ~100 turns, so
37//! runtime is trivial in practice.
38
39use serde::{Deserialize, Serialize};
40use std::collections::BTreeSet;
41
42use crate::agentlog::{Kind, Record};
43use crate::diff::axes::Axis;
44
45/// Classification of the first divergence between two traces.
46///
47/// Serialises with a consistent `_drift` suffix across Rust's `label()`,
48/// serde JSON output, and Python-side string representation so the same
49/// value appears identically everywhere a consumer might see it.
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub enum DivergenceKind {
52    /// Cosmetic wording only: semantic similarity high, tool shape
53    /// identical, stop reason identical. Safe-to-merge signal, usually.
54    #[serde(rename = "style_drift")]
55    Style,
56    /// Same structure, different decision: arg values differ, refusal
57    /// flipped, or final-answer semantics shifted meaningfully.
58    #[serde(rename = "decision_drift")]
59    Decision,
60    /// Tool-call sequence differs: insertion, deletion, or reorder.
61    /// This is almost always a real behavioural regression.
62    #[serde(rename = "structural_drift")]
63    Structural,
64}
65
66impl DivergenceKind {
67    /// Short machine-readable label used in terminal / markdown / JSON output.
68    pub fn label(&self) -> &'static str {
69        match self {
70            DivergenceKind::Style => "style_drift",
71            DivergenceKind::Decision => "decision_drift",
72            DivergenceKind::Structural => "structural_drift",
73        }
74    }
75}
76
77/// First meaningful divergence between two traces.
78#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
79pub struct FirstDivergence {
80    /// 0-based index in the baseline's chat_response sequence. For an
81    /// insertion on the candidate side, this is the baseline index
82    /// where the insertion appeared (effectively a "between" marker).
83    pub baseline_turn: usize,
84    /// Same for the candidate side. May differ from baseline_turn when
85    /// gaps are present on the alignment path.
86    pub candidate_turn: usize,
87    /// Classification (Style / Decision / Structural).
88    pub kind: DivergenceKind,
89    /// Primary axis the divergence surfaces on (semantic, trajectory,
90    /// safety, conformance). Provides a machine-readable hint for
91    /// grouping regressions by root cause.
92    pub primary_axis: Axis,
93    /// One-line human-readable explanation. Designed to be embeddable
94    /// in a PR comment without additional context.
95    pub explanation: String,
96    /// Confidence in 0..1. Higher means "the signal exceeds the noise
97    /// floor by a wide margin". Callers can gate display on >= 0.5.
98    pub confidence: f64,
99}
100
101// ---------------------------------------------------------------------------
102// Public entry point
103// ---------------------------------------------------------------------------
104
105/// Default number of top-ranked divergences returned by [`detect_top_k`].
106/// The markdown / terminal renderers show the top 3; the full list goes
107/// to the JSON output. Users can override via the explicit `k` parameter.
108pub const DEFAULT_K: usize = 5;
109
110/// Detect the first meaningful divergence between two traces.
111///
112/// Returns `None` when the traces agree on every compared turn up to the
113/// length of the shorter one (and the longer tail is empty or also
114/// matches). Returns `Some` at the first pair whose combined per-cell
115/// cost exceeds the noise floor.
116///
117/// This is a thin convenience wrapper around [`detect_top_k`] that
118/// returns only the highest-ranked divergence by walk order. Callers
119/// who want multi-fork coverage should use `detect_top_k` directly.
120pub fn detect(baseline: &[Record], candidate: &[Record]) -> Option<FirstDivergence> {
121    // The original "first divergence" is literally the first cell on
122    // the alignment walk that exceeds the noise floor — i.e. rank=0
123    // in walk order, NOT in severity-weighted rank. We preserve that
124    // semantic here for backward compatibility by doing a single-step
125    // walk instead of sorting top-K.
126    let baseline_responses: Vec<&Record> = baseline
127        .iter()
128        .filter(|r| r.kind == Kind::ChatResponse)
129        .collect();
130    let candidate_responses: Vec<&Record> = candidate
131        .iter()
132        .filter(|r| r.kind == Kind::ChatResponse)
133        .collect();
134    if baseline_responses.is_empty() || candidate_responses.is_empty() {
135        return None;
136    }
137    let alignment = align(&baseline_responses, &candidate_responses);
138    walk_collecting(&alignment, &baseline_responses, &candidate_responses, 1)
139        .into_iter()
140        .next()
141}
142
143/// Detect up to `k` meaningful divergences between two traces, sorted
144/// by importance (kind severity × confidence, descending).
145///
146/// Returns an empty vec when the traces agree end-to-end. Returns at
147/// most `k` results; fewer if the walk produces fewer above-noise cells.
148/// Pass `k = DEFAULT_K` for the standard top-5.
149///
150/// **Ranking:** Structural > Decision > Style (by class), then by
151/// `confidence` within a class. This surfaces the most actionable
152/// regression first, not just the earliest. Walk order is preserved
153/// as a stable tiebreaker so identical-severity events are reported
154/// in temporal order (earlier turns before later ones).
155pub fn detect_top_k(baseline: &[Record], candidate: &[Record], k: usize) -> Vec<FirstDivergence> {
156    if k == 0 {
157        return Vec::new();
158    }
159    let baseline_responses: Vec<&Record> = baseline
160        .iter()
161        .filter(|r| r.kind == Kind::ChatResponse)
162        .collect();
163    let candidate_responses: Vec<&Record> = candidate
164        .iter()
165        .filter(|r| r.kind == Kind::ChatResponse)
166        .collect();
167    if baseline_responses.is_empty() || candidate_responses.is_empty() {
168        return Vec::new();
169    }
170    let alignment = align(&baseline_responses, &candidate_responses);
171    // Collect ALL above-noise divergences in walk order (there can't
172    // be more than baseline.len() + candidate.len() of them).
173    let max_possible = baseline_responses.len() + candidate_responses.len();
174    let mut all = walk_collecting(
175        &alignment,
176        &baseline_responses,
177        &candidate_responses,
178        max_possible,
179    );
180    // Stable sort by (kind rank desc, confidence desc, walk-order asc).
181    // Walk-order is captured by the current vec position (index), so we
182    // use a stable sort and only key on the two explicit ranks.
183    all.sort_by(|a, b| {
184        kind_rank(b.kind).cmp(&kind_rank(a.kind)).then_with(|| {
185            b.confidence
186                .partial_cmp(&a.confidence)
187                .unwrap_or(std::cmp::Ordering::Equal)
188        })
189    });
190    all.truncate(k);
191    all
192}
193
194/// Ranking weight for each kind. Higher = more actionable, ranks higher.
195/// Structural drift (tool sequence differs) is nearly always a real
196/// behavioural regression; decision drift (same shape, different call)
197/// needs investigation but less urgent; style drift is cosmetic.
198fn kind_rank(k: DivergenceKind) -> u8 {
199    match k {
200        DivergenceKind::Structural => 3,
201        DivergenceKind::Decision => 2,
202        DivergenceKind::Style => 1,
203    }
204}
205
206// ---------------------------------------------------------------------------
207// Alignment: Needleman-Wunsch with Gotoh affine gap penalties
208// ---------------------------------------------------------------------------
209
210/// Weights for the per-cell cost function. Tuned against the real
211/// demo fixtures; exported as constants so tests and callers can see
212/// the exact numbers without grepping source.
213const W_STRUCT: f64 = 0.40; // Jaccard distance on tool_shape
214const W_SEM: f64 = 0.25; // 1 - text_similarity
215const W_STOP: f64 = 0.15; // stop_reason mismatch
216const W_ARGS: f64 = 0.20; // tool_use input VALUE differences (same keys, different values)
217
218/// Gotoh affine gap penalty: opening a gap is more expensive than
219/// extending one. Prevents the aligner from fragmenting a multi-turn
220/// insertion into many single-turn insertions.
221const GAP_OPEN: f64 = 0.60;
222const GAP_EXTEND: f64 = 0.15;
223
224/// Noise floor for per-cell divergence. Cells below this are treated
225/// as "no divergence" — covers bootstrap non-determinism, minor
226/// token-count drift from prompt caching, etc.
227const NOISE_FLOOR: f64 = 0.12;
228
229/// Style-drift upper bound on the per-cell cost. Above this we call
230/// it Decision or Structural. Calibrated for semantic cosine ≥ 0.9
231/// with identical tool shape.
232const STYLE_MAX_COST: f64 = 0.25;
233
234#[derive(Debug, Clone, Copy, PartialEq, Eq)]
235enum Step {
236    /// Diagonal: pair baseline[i-1] with candidate[j-1].
237    Match(usize, usize),
238    /// Horizontal: gap on baseline side (candidate inserted a turn).
239    InsertCandidate(usize),
240    /// Vertical: gap on candidate side (candidate dropped a turn).
241    DeleteBaseline(usize),
242}
243
244/// Alignment result: a path of steps from (0,0) to (n,m).
245struct Alignment {
246    steps: Vec<Step>,
247}
248
249/// Threshold beyond which we switch from full-matrix Needleman-Wunsch
250/// to a banded variant. Under the threshold, the quadratic work is
251/// negligible and full-matrix stays exact. Above it, the band keeps
252/// memory + CPU linear in `max(N, M)`.
253const SCALE_BAND_THRESHOLD: usize = 1000;
254
255/// Minimum band half-width. Small enough to stay efficient, large
256/// enough that typical drift between baseline and candidate (tool
257/// added / removed / reordered within a few turns) fits.
258const MIN_BAND_HALF_WIDTH: usize = 100;
259
260/// Pick the band half-width for a (N, M) pair. We guarantee
261/// `|i - j| <= band` captures any realistic alignment path: the band
262/// scales with `max(|N - M| + MIN_BAND_HALF_WIDTH, sqrt(max(N, M)))`
263/// so a modest length difference plus a local scan radius is always
264/// within reach.
265fn band_half_width(n: usize, m: usize) -> usize {
266    let length_diff = n.abs_diff(m);
267    let radius = (n.max(m) as f64).sqrt() as usize;
268    length_diff + MIN_BAND_HALF_WIDTH.max(radius)
269}
270
271fn align(baseline: &[&Record], candidate: &[&Record]) -> Alignment {
272    let n = baseline.len();
273    let m = candidate.len();
274    if n.max(m) > SCALE_BAND_THRESHOLD {
275        align_banded(baseline, candidate, band_half_width(n, m))
276    } else {
277        align_full(baseline, candidate)
278    }
279}
280
281fn align_full(baseline: &[&Record], candidate: &[&Record]) -> Alignment {
282    let n = baseline.len();
283    let m = candidate.len();
284    // DP table: cost of aligning baseline[0..i] with candidate[0..j].
285    // We carry three matrices (M, X, Y) per Gotoh to track whether the
286    // previous op was a match, a horizontal gap (in baseline), or a
287    // vertical gap (in candidate). INF sentinel: 1e18 (cannot realistically
288    // be reached in practice).
289    const INF: f64 = 1e18;
290    let mut mat = vec![vec![INF; m + 1]; n + 1];
291    let mut xg = vec![vec![INF; m + 1]; n + 1]; // gap in baseline (insertion)
292    let mut yg = vec![vec![INF; m + 1]; n + 1]; // gap in candidate (deletion)
293    let mut back = vec![vec![Step::Match(0, 0); m + 1]; n + 1];
294
295    mat[0][0] = 0.0;
296    for i in 1..=n {
297        yg[i][0] = GAP_OPEN + (i as f64 - 1.0) * GAP_EXTEND;
298        mat[i][0] = yg[i][0];
299        back[i][0] = Step::DeleteBaseline(i - 1);
300    }
301    for j in 1..=m {
302        xg[0][j] = GAP_OPEN + (j as f64 - 1.0) * GAP_EXTEND;
303        mat[0][j] = xg[0][j];
304        back[0][j] = Step::InsertCandidate(j - 1);
305    }
306
307    for i in 1..=n {
308        for j in 1..=m {
309            let c = pair_cost(baseline[i - 1], candidate[j - 1]);
310            // Match path: best of (prev-match, prev-xgap, prev-ygap) + pair cost.
311            let m_cost = mat[i - 1][j - 1]
312                .min(xg[i - 1][j - 1])
313                .min(yg[i - 1][j - 1])
314                + c;
315            // Open a horizontal gap (insertion on candidate) or extend one.
316            let xg_cost = (mat[i][j - 1] + GAP_OPEN).min(xg[i][j - 1] + GAP_EXTEND);
317            // Open a vertical gap (deletion on baseline) or extend one.
318            let yg_cost = (mat[i - 1][j] + GAP_OPEN).min(yg[i - 1][j] + GAP_EXTEND);
319            mat[i][j] = m_cost;
320            xg[i][j] = xg_cost;
321            yg[i][j] = yg_cost;
322            // Record the back-pointer for the cell's *minimum* overall
323            // reachable cost — we walk the cheapest path.
324            let best = m_cost.min(xg_cost).min(yg_cost);
325            back[i][j] = if (best - m_cost).abs() < 1e-12 {
326                Step::Match(i - 1, j - 1)
327            } else if (best - xg_cost).abs() < 1e-12 {
328                Step::InsertCandidate(j - 1)
329            } else {
330                Step::DeleteBaseline(i - 1)
331            };
332        }
333    }
334
335    // Traceback from (n, m) to (0, 0).
336    let mut steps = Vec::new();
337    let mut i = n;
338    let mut j = m;
339    while i > 0 || j > 0 {
340        let s = back[i][j];
341        steps.push(s);
342        match s {
343            Step::Match(_, _) => {
344                i -= 1;
345                j -= 1;
346            }
347            Step::InsertCandidate(_) => {
348                j -= 1;
349            }
350            Step::DeleteBaseline(_) => {
351                i -= 1;
352            }
353        }
354    }
355    steps.reverse();
356    Alignment { steps }
357}
358
359/// Banded Needleman-Wunsch — identical recurrence to `align_full` but
360/// only computes cells within `|i - j| <= band`. Memory + work drop
361/// from O(N*M) to O((N+M) * band). For well-behaved traces (baseline
362/// and candidate similar length with limited local drift) this yields
363/// the same optimal alignment as the full matrix; for adversarial
364/// inputs that drift further than `band`, the result is best-effort
365/// within the band and falls back to a forced diagonal outside.
366///
367/// Cells outside the band are clamped to INF on read and never written
368/// to (they stay INF), so the recurrence naturally refuses to route
369/// through them.
370/// Per-row column window for the banded matrix: `[j_lo, j_hi]`
371/// inclusive, capturing the in-band columns at row `i`.
372#[inline]
373fn band_window(i: usize, m: usize, band: usize) -> (usize, usize) {
374    (i.saturating_sub(band), (i + band).min(m))
375}
376
377/// Banded 2-D table: row `i` stores only the in-band columns
378/// `[j_lo(i), j_hi(i)]`. Column `j` lives at offset `j - j_lo(i)`.
379///
380/// Memory: O(n * band) instead of O(n * m). At n = m = 10_000 and
381/// band ≈ 200, this is ~30 MB per matrix vs ~800 MB for the full
382/// layout. v3.2.4 and earlier allocated the full matrix even though
383/// the compute only filled the band, defeating the whole point of
384/// the banded variant. v3.2.5 stores what the compute actually uses.
385struct Banded<T: Copy> {
386    rows: Vec<Vec<T>>,
387    band: usize,
388    m: usize,
389    default_val: T,
390}
391
392impl<T: Copy> Banded<T> {
393    fn new(n: usize, m: usize, band: usize, default_val: T) -> Self {
394        let mut rows = Vec::with_capacity(n + 1);
395        for i in 0..=n {
396            let (j_lo, j_hi) = band_window(i, m, band);
397            rows.push(vec![default_val; j_hi - j_lo + 1]);
398        }
399        Self {
400            rows,
401            band,
402            m,
403            default_val,
404        }
405    }
406
407    #[inline]
408    fn in_band(&self, i: usize, j: usize) -> bool {
409        let (j_lo, j_hi) = band_window(i, self.m, self.band);
410        j >= j_lo && j <= j_hi
411    }
412
413    #[inline]
414    fn get(&self, i: usize, j: usize) -> T {
415        if !self.in_band(i, j) {
416            return self.default_val;
417        }
418        let (j_lo, _) = band_window(i, self.m, self.band);
419        self.rows[i][j - j_lo]
420    }
421
422    #[inline]
423    fn set(&mut self, i: usize, j: usize, v: T) {
424        let (j_lo, _) = band_window(i, self.m, self.band);
425        self.rows[i][j - j_lo] = v;
426    }
427}
428
429fn align_banded(baseline: &[&Record], candidate: &[&Record], band: usize) -> Alignment {
430    let n = baseline.len();
431    let m = candidate.len();
432    const INF: f64 = 1e18;
433    let mut mat = Banded::new(n, m, band, INF);
434    let mut xg = Banded::new(n, m, band, INF);
435    let mut yg = Banded::new(n, m, band, INF);
436    let mut back = Banded::new(n, m, band, Step::Match(0, 0));
437
438    mat.set(0, 0, 0.0);
439    // Boundary initialisation limited to the band.
440    for i in 1..=n.min(band) {
441        let v = GAP_OPEN + (i as f64 - 1.0) * GAP_EXTEND;
442        yg.set(i, 0, v);
443        mat.set(i, 0, v);
444        back.set(i, 0, Step::DeleteBaseline(i - 1));
445    }
446    for j in 1..=m.min(band) {
447        let v = GAP_OPEN + (j as f64 - 1.0) * GAP_EXTEND;
448        xg.set(0, j, v);
449        mat.set(0, j, v);
450        back.set(0, j, Step::InsertCandidate(j - 1));
451    }
452
453    for i in 1..=n {
454        let j_lo = i.saturating_sub(band).max(1);
455        let j_hi = (i + band).min(m);
456        for j in j_lo..=j_hi {
457            let c = pair_cost(baseline[i - 1], candidate[j - 1]);
458            let m_cost = mat
459                .get(i - 1, j - 1)
460                .min(xg.get(i - 1, j - 1))
461                .min(yg.get(i - 1, j - 1))
462                + c;
463            let xg_cost = (mat.get(i, j - 1) + GAP_OPEN).min(xg.get(i, j - 1) + GAP_EXTEND);
464            let yg_cost = (mat.get(i - 1, j) + GAP_OPEN).min(yg.get(i - 1, j) + GAP_EXTEND);
465            mat.set(i, j, m_cost);
466            xg.set(i, j, xg_cost);
467            yg.set(i, j, yg_cost);
468            let best = m_cost.min(xg_cost).min(yg_cost);
469            let step = if (best - m_cost).abs() < 1e-12 {
470                Step::Match(i - 1, j - 1)
471            } else if (best - xg_cost).abs() < 1e-12 {
472                Step::InsertCandidate(j - 1)
473            } else {
474                Step::DeleteBaseline(i - 1)
475            };
476            back.set(i, j, step);
477        }
478    }
479
480    // Traceback from (n, m) to (0, 0). Out-of-band cells were never
481    // populated so the backpointer at (n, m) is valid if the path
482    // stayed within the band; otherwise we forcibly walk the diagonal.
483    let mut steps = Vec::new();
484    let mut i = n;
485    let mut j = m;
486    while i > 0 || j > 0 {
487        // If the recorded backpointer is the default Match(0, 0)
488        // at an out-of-band cell, force a diagonal or edge step.
489        let s = if i > 0 && j > 0 && back.in_band(i, j) {
490            back.get(i, j)
491        } else if j == 0 {
492            Step::DeleteBaseline(i - 1)
493        } else if i == 0 {
494            Step::InsertCandidate(j - 1)
495        } else {
496            // Out of band: force a diagonal step to converge to (0,0).
497            Step::Match(i - 1, j - 1)
498        };
499        steps.push(s);
500        match s {
501            Step::Match(_, _) => {
502                i -= 1;
503                j -= 1;
504            }
505            Step::InsertCandidate(_) => {
506                j -= 1;
507            }
508            Step::DeleteBaseline(_) => {
509                i -= 1;
510            }
511        }
512    }
513    steps.reverse();
514    Alignment { steps }
515}
516
517// ---------------------------------------------------------------------------
518// Per-cell cost
519// ---------------------------------------------------------------------------
520
521/// Cost of aligning one baseline response with one candidate response.
522/// Always returns a value in [0, 1].
523fn pair_cost(a: &Record, b: &Record) -> f64 {
524    let tool_shape_a = tool_shape(a);
525    let tool_shape_b = tool_shape(b);
526    // Structural component: set Jaccard on tool shapes, BUT also penalise
527    // count mismatches. Without the count check, duplicate tool calls
528    // ("candidate called `lookup_order` twice where baseline called it
529    // once") are invisible because sets collapse duplicates.
530    let shape_dist = 1.0 - jaccard(&tool_shape_a, &tool_shape_b);
531    let count_a = count_tool_use(a);
532    let count_b = count_tool_use(b);
533    let count_dist = if count_a == count_b {
534        0.0
535    } else {
536        let diff = (count_a as f64 - count_b as f64).abs();
537        let denom = count_a.max(count_b) as f64;
538        if denom == 0.0 {
539            0.0
540        } else {
541            (diff / denom).min(1.0)
542        }
543    };
544    let structural = shape_dist.max(count_dist);
545
546    let text_a = response_text(a);
547    let text_b = response_text(b);
548    let semantic = 1.0 - text_similarity(&text_a, &text_b);
549
550    let stop_a = stop_reason(a);
551    let stop_b = stop_reason(b);
552    let stop = if stop_a != stop_b { 1.0 } else { 0.0 };
553
554    // Arg-value divergence: same tool name AND same arg-key set on
555    // both sides, but different arg VALUES. Without this component
556    // we'd miss the "`search(limit=10)` → `search(limit=50)`" case
557    // because structural + stop + (empty-text) semantic are all 0.
558    let args = if tool_shape_a == tool_shape_b && !tool_shape_a.is_empty() {
559        if arg_value_diff(a, b).is_some() {
560            1.0
561        } else {
562            0.0
563        }
564    } else {
565        0.0
566    };
567
568    W_STRUCT * structural + W_SEM * semantic + W_STOP * stop + W_ARGS * args
569}
570
571/// Extract a canonical tool-shape token per tool_use block. The token
572/// is `"<tool_name>(<sorted-comma-arg-keys>)"` — captures both the
573/// tool called and the KEYS (not values) of its input. Values are
574/// compared separately by `arg_values_differ`.
575fn tool_shape(r: &Record) -> BTreeSet<String> {
576    let mut out = BTreeSet::new();
577    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
578        return out;
579    };
580    for part in arr {
581        if part.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
582            continue;
583        }
584        let name = part.get("name").and_then(|n| n.as_str()).unwrap_or("_");
585        let mut keys: Vec<String> = part
586            .get("input")
587            .and_then(|i| i.as_object())
588            .map(|o| o.keys().cloned().collect())
589            .unwrap_or_default();
590        keys.sort();
591        out.insert(format!("{name}({})", keys.join(",")));
592    }
593    out
594}
595
596/// Count the number of tool_use blocks in a response. Useful as a
597/// structural signal in addition to the set-based tool_shape, because
598/// a set collapses duplicates — calling `lookup_order` twice looks
599/// identical to calling it once if we only compare shapes.
600fn count_tool_use(r: &Record) -> usize {
601    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
602        return 0;
603    };
604    arr.iter()
605        .filter(|p| p.get("type").and_then(|t| t.as_str()) == Some("tool_use"))
606        .count()
607}
608
609fn response_text(r: &Record) -> String {
610    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
611        return String::new();
612    };
613    arr.iter()
614        .filter_map(|p| {
615            if p.get("type").and_then(|t| t.as_str()) == Some("text") {
616                p.get("text")
617                    .and_then(|t| t.as_str())
618                    .map(ToString::to_string)
619            } else {
620                None
621            }
622        })
623        .collect::<Vec<_>>()
624        .join(" ")
625}
626
627fn stop_reason(r: &Record) -> String {
628    r.payload
629        .get("stop_reason")
630        .and_then(|v| v.as_str())
631        .unwrap_or("")
632        .to_string()
633}
634
635/// Jaccard similarity on two string sets. Returns 1.0 for two empty
636/// sets (both sides produced no tool calls — they agree structurally).
637fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
638    if a.is_empty() && b.is_empty() {
639        return 1.0;
640    }
641    let inter = a.intersection(b).count() as f64;
642    let uni = a.union(b).count() as f64;
643    if uni == 0.0 {
644        1.0
645    } else {
646        inter / uni
647    }
648}
649
650/// Lightweight text-similarity proxy: character-shingle Jaccard over
651/// whitespace-normalised text. We avoid bringing in embeddings here
652/// because this module is in the Rust core and must not take a heavy
653/// ML dep. The Python layer can upgrade this via a similarity callback
654/// in v0.2.
655///
656/// Whitespace is normalised (collapsed and trimmed) before shingling:
657/// `"ok"` and `"o k"` should be treated as identical, not as totally
658/// different strings — whitespace-only diffs are the canonical style-
659/// drift signal and must not survive into the similarity score.
660///
661/// For empty (post-normalisation) strings, returns 1.0.
662fn text_similarity(a: &str, b: &str) -> f64 {
663    let na = normalise_whitespace(a);
664    let nb = normalise_whitespace(b);
665    if na.is_empty() && nb.is_empty() {
666        return 1.0;
667    }
668    if na == nb {
669        return 1.0;
670    }
671    let sa = shingles(&na, 4);
672    let sb = shingles(&nb, 4);
673    jaccard(&sa, &sb)
674}
675
676/// Collapse runs of whitespace into a single space and trim edges.
677/// Whitespace-only differences aren't meaningful semantic signal for
678/// the alignment cost function.
679fn normalise_whitespace(s: &str) -> String {
680    let mut out = String::with_capacity(s.len());
681    let mut in_ws = false;
682    for ch in s.chars() {
683        if ch.is_whitespace() {
684            if !in_ws && !out.is_empty() {
685                out.push(' ');
686            }
687            in_ws = true;
688        } else {
689            out.push(ch);
690            in_ws = false;
691        }
692    }
693    if out.ends_with(' ') {
694        out.pop();
695    }
696    out
697}
698
699fn shingles(s: &str, k: usize) -> BTreeSet<String> {
700    let chars: Vec<char> = s.chars().collect();
701    let mut out = BTreeSet::new();
702    if chars.len() < k {
703        if !s.is_empty() {
704            out.insert(s.to_string());
705        }
706        return out;
707    }
708    for w in chars.windows(k) {
709        out.insert(w.iter().collect());
710    }
711    out
712}
713
714// ---------------------------------------------------------------------------
715// Walk the alignment for the first divergence
716// ---------------------------------------------------------------------------
717
718/// Walk the alignment and collect up to `limit` above-noise divergences
719/// in alignment order. Returns an empty vec when the traces agree
720/// end-to-end. Cursor tracking lets gap steps report the correct
721/// baseline / candidate positions even after previous gaps.
722fn walk_collecting(
723    alignment: &Alignment,
724    baseline: &[&Record],
725    candidate: &[&Record],
726    limit: usize,
727) -> Vec<FirstDivergence> {
728    let mut out: Vec<FirstDivergence> = Vec::new();
729    if limit == 0 {
730        return out;
731    }
732    // Track cursors through the walk so that gap steps can report the
733    // baseline / candidate position correctly — without this, an
734    // insertion on the candidate side can't tell which baseline turn
735    // it lived BETWEEN.
736    let mut b_cursor: usize = 0;
737    let mut c_cursor: usize = 0;
738    for step in &alignment.steps {
739        if out.len() >= limit {
740            return out;
741        }
742        match *step {
743            Step::InsertCandidate(j) => {
744                // Candidate inserted a turn the baseline didn't have —
745                // structural by definition. The insertion sits between
746                // `b_cursor - 1` and `b_cursor` on the baseline side.
747                let cand = candidate[j];
748                let insertion_point = b_cursor;
749                let n_tools = tool_shape(cand).len();
750                let detail = if n_tools == 0 {
751                    "an extra response turn with no tool calls".to_string()
752                } else if n_tools == 1 {
753                    "an extra turn with 1 tool call".to_string()
754                } else {
755                    format!("an extra turn with {n_tools} tool calls")
756                };
757                out.push(FirstDivergence {
758                    baseline_turn: insertion_point,
759                    candidate_turn: j,
760                    kind: DivergenceKind::Structural,
761                    primary_axis: Axis::Trajectory,
762                    explanation: format!(
763                        "candidate inserted {detail} between baseline turns #{prev} and #{insertion_point}",
764                        prev = insertion_point.saturating_sub(1),
765                    ),
766                    confidence: 1.0,
767                });
768                c_cursor = c_cursor.saturating_add(1);
769            }
770            Step::DeleteBaseline(i) => {
771                let b = baseline[i];
772                let deletion_point = c_cursor;
773                let n_tools = tool_shape(b).len();
774                let detail = if n_tools == 0 {
775                    "a response turn with no tool calls".to_string()
776                } else if n_tools == 1 {
777                    "a turn with 1 tool call".to_string()
778                } else {
779                    format!("a turn with {n_tools} tool calls")
780                };
781                out.push(FirstDivergence {
782                    baseline_turn: i,
783                    candidate_turn: deletion_point,
784                    kind: DivergenceKind::Structural,
785                    primary_axis: Axis::Trajectory,
786                    explanation: format!(
787                        "candidate dropped {detail} (baseline turn #{i} has no counterpart)",
788                    ),
789                    confidence: 1.0,
790                });
791                b_cursor = b_cursor.saturating_add(1);
792            }
793            Step::Match(i, j) => {
794                let b = baseline[i];
795                let c = candidate[j];
796                let cost = pair_cost(b, c);
797                b_cursor = i.saturating_add(1);
798                c_cursor = j.saturating_add(1);
799                if cost <= NOISE_FLOOR {
800                    continue;
801                }
802                // Above noise floor — classify and record.
803                let (kind, axis, explanation) = classify(b, c, cost);
804                let confidence = ((cost - NOISE_FLOOR) / (1.0 - NOISE_FLOOR)).clamp(0.0, 1.0);
805                out.push(FirstDivergence {
806                    baseline_turn: i,
807                    candidate_turn: j,
808                    kind,
809                    primary_axis: axis,
810                    explanation,
811                    confidence,
812                });
813            }
814        }
815    }
816    out
817}
818
819/// Classify a significant (above-noise-floor) matched pair.
820fn classify(b: &Record, c: &Record, cost: f64) -> (DivergenceKind, Axis, String) {
821    let shape_b = tool_shape(b);
822    let shape_c = tool_shape(c);
823    let text_b = response_text(b);
824    let text_c = response_text(c);
825    let stop_b = stop_reason(b);
826    let stop_c = stop_reason(c);
827    let sem_sim = text_similarity(&text_b, &text_c);
828
829    // Structural: tool shapes differ (name or arg-key set).
830    if shape_b != shape_c {
831        let explanation = describe_tool_diff(&shape_b, &shape_c);
832        return (DivergenceKind::Structural, Axis::Trajectory, explanation);
833    }
834    // Structural: same tool shapes but different COUNTS (duplicated calls).
835    // Shape is a set so `{lookup_order(id)}` matches itself regardless of
836    // call count; we detect duplicates via an explicit count comparison.
837    let count_b = count_tool_use(b);
838    let count_c = count_tool_use(c);
839    if count_b != count_c {
840        let tool_names: Vec<&String> = shape_b.iter().collect();
841        let tools_summary = if tool_names.len() == 1 {
842            format!("`{}`", tool_names[0])
843        } else {
844            format!("{} tool(s)", tool_names.len())
845        };
846        let explanation = if count_c > count_b {
847            format!(
848                "candidate called {tools_summary} {count_c} time(s) vs baseline's {count_b} \
849                — duplicate tool invocation"
850            )
851        } else {
852            format!(
853                "candidate called {tools_summary} {count_c} time(s) vs baseline's {count_b} \
854                — dropped one or more repeat invocations"
855            )
856        };
857        return (DivergenceKind::Structural, Axis::Trajectory, explanation);
858    }
859
860    // Stop reason flipped — often signals refusal / filter.
861    if stop_b != stop_c {
862        return (
863            DivergenceKind::Decision,
864            Axis::Safety,
865            format!("stop_reason changed: `{stop_b}` → `{stop_c}`"),
866        );
867    }
868
869    // Tool shape matches and stop_reason matches but something still
870    // drives a divergence. Two sub-cases:
871    //   - tool_use.input values differ (same keys, different values)
872    //   - response text diverged
873    if let Some(arg_diff) = arg_value_diff(b, c) {
874        return (
875            DivergenceKind::Decision,
876            Axis::Trajectory,
877            format!("tool arg value changed: {arg_diff}"),
878        );
879    }
880
881    // Pure text divergence. Style vs decision depends on similarity.
882    if sem_sim >= 0.90 && cost <= STYLE_MAX_COST {
883        (
884            DivergenceKind::Style,
885            Axis::Semantic,
886            "cosmetic wording change — tool sequence and semantics preserved".to_string(),
887        )
888    } else {
889        (
890            DivergenceKind::Decision,
891            Axis::Semantic,
892            format!(
893                "response text diverged (text similarity {:.2}); same tool sequence",
894                sem_sim
895            ),
896        )
897    }
898}
899
900fn describe_tool_diff(a: &BTreeSet<String>, b: &BTreeSet<String>) -> String {
901    let only_a: Vec<&String> = a.difference(b).collect();
902    let only_b: Vec<&String> = b.difference(a).collect();
903    if !only_a.is_empty() && only_b.is_empty() {
904        format!("candidate dropped tool call(s): {}", list(&only_a))
905    } else if !only_b.is_empty() && only_a.is_empty() {
906        format!("candidate added tool call(s): {}", list(&only_b))
907    } else if !only_a.is_empty() && !only_b.is_empty() {
908        format!(
909            "tool set changed: removed {}, added {}",
910            list(&only_a),
911            list(&only_b)
912        )
913    } else {
914        "tool ordering differs".to_string()
915    }
916}
917
918fn list(items: &[&String]) -> String {
919    items
920        .iter()
921        .map(|s| format!("`{s}`"))
922        .collect::<Vec<_>>()
923        .join(", ")
924}
925
926/// Compare arg values for tools that have the same name and arg keys.
927/// Returns `Some(summary)` if any tool's values differ, `None` if every
928/// tool's values match.
929fn arg_value_diff(a: &Record, b: &Record) -> Option<String> {
930    let ta = tool_use_inputs(a);
931    let tb = tool_use_inputs(b);
932    for (name, va) in &ta {
933        if let Some(vb) = tb.get(name) {
934            if va != vb {
935                // Find the first differing key.
936                if let (Some(oa), Some(ob)) = (va.as_object(), vb.as_object()) {
937                    for (k, v) in oa {
938                        if ob.get(k) != Some(v) {
939                            let other = ob
940                                .get(k)
941                                .map(|x| x.to_string())
942                                .unwrap_or("<missing>".to_string());
943                            return Some(format!("`{name}({k})`: `{v}` → `{other}`"));
944                        }
945                    }
946                }
947                return Some(format!("`{name}`: input changed"));
948            }
949        }
950    }
951    None
952}
953
954/// Index a chat_response's tool_use blocks by tool_name → input value.
955/// First occurrence wins if a tool is called twice in the same turn.
956fn tool_use_inputs(r: &Record) -> std::collections::BTreeMap<String, serde_json::Value> {
957    let mut out = std::collections::BTreeMap::new();
958    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
959        return out;
960    };
961    for part in arr {
962        if part.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
963            continue;
964        }
965        let name = part
966            .get("name")
967            .and_then(|n| n.as_str())
968            .unwrap_or("_")
969            .to_string();
970        let input = part
971            .get("input")
972            .cloned()
973            .unwrap_or(serde_json::Value::Null);
974        out.entry(name).or_insert(input);
975    }
976    out
977}
978
979// ---------------------------------------------------------------------------
980// Tests
981// ---------------------------------------------------------------------------
982
983#[cfg(test)]
984mod tests {
985    use super::*;
986    use crate::agentlog::Kind;
987    use serde_json::json;
988
989    fn response_text_only(text: &str, stop: &str) -> Record {
990        Record::new(
991            Kind::ChatResponse,
992            json!({
993                "model": "x",
994                "content": [{"type": "text", "text": text}],
995                "stop_reason": stop,
996                "latency_ms": 0,
997                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
998            }),
999            "2026-04-23T00:00:00Z",
1000            None,
1001        )
1002    }
1003
1004    fn response_with_tool(name: &str, input: serde_json::Value, stop: &str) -> Record {
1005        Record::new(
1006            Kind::ChatResponse,
1007            json!({
1008                "model": "x",
1009                "content": [{
1010                    "type": "tool_use",
1011                    "id": "t1",
1012                    "name": name,
1013                    "input": input,
1014                }],
1015                "stop_reason": stop,
1016                "latency_ms": 0,
1017                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
1018            }),
1019            "2026-04-23T00:00:00Z",
1020            None,
1021        )
1022    }
1023
1024    fn meta() -> Record {
1025        Record::new(
1026            Kind::Metadata,
1027            json!({"sdk": {"name": "shadow"}}),
1028            "2026-04-23T00:00:00Z",
1029            None,
1030        )
1031    }
1032
1033    #[test]
1034    fn identical_traces_return_none() {
1035        let r = response_text_only("Paris is the capital of France.", "end_turn");
1036        let baseline = vec![meta(), r.clone(), r.clone()];
1037        let candidate = vec![meta(), r.clone(), r.clone()];
1038        assert_eq!(detect(&baseline, &candidate), None);
1039    }
1040
1041    #[test]
1042    fn whitespace_only_diff_is_style() {
1043        let b = response_text_only("Paris is the capital of France.", "end_turn");
1044        let c = response_text_only("Paris is  the capital of France.", "end_turn");
1045        let baseline = vec![meta(), b];
1046        let candidate = vec![meta(), c];
1047        // At shingle-level, whitespace variance is tiny but nonzero.
1048        // Classification depends on cost; this case is typically below
1049        // NOISE_FLOOR in practice. The key assertion is that if ANY
1050        // divergence is reported, it's Style, not Structural/Decision.
1051        if let Some(d) = detect(&baseline, &candidate) {
1052            assert_eq!(d.kind, DivergenceKind::Style);
1053            assert_eq!(d.primary_axis, Axis::Semantic);
1054        }
1055    }
1056
1057    #[test]
1058    fn different_tool_name_is_structural_on_trajectory_axis() {
1059        let b = response_with_tool("search", json!({"q": "cats"}), "tool_use");
1060        let c = response_with_tool("lookup", json!({"q": "cats"}), "tool_use");
1061        let baseline = vec![meta(), b];
1062        let candidate = vec![meta(), c];
1063        let d = detect(&baseline, &candidate).expect("divergence expected");
1064        assert_eq!(d.kind, DivergenceKind::Structural);
1065        assert_eq!(d.primary_axis, Axis::Trajectory);
1066        assert_eq!(d.baseline_turn, 0);
1067        assert_eq!(d.candidate_turn, 0);
1068        assert!(d.explanation.contains("search") || d.explanation.contains("lookup"));
1069    }
1070
1071    #[test]
1072    fn same_tool_different_arg_value_is_decision() {
1073        let b = response_with_tool("search", json!({"q": "cats", "limit": 10}), "tool_use");
1074        let c = response_with_tool("search", json!({"q": "cats", "limit": 50}), "tool_use");
1075        let baseline = vec![meta(), b];
1076        let candidate = vec![meta(), c];
1077        let d = detect(&baseline, &candidate).expect("divergence expected");
1078        assert_eq!(d.kind, DivergenceKind::Decision);
1079        assert_eq!(d.primary_axis, Axis::Trajectory);
1080        assert!(d.explanation.contains("limit"));
1081    }
1082
1083    #[test]
1084    fn stop_reason_flip_is_decision_on_safety() {
1085        let b = response_text_only("Here is the answer.", "end_turn");
1086        let c = response_text_only("I can't help with that.", "content_filter");
1087        let baseline = vec![meta(), b];
1088        let candidate = vec![meta(), c];
1089        let d = detect(&baseline, &candidate).expect("divergence expected");
1090        assert_eq!(d.kind, DivergenceKind::Decision);
1091        assert_eq!(d.primary_axis, Axis::Safety);
1092        assert!(d.explanation.contains("end_turn"));
1093        assert!(d.explanation.contains("content_filter"));
1094    }
1095
1096    #[test]
1097    fn candidate_drops_a_turn_is_structural() {
1098        let r1 = response_text_only("first turn", "end_turn");
1099        let r2 = response_text_only("second turn", "end_turn");
1100        let baseline = vec![meta(), r1.clone(), r2];
1101        let candidate = vec![meta(), r1]; // dropped the second
1102        let d = detect(&baseline, &candidate).expect("divergence expected");
1103        assert_eq!(d.kind, DivergenceKind::Structural);
1104        assert_eq!(d.primary_axis, Axis::Trajectory);
1105    }
1106
1107    #[test]
1108    fn candidate_inserts_a_turn_is_structural() {
1109        let r1 = response_text_only("turn one", "end_turn");
1110        let r2 = response_text_only("inserted", "end_turn");
1111        let r3 = response_text_only("turn two", "end_turn");
1112        let baseline = vec![meta(), r1.clone(), r3.clone()];
1113        let candidate = vec![meta(), r1, r2, r3];
1114        let d = detect(&baseline, &candidate).expect("divergence expected");
1115        assert_eq!(d.kind, DivergenceKind::Structural);
1116    }
1117
1118    #[test]
1119    fn significant_text_shift_is_decision_on_semantic() {
1120        // Different topics entirely — semantic similarity low, no tools.
1121        let b = response_text_only(
1122            "Photosynthesis is the process by which plants convert sunlight.",
1123            "end_turn",
1124        );
1125        let c = response_text_only(
1126            "The stock market closed higher on Thursday after strong earnings.",
1127            "end_turn",
1128        );
1129        let baseline = vec![meta(), b];
1130        let candidate = vec![meta(), c];
1131        let d = detect(&baseline, &candidate).expect("divergence expected");
1132        assert_eq!(d.kind, DivergenceKind::Decision);
1133        assert_eq!(d.primary_axis, Axis::Semantic);
1134    }
1135
1136    #[test]
1137    fn empty_traces_return_none() {
1138        assert_eq!(detect(&[meta()], &[meta()]), None);
1139        assert_eq!(detect(&[], &[]), None);
1140    }
1141
1142    #[test]
1143    fn first_divergence_is_truly_first() {
1144        // Three turns; only the SECOND differs. Detector must locate
1145        // turn index 1, not turn 2.
1146        let r1 = response_text_only("same", "end_turn");
1147        let r2b = response_text_only("baseline version of turn two with lots of text", "end_turn");
1148        let r2c = response_text_only(
1149            "CANDIDATE SAID SOMETHING COMPLETELY DIFFERENT HERE",
1150            "end_turn",
1151        );
1152        let r3 = response_text_only("also same", "end_turn");
1153        let baseline = vec![meta(), r1.clone(), r2b, r3.clone()];
1154        let candidate = vec![meta(), r1, r2c, r3];
1155        let d = detect(&baseline, &candidate).expect("divergence expected");
1156        assert_eq!(d.baseline_turn, 1);
1157        assert_eq!(d.candidate_turn, 1);
1158    }
1159
1160    #[test]
1161    fn confidence_is_in_valid_range() {
1162        let b = response_with_tool("search", json!({"q": "a"}), "tool_use");
1163        let c = response_with_tool("other", json!({"q": "a"}), "tool_use");
1164        let baseline = vec![meta(), b];
1165        let candidate = vec![meta(), c];
1166        let d = detect(&baseline, &candidate).unwrap();
1167        assert!((0.0..=1.0).contains(&d.confidence));
1168    }
1169
1170    #[test]
1171    fn tool_shape_captures_name_and_arg_keys() {
1172        let r = response_with_tool("search", json!({"q": "a", "limit": 10}), "tool_use");
1173        let shape = tool_shape(&r);
1174        assert_eq!(shape.len(), 1);
1175        let entry = shape.iter().next().unwrap();
1176        assert!(entry.starts_with("search("));
1177        assert!(entry.contains("limit"));
1178        assert!(entry.contains("q"));
1179    }
1180
1181    #[test]
1182    fn jaccard_on_empty_sets_is_one() {
1183        let empty = BTreeSet::new();
1184        assert_eq!(jaccard(&empty, &empty), 1.0);
1185    }
1186
1187    #[test]
1188    fn alignment_prefers_matches_over_gaps_when_both_cheap() {
1189        // Two identical turns. NW should produce two Match steps and
1190        // no gaps.
1191        let r = response_text_only("same", "end_turn");
1192        let alignment = align(&[&r, &r], &[&r, &r]);
1193        let matches = alignment
1194            .steps
1195            .iter()
1196            .filter(|s| matches!(s, Step::Match(..)))
1197            .count();
1198        assert_eq!(matches, 2);
1199        let gaps = alignment.steps.len() - matches;
1200        assert_eq!(gaps, 0);
1201    }
1202
1203    // -----------------------------------------------------------------
1204    // Top-K tests
1205    // -----------------------------------------------------------------
1206
1207    #[test]
1208    fn top_k_with_zero_returns_empty() {
1209        let r1 = response_text_only("same", "end_turn");
1210        let r2 = response_text_only("different", "end_turn");
1211        let out = detect_top_k(&[meta(), r1], &[meta(), r2], 0);
1212        assert_eq!(out.len(), 0);
1213    }
1214
1215    #[test]
1216    fn top_k_with_identical_returns_empty() {
1217        let r = response_text_only("same", "end_turn");
1218        let out = detect_top_k(&[meta(), r.clone(), r.clone()], &[meta(), r.clone(), r], 3);
1219        assert_eq!(out.len(), 0);
1220    }
1221
1222    #[test]
1223    fn top_k_orders_structural_before_decision_before_style() {
1224        // Construct a candidate with one divergence of each kind, in
1225        // order: Style @ turn 0, Decision @ turn 1 (refusal), Structural
1226        // @ turn 2 (tool change). Top-K must rerank: Structural #1,
1227        // Decision #2, Style #3 — NOT walk order.
1228        let b0 = response_text_only(
1229            "Hello, here is a detailed answer explaining the topic in full.",
1230            "end_turn",
1231        );
1232        let b1 = response_text_only("The answer is 42.", "end_turn");
1233        let b2 = response_with_tool("search", json!({"q": "x"}), "tool_use");
1234        let c0 = response_text_only(
1235            "Hello, here is a detailed answer explaining the topic in full!",
1236            "end_turn",
1237        ); // cosmetic punctuation → style
1238        let c1 = response_text_only("I cannot answer that.", "content_filter"); // refusal → decision (safety)
1239        let c2 = response_with_tool("lookup", json!({"q": "x"}), "tool_use"); // tool name change → structural
1240        let baseline = vec![meta(), b0, b1, b2];
1241        let candidate = vec![meta(), c0, c1, c2];
1242        let out = detect_top_k(&baseline, &candidate, 5);
1243        assert!(
1244            out.len() >= 2,
1245            "expected at least 2 divergences, got {}",
1246            out.len()
1247        );
1248        // #1 must be structural
1249        assert_eq!(
1250            out[0].kind,
1251            DivergenceKind::Structural,
1252            "rank 1 should be Structural, got {:?}",
1253            out[0].kind
1254        );
1255        // If we have a rank 2, it must be Decision (Style is lowest priority)
1256        if out.len() >= 2 {
1257            assert_eq!(
1258                out[1].kind,
1259                DivergenceKind::Decision,
1260                "rank 2 should be Decision, got {:?}",
1261                out[1].kind
1262            );
1263        }
1264    }
1265
1266    #[test]
1267    fn top_k_truncates_at_k() {
1268        // 5 divergent turns, ask for top 2.
1269        let same = response_text_only("unchanged", "end_turn");
1270        let _ = same.clone(); // avoid unused_assignments warning pattern
1271        let baseline = vec![
1272            meta(),
1273            response_with_tool("a", json!({}), "tool_use"),
1274            response_with_tool("b", json!({}), "tool_use"),
1275            response_with_tool("c", json!({}), "tool_use"),
1276            response_with_tool("d", json!({}), "tool_use"),
1277            response_with_tool("e", json!({}), "tool_use"),
1278        ];
1279        let candidate = vec![
1280            meta(),
1281            response_with_tool("A", json!({}), "tool_use"),
1282            response_with_tool("B", json!({}), "tool_use"),
1283            response_with_tool("C", json!({}), "tool_use"),
1284            response_with_tool("D", json!({}), "tool_use"),
1285            response_with_tool("E", json!({}), "tool_use"),
1286        ];
1287        let out = detect_top_k(&baseline, &candidate, 2);
1288        assert_eq!(out.len(), 2);
1289        // All should be Structural (tool name differs)
1290        for dv in &out {
1291            assert_eq!(dv.kind, DivergenceKind::Structural);
1292        }
1293    }
1294
1295    #[test]
1296    fn top_k_preserves_walk_order_within_same_severity_and_confidence() {
1297        // Three Structural divergences with identical confidence → ties
1298        // broken by walk order (earlier turns before later ones).
1299        let baseline = vec![
1300            meta(),
1301            response_with_tool("a", json!({}), "tool_use"),
1302            response_with_tool("b", json!({}), "tool_use"),
1303            response_with_tool("c", json!({}), "tool_use"),
1304        ];
1305        let candidate = vec![
1306            meta(),
1307            response_with_tool("A", json!({}), "tool_use"),
1308            response_with_tool("B", json!({}), "tool_use"),
1309            response_with_tool("C", json!({}), "tool_use"),
1310        ];
1311        let out = detect_top_k(&baseline, &candidate, 3);
1312        assert_eq!(out.len(), 3);
1313        // Stable sort preserves walk order for equal keys
1314        assert_eq!(out[0].baseline_turn, 0);
1315        assert_eq!(out[1].baseline_turn, 1);
1316        assert_eq!(out[2].baseline_turn, 2);
1317    }
1318
1319    #[test]
1320    fn top_k_of_1_matches_first_divergence_classifier() {
1321        // detect_top_k(.., 1) and detect() should name the same KIND
1322        // for simple single-divergence traces (walk order preserved
1323        // within the same kind rank).
1324        let b = response_with_tool("search", json!({"q": "x"}), "tool_use");
1325        let c = response_with_tool("search", json!({"q": "y"}), "tool_use");
1326        let first = detect(&[meta(), b.clone()], &[meta(), c.clone()]).unwrap();
1327        let top = detect_top_k(&[meta(), b], &[meta(), c], 1);
1328        assert_eq!(top.len(), 1);
1329        assert_eq!(top[0].kind, first.kind);
1330        assert_eq!(top[0].baseline_turn, first.baseline_turn);
1331    }
1332
1333    #[test]
1334    fn banded_alignment_storage_is_banded_not_full_matrix() {
1335        // Regression test for an external-review finding: v3.2.4 and
1336        // earlier `align_banded` ONLY banded the compute loop —
1337        // storage was still `vec![vec![INF; m+1]; n+1]`, allocating
1338        // the full n × m matrix four times. At n = m = 10_000 that
1339        // produced a 3.6 GB RSS spike on a 500 MB budget.
1340        //
1341        // This test pins the contract: at n = m = 2_000, the total
1342        // working memory of the four banded matrices stays well under
1343        // what a full-matrix layout would cost. We measure structurally
1344        // by counting cells in each row, which is more reliable than
1345        // measuring RSS (RSS is noisy across platforms + allocators).
1346        let n = 2_000usize;
1347        let m = 2_000usize;
1348        let band = band_half_width(n, m);
1349        let banded: Banded<f64> = Banded::new(n, m, band, 0.0);
1350        let total_cells: usize = banded.rows.iter().map(|r| r.len()).sum();
1351        let full_cells = (n + 1) * (m + 1);
1352        // The banded layout must use less than a quarter of the full
1353        // layout. In practice it's much less (band ≈ √n + edge slack),
1354        // but the quarter bound is a permissive regression gate.
1355        assert!(
1356            total_cells < full_cells / 4,
1357            "banded storage size {total_cells} not meaningfully smaller than \
1358             full-matrix size {full_cells}; the storage-is-banded fix has regressed"
1359        );
1360        // And spot-check: every row at the dense middle of the band
1361        // must be exactly 2 * band + 1 cells, not m + 1.
1362        let middle_row_len = banded.rows[n / 2].len();
1363        assert!(
1364            middle_row_len <= 2 * band + 1,
1365            "middle row has {middle_row_len} cells; expected at most {} \
1366             (2 * band + 1). align_banded is allocating wider than the band.",
1367            2 * band + 1
1368        );
1369        assert!(
1370            middle_row_len < m + 1,
1371            "middle row has {middle_row_len} cells = m + 1; align_banded \
1372             has regressed to full-matrix storage"
1373        );
1374    }
1375
1376    #[test]
1377    fn first_divergence_is_alignment_order_not_importance_rank() {
1378        // Explicit guarantee: `detect()` returns divergence by WALK
1379        // order (earliest above-noise cell), not importance. This
1380        // preserves backward compat with v1's semantic.
1381        let b0 = response_text_only("same across both", "end_turn");
1382        let b1 = response_with_tool("search", json!({"q": "x"}), "tool_use");
1383        let c0 = response_text_only("completely different response here", "end_turn");
1384        let c1 = response_with_tool("lookup", json!({"q": "x"}), "tool_use");
1385        // Turn 0 has Decision (text shift); turn 1 has Structural.
1386        // top_k will rank Structural #1 (higher class). first() must
1387        // still return the turn 0 divergence (walk order).
1388        let baseline = vec![meta(), b0, b1];
1389        let candidate = vec![meta(), c0, c1];
1390        let first = detect(&baseline, &candidate).unwrap();
1391        let top = detect_top_k(&baseline, &candidate, 3);
1392        assert_eq!(first.baseline_turn, 0);
1393        assert_eq!(top[0].baseline_turn, 1); // re-ranked Structural first
1394        assert_eq!(top[0].kind, DivergenceKind::Structural);
1395    }
1396}
shadow_core/diff/alignment.rs

shadow_core/diff/
alignment.rs