Skip to main content

shadow_core/diff/
alignment.rs

1//! First-divergence detection over paired chat responses.
2//!
3//! Given two traces, this module identifies the **first turn at which
4//! the candidate meaningfully diverged from the baseline** and classifies
5//! the divergence as one of three kinds:
6//!
7//! - **Structural** — the tool-call sequence differs (missing, extra, or
8//!   reordered calls). Surfaces as a gap in the alignment, OR a same-
9//!   position pair with different `tool_name`s.
10//! - **Decision** — the tool sequence matches but the *decision* changed:
11//!   same tool, different arg values; final-answer semantic cosine < 0.8;
12//!   `stop_reason` flipped; refusal where there wasn't one.
13//! - **Style** — cosmetic wording differences only; semantic cosine ≥ 0.9,
14//!   identical tool shape, identical stop_reason.
15//!
16//! ## Algorithm
17//!
18//! A Needleman-Wunsch global alignment with Gotoh affine gap penalties
19//! pairs baseline and candidate chat_response records. The cost for
20//! aligning pair `(a, b)` is:
21//!
22//! ```text
23//!   cost(a, b) = w_struct * (1 - jaccard(tool_shape_a, tool_shape_b))
24//!              + w_sem    * (1 - text_similarity(a, b))
25//!              + w_stop   * stop_reason_mismatch(a, b)
26//! ```
27//!
28//! After alignment, we walk the alignment path left-to-right and emit
29//! the first cell whose per-cell divergence exceeds the noise floor.
30//!
31//! ## Why NW and not position-match
32//!
33//! Position-match fails when one side inserts or drops a turn (common
34//! when one config retries where the other doesn't). NW pays a
35//! controlled gap cost instead of mis-pairing every subsequent turn.
36//! Cost is O(n·m) in DP cells; traces rarely exceed ~100 turns, so
37//! runtime is trivial in practice.
38
39use serde::{Deserialize, Serialize};
40use std::collections::BTreeSet;
41
42use crate::agentlog::{Kind, Record};
43use crate::diff::axes::Axis;
44
45/// Classification of the first divergence between two traces.
46///
47/// Serialises with a consistent `_drift` suffix across Rust's `label()`,
48/// serde JSON output, and Python-side string representation so the same
49/// value appears identically everywhere a consumer might see it.
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub enum DivergenceKind {
52    /// Cosmetic wording only: semantic similarity high, tool shape
53    /// identical, stop reason identical. Safe-to-merge signal, usually.
54    #[serde(rename = "style_drift")]
55    Style,
56    /// Same structure, different decision: arg values differ, refusal
57    /// flipped, or final-answer semantics shifted meaningfully.
58    #[serde(rename = "decision_drift")]
59    Decision,
60    /// Tool-call sequence differs: insertion, deletion, or reorder.
61    /// This is almost always a real behavioural regression.
62    #[serde(rename = "structural_drift")]
63    Structural,
64}
65
66impl DivergenceKind {
67    /// Short machine-readable label used in terminal / markdown / JSON output.
68    pub fn label(&self) -> &'static str {
69        match self {
70            DivergenceKind::Style => "style_drift",
71            DivergenceKind::Decision => "decision_drift",
72            DivergenceKind::Structural => "structural_drift",
73        }
74    }
75}
76
77/// First meaningful divergence between two traces.
78#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
79pub struct FirstDivergence {
80    /// 0-based index in the baseline's chat_response sequence. For an
81    /// insertion on the candidate side, this is the baseline index
82    /// where the insertion appeared (effectively a "between" marker).
83    pub baseline_turn: usize,
84    /// Same for the candidate side. May differ from baseline_turn when
85    /// gaps are present on the alignment path.
86    pub candidate_turn: usize,
87    /// Classification (Style / Decision / Structural).
88    pub kind: DivergenceKind,
89    /// Primary axis the divergence surfaces on (semantic, trajectory,
90    /// safety, conformance). Provides a machine-readable hint for
91    /// grouping regressions by root cause.
92    pub primary_axis: Axis,
93    /// One-line human-readable explanation. Designed to be embeddable
94    /// in a PR comment without additional context.
95    pub explanation: String,
96    /// Confidence in 0..1. Higher means "the signal exceeds the noise
97    /// floor by a wide margin". Callers can gate display on >= 0.5.
98    pub confidence: f64,
99}
100
101// ---------------------------------------------------------------------------
102// Public entry point
103// ---------------------------------------------------------------------------
104
105/// Default number of top-ranked divergences returned by [`detect_top_k`].
106/// The markdown / terminal renderers show the top 3; the full list goes
107/// to the JSON output. Users can override via the explicit `k` parameter.
108pub const DEFAULT_K: usize = 5;
109
110/// Detect the first meaningful divergence between two traces.
111///
112/// Returns `None` when the traces agree on every compared turn up to the
113/// length of the shorter one (and the longer tail is empty or also
114/// matches). Returns `Some` at the first pair whose combined per-cell
115/// cost exceeds the noise floor.
116///
117/// This is a thin convenience wrapper around [`detect_top_k`] that
118/// returns only the highest-ranked divergence by walk order. Callers
119/// who want multi-fork coverage should use `detect_top_k` directly.
120pub fn detect(baseline: &[Record], candidate: &[Record]) -> Option<FirstDivergence> {
121    // The original "first divergence" is literally the first cell on
122    // the alignment walk that exceeds the noise floor — i.e. rank=0
123    // in walk order, NOT in severity-weighted rank. We preserve that
124    // semantic here for backward compatibility by doing a single-step
125    // walk instead of sorting top-K.
126    let baseline_responses: Vec<&Record> = baseline
127        .iter()
128        .filter(|r| r.kind == Kind::ChatResponse)
129        .collect();
130    let candidate_responses: Vec<&Record> = candidate
131        .iter()
132        .filter(|r| r.kind == Kind::ChatResponse)
133        .collect();
134    if baseline_responses.is_empty() || candidate_responses.is_empty() {
135        return None;
136    }
137    let alignment = align(&baseline_responses, &candidate_responses);
138    walk_collecting(&alignment, &baseline_responses, &candidate_responses, 1)
139        .into_iter()
140        .next()
141}
142
143/// Detect up to `k` meaningful divergences between two traces, sorted
144/// by importance (kind severity × confidence, descending).
145///
146/// Returns an empty vec when the traces agree end-to-end. Returns at
147/// most `k` results; fewer if the walk produces fewer above-noise cells.
148/// Pass `k = DEFAULT_K` for the standard top-5.
149///
150/// **Ranking:** Structural > Decision > Style (by class), then by
151/// `confidence` within a class. This surfaces the most actionable
152/// regression first, not just the earliest. Walk order is preserved
153/// as a stable tiebreaker so identical-severity events are reported
154/// in temporal order (earlier turns before later ones).
155pub fn detect_top_k(baseline: &[Record], candidate: &[Record], k: usize) -> Vec<FirstDivergence> {
156    if k == 0 {
157        return Vec::new();
158    }
159    let baseline_responses: Vec<&Record> = baseline
160        .iter()
161        .filter(|r| r.kind == Kind::ChatResponse)
162        .collect();
163    let candidate_responses: Vec<&Record> = candidate
164        .iter()
165        .filter(|r| r.kind == Kind::ChatResponse)
166        .collect();
167    if baseline_responses.is_empty() || candidate_responses.is_empty() {
168        return Vec::new();
169    }
170    let alignment = align(&baseline_responses, &candidate_responses);
171    // Collect ALL above-noise divergences in walk order (there can't
172    // be more than baseline.len() + candidate.len() of them).
173    let max_possible = baseline_responses.len() + candidate_responses.len();
174    let mut all = walk_collecting(
175        &alignment,
176        &baseline_responses,
177        &candidate_responses,
178        max_possible,
179    );
180    // Stable sort by (kind rank desc, confidence desc, walk-order asc).
181    // Walk-order is captured by the current vec position (index), so we
182    // use a stable sort and only key on the two explicit ranks.
183    all.sort_by(|a, b| {
184        kind_rank(b.kind).cmp(&kind_rank(a.kind)).then_with(|| {
185            b.confidence
186                .partial_cmp(&a.confidence)
187                .unwrap_or(std::cmp::Ordering::Equal)
188        })
189    });
190    all.truncate(k);
191    all
192}
193
194/// Ranking weight for each kind. Higher = more actionable, ranks higher.
195/// Structural drift (tool sequence differs) is nearly always a real
196/// behavioural regression; decision drift (same shape, different call)
197/// needs investigation but less urgent; style drift is cosmetic.
198fn kind_rank(k: DivergenceKind) -> u8 {
199    match k {
200        DivergenceKind::Structural => 3,
201        DivergenceKind::Decision => 2,
202        DivergenceKind::Style => 1,
203    }
204}
205
206// ---------------------------------------------------------------------------
207// Alignment: Needleman-Wunsch with Gotoh affine gap penalties
208// ---------------------------------------------------------------------------
209
210/// Weights for the per-cell cost function. Tuned against the real
211/// demo fixtures; exported as constants so tests and callers can see
212/// the exact numbers without grepping source.
213const W_STRUCT: f64 = 0.40; // Jaccard distance on tool_shape
214const W_SEM: f64 = 0.25; // 1 - text_similarity
215const W_STOP: f64 = 0.15; // stop_reason mismatch
216const W_ARGS: f64 = 0.20; // tool_use input VALUE differences (same keys, different values)
217
218/// Gotoh affine gap penalty: opening a gap is more expensive than
219/// extending one. Prevents the aligner from fragmenting a multi-turn
220/// insertion into many single-turn insertions.
221const GAP_OPEN: f64 = 0.60;
222const GAP_EXTEND: f64 = 0.15;
223
224/// Noise floor for per-cell divergence. Cells below this are treated
225/// as "no divergence" — covers bootstrap non-determinism, minor
226/// token-count drift from prompt caching, etc.
227const NOISE_FLOOR: f64 = 0.12;
228
229/// Style-drift upper bound on the per-cell cost. Above this we call
230/// it Decision or Structural. Calibrated for semantic cosine ≥ 0.9
231/// with identical tool shape.
232const STYLE_MAX_COST: f64 = 0.25;
233
234#[derive(Debug, Clone, Copy, PartialEq, Eq)]
235enum Step {
236    /// Diagonal: pair baseline[i-1] with candidate[j-1].
237    Match(usize, usize),
238    /// Horizontal: gap on baseline side (candidate inserted a turn).
239    InsertCandidate(usize),
240    /// Vertical: gap on candidate side (candidate dropped a turn).
241    DeleteBaseline(usize),
242}
243
244/// Alignment result: a path of steps from (0,0) to (n,m).
245struct Alignment {
246    steps: Vec<Step>,
247}
248
249/// Threshold beyond which we switch from full-matrix Needleman-Wunsch
250/// to a banded variant. Under the threshold, the quadratic work is
251/// negligible and full-matrix stays exact. Above it, the band keeps
252/// memory + CPU linear in `max(N, M)`.
253const SCALE_BAND_THRESHOLD: usize = 1000;
254
255/// Minimum band half-width. Small enough to stay efficient, large
256/// enough that typical drift between baseline and candidate (tool
257/// added / removed / reordered within a few turns) fits.
258const MIN_BAND_HALF_WIDTH: usize = 100;
259
260/// Pick the band half-width for a (N, M) pair. We guarantee
261/// `|i - j| <= band` captures any realistic alignment path: the band
262/// scales with `max(|N - M| + MIN_BAND_HALF_WIDTH, sqrt(max(N, M)))`
263/// so a modest length difference plus a local scan radius is always
264/// within reach.
265fn band_half_width(n: usize, m: usize) -> usize {
266    let length_diff = n.abs_diff(m);
267    let radius = (n.max(m) as f64).sqrt() as usize;
268    length_diff + MIN_BAND_HALF_WIDTH.max(radius)
269}
270
271fn align(baseline: &[&Record], candidate: &[&Record]) -> Alignment {
272    let n = baseline.len();
273    let m = candidate.len();
274    if n.max(m) > SCALE_BAND_THRESHOLD {
275        align_banded(baseline, candidate, band_half_width(n, m))
276    } else {
277        align_full(baseline, candidate)
278    }
279}
280
281fn align_full(baseline: &[&Record], candidate: &[&Record]) -> Alignment {
282    let n = baseline.len();
283    let m = candidate.len();
284    // DP table: cost of aligning baseline[0..i] with candidate[0..j].
285    // We carry three matrices (M, X, Y) per Gotoh to track whether the
286    // previous op was a match, a horizontal gap (in baseline), or a
287    // vertical gap (in candidate). INF sentinel: 1e18 (cannot realistically
288    // be reached in practice).
289    const INF: f64 = 1e18;
290    let mut mat = vec![vec![INF; m + 1]; n + 1];
291    let mut xg = vec![vec![INF; m + 1]; n + 1]; // gap in baseline (insertion)
292    let mut yg = vec![vec![INF; m + 1]; n + 1]; // gap in candidate (deletion)
293    let mut back = vec![vec![Step::Match(0, 0); m + 1]; n + 1];
294
295    mat[0][0] = 0.0;
296    for i in 1..=n {
297        yg[i][0] = GAP_OPEN + (i as f64 - 1.0) * GAP_EXTEND;
298        mat[i][0] = yg[i][0];
299        back[i][0] = Step::DeleteBaseline(i - 1);
300    }
301    for j in 1..=m {
302        xg[0][j] = GAP_OPEN + (j as f64 - 1.0) * GAP_EXTEND;
303        mat[0][j] = xg[0][j];
304        back[0][j] = Step::InsertCandidate(j - 1);
305    }
306
307    for i in 1..=n {
308        for j in 1..=m {
309            let c = pair_cost(baseline[i - 1], candidate[j - 1]);
310            // Match path: best of (prev-match, prev-xgap, prev-ygap) + pair cost.
311            let m_cost = mat[i - 1][j - 1]
312                .min(xg[i - 1][j - 1])
313                .min(yg[i - 1][j - 1])
314                + c;
315            // Open a horizontal gap (insertion on candidate) or extend one.
316            let xg_cost = (mat[i][j - 1] + GAP_OPEN).min(xg[i][j - 1] + GAP_EXTEND);
317            // Open a vertical gap (deletion on baseline) or extend one.
318            let yg_cost = (mat[i - 1][j] + GAP_OPEN).min(yg[i - 1][j] + GAP_EXTEND);
319            mat[i][j] = m_cost;
320            xg[i][j] = xg_cost;
321            yg[i][j] = yg_cost;
322            // Record the back-pointer for the cell's *minimum* overall
323            // reachable cost — we walk the cheapest path.
324            let best = m_cost.min(xg_cost).min(yg_cost);
325            back[i][j] = if (best - m_cost).abs() < 1e-12 {
326                Step::Match(i - 1, j - 1)
327            } else if (best - xg_cost).abs() < 1e-12 {
328                Step::InsertCandidate(j - 1)
329            } else {
330                Step::DeleteBaseline(i - 1)
331            };
332        }
333    }
334
335    // Traceback from (n, m) to (0, 0).
336    let mut steps = Vec::new();
337    let mut i = n;
338    let mut j = m;
339    while i > 0 || j > 0 {
340        let s = back[i][j];
341        steps.push(s);
342        match s {
343            Step::Match(_, _) => {
344                i -= 1;
345                j -= 1;
346            }
347            Step::InsertCandidate(_) => {
348                j -= 1;
349            }
350            Step::DeleteBaseline(_) => {
351                i -= 1;
352            }
353        }
354    }
355    steps.reverse();
356    Alignment { steps }
357}
358
359/// Banded Needleman-Wunsch — identical recurrence to `align_full` but
360/// only computes cells within `|i - j| <= band`. Memory + work drop
361/// from O(N*M) to O((N+M) * band). For well-behaved traces (baseline
362/// and candidate similar length with limited local drift) this yields
363/// the same optimal alignment as the full matrix; for adversarial
364/// inputs that drift further than `band`, the result is best-effort
365/// within the band and falls back to a forced diagonal outside.
366///
367/// Cells outside the band are clamped to INF on read and never written
368/// to (they stay INF), so the recurrence naturally refuses to route
369/// through them.
370fn align_banded(baseline: &[&Record], candidate: &[&Record], band: usize) -> Alignment {
371    let n = baseline.len();
372    let m = candidate.len();
373    const INF: f64 = 1e18;
374    let mut mat = vec![vec![INF; m + 1]; n + 1];
375    let mut xg = vec![vec![INF; m + 1]; n + 1];
376    let mut yg = vec![vec![INF; m + 1]; n + 1];
377    let mut back = vec![vec![Step::Match(0, 0); m + 1]; n + 1];
378
379    mat[0][0] = 0.0;
380    // Boundary initialisation limited to the band.
381    for i in 1..=n.min(band) {
382        yg[i][0] = GAP_OPEN + (i as f64 - 1.0) * GAP_EXTEND;
383        mat[i][0] = yg[i][0];
384        back[i][0] = Step::DeleteBaseline(i - 1);
385    }
386    for j in 1..=m.min(band) {
387        xg[0][j] = GAP_OPEN + (j as f64 - 1.0) * GAP_EXTEND;
388        mat[0][j] = xg[0][j];
389        back[0][j] = Step::InsertCandidate(j - 1);
390    }
391
392    for i in 1..=n {
393        let j_lo = i.saturating_sub(band).max(1);
394        let j_hi = (i + band).min(m);
395        for j in j_lo..=j_hi {
396            let c = pair_cost(baseline[i - 1], candidate[j - 1]);
397            let m_cost = mat[i - 1][j - 1]
398                .min(xg[i - 1][j - 1])
399                .min(yg[i - 1][j - 1])
400                + c;
401            let xg_cost = (mat[i][j - 1] + GAP_OPEN).min(xg[i][j - 1] + GAP_EXTEND);
402            let yg_cost = (mat[i - 1][j] + GAP_OPEN).min(yg[i - 1][j] + GAP_EXTEND);
403            mat[i][j] = m_cost;
404            xg[i][j] = xg_cost;
405            yg[i][j] = yg_cost;
406            let best = m_cost.min(xg_cost).min(yg_cost);
407            back[i][j] = if (best - m_cost).abs() < 1e-12 {
408                Step::Match(i - 1, j - 1)
409            } else if (best - xg_cost).abs() < 1e-12 {
410                Step::InsertCandidate(j - 1)
411            } else {
412                Step::DeleteBaseline(i - 1)
413            };
414        }
415    }
416
417    // Traceback from (n, m) to (0, 0). Out-of-band cells were never
418    // populated so the backpointer at (n, m) is valid if the path
419    // stayed within the band; otherwise we forcibly walk the diagonal.
420    let mut steps = Vec::new();
421    let mut i = n;
422    let mut j = m;
423    while i > 0 || j > 0 {
424        // If the recorded backpointer is the default Match(0, 0)
425        // at an out-of-band cell, force a diagonal or edge step.
426        let s = if i > 0 && j > 0 {
427            back[i][j]
428        } else if j == 0 {
429            Step::DeleteBaseline(i - 1)
430        } else {
431            Step::InsertCandidate(j - 1)
432        };
433        steps.push(s);
434        match s {
435            Step::Match(_, _) => {
436                i -= 1;
437                j -= 1;
438            }
439            Step::InsertCandidate(_) => {
440                j -= 1;
441            }
442            Step::DeleteBaseline(_) => {
443                i -= 1;
444            }
445        }
446    }
447    steps.reverse();
448    Alignment { steps }
449}
450
451// ---------------------------------------------------------------------------
452// Per-cell cost
453// ---------------------------------------------------------------------------
454
455/// Cost of aligning one baseline response with one candidate response.
456/// Always returns a value in [0, 1].
457fn pair_cost(a: &Record, b: &Record) -> f64 {
458    let tool_shape_a = tool_shape(a);
459    let tool_shape_b = tool_shape(b);
460    // Structural component: set Jaccard on tool shapes, BUT also penalise
461    // count mismatches. Without the count check, duplicate tool calls
462    // ("candidate called `lookup_order` twice where baseline called it
463    // once") are invisible because sets collapse duplicates.
464    let shape_dist = 1.0 - jaccard(&tool_shape_a, &tool_shape_b);
465    let count_a = count_tool_use(a);
466    let count_b = count_tool_use(b);
467    let count_dist = if count_a == count_b {
468        0.0
469    } else {
470        let diff = (count_a as f64 - count_b as f64).abs();
471        let denom = count_a.max(count_b) as f64;
472        if denom == 0.0 {
473            0.0
474        } else {
475            (diff / denom).min(1.0)
476        }
477    };
478    let structural = shape_dist.max(count_dist);
479
480    let text_a = response_text(a);
481    let text_b = response_text(b);
482    let semantic = 1.0 - text_similarity(&text_a, &text_b);
483
484    let stop_a = stop_reason(a);
485    let stop_b = stop_reason(b);
486    let stop = if stop_a != stop_b { 1.0 } else { 0.0 };
487
488    // Arg-value divergence: same tool name AND same arg-key set on
489    // both sides, but different arg VALUES. Without this component
490    // we'd miss the "`search(limit=10)` → `search(limit=50)`" case
491    // because structural + stop + (empty-text) semantic are all 0.
492    let args = if tool_shape_a == tool_shape_b && !tool_shape_a.is_empty() {
493        if arg_value_diff(a, b).is_some() {
494            1.0
495        } else {
496            0.0
497        }
498    } else {
499        0.0
500    };
501
502    W_STRUCT * structural + W_SEM * semantic + W_STOP * stop + W_ARGS * args
503}
504
505/// Extract a canonical tool-shape token per tool_use block. The token
506/// is `"<tool_name>(<sorted-comma-arg-keys>)"` — captures both the
507/// tool called and the KEYS (not values) of its input. Values are
508/// compared separately by `arg_values_differ`.
509fn tool_shape(r: &Record) -> BTreeSet<String> {
510    let mut out = BTreeSet::new();
511    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
512        return out;
513    };
514    for part in arr {
515        if part.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
516            continue;
517        }
518        let name = part.get("name").and_then(|n| n.as_str()).unwrap_or("_");
519        let mut keys: Vec<String> = part
520            .get("input")
521            .and_then(|i| i.as_object())
522            .map(|o| o.keys().cloned().collect())
523            .unwrap_or_default();
524        keys.sort();
525        out.insert(format!("{name}({})", keys.join(",")));
526    }
527    out
528}
529
530/// Count the number of tool_use blocks in a response. Useful as a
531/// structural signal in addition to the set-based tool_shape, because
532/// a set collapses duplicates — calling `lookup_order` twice looks
533/// identical to calling it once if we only compare shapes.
534fn count_tool_use(r: &Record) -> usize {
535    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
536        return 0;
537    };
538    arr.iter()
539        .filter(|p| p.get("type").and_then(|t| t.as_str()) == Some("tool_use"))
540        .count()
541}
542
543fn response_text(r: &Record) -> String {
544    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
545        return String::new();
546    };
547    arr.iter()
548        .filter_map(|p| {
549            if p.get("type").and_then(|t| t.as_str()) == Some("text") {
550                p.get("text")
551                    .and_then(|t| t.as_str())
552                    .map(ToString::to_string)
553            } else {
554                None
555            }
556        })
557        .collect::<Vec<_>>()
558        .join(" ")
559}
560
561fn stop_reason(r: &Record) -> String {
562    r.payload
563        .get("stop_reason")
564        .and_then(|v| v.as_str())
565        .unwrap_or("")
566        .to_string()
567}
568
569/// Jaccard similarity on two string sets. Returns 1.0 for two empty
570/// sets (both sides produced no tool calls — they agree structurally).
571fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
572    if a.is_empty() && b.is_empty() {
573        return 1.0;
574    }
575    let inter = a.intersection(b).count() as f64;
576    let uni = a.union(b).count() as f64;
577    if uni == 0.0 {
578        1.0
579    } else {
580        inter / uni
581    }
582}
583
584/// Lightweight text-similarity proxy: character-shingle Jaccard over
585/// whitespace-normalised text. We avoid bringing in embeddings here
586/// because this module is in the Rust core and must not take a heavy
587/// ML dep. The Python layer can upgrade this via a similarity callback
588/// in v0.2.
589///
590/// Whitespace is normalised (collapsed and trimmed) before shingling:
591/// `"ok"` and `"o k"` should be treated as identical, not as totally
592/// different strings — whitespace-only diffs are the canonical style-
593/// drift signal and must not survive into the similarity score.
594///
595/// For empty (post-normalisation) strings, returns 1.0.
596fn text_similarity(a: &str, b: &str) -> f64 {
597    let na = normalise_whitespace(a);
598    let nb = normalise_whitespace(b);
599    if na.is_empty() && nb.is_empty() {
600        return 1.0;
601    }
602    if na == nb {
603        return 1.0;
604    }
605    let sa = shingles(&na, 4);
606    let sb = shingles(&nb, 4);
607    jaccard(&sa, &sb)
608}
609
610/// Collapse runs of whitespace into a single space and trim edges.
611/// Whitespace-only differences aren't meaningful semantic signal for
612/// the alignment cost function.
613fn normalise_whitespace(s: &str) -> String {
614    let mut out = String::with_capacity(s.len());
615    let mut in_ws = false;
616    for ch in s.chars() {
617        if ch.is_whitespace() {
618            if !in_ws && !out.is_empty() {
619                out.push(' ');
620            }
621            in_ws = true;
622        } else {
623            out.push(ch);
624            in_ws = false;
625        }
626    }
627    if out.ends_with(' ') {
628        out.pop();
629    }
630    out
631}
632
633fn shingles(s: &str, k: usize) -> BTreeSet<String> {
634    let chars: Vec<char> = s.chars().collect();
635    let mut out = BTreeSet::new();
636    if chars.len() < k {
637        if !s.is_empty() {
638            out.insert(s.to_string());
639        }
640        return out;
641    }
642    for w in chars.windows(k) {
643        out.insert(w.iter().collect());
644    }
645    out
646}
647
648// ---------------------------------------------------------------------------
649// Walk the alignment for the first divergence
650// ---------------------------------------------------------------------------
651
652/// Walk the alignment and collect up to `limit` above-noise divergences
653/// in alignment order. Returns an empty vec when the traces agree
654/// end-to-end. Cursor tracking lets gap steps report the correct
655/// baseline / candidate positions even after previous gaps.
656fn walk_collecting(
657    alignment: &Alignment,
658    baseline: &[&Record],
659    candidate: &[&Record],
660    limit: usize,
661) -> Vec<FirstDivergence> {
662    let mut out: Vec<FirstDivergence> = Vec::new();
663    if limit == 0 {
664        return out;
665    }
666    // Track cursors through the walk so that gap steps can report the
667    // baseline / candidate position correctly — without this, an
668    // insertion on the candidate side can't tell which baseline turn
669    // it lived BETWEEN.
670    let mut b_cursor: usize = 0;
671    let mut c_cursor: usize = 0;
672    for step in &alignment.steps {
673        if out.len() >= limit {
674            return out;
675        }
676        match *step {
677            Step::InsertCandidate(j) => {
678                // Candidate inserted a turn the baseline didn't have —
679                // structural by definition. The insertion sits between
680                // `b_cursor - 1` and `b_cursor` on the baseline side.
681                let cand = candidate[j];
682                let insertion_point = b_cursor;
683                let n_tools = tool_shape(cand).len();
684                let detail = if n_tools == 0 {
685                    "an extra response turn with no tool calls".to_string()
686                } else if n_tools == 1 {
687                    "an extra turn with 1 tool call".to_string()
688                } else {
689                    format!("an extra turn with {n_tools} tool calls")
690                };
691                out.push(FirstDivergence {
692                    baseline_turn: insertion_point,
693                    candidate_turn: j,
694                    kind: DivergenceKind::Structural,
695                    primary_axis: Axis::Trajectory,
696                    explanation: format!(
697                        "candidate inserted {detail} between baseline turns #{prev} and #{insertion_point}",
698                        prev = insertion_point.saturating_sub(1),
699                    ),
700                    confidence: 1.0,
701                });
702                c_cursor = c_cursor.saturating_add(1);
703            }
704            Step::DeleteBaseline(i) => {
705                let b = baseline[i];
706                let deletion_point = c_cursor;
707                let n_tools = tool_shape(b).len();
708                let detail = if n_tools == 0 {
709                    "a response turn with no tool calls".to_string()
710                } else if n_tools == 1 {
711                    "a turn with 1 tool call".to_string()
712                } else {
713                    format!("a turn with {n_tools} tool calls")
714                };
715                out.push(FirstDivergence {
716                    baseline_turn: i,
717                    candidate_turn: deletion_point,
718                    kind: DivergenceKind::Structural,
719                    primary_axis: Axis::Trajectory,
720                    explanation: format!(
721                        "candidate dropped {detail} (baseline turn #{i} has no counterpart)",
722                    ),
723                    confidence: 1.0,
724                });
725                b_cursor = b_cursor.saturating_add(1);
726            }
727            Step::Match(i, j) => {
728                let b = baseline[i];
729                let c = candidate[j];
730                let cost = pair_cost(b, c);
731                b_cursor = i.saturating_add(1);
732                c_cursor = j.saturating_add(1);
733                if cost <= NOISE_FLOOR {
734                    continue;
735                }
736                // Above noise floor — classify and record.
737                let (kind, axis, explanation) = classify(b, c, cost);
738                let confidence = ((cost - NOISE_FLOOR) / (1.0 - NOISE_FLOOR)).clamp(0.0, 1.0);
739                out.push(FirstDivergence {
740                    baseline_turn: i,
741                    candidate_turn: j,
742                    kind,
743                    primary_axis: axis,
744                    explanation,
745                    confidence,
746                });
747            }
748        }
749    }
750    out
751}
752
753/// Classify a significant (above-noise-floor) matched pair.
754fn classify(b: &Record, c: &Record, cost: f64) -> (DivergenceKind, Axis, String) {
755    let shape_b = tool_shape(b);
756    let shape_c = tool_shape(c);
757    let text_b = response_text(b);
758    let text_c = response_text(c);
759    let stop_b = stop_reason(b);
760    let stop_c = stop_reason(c);
761    let sem_sim = text_similarity(&text_b, &text_c);
762
763    // Structural: tool shapes differ (name or arg-key set).
764    if shape_b != shape_c {
765        let explanation = describe_tool_diff(&shape_b, &shape_c);
766        return (DivergenceKind::Structural, Axis::Trajectory, explanation);
767    }
768    // Structural: same tool shapes but different COUNTS (duplicated calls).
769    // Shape is a set so `{lookup_order(id)}` matches itself regardless of
770    // call count; we detect duplicates via an explicit count comparison.
771    let count_b = count_tool_use(b);
772    let count_c = count_tool_use(c);
773    if count_b != count_c {
774        let tool_names: Vec<&String> = shape_b.iter().collect();
775        let tools_summary = if tool_names.len() == 1 {
776            format!("`{}`", tool_names[0])
777        } else {
778            format!("{} tool(s)", tool_names.len())
779        };
780        let explanation = if count_c > count_b {
781            format!(
782                "candidate called {tools_summary} {count_c} time(s) vs baseline's {count_b} \
783                — duplicate tool invocation"
784            )
785        } else {
786            format!(
787                "candidate called {tools_summary} {count_c} time(s) vs baseline's {count_b} \
788                — dropped one or more repeat invocations"
789            )
790        };
791        return (DivergenceKind::Structural, Axis::Trajectory, explanation);
792    }
793
794    // Stop reason flipped — often signals refusal / filter.
795    if stop_b != stop_c {
796        return (
797            DivergenceKind::Decision,
798            Axis::Safety,
799            format!("stop_reason changed: `{stop_b}` → `{stop_c}`"),
800        );
801    }
802
803    // Tool shape matches and stop_reason matches but something still
804    // drives a divergence. Two sub-cases:
805    //   - tool_use.input values differ (same keys, different values)
806    //   - response text diverged
807    if let Some(arg_diff) = arg_value_diff(b, c) {
808        return (
809            DivergenceKind::Decision,
810            Axis::Trajectory,
811            format!("tool arg value changed: {arg_diff}"),
812        );
813    }
814
815    // Pure text divergence. Style vs decision depends on similarity.
816    if sem_sim >= 0.90 && cost <= STYLE_MAX_COST {
817        (
818            DivergenceKind::Style,
819            Axis::Semantic,
820            "cosmetic wording change — tool sequence and semantics preserved".to_string(),
821        )
822    } else {
823        (
824            DivergenceKind::Decision,
825            Axis::Semantic,
826            format!(
827                "response text diverged (text similarity {:.2}); same tool sequence",
828                sem_sim
829            ),
830        )
831    }
832}
833
834fn describe_tool_diff(a: &BTreeSet<String>, b: &BTreeSet<String>) -> String {
835    let only_a: Vec<&String> = a.difference(b).collect();
836    let only_b: Vec<&String> = b.difference(a).collect();
837    if !only_a.is_empty() && only_b.is_empty() {
838        format!("candidate dropped tool call(s): {}", list(&only_a))
839    } else if !only_b.is_empty() && only_a.is_empty() {
840        format!("candidate added tool call(s): {}", list(&only_b))
841    } else if !only_a.is_empty() && !only_b.is_empty() {
842        format!(
843            "tool set changed: removed {}, added {}",
844            list(&only_a),
845            list(&only_b)
846        )
847    } else {
848        "tool ordering differs".to_string()
849    }
850}
851
852fn list(items: &[&String]) -> String {
853    items
854        .iter()
855        .map(|s| format!("`{s}`"))
856        .collect::<Vec<_>>()
857        .join(", ")
858}
859
860/// Compare arg values for tools that have the same name and arg keys.
861/// Returns `Some(summary)` if any tool's values differ, `None` if every
862/// tool's values match.
863fn arg_value_diff(a: &Record, b: &Record) -> Option<String> {
864    let ta = tool_use_inputs(a);
865    let tb = tool_use_inputs(b);
866    for (name, va) in &ta {
867        if let Some(vb) = tb.get(name) {
868            if va != vb {
869                // Find the first differing key.
870                if let (Some(oa), Some(ob)) = (va.as_object(), vb.as_object()) {
871                    for (k, v) in oa {
872                        if ob.get(k) != Some(v) {
873                            let other = ob
874                                .get(k)
875                                .map(|x| x.to_string())
876                                .unwrap_or("<missing>".to_string());
877                            return Some(format!("`{name}({k})`: `{v}` → `{other}`"));
878                        }
879                    }
880                }
881                return Some(format!("`{name}`: input changed"));
882            }
883        }
884    }
885    None
886}
887
888/// Index a chat_response's tool_use blocks by tool_name → input value.
889/// First occurrence wins if a tool is called twice in the same turn.
890fn tool_use_inputs(r: &Record) -> std::collections::BTreeMap<String, serde_json::Value> {
891    let mut out = std::collections::BTreeMap::new();
892    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
893        return out;
894    };
895    for part in arr {
896        if part.get("type").and_then(|t| t.as_str()) != Some("tool_use") {
897            continue;
898        }
899        let name = part
900            .get("name")
901            .and_then(|n| n.as_str())
902            .unwrap_or("_")
903            .to_string();
904        let input = part
905            .get("input")
906            .cloned()
907            .unwrap_or(serde_json::Value::Null);
908        out.entry(name).or_insert(input);
909    }
910    out
911}
912
913// ---------------------------------------------------------------------------
914// Tests
915// ---------------------------------------------------------------------------
916
917#[cfg(test)]
918mod tests {
919    use super::*;
920    use crate::agentlog::Kind;
921    use serde_json::json;
922
923    fn response_text_only(text: &str, stop: &str) -> Record {
924        Record::new(
925            Kind::ChatResponse,
926            json!({
927                "model": "x",
928                "content": [{"type": "text", "text": text}],
929                "stop_reason": stop,
930                "latency_ms": 0,
931                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
932            }),
933            "2026-04-23T00:00:00Z",
934            None,
935        )
936    }
937
938    fn response_with_tool(name: &str, input: serde_json::Value, stop: &str) -> Record {
939        Record::new(
940            Kind::ChatResponse,
941            json!({
942                "model": "x",
943                "content": [{
944                    "type": "tool_use",
945                    "id": "t1",
946                    "name": name,
947                    "input": input,
948                }],
949                "stop_reason": stop,
950                "latency_ms": 0,
951                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
952            }),
953            "2026-04-23T00:00:00Z",
954            None,
955        )
956    }
957
958    fn meta() -> Record {
959        Record::new(
960            Kind::Metadata,
961            json!({"sdk": {"name": "shadow"}}),
962            "2026-04-23T00:00:00Z",
963            None,
964        )
965    }
966
967    #[test]
968    fn identical_traces_return_none() {
969        let r = response_text_only("Paris is the capital of France.", "end_turn");
970        let baseline = vec![meta(), r.clone(), r.clone()];
971        let candidate = vec![meta(), r.clone(), r.clone()];
972        assert_eq!(detect(&baseline, &candidate), None);
973    }
974
975    #[test]
976    fn whitespace_only_diff_is_style() {
977        let b = response_text_only("Paris is the capital of France.", "end_turn");
978        let c = response_text_only("Paris is  the capital of France.", "end_turn");
979        let baseline = vec![meta(), b];
980        let candidate = vec![meta(), c];
981        // At shingle-level, whitespace variance is tiny but nonzero.
982        // Classification depends on cost; this case is typically below
983        // NOISE_FLOOR in practice. The key assertion is that if ANY
984        // divergence is reported, it's Style, not Structural/Decision.
985        if let Some(d) = detect(&baseline, &candidate) {
986            assert_eq!(d.kind, DivergenceKind::Style);
987            assert_eq!(d.primary_axis, Axis::Semantic);
988        }
989    }
990
991    #[test]
992    fn different_tool_name_is_structural_on_trajectory_axis() {
993        let b = response_with_tool("search", json!({"q": "cats"}), "tool_use");
994        let c = response_with_tool("lookup", json!({"q": "cats"}), "tool_use");
995        let baseline = vec![meta(), b];
996        let candidate = vec![meta(), c];
997        let d = detect(&baseline, &candidate).expect("divergence expected");
998        assert_eq!(d.kind, DivergenceKind::Structural);
999        assert_eq!(d.primary_axis, Axis::Trajectory);
1000        assert_eq!(d.baseline_turn, 0);
1001        assert_eq!(d.candidate_turn, 0);
1002        assert!(d.explanation.contains("search") || d.explanation.contains("lookup"));
1003    }
1004
1005    #[test]
1006    fn same_tool_different_arg_value_is_decision() {
1007        let b = response_with_tool("search", json!({"q": "cats", "limit": 10}), "tool_use");
1008        let c = response_with_tool("search", json!({"q": "cats", "limit": 50}), "tool_use");
1009        let baseline = vec![meta(), b];
1010        let candidate = vec![meta(), c];
1011        let d = detect(&baseline, &candidate).expect("divergence expected");
1012        assert_eq!(d.kind, DivergenceKind::Decision);
1013        assert_eq!(d.primary_axis, Axis::Trajectory);
1014        assert!(d.explanation.contains("limit"));
1015    }
1016
1017    #[test]
1018    fn stop_reason_flip_is_decision_on_safety() {
1019        let b = response_text_only("Here is the answer.", "end_turn");
1020        let c = response_text_only("I can't help with that.", "content_filter");
1021        let baseline = vec![meta(), b];
1022        let candidate = vec![meta(), c];
1023        let d = detect(&baseline, &candidate).expect("divergence expected");
1024        assert_eq!(d.kind, DivergenceKind::Decision);
1025        assert_eq!(d.primary_axis, Axis::Safety);
1026        assert!(d.explanation.contains("end_turn"));
1027        assert!(d.explanation.contains("content_filter"));
1028    }
1029
1030    #[test]
1031    fn candidate_drops_a_turn_is_structural() {
1032        let r1 = response_text_only("first turn", "end_turn");
1033        let r2 = response_text_only("second turn", "end_turn");
1034        let baseline = vec![meta(), r1.clone(), r2];
1035        let candidate = vec![meta(), r1]; // dropped the second
1036        let d = detect(&baseline, &candidate).expect("divergence expected");
1037        assert_eq!(d.kind, DivergenceKind::Structural);
1038        assert_eq!(d.primary_axis, Axis::Trajectory);
1039    }
1040
1041    #[test]
1042    fn candidate_inserts_a_turn_is_structural() {
1043        let r1 = response_text_only("turn one", "end_turn");
1044        let r2 = response_text_only("inserted", "end_turn");
1045        let r3 = response_text_only("turn two", "end_turn");
1046        let baseline = vec![meta(), r1.clone(), r3.clone()];
1047        let candidate = vec![meta(), r1, r2, r3];
1048        let d = detect(&baseline, &candidate).expect("divergence expected");
1049        assert_eq!(d.kind, DivergenceKind::Structural);
1050    }
1051
1052    #[test]
1053    fn significant_text_shift_is_decision_on_semantic() {
1054        // Different topics entirely — semantic similarity low, no tools.
1055        let b = response_text_only(
1056            "Photosynthesis is the process by which plants convert sunlight.",
1057            "end_turn",
1058        );
1059        let c = response_text_only(
1060            "The stock market closed higher on Thursday after strong earnings.",
1061            "end_turn",
1062        );
1063        let baseline = vec![meta(), b];
1064        let candidate = vec![meta(), c];
1065        let d = detect(&baseline, &candidate).expect("divergence expected");
1066        assert_eq!(d.kind, DivergenceKind::Decision);
1067        assert_eq!(d.primary_axis, Axis::Semantic);
1068    }
1069
1070    #[test]
1071    fn empty_traces_return_none() {
1072        assert_eq!(detect(&[meta()], &[meta()]), None);
1073        assert_eq!(detect(&[], &[]), None);
1074    }
1075
1076    #[test]
1077    fn first_divergence_is_truly_first() {
1078        // Three turns; only the SECOND differs. Detector must locate
1079        // turn index 1, not turn 2.
1080        let r1 = response_text_only("same", "end_turn");
1081        let r2b = response_text_only("baseline version of turn two with lots of text", "end_turn");
1082        let r2c = response_text_only(
1083            "CANDIDATE SAID SOMETHING COMPLETELY DIFFERENT HERE",
1084            "end_turn",
1085        );
1086        let r3 = response_text_only("also same", "end_turn");
1087        let baseline = vec![meta(), r1.clone(), r2b, r3.clone()];
1088        let candidate = vec![meta(), r1, r2c, r3];
1089        let d = detect(&baseline, &candidate).expect("divergence expected");
1090        assert_eq!(d.baseline_turn, 1);
1091        assert_eq!(d.candidate_turn, 1);
1092    }
1093
1094    #[test]
1095    fn confidence_is_in_valid_range() {
1096        let b = response_with_tool("search", json!({"q": "a"}), "tool_use");
1097        let c = response_with_tool("other", json!({"q": "a"}), "tool_use");
1098        let baseline = vec![meta(), b];
1099        let candidate = vec![meta(), c];
1100        let d = detect(&baseline, &candidate).unwrap();
1101        assert!((0.0..=1.0).contains(&d.confidence));
1102    }
1103
1104    #[test]
1105    fn tool_shape_captures_name_and_arg_keys() {
1106        let r = response_with_tool("search", json!({"q": "a", "limit": 10}), "tool_use");
1107        let shape = tool_shape(&r);
1108        assert_eq!(shape.len(), 1);
1109        let entry = shape.iter().next().unwrap();
1110        assert!(entry.starts_with("search("));
1111        assert!(entry.contains("limit"));
1112        assert!(entry.contains("q"));
1113    }
1114
1115    #[test]
1116    fn jaccard_on_empty_sets_is_one() {
1117        let empty = BTreeSet::new();
1118        assert_eq!(jaccard(&empty, &empty), 1.0);
1119    }
1120
1121    #[test]
1122    fn alignment_prefers_matches_over_gaps_when_both_cheap() {
1123        // Two identical turns. NW should produce two Match steps and
1124        // no gaps.
1125        let r = response_text_only("same", "end_turn");
1126        let alignment = align(&[&r, &r], &[&r, &r]);
1127        let matches = alignment
1128            .steps
1129            .iter()
1130            .filter(|s| matches!(s, Step::Match(..)))
1131            .count();
1132        assert_eq!(matches, 2);
1133        let gaps = alignment.steps.len() - matches;
1134        assert_eq!(gaps, 0);
1135    }
1136
1137    // -----------------------------------------------------------------
1138    // Top-K tests
1139    // -----------------------------------------------------------------
1140
1141    #[test]
1142    fn top_k_with_zero_returns_empty() {
1143        let r1 = response_text_only("same", "end_turn");
1144        let r2 = response_text_only("different", "end_turn");
1145        let out = detect_top_k(&[meta(), r1], &[meta(), r2], 0);
1146        assert_eq!(out.len(), 0);
1147    }
1148
1149    #[test]
1150    fn top_k_with_identical_returns_empty() {
1151        let r = response_text_only("same", "end_turn");
1152        let out = detect_top_k(&[meta(), r.clone(), r.clone()], &[meta(), r.clone(), r], 3);
1153        assert_eq!(out.len(), 0);
1154    }
1155
1156    #[test]
1157    fn top_k_orders_structural_before_decision_before_style() {
1158        // Construct a candidate with one divergence of each kind, in
1159        // order: Style @ turn 0, Decision @ turn 1 (refusal), Structural
1160        // @ turn 2 (tool change). Top-K must rerank: Structural #1,
1161        // Decision #2, Style #3 — NOT walk order.
1162        let b0 = response_text_only(
1163            "Hello, here is a detailed answer explaining the topic in full.",
1164            "end_turn",
1165        );
1166        let b1 = response_text_only("The answer is 42.", "end_turn");
1167        let b2 = response_with_tool("search", json!({"q": "x"}), "tool_use");
1168        let c0 = response_text_only(
1169            "Hello, here is a detailed answer explaining the topic in full!",
1170            "end_turn",
1171        ); // cosmetic punctuation → style
1172        let c1 = response_text_only("I cannot answer that.", "content_filter"); // refusal → decision (safety)
1173        let c2 = response_with_tool("lookup", json!({"q": "x"}), "tool_use"); // tool name change → structural
1174        let baseline = vec![meta(), b0, b1, b2];
1175        let candidate = vec![meta(), c0, c1, c2];
1176        let out = detect_top_k(&baseline, &candidate, 5);
1177        assert!(
1178            out.len() >= 2,
1179            "expected at least 2 divergences, got {}",
1180            out.len()
1181        );
1182        // #1 must be structural
1183        assert_eq!(
1184            out[0].kind,
1185            DivergenceKind::Structural,
1186            "rank 1 should be Structural, got {:?}",
1187            out[0].kind
1188        );
1189        // If we have a rank 2, it must be Decision (Style is lowest priority)
1190        if out.len() >= 2 {
1191            assert_eq!(
1192                out[1].kind,
1193                DivergenceKind::Decision,
1194                "rank 2 should be Decision, got {:?}",
1195                out[1].kind
1196            );
1197        }
1198    }
1199
1200    #[test]
1201    fn top_k_truncates_at_k() {
1202        // 5 divergent turns, ask for top 2.
1203        let same = response_text_only("unchanged", "end_turn");
1204        let _ = same.clone(); // avoid unused_assignments warning pattern
1205        let baseline = vec![
1206            meta(),
1207            response_with_tool("a", json!({}), "tool_use"),
1208            response_with_tool("b", json!({}), "tool_use"),
1209            response_with_tool("c", json!({}), "tool_use"),
1210            response_with_tool("d", json!({}), "tool_use"),
1211            response_with_tool("e", json!({}), "tool_use"),
1212        ];
1213        let candidate = vec![
1214            meta(),
1215            response_with_tool("A", json!({}), "tool_use"),
1216            response_with_tool("B", json!({}), "tool_use"),
1217            response_with_tool("C", json!({}), "tool_use"),
1218            response_with_tool("D", json!({}), "tool_use"),
1219            response_with_tool("E", json!({}), "tool_use"),
1220        ];
1221        let out = detect_top_k(&baseline, &candidate, 2);
1222        assert_eq!(out.len(), 2);
1223        // All should be Structural (tool name differs)
1224        for dv in &out {
1225            assert_eq!(dv.kind, DivergenceKind::Structural);
1226        }
1227    }
1228
1229    #[test]
1230    fn top_k_preserves_walk_order_within_same_severity_and_confidence() {
1231        // Three Structural divergences with identical confidence → ties
1232        // broken by walk order (earlier turns before later ones).
1233        let baseline = vec![
1234            meta(),
1235            response_with_tool("a", json!({}), "tool_use"),
1236            response_with_tool("b", json!({}), "tool_use"),
1237            response_with_tool("c", json!({}), "tool_use"),
1238        ];
1239        let candidate = vec![
1240            meta(),
1241            response_with_tool("A", json!({}), "tool_use"),
1242            response_with_tool("B", json!({}), "tool_use"),
1243            response_with_tool("C", json!({}), "tool_use"),
1244        ];
1245        let out = detect_top_k(&baseline, &candidate, 3);
1246        assert_eq!(out.len(), 3);
1247        // Stable sort preserves walk order for equal keys
1248        assert_eq!(out[0].baseline_turn, 0);
1249        assert_eq!(out[1].baseline_turn, 1);
1250        assert_eq!(out[2].baseline_turn, 2);
1251    }
1252
1253    #[test]
1254    fn top_k_of_1_matches_first_divergence_classifier() {
1255        // detect_top_k(.., 1) and detect() should name the same KIND
1256        // for simple single-divergence traces (walk order preserved
1257        // within the same kind rank).
1258        let b = response_with_tool("search", json!({"q": "x"}), "tool_use");
1259        let c = response_with_tool("search", json!({"q": "y"}), "tool_use");
1260        let first = detect(&[meta(), b.clone()], &[meta(), c.clone()]).unwrap();
1261        let top = detect_top_k(&[meta(), b], &[meta(), c], 1);
1262        assert_eq!(top.len(), 1);
1263        assert_eq!(top[0].kind, first.kind);
1264        assert_eq!(top[0].baseline_turn, first.baseline_turn);
1265    }
1266
1267    #[test]
1268    fn first_divergence_is_alignment_order_not_importance_rank() {
1269        // Explicit guarantee: `detect()` returns divergence by WALK
1270        // order (earliest above-noise cell), not importance. This
1271        // preserves backward compat with v1's semantic.
1272        let b0 = response_text_only("same across both", "end_turn");
1273        let b1 = response_with_tool("search", json!({"q": "x"}), "tool_use");
1274        let c0 = response_text_only("completely different response here", "end_turn");
1275        let c1 = response_with_tool("lookup", json!({"q": "x"}), "tool_use");
1276        // Turn 0 has Decision (text shift); turn 1 has Structural.
1277        // top_k will rank Structural #1 (higher class). first() must
1278        // still return the turn 0 divergence (walk order).
1279        let baseline = vec![meta(), b0, b1];
1280        let candidate = vec![meta(), c0, c1];
1281        let first = detect(&baseline, &candidate).unwrap();
1282        let top = detect_top_k(&baseline, &candidate, 3);
1283        assert_eq!(first.baseline_turn, 0);
1284        assert_eq!(top[0].baseline_turn, 1); // re-ranked Structural first
1285        assert_eq!(top[0].kind, DivergenceKind::Structural);
1286    }
1287}