Skip to main content

difflib_fast/
gestalt.rs

1//! Gestalt-Chain: fast **exact** Ratcliff–Obershelp via a suffix automaton.
2//!
3//! difflib's `ratio` is slow because `find_longest_match` is O(|a|·|b|) (it rescans every
4//! occurrence of popular characters). The matched-block total M is the recursive
5//! longest-common-substring decomposition; a **suffix automaton** finds the longest common
6//! substring in O(|a|+|b|) regardless of character frequency, and recursing on the left/
7//! right remainders reproduces difflib's M with the *same* tie-break (longest; earliest in
8//! a; earliest in b). So `gestalt_ratio` equals `difflib.SequenceMatcher.ratio()` exactly.
9//!
10//! Hot-path engineering for the all-pairs join:
11//!   * transitions are stored **CSR** (per-state sorted slice) → O(log deg) lookup, O(len)
12//!     memory, no hashing — and crucially no O(deg) scan at the high-degree root, which the
13//!     scan hammers when strings are dissimilar;
14//!   * the b-side automaton is **prebuilt once per string** (`build_sam`) and reused for
15//!     every pair, so the all-pairs cost is n builds + n² scans, not n² builds.
16//!   * **prefetch hints attempted on the per-iteration `node[state]` load** — the SAM walk is a
17//!     data-dependent pointer chase the hardware prefetcher cannot anticipate. A `prfm pldl1keep`
18//!     (`AArch64`) / `_mm_prefetch` (x86) experiment was MEASURED on M3 Pro and did NOT pay off
19//!     (-10% to -2% across mypy/sympy/django) — the SAMs fit in L2 already, and the prefetch
20//!     instruction burns execution slots without producing a hit-rate improvement. See the tombstone
21//!     near `matching_stats_into` for details.
22//!
23//! Operates on `char` (code points), so it is bit-identical to difflib on non-ASCII.
24
25// Note: a software-prefetch experiment (prfm pldl1keep / _mm_prefetch for the next iteration's
26// node[state] load) was tried and DID NOT pay off on M3 Pro — the prefetch instruction itself
27// burned execution slots without producing a measurable hit-rate improvement, because the SAM
28// pages fit comfortably in L2 for the per-pair working set (~100KB for the two SAMs in play)
29// and the hardware prefetcher already keeps the hot stretches warm. Net: -10% to -2% across
30// mypy/sympy/django. Left here as a tombstone so future readers don't waste a day on it.
31
32/// Size of the root's direct transition table — covers ASCII (the canonical text's alphabet).
33const ROOT_TBL: usize = 128;
34
35/// `#[cfg(feature = "instrument")]` per-call counters for data-driven perf decisions. Compiles to
36/// no-op in default builds (no atomic ops, no codegen). Enabled via `cargo build --features
37/// instrument`. The counters use `Ordering::Relaxed` because we only care about totals at
38/// end-of-workload (no causal ordering needed) — the relaxed cost is one atomic add per inc.
39///
40/// Cross-thread aggregation: every rayon worker writes into the same global atomics; we read them
41/// after the workload completes via `instrument::dump()`. Use `instrument::reset()` between
42/// successive workload runs in the same process so the numbers stay per-workload.
43#[cfg(feature = "instrument")]
44pub mod instrument {
45    use std::sync::atomic::{AtomicU64, Ordering};
46
47    /// Histogram bucket count for chain walk + recursion depth. Anything deeper than 63 falls in
48    /// the last bucket — measured tail on canonical Python is < 30 in practice, so 64 is plenty.
49    pub const HIST_BUCKETS: usize = 64;
50
51    pub static LONGEST_IN_CALLS: AtomicU64 = AtomicU64::new(0);
52    pub static PAIRS_PROCESSED: AtomicU64 = AtomicU64::new(0);
53    pub static MAX_LE_CALLS: AtomicU64 = AtomicU64::new(0);
54    pub static MAX_LE_FAST_PATH: AtomicU64 = AtomicU64::new(0);   // lastpos<=x or firstpos>x shortcut
55    pub static MAX_LE_LINEAR: AtomicU64 = AtomicU64::new(0);      // cnt <= LINEAR_MAX scan
56    pub static MAX_LE_LINEAR_LEN_SUM: AtomicU64 = AtomicU64::new(0); // total entries scanned
57    pub static MAX_LE_SEGTREE: AtomicU64 = AtomicU64::new(0);     // merge-sort-tree walk
58    pub static MIN_IN_CALLS: AtomicU64 = AtomicU64::new(0);
59    pub static MIN_IN_FAST_PATH: AtomicU64 = AtomicU64::new(0);
60    pub static MIN_IN_LINEAR: AtomicU64 = AtomicU64::new(0);
61    pub static MIN_IN_LINEAR_LEN_SUM: AtomicU64 = AtomicU64::new(0);
62    pub static MIN_IN_SEGTREE: AtomicU64 = AtomicU64::new(0);
63    pub static FMATCH_ZERO: AtomicU64 = AtomicU64::new(0);
64    pub static FMATCH_NONZERO: AtomicU64 = AtomicU64::new(0);
65    pub static FMATCH_SUM: AtomicU64 = AtomicU64::new(0); // sum of all non-zero fmatch values
66
67    pub static CHAIN_DEPTHS: [AtomicU64; HIST_BUCKETS] = {
68        // const init via repeated AtomicU64::new(0) — array initializer.
69        const ZERO: AtomicU64 = AtomicU64::new(0);
70        [ZERO; HIST_BUCKETS]
71    };
72    pub static RECURSION_DEPTHS: [AtomicU64; HIST_BUCKETS] = {
73        const ZERO: AtomicU64 = AtomicU64::new(0);
74        [ZERO; HIST_BUCKETS]
75    };
76
77    pub fn reset() {
78        LONGEST_IN_CALLS.store(0, Ordering::Relaxed);
79        PAIRS_PROCESSED.store(0, Ordering::Relaxed);
80        MAX_LE_CALLS.store(0, Ordering::Relaxed);
81        MAX_LE_FAST_PATH.store(0, Ordering::Relaxed);
82        MAX_LE_LINEAR.store(0, Ordering::Relaxed);
83        MAX_LE_LINEAR_LEN_SUM.store(0, Ordering::Relaxed);
84        MAX_LE_SEGTREE.store(0, Ordering::Relaxed);
85        MIN_IN_CALLS.store(0, Ordering::Relaxed);
86        MIN_IN_FAST_PATH.store(0, Ordering::Relaxed);
87        MIN_IN_LINEAR.store(0, Ordering::Relaxed);
88        MIN_IN_LINEAR_LEN_SUM.store(0, Ordering::Relaxed);
89        MIN_IN_SEGTREE.store(0, Ordering::Relaxed);
90        FMATCH_ZERO.store(0, Ordering::Relaxed);
91        FMATCH_NONZERO.store(0, Ordering::Relaxed);
92        FMATCH_SUM.store(0, Ordering::Relaxed);
93        for b in &CHAIN_DEPTHS {
94            b.store(0, Ordering::Relaxed);
95        }
96        for b in &RECURSION_DEPTHS {
97            b.store(0, Ordering::Relaxed);
98        }
99    }
100
101    /// Dump all counters as a human-readable multi-line string. Includes derived stats
102    /// (avg per pair, % linear vs seg-tree, percentile of chain depth, etc.).
103    #[must_use]
104    pub fn dump() -> String {
105        let mut s = String::with_capacity(2048);
106        let pairs = PAIRS_PROCESSED.load(Ordering::Relaxed).max(1);
107        let li = LONGEST_IN_CALLS.load(Ordering::Relaxed);
108        let mxc = MAX_LE_CALLS.load(Ordering::Relaxed).max(1);
109        let mxfp = MAX_LE_FAST_PATH.load(Ordering::Relaxed);
110        let mxln = MAX_LE_LINEAR.load(Ordering::Relaxed);
111        let mxlnsum = MAX_LE_LINEAR_LEN_SUM.load(Ordering::Relaxed);
112        let mxsg = MAX_LE_SEGTREE.load(Ordering::Relaxed);
113        let mic = MIN_IN_CALLS.load(Ordering::Relaxed);
114        let micfp = MIN_IN_FAST_PATH.load(Ordering::Relaxed);
115        let micln = MIN_IN_LINEAR.load(Ordering::Relaxed);
116        let miclnsum = MIN_IN_LINEAR_LEN_SUM.load(Ordering::Relaxed);
117        let micsg = MIN_IN_SEGTREE.load(Ordering::Relaxed);
118        let fz = FMATCH_ZERO.load(Ordering::Relaxed);
119        let fnz = FMATCH_NONZERO.load(Ordering::Relaxed);
120        let fsum = FMATCH_SUM.load(Ordering::Relaxed);
121        let ftotal = (fz + fnz).max(1);
122
123        s.push_str(&format!("=== gestalt::instrument dump ===\n"));
124        s.push_str(&format!(
125            "pairs processed:  {}\n",
126            PAIRS_PROCESSED.load(Ordering::Relaxed),
127        ));
128        s.push_str(&format!(
129            "longest_in calls: {} ({:.1} per pair)\n",
130            li,
131            li as f64 / pairs as f64,
132        ));
133        s.push_str(&format!(
134            "max_le calls:     {} ({:.1} per pair, {:.1} per longest_in)\n",
135            mxc,
136            mxc as f64 / pairs as f64,
137            mxc as f64 / li.max(1) as f64,
138        ));
139        s.push_str(&format!(
140            "  fast-path:      {} ({:.1}%)\n",
141            mxfp,
142            mxfp as f64 / mxc as f64 * 100.0,
143        ));
144        s.push_str(&format!(
145            "  linear scan:    {} ({:.1}%)   avg scan len: {:.1}\n",
146            mxln,
147            mxln as f64 / mxc as f64 * 100.0,
148            mxlnsum as f64 / mxln.max(1) as f64,
149        ));
150        s.push_str(&format!(
151            "  seg-tree:       {} ({:.1}%)\n",
152            mxsg,
153            mxsg as f64 / mxc as f64 * 100.0,
154        ));
155        s.push_str(&format!(
156            "min_in calls:     {} ({:.1} per pair)\n",
157            mic,
158            mic as f64 / pairs as f64,
159        ));
160        s.push_str(&format!(
161            "  fast-path:      {} ({:.1}%)\n",
162            micfp,
163            micfp as f64 / mic.max(1) as f64 * 100.0,
164        ));
165        s.push_str(&format!(
166            "  linear scan:    {} ({:.1}%)   avg scan len: {:.1}\n",
167            micln,
168            micln as f64 / mic.max(1) as f64 * 100.0,
169            miclnsum as f64 / micln.max(1) as f64,
170        ));
171        s.push_str(&format!(
172            "  seg-tree:       {} ({:.1}%)\n",
173            micsg,
174            micsg as f64 / mic.max(1) as f64 * 100.0,
175        ));
176        s.push_str(&format!(
177            "fmatch values:    {:.1}% zero  ({} z / {} nz)  avg-nz {:.2}\n",
178            fz as f64 / ftotal as f64 * 100.0,
179            fz,
180            fnz,
181            fsum as f64 / fnz.max(1) as f64,
182        ));
183
184        let mut chain_total = 0u64;
185        let chain: Vec<u64> = CHAIN_DEPTHS.iter().map(|a| a.load(Ordering::Relaxed)).collect();
186        for &v in &chain {
187            chain_total += v;
188        }
189        s.push_str(&format!(
190            "chain walk depths ({} total walks):\n",
191            chain_total,
192        ));
193        let pct = |target: f64| -> usize {
194            let mut acc = 0u64;
195            for (i, &v) in chain.iter().enumerate() {
196                acc += v;
197                if acc as f64 >= target * chain_total as f64 {
198                    return i;
199                }
200            }
201            chain.len() - 1
202        };
203        s.push_str(&format!(
204            "  p50={} p90={} p95={} p99={} max-bucket={}\n",
205            pct(0.50),
206            pct(0.90),
207            pct(0.95),
208            pct(0.99),
209            chain.iter().rposition(|&v| v > 0).unwrap_or(0),
210        ));
211        // Compact histogram preview (first 16 buckets)
212        s.push_str("  hist[0..16]: ");
213        for &v in chain.iter().take(16) {
214            s.push_str(&format!("{} ", v));
215        }
216        s.push('\n');
217
218        let mut rec_total = 0u64;
219        let rec: Vec<u64> = RECURSION_DEPTHS.iter().map(|a| a.load(Ordering::Relaxed)).collect();
220        for &v in &rec {
221            rec_total += v;
222        }
223        s.push_str(&format!(
224            "recursion depths ({} stack pushes):\n",
225            rec_total,
226        ));
227        s.push_str("  hist[0..16]: ");
228        for &v in rec.iter().take(16) {
229            s.push_str(&format!("{} ", v));
230        }
231        s.push('\n');
232
233        s
234    }
235}
236
237/// Helper that the instrument hooks call when the feature is enabled; no-op otherwise.
238#[cfg(feature = "instrument")]
239#[inline(always)]
240fn instr_inc(c: &std::sync::atomic::AtomicU64, by: u64) {
241    c.fetch_add(by, std::sync::atomic::Ordering::Relaxed);
242}
243#[cfg(feature = "instrument")]
244#[inline(always)]
245fn instr_hist(buckets: &[std::sync::atomic::AtomicU64; instrument::HIST_BUCKETS], depth: usize) {
246    buckets[depth.min(instrument::HIST_BUCKETS - 1)]
247        .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
248}
249
250
251/// Suffix automaton with CSR (sorted-per-state) transitions — built once, queried by scans.
252///
253/// For the range-restricted recursion (fix b), each state also carries its **endpos** as a
254/// contiguous slice `[dfs_in, dfs_in+dfs_cnt)` of `epos` (the end-positions in b, laid out by
255/// a DFS of the suffix-link tree so a subtree is contiguous). A merge-sort tree over `epos`
256/// answers "is there an end-position in [lo,hi] within this state's subtree, and the min/max
257/// such" — so the whole RO recursion runs on this one prebuilt SAM, with **no sub-builds**.
258#[derive(Clone)]
259pub struct Sam {
260    // Per-state hot fields packed into one cache-line-friendly struct `[len, link, edge_lo, edge_hi]`
261    // so a state visit in the scan/recursion loads len + link + transition-range with a SINGLE cache
262    // miss (these were 5 separate arrays — under multi-thread DRAM-bandwidth pressure, fewer distinct
263    // lines per visit is the dominant win). `link` is u32: the root's link (-1) is stored as 0 and
264    // never read, since both the scan and `longest_in` stop at state 0 before following its link.
265    node: Vec<[u32; 4]>,
266    // Transitions packed as `(char as u64) << 32 | to`, sorted by char within each state's range
267    // [edge_lo, edge_hi). Co-locating char+target means the binary-search key and the taken edge's
268    // target live on the same cache line (was two parallel arrays `csr_char`/`csr_to`).
269    edges: Vec<u64>,
270    firstpos: Vec<u32>,  // per state: *smallest* end-position (build-time; folded into epmeta)
271    lastpos: Vec<u32>,   // per state: *largest* end-position (build-time; folded into epmeta)
272    // Direct lookup for the root's ASCII transitions (`root_next[c] = state`, or -1). The root is
273    // the high-degree state hit after every match reset; this makes its transition O(1) instead
274    // of a binary search. Non-ASCII root chars (rare) fall back to the edge search, so it's general.
275    root_next: Vec<i32>,
276    dfs_in: Vec<u32>,    // per state: start index into the endpos array of its endpos range
277    dfs_cnt: Vec<u32>,   // per state: number of end-positions in its subtree
278    // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style: node `i`'s sorted
279    // range is seg_data[seg_off[i]..seg_off[i+1]] (node 1 = root). A state's endpos is the leaf
280    // range [dfs_in, dfs_in+dfs_cnt); the tree answers range predecessor/successor adaptively in
281    // O(log²·slice) — and `firstpos`/`lastpos` give an O(1) fast path that skips it when the query
282    // bound doesn't split the state's endpos span (the common case for wide windows).
283    seg_data: Vec<u32>,
284    seg_off: Vec<u32>,
285    seg_n: usize,        // number of leaves (power of two) in the merge-sort tree
286    // Precomputed `len(link[s])` per state — used by `longest_in`'s chain walk's `band_min` check.
287    // Without this, every chain step does TWO node loads: `node[cur]` for (len, link, ...), then
288    // `node[node[cur].link]` for its `len`. Storing the link's length in a parallel array lets the
289    // hot loop fold both into one cache line. Saves ~1 memory access per chain step on M3 — chain
290    // walks are pointer-chase-bound, so this is a measurable win on `longest_in` (the 70%-of-CPU
291    // function on prepared ratio_many).
292    link_len: Vec<u32>,
293    // DFS-ordered end-positions (each state's endpos = contiguous slice [dfs_in, dfs_in+dfs_cnt)).
294    // For small endpos sets a cache-friendly linear scan of this beats the scattered tree descent.
295    epos: Vec<u32>,
296    // Query-hot per-state fields packed into one cache line `[firstpos, lastpos, dfs_in, dfs_cnt]`
297    // so `max_le`/`min_in` load all four with a single cache miss (they were 4 separate arrays).
298    epmeta: Vec<[u32; 4]>,
299    // Optimization B (Phase 4): chain-walk hot slot. Layout:
300    //   [0]   = len (same as node[s][0])
301    //   [1]   = link (same as node[s][1])
302    //   [2]   = link_len[s]
303    //   [3]   = padding
304    //   [4..8] = epmeta[s] = [firstpos, lastpos, dfs_in, dfs_cnt]
305    //
306    // 32 bytes / state — one cache-line-aligned load supplies everything `longest_in`'s chain
307    // walk + `max_le`/`min_in`'s fast path need for one state. Replaces three separate scattered
308    // accesses (`node[cur]`, `link_len[cur]`, `epmeta[cur]`) on the chain-walk hot path; PMU said
309    // L1D miss rate was 0.51 % and accounted for 18 % of cycles, distributed across these three
310    // per-state arrays. Co-locating folds 3 cache-line misses per chain step into 1.
311    //
312    // Memory cost: +32 bytes / state ≈ +1.6 MB on a 50 k-state SAM. Total `Sam` size grows ~20 %.
313    // Bench `bench_new delta-sweep` validates a wall-time reduction on the exact path.
314    chain_slot: Vec<[u32; 8]>,
315}
316
317/// Below this endpos-set size, `max_le`/`min_in` linear-scan the contiguous DFS-ordered endpos
318/// (cache-friendly) instead of the merge-sort tree (scattered) — most queried sets are this small.
319const LINEAR_MAX: usize = 256;
320
321impl Sam {
322    fn empty() -> Self {
323        Sam {
324            node: Vec::new(),
325            edges: Vec::new(),
326            firstpos: Vec::new(),
327            lastpos: Vec::new(),
328            root_next: Vec::new(),
329            dfs_in: Vec::new(),
330            dfs_cnt: Vec::new(),
331            seg_data: Vec::new(),
332            seg_off: Vec::new(),
333            seg_n: 0,
334            epos: Vec::new(),
335            epmeta: Vec::new(),
336            link_len: Vec::new(),
337            chain_slot: Vec::new(),
338        }
339    }
340
341    /// Node `i`'s sorted endpos range in the flattened merge-sort tree.
342    fn seg_node(&self, i: usize) -> &[u32] {
343        &self.seg_data[self.seg_off[i] as usize..self.seg_off[i + 1] as usize]
344    }
345
346    /// Read-only view of the packed `[len, link, edge_lo, edge_hi]` per state — needed by the
347    /// GPU port (`gpu::matching_stats_gpu`) to serialize the SAM into a Metal buffer. The kernel
348    /// reads this slice via index calculations, so we expose it raw (one `[u32; 4]` per state).
349    #[must_use]
350    pub fn nodes(&self) -> &[[u32; 4]] {
351        &self.node
352    }
353
354    /// Read-only view of the packed edge slice: `(char << 32) | target_state`, sorted by char
355    /// within each state's `[edge_lo, edge_hi)` range. The GPU kernel does binary search over
356    /// this slice exactly as `csr_lookup` does on the CPU.
357    #[must_use]
358    pub fn edges_packed(&self) -> &[u64] {
359        &self.edges
360    }
361
362    /// Read-only view of the root's direct ASCII transition table (`root_next[c] = state`, or
363    /// `-1` for missing). 128 entries per SAM. The GPU kernel uses this to skip the binary
364    /// search at the root state, exactly as the CPU does.
365    #[must_use]
366    pub fn root_next_table(&self) -> &[i32] {
367        &self.root_next
368    }
369
370    /// `state`'s endpos slice as the merge-sort-tree leaf range [l, r), from packed `epmeta`.
371    fn endpos_range_m(m: &[u32; 4], seg_n: usize) -> (usize, usize) {
372        let s = m[2] as usize; // dfs_in
373        (s + seg_n, s + m[3] as usize + seg_n) // dfs_cnt
374    }
375
376    /// Largest end-position `<= x` among `state`'s endpos, with pre-loaded metadata. Used by
377    /// `longest_in`'s chain walk where `m` was already pulled from `chain_slot[cur]` (Optimization
378    /// B) — saves an `epmeta[state]` load that would hit a separate cache line.
379    fn max_le_with_meta(&self, m: &[u32; 4], x: u32) -> Option<u32> {
380        #[cfg(feature = "instrument")]
381        instr_inc(&instrument::MAX_LE_CALLS, 1);
382        // O(1) fast path: x doesn't split the state's [firstpos, lastpos] span.
383        if m[1] <= x {
384            #[cfg(feature = "instrument")]
385            instr_inc(&instrument::MAX_LE_FAST_PATH, 1);
386            return Some(m[1]);
387        }
388        if m[0] > x {
389            #[cfg(feature = "instrument")]
390            instr_inc(&instrument::MAX_LE_FAST_PATH, 1);
391            return None;
392        }
393        let cnt = m[3] as usize;
394        if cnt <= LINEAR_MAX {
395            #[cfg(feature = "instrument")]
396            {
397                instr_inc(&instrument::MAX_LE_LINEAR, 1);
398                instr_inc(&instrument::MAX_LE_LINEAR_LEN_SUM, cnt as u64);
399            }
400            let lo = m[2] as usize;
401            let mut best = 0u32;
402            // SAFETY: `m` comes from a valid SAM state's epmeta or chain_slot, so the slice
403            // [m[2], m[2]+m[3]) is within `self.epos` (built that way at SAM construction).
404            #[allow(clippy::undocumented_unsafe_blocks)]
405            for &v in unsafe { self.epos.get_unchecked(lo..lo + cnt) } {
406                best = best.max(if v <= x { v } else { 0 });
407            }
408            return Some(best);
409        }
410        #[cfg(feature = "instrument")]
411        instr_inc(&instrument::MAX_LE_SEGTREE, 1);
412        let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
413        let mut best: Option<u32> = None;
414        while l < r {
415            if l & 1 == 1 {
416                best = best.max(node_max_le(self.seg_node(l), x));
417                l += 1;
418            }
419            if r & 1 == 1 {
420                r -= 1;
421                best = best.max(node_max_le(self.seg_node(r), x));
422            }
423            l >>= 1;
424            r >>= 1;
425        }
426        best
427    }
428
429    /// Smallest end-position in `[lo, hi]` with pre-loaded metadata. See `max_le_with_meta`.
430    fn min_in_with_meta(&self, m: &[u32; 4], lo: u32, hi: u32) -> Option<u32> {
431        #[cfg(feature = "instrument")]
432        instr_inc(&instrument::MIN_IN_CALLS, 1);
433        if m[0] >= lo {
434            #[cfg(feature = "instrument")]
435            instr_inc(&instrument::MIN_IN_FAST_PATH, 1);
436            return (m[0] <= hi).then_some(m[0]);
437        }
438        if m[1] < lo {
439            #[cfg(feature = "instrument")]
440            instr_inc(&instrument::MIN_IN_FAST_PATH, 1);
441            return None;
442        }
443        let cnt = m[3] as usize;
444        if cnt <= LINEAR_MAX {
445            #[cfg(feature = "instrument")]
446            {
447                instr_inc(&instrument::MIN_IN_LINEAR, 1);
448                instr_inc(&instrument::MIN_IN_LINEAR_LEN_SUM, cnt as u64);
449            }
450            let off = m[2] as usize;
451            let mut best = u32::MAX;
452            // SAFETY: `m` is from a valid SAM state; endpos slice within bounds (same as max_le).
453            #[allow(clippy::undocumented_unsafe_blocks)]
454            for &v in unsafe { self.epos.get_unchecked(off..off + cnt) } {
455                best = best.min(if v >= lo && v <= hi { v } else { u32::MAX });
456            }
457            return (best != u32::MAX).then_some(best);
458        }
459        #[cfg(feature = "instrument")]
460        instr_inc(&instrument::MIN_IN_SEGTREE, 1);
461        let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
462        let mut best: Option<u32> = None;
463        while l < r {
464            if l & 1 == 1 {
465                best = merge_min(best, node_min_in(self.seg_node(l), lo, hi));
466                l += 1;
467            }
468            if r & 1 == 1 {
469                r -= 1;
470                best = merge_min(best, node_min_in(self.seg_node(r), lo, hi));
471            }
472            l >>= 1;
473            r >>= 1;
474        }
475        best
476    }
477
478}
479
480/// Largest value `<= x` in a sorted slice, or None.
481fn node_max_le(sorted: &[u32], x: u32) -> Option<u32> {
482    let k = sorted.partition_point(|&v| v <= x);
483    (k > 0).then(|| sorted[k - 1])
484}
485
486/// Smallest value in `[lo, hi]` in a sorted slice, or None.
487fn node_min_in(sorted: &[u32], lo: u32, hi: u32) -> Option<u32> {
488    let k = sorted.partition_point(|&v| v < lo);
489    sorted.get(k).copied().filter(|&v| v <= hi)
490}
491
492fn merge_min(a: Option<u32>, b: Option<u32>) -> Option<u32> {
493    match (a, b) {
494        (Some(x), Some(y)) => Some(x.min(y)),
495        (x, None) => x,
496        (None, y) => y,
497    }
498}
499
500/// Transient builder (Ukkonen online SAM with a linked-list transition arena).
501struct Builder {
502    edge_char: Vec<char>,
503    edge_to: Vec<u32>,
504    edge_next: Vec<i32>,
505    head: Vec<i32>,
506    link: Vec<i32>,
507    len: Vec<u32>,
508    firstpos: Vec<u32>,
509    primary: Vec<bool>, // true for the per-position state (its firstpos is a real end-position)
510    last: u32,
511}
512
513impl Builder {
514    fn empty() -> Self {
515        Builder {
516            edge_char: Vec::new(),
517            edge_to: Vec::new(),
518            edge_next: Vec::new(),
519            head: Vec::new(),
520            link: Vec::new(),
521            len: Vec::new(),
522            firstpos: Vec::new(),
523            primary: Vec::new(),
524            last: 0,
525        }
526    }
527
528    fn new(cap: usize) -> Self {
529        let mut b = Builder::empty();
530        b.reset(cap);
531        b
532    }
533
534    /// Clear the arenas (keeping capacity) and re-seed the root — for buffer reuse.
535    fn reset(&mut self, cap: usize) {
536        self.edge_char.clear();
537        self.edge_to.clear();
538        self.edge_next.clear();
539        self.head.clear();
540        self.link.clear();
541        self.len.clear();
542        self.firstpos.clear();
543        self.primary.clear();
544        self.edge_char.reserve(3 * cap);
545        self.edge_to.reserve(3 * cap);
546        self.edge_next.reserve(3 * cap);
547        self.primary.push(false); // root state 0 is not a per-position state
548        self.head.push(-1);
549        self.link.push(-1);
550        self.len.push(0);
551        self.firstpos.push(0);
552        self.last = 0;
553    }
554
555    #[allow(clippy::cast_sign_loss)]
556    fn find(&self, state: u32, c: char) -> Option<u32> {
557        let mut e = self.head[state as usize];
558        while e != -1 {
559            let idx = e as usize;
560            if self.edge_char[idx] == c {
561                return Some(self.edge_to[idx]);
562            }
563            e = self.edge_next[idx];
564        }
565        None
566    }
567
568    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
569    fn add_edge(&mut self, state: u32, c: char, to: u32) {
570        self.edge_char.push(c);
571        self.edge_to.push(to);
572        self.edge_next.push(self.head[state as usize]);
573        self.head[state as usize] = (self.edge_char.len() - 1) as i32;
574    }
575
576    #[allow(clippy::cast_sign_loss)]
577    fn set_edge(&mut self, state: u32, c: char, to: u32) {
578        let mut e = self.head[state as usize];
579        while e != -1 {
580            let idx = e as usize;
581            if self.edge_char[idx] == c {
582                self.edge_to[idx] = to;
583                return;
584            }
585            e = self.edge_next[idx];
586        }
587        self.add_edge(state, c, to);
588    }
589
590    #[allow(clippy::cast_possible_truncation)]
591    fn new_state(&mut self, len: u32, link: i32, firstpos: u32, primary: bool) -> u32 {
592        self.head.push(-1);
593        self.len.push(len);
594        self.link.push(link);
595        self.firstpos.push(firstpos);
596        self.primary.push(primary);
597        (self.head.len() - 1) as u32
598    }
599
600    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_sign_loss)]
601    fn extend(&mut self, c: char, pos: usize) {
602        let cur = self.new_state(self.len[self.last as usize] + 1, -1, pos as u32, true);
603        let mut p = self.last as i32;
604        while p != -1 && self.find(p as u32, c).is_none() {
605            self.add_edge(p as u32, c, cur);
606            p = self.link[p as usize];
607        }
608        if p == -1 {
609            self.link[cur as usize] = 0;
610        } else {
611            let q = self.find(p as u32, c).unwrap();
612            if self.len[p as usize] + 1 == self.len[q as usize] {
613                self.link[cur as usize] = q as i32;
614            } else {
615                let clone = self.new_state(self.len[p as usize] + 1, self.link[q as usize], self.firstpos[q as usize], false);
616                let mut e = self.head[q as usize];
617                while e != -1 {
618                    let idx = e as usize;
619                    self.add_edge(clone, self.edge_char[idx], self.edge_to[idx]);
620                    e = self.edge_next[idx];
621                }
622                while p != -1 && self.find(p as u32, c) == Some(q) {
623                    self.set_edge(p as u32, c, clone);
624                    p = self.link[p as usize];
625                }
626                self.link[q as usize] = clone as i32;
627                self.link[cur as usize] = clone as i32;
628            }
629        }
630        self.last = cur;
631    }
632
633    /// Convert the linked-list transitions into the packed `node`/`edges` layout (per-state edges
634    /// sorted by char, co-located char+target), reusing `out`'s allocations (no per-build malloc churn).
635    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::needless_range_loop)]
636    fn finalize_into(&self, out: &mut Sam) {
637        let nstates = self.head.len();
638        let nedges = self.edge_char.len();
639        out.firstpos.clear();
640        out.firstpos.extend_from_slice(&self.firstpos);
641        // per-state edge counts → exclusive prefix-sum offsets (local; folded into `node` below)
642        let mut off = vec![0u32; nstates + 1];
643        for state in 0..nstates {
644            let mut e = self.head[state];
645            while e != -1 {
646                off[state + 1] += 1;
647                e = self.edge_next[e as usize];
648            }
649        }
650        for state in 0..nstates {
651            off[state + 1] += off[state];
652        }
653        // edges: (char << 32 | to), sorted by char within each state's [off[s], off[s+1]) range.
654        // sorting the packed u64 sorts by char (high bits) since a state's chars are distinct.
655        out.edges.clear();
656        out.edges.resize(nedges, 0);
657        let mut scratch: Vec<u64> = Vec::new(); // reused across states
658        for state in 0..nstates {
659            scratch.clear();
660            let mut e = self.head[state];
661            while e != -1 {
662                let idx = e as usize;
663                scratch.push((u64::from(self.edge_char[idx] as u32) << 32) | u64::from(self.edge_to[idx]));
664                e = self.edge_next[idx];
665            }
666            scratch.sort_unstable();
667            let base = off[state] as usize;
668            for (k, &packed) in scratch.iter().enumerate() {
669                out.edges[base + k] = packed;
670            }
671        }
672        // per-state packed node [len, link(clamped: root -1 → 0, never read), edge_lo, edge_hi]
673        out.node.clear();
674        out.node.reserve(nstates);
675        out.link_len.clear();
676        out.link_len.reserve(nstates);
677        for s in 0..nstates {
678            let link_idx_signed = self.link[s];
679            let link = if link_idx_signed < 0 { 0 } else { link_idx_signed as u32 };
680            out.node.push([self.len[s], link, off[s], off[s + 1]]);
681            // Precompute len(link[s]) — saves the second node-load in `longest_in`'s chain walk.
682            // Root (link = -1) is never read here because the chain walk breaks before entering it.
683            let llen = if link_idx_signed < 0 { 0 } else { self.len[link_idx_signed as usize] };
684            out.link_len.push(llen);
685        }
686        // root direct transition table (ASCII): fill from the root's edges
687        out.root_next.clear();
688        out.root_next.resize(ROOT_TBL, -1);
689        for k in off[0] as usize..off[1] as usize {
690            let e = out.edges[k];
691            let ci = (e >> 32) as usize;
692            if ci < ROOT_TBL {
693                out.root_next[ci] = i32::try_from((e & 0xFFFF_FFFF) as u32).expect("state fits i32");
694            }
695        }
696        self.build_endpos(out, nstates);
697    }
698
699    /// Build the endpos range structure (fix b): suffix-link children → DFS-order end-positions
700    /// (each state's endpos = a contiguous array range) → wavelet matrix over those positions.
701    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
702    fn build_endpos(&self, out: &mut Sam, nstates: usize) {
703        // suffix-link children, CSR
704        let mut child_head = vec![0u32; nstates + 1];
705        for s in 1..nstates {
706            child_head[self.link[s] as usize + 1] += 1;
707        }
708        for s in 0..nstates {
709            child_head[s + 1] += child_head[s];
710        }
711        let mut child_arr = vec![0u32; nstates.saturating_sub(1)];
712        let mut fill = child_head.clone();
713        for s in 1..nstates {
714            let p = self.link[s] as usize;
715            child_arr[fill[p] as usize] = s as u32;
716            fill[p] += 1;
717        }
718        // iterative DFS (enter/exit) from root → dfs_in / dfs_cnt + DFS-ordered end-positions;
719        // on exit (post-order, children done) accumulate lastpos = max end-position in subtree.
720        out.dfs_in.clear();
721        out.dfs_in.resize(nstates, 0);
722        out.dfs_cnt.clear();
723        out.dfs_cnt.resize(nstates, 0);
724        out.lastpos.clear();
725        out.lastpos.resize(nstates, 0);
726        out.epos.clear();
727        let mut stack: Vec<(u32, bool)> = vec![(0, false)];
728        while let Some((s, exit)) = stack.pop() {
729            let su = s as usize;
730            if exit {
731                out.dfs_cnt[su] = out.epos.len() as u32 - out.dfs_in[su];
732                let mut lp = if self.primary[su] { self.firstpos[su] } else { 0 };
733                for k in child_head[su]..child_head[su + 1] {
734                    lp = lp.max(out.lastpos[child_arr[k as usize] as usize]);
735                }
736                out.lastpos[su] = lp;
737                continue;
738            }
739            out.dfs_in[su] = out.epos.len() as u32;
740            if self.primary[su] {
741                out.epos.push(self.firstpos[su]);
742            }
743            stack.push((s, true));
744            for k in child_head[su]..child_head[su + 1] {
745                stack.push((child_arr[k as usize], false));
746            }
747        }
748        // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style (build is a
749        // negligible fraction of runtime; queries hit the contiguous `seg_data` layout).
750        let n = out.epos.len();
751        let mut seg_n = 1usize;
752        while seg_n < n.max(1) {
753            seg_n <<= 1;
754        }
755        out.seg_n = seg_n;
756        let mut tree: Vec<Vec<u32>> = vec![Vec::new(); 2 * seg_n];
757        for i in 0..n {
758            tree[seg_n + i] = vec![out.epos[i]];
759        }
760        for i in (1..seg_n).rev() {
761            let (lhs, rhs) = (&tree[2 * i], &tree[2 * i + 1]);
762            let mut acc = Vec::with_capacity(lhs.len() + rhs.len());
763            let (mut x, mut y) = (0usize, 0usize);
764            while x < lhs.len() && y < rhs.len() {
765                if lhs[x] <= rhs[y] {
766                    acc.push(lhs[x]);
767                    x += 1;
768                } else {
769                    acc.push(rhs[y]);
770                    y += 1;
771                }
772            }
773            acc.extend_from_slice(&lhs[x..]);
774            acc.extend_from_slice(&rhs[y..]);
775            tree[i] = acc;
776        }
777        out.seg_off.clear();
778        out.seg_off.reserve(2 * seg_n + 1);
779        out.seg_data.clear();
780        let mut off = 0u32;
781        out.seg_off.push(0);
782        for node in &tree {
783            out.seg_data.extend_from_slice(node);
784            off += u32::try_from(node.len()).expect("epos fits u32");
785            out.seg_off.push(off);
786        }
787        // pack the query-hot per-state fields into one cache-line-friendly array
788        out.epmeta.clear();
789        out.epmeta.reserve(nstates);
790        for s in 0..nstates {
791            out.epmeta.push([out.firstpos[s], out.lastpos[s], out.dfs_in[s], out.dfs_cnt[s]]);
792        }
793        // Optimization B: interleave (node.len, node.link, link_len, epmeta) into one 32-byte slot
794        // per state so the chain walk's per-state data lands on a single cache line. Built lazily
795        // after epmeta + link_len; safe to do here because all the source arrays are finalized.
796        out.chain_slot.clear();
797        out.chain_slot.reserve(nstates);
798        for s in 0..nstates {
799            let nd = out.node[s];
800            let m = out.epmeta[s];
801            out.chain_slot.push([
802                nd[0],            // len
803                nd[1],            // link
804                out.link_len[s],  // band_min source
805                0,                // padding for 32-byte alignment
806                m[0],             // firstpos
807                m[1],             // lastpos
808                m[2],             // dfs_in
809                m[3],             // dfs_cnt
810            ]);
811        }
812    }
813
814    fn finalize(self) -> Sam {
815        let mut out = Sam::empty();
816        self.finalize_into(&mut out);
817        out
818    }
819}
820
821/// Build (and finalize) the suffix automaton of `b` — prebuild once, reuse across pairs.
822#[must_use]
823pub fn build_sam(b: &[char]) -> Sam {
824    let mut bld = Builder::new(b.len());
825    for (i, &c) in b.iter().enumerate() {
826        bld.extend(c, i);
827    }
828    bld.finalize()
829}
830
831/// Longest substring of `a[al..ar]` that occurs in `b[bl..br)`, using the **precomputed**
832/// window-independent match (`fstate`/`fmatch` = the SAM state and match length ending at each
833/// a-position, from one full-`a` scan). For each position the chain walk starts at the precomputed
834/// state, capping the usable length to the a-fragment (`i-al+1`) and the b-window via endpos
835/// queries. Returns (`a_start`, absolute `b_start`, len) with difflib's tie-break. No re-scanning.
836///
837/// `chain_cap` (Phase 3 approximation knob) bounds the suffix-link chain walk to `chain_cap`
838/// states ascended; deeper chains return whatever match was already found. With the empirically
839/// measured distribution (p99 depth 7, p95 depth 5; see `gestalt::instrument`'s histograms), a
840/// cap of 7 keeps ~99% of chains intact, capping at 5 keeps ~95%, etc. Setting `chain_cap =
841/// u32::MAX` recovers the exact behaviour. The caller (`gestalt_edge_with_ms` and friends)
842/// derives `chain_cap` from the user-supplied `delta` parameter via `delta_to_chain_cap`.
843#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments)]
844fn longest_in(
845    sam: &Sam,
846    fstate: &[u32],
847    fmatch: &[u32],
848    al: usize,
849    ar: usize,
850    bl: usize,
851    br: usize,
852    chain_cap: u32,
853) -> (usize, usize, usize) {
854    #[cfg(feature = "instrument")]
855    instr_inc(&instrument::LONGEST_IN_CALLS, 1);
856    let blo = bl as u32;
857    let hi = br as u32 - 1; // caller guarantees bl < br, so br >= 1
858    let (mut best_len, mut best_a, mut best_b) = (0usize, 0usize, 0usize);
859    // SAFETY: i ∈ [al, ar) ⊆ [0, n) = fstate.len() = fmatch.len(); `cur` is always a valid SAM
860    // state (fstate entry or a suffix link), so chain_slot[cur] is in bounds (= nstates entries).
861    #[allow(clippy::undocumented_unsafe_blocks)]
862    unsafe {
863        for i in al..ar {
864            let cap = i - al + 1; // a-match ending at i can't start before `al`
865            let eff = (*fmatch.get_unchecked(i) as usize).min(cap);
866            if eff <= best_len {
867                continue; // can't beat the best — skip (dominant pruning)
868            }
869            // walk the suffix-link chain from the precomputed state up; `curlen` is the usable length
870            // at the current state (capped to the a-fragment), shrinking as we ascend.
871            let mut cur = *fstate.get_unchecked(i);
872            // Optimization K1: `cur != 0` is an invariant here. `cur == 0` (the SAM root) is only
873            // stored in `fstate[i]` when `fmatch[i] == 0`, and in that case `eff == 0 <= best_len`
874            // already pruned this iteration via the `continue` above. From the second chain step
875            // on, `cur` came from `link != 0` (we check before assigning). Hoisting the
876            // entry-time `if cur == 0 { break }` out saves ≈4 k arm64 instructions per pair on
877            // the bench corpus (-0.9 % retired) — no cycle change on this single-thread profile
878            // because the chain walk is latency-bound by the `node[cur] → link → node[link]`
879            // pointer chase, but it frees front-end issue bandwidth for the threaded path.
880            let mut chain_depth: u32 = 0;
881            loop {
882                chain_depth += 1;
883                // Phase 3 approximate-RO cap: stop walking the chain past `chain_cap` ascended
884                // states. With cap=u32::MAX (default delta=0) this never fires; smaller caps
885                // trade tail accuracy for fewer pointer chases. Measurement on canonical Python:
886                // p95 chain depth = 5, p99 = 7 — capping at 5/7 truncates <5%/<1% of chains.
887                if chain_depth > chain_cap {
888                    break;
889                }
890                // OPTIMIZATION B: single 32-byte load of `chain_slot[cur]` brings the per-state
891                // hot fields into registers in ONE cache-line touch:
892                //   [0] = len   [1] = link   [2] = link_len   [3] = padding
893                //   [4..8] = epmeta (firstpos, lastpos, dfs_in, dfs_cnt) for max_le/min_in
894                // Replaces the three scattered loads (node[cur], link_len[cur], epmeta[cur]) that
895                // the chain walk used to do, each on its own cache line — PMU attribution showed
896                // L1D misses at 18 % of cycle budget split between those three arrays.
897                let cs = *sam.chain_slot.get_unchecked(cur as usize);
898                let curlen = eff.min(cs[0] as usize);
899                if curlen <= best_len {
900                    break;
901                }
902                let band_min = cs[2] as usize + 1;
903                if curlen >= band_min {
904                    // Reuse the epmeta bytes already in `cs` — no extra load needed.
905                    let m = [cs[4], cs[5], cs[6], cs[7]];
906                    if let Some(pmax) = sam.max_le_with_meta(&m, hi) {
907                        if pmax >= blo {
908                            let l_window = (pmax - blo) as usize + 1; // window-cap on match len
909                            let l = curlen.min(l_window); // = min(chain-cap, window-cap)
910                            if l >= band_min {
911                                // earliest-b (min_in) only when we beat the best — rare, off the path.
912                                if l > best_len {
913                                    // OPTIMIZATION A: when the window cap binds (l == l_window),
914                                    // lo_q = blo + l - 1 = pmax. Since pmax is THE max v <= hi in
915                                    // this state's endpos, the set ∩ [pmax, hi] is exactly {pmax},
916                                    // so pmin = pmax trivially — skip the linear scan in `min_in`.
917                                    // Only the chain-cap branch (l < l_window) needs a real scan.
918                                    let pmin = if l == l_window {
919                                        Some(pmax)
920                                    } else {
921                                        sam.min_in_with_meta(&m, blo + l as u32 - 1, hi)
922                                    };
923                                    if let Some(pmin) = pmin {
924                                        best_len = l;
925                                        best_a = i + 1 - l;
926                                        best_b = pmin as usize + 1 - l;
927                                    }
928                                }
929                                break; // deepest qualifying state ⇒ longest in-window match here
930                            }
931                        }
932                    }
933                }
934                let link = cs[1];
935                if link == 0 {
936                    break; // reached the root (state 0) — no shorter qualifying state above
937                }
938                cur = link;
939            }
940            #[cfg(feature = "instrument")]
941            instr_hist(&instrument::CHAIN_DEPTHS, chain_depth as usize);
942        }
943    }
944    (best_a, best_b, best_len)
945}
946
947/// Map a user-facing `delta` (max acceptable RO ratio loss, in absolute units) to a chain-walk
948/// depth cap for `longest_in`. Empirically calibrated against `gestalt::instrument`'s chain
949/// depth histogram on the mypy/sympy corpora:
950///
951/// | delta | cap | covers |
952/// |---:|---:|---:|
953/// | 0.00 | `u32::MAX` | exact (default) |
954/// | 0.01 | 12 | p99.5+ |
955/// | 0.05 | 7  | p99   |
956/// | 0.10 | 5  | p95   |
957/// | 0.20 | 3  | p85   |
958/// | 0.50 | 2  | p70   |
959/// | 1.00 | 1  | only fstate, no walk |
960///
961/// Formula: `cap = ceil(1 / sqrt(delta))` clamped to ≥1 for delta > 0. Pure heuristic — the
962/// property test in `tests/approx_ro.rs` verifies the actual loss stays below `delta` on a
963/// representative corpus.
964#[must_use]
965#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
966pub fn delta_to_chain_cap(delta: f64) -> u32 {
967    if delta <= 0.0 {
968        return u32::MAX;
969    }
970    if delta >= 1.0 {
971        return 1;
972    }
973    let cap = (1.0_f64 / delta.sqrt()).ceil() as u32;
974    cap.max(1)
975}
976
977/// Window-independent matching statistics of `a` vs `sam_b`, filled into reused buffers: for each
978/// i, `(state, matched)` where `matched` = longest suffix of `a[..=i]` occurring anywhere in b.
979/// One O(|a|) scan, reused by every recursion node (no per-node re-scan). Reusing the caller's
980/// buffers avoids a per-pair allocation (was ~10% of the all-pairs join).
981#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
982fn matching_stats_into(a: &[char], sam_b: &Sam, fstate: &mut Vec<u32>, fmatch: &mut Vec<u32>) {
983    let n = a.len();
984    // Size the buffers WITHOUT zero-filling: the scan below writes every index [0, n) before any read,
985    // so the resize(n, 0) zero-pass was pure waste (it was ~half the scan's store traffic).
986    // SAFETY: u32 has no invalid bit patterns, and fstate[i]/fmatch[i] are written for every i in 0..n
987    // (the loop below) strictly before longest_in ever reads them.
988    fstate.clear();
989    fstate.reserve(n);
990    fmatch.clear();
991    fmatch.reserve(n);
992    #[allow(clippy::undocumented_unsafe_blocks)]
993    unsafe {
994        fstate.set_len(n);
995        fmatch.set_len(n);
996    }
997    // Hoist the SAM arrays into locals so the compiler keeps base pointers in registers and
998    // doesn't reload them through `sam_b` each iteration; transition is hand-inlined.
999    let node = sam_b.node.as_slice();
1000    let edges = sam_b.edges.as_slice();
1001    let root = sam_b.root_next.as_slice();
1002    let mut state = 0u32;
1003    let mut matched = 0u32;
1004    // SAFETY: `state` is always a valid SAM state index (< nstates = node.len()): it starts at the
1005    // root (0) and only ever becomes a transition target (an edge's low bits, a valid state) or a
1006    // suffix link (node[..][1], a valid state). So node[state] and node[link] are in bounds; the edge
1007    // range [edge_lo, edge_hi) ⊆ [0, edges.len()); `ci < ROOT_TBL == root.len()`; `i < n == fstate.len()`.
1008    #[allow(clippy::undocumented_unsafe_blocks)]
1009    unsafe {
1010        for i in 0..n {
1011            let c = *a.get_unchecked(i);
1012            loop {
1013                // inline `transition(state, c)` → next state, or -1. The non-root branch loads
1014                // node[state] ONCE for both the edge range and (on miss) the suffix link — one cache line.
1015                let nx: i64 = if state == 0 {
1016                    let ci = c as usize;
1017                    if ci < ROOT_TBL {
1018                        i64::from(*root.get_unchecked(ci))
1019                    } else {
1020                        let nd = node.get_unchecked(0);
1021                        csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
1022                    }
1023                } else {
1024                    let nd = node.get_unchecked(state as usize);
1025                    csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
1026                };
1027                if nx >= 0 {
1028                    state = nx as u32;
1029                    matched += 1;
1030                    break;
1031                }
1032                if state == 0 {
1033                    matched = 0;
1034                    break;
1035                }
1036                let link = node.get_unchecked(state as usize)[1]; // same line as the edge-range load
1037                state = link;
1038                matched = node.get_unchecked(link as usize)[0]; // len(link)
1039            }
1040            *fstate.get_unchecked_mut(i) = state;
1041            *fmatch.get_unchecked_mut(i) = matched;
1042        }
1043    }
1044}
1045
1046/// Binary search the packed edge slice `edges[lo..hi]` (sorted by char in the high 32 bits) for `c`;
1047/// returns the target state (low 32 bits) as i64, or -1. Inlined into the scan.
1048#[inline]
1049#[allow(clippy::cast_possible_truncation)]
1050fn csr_lookup(edges: &[u64], mut lo: usize, hi: usize, c: char) -> i64 {
1051    let mut hi = hi;
1052    let key = c as u32;
1053    while lo < hi {
1054        let mid = lo + (hi - lo) / 2;
1055        // SAFETY: callers pass lo,hi from a state's edge range ⊆ [0, edges.len()]; `mid ∈ [lo, hi)`.
1056        let e = unsafe { *edges.get_unchecked(mid) };
1057        let mc = (e >> 32) as u32; // char (code point) in the high 32 bits
1058        if mc == key {
1059            return i64::from((e & 0xFFFF_FFFF) as u32); // target state in the low 32 bits
1060        }
1061        if mc < key {
1062            lo = mid + 1;
1063        } else {
1064            hi = mid;
1065        }
1066    }
1067    -1
1068}
1069
1070thread_local! {
1071    /// Reused (fstate, fmatch) buffers for the per-pair matching statistics — no per-pair alloc.
1072    static MS_BUF: std::cell::RefCell<(Vec<u32>, Vec<u32>)> =
1073        const { std::cell::RefCell::new((Vec::new(), Vec::new())) };
1074    /// Reused recursion stack of (al, ar, bl, br) windows — retains capacity across pairs so the
1075    /// RO recursion never heap-allocates or reallocs per pair (was `__rust_alloc` + `grow_one`).
1076    static STACK_BUF: std::cell::RefCell<Vec<(usize, usize, usize, usize)>> =
1077        const { std::cell::RefCell::new(Vec::new()) };
1078    /// Reused DP buffer for `ub_from_fmatch` (currently unused — kept for re-enabling H later).
1079    #[allow(dead_code)]
1080    static UB_BUF: std::cell::RefCell<Vec<u32>> = const { std::cell::RefCell::new(Vec::new()) };
1081}
1082
1083/// Tight upper bound on the RO matched-length M, computed from the per-position `fmatch` array
1084/// via O(na) weighted-interval-scheduling DP. Each `fmatch[i]` records the longest substring of
1085/// `b` ending at `a[i]`; an actual RO decomposition picks non-overlapping such matches, so M is
1086/// bounded above by the max non-overlapping sum we can extract from `fmatch`.
1087///
1088/// `dp[i]` = best sum using only positions `0..=i`. Recurrence:
1089///   * not taking position i: `dp[i-1]`
1090///   * taking the fmatch[i]-length block ending at i (a[i+1-l..=i]): `dp[i-l] + l` where
1091///     `l = fmatch[i].min(i+1)` (clamped to what fits in `a[..=i]`)
1092///
1093/// **CURRENTLY UNUSED** — tried in `gestalt_edge_with_ms` (optimization H) and reverted: on
1094/// the `cluster_canonicals` threshold path the O(na) DP cost (+ thread-local buffer access) was
1095/// larger than the wall savings from skipping recursion, because the existing
1096/// `m + pending < need` check inside the recursion already bails cheaply for non-edges. Kept
1097/// as a tombstone in case a future call shape (e.g., larger threshold + denser matches) makes
1098/// it worth re-trying. To re-enable: insert the call before `STACK_BUF.with_borrow_mut` in
1099/// `gestalt_edge_with_ms` and gate on `need > 0`.
1100#[allow(dead_code)]
1101#[inline]
1102#[allow(clippy::cast_possible_truncation)]
1103fn ub_from_fmatch(fmatch: &[u32], buf: &mut Vec<u32>) -> u32 {
1104    let n = fmatch.len();
1105    if n == 0 {
1106        return 0;
1107    }
1108    buf.clear();
1109    buf.resize(n, 0);
1110    // SAFETY: buf and fmatch are both len = n; indices below stay in [0, n).
1111    #[allow(clippy::undocumented_unsafe_blocks)]
1112    unsafe {
1113        let mut prev: u32 = 0;
1114        for i in 0..n {
1115            let l = (*fmatch.get_unchecked(i)).min(i as u32 + 1);
1116            let with_take = if l == 0 {
1117                0
1118            } else if (l as usize) > i {
1119                l // entire prefix [0..=i] is the block, no `dp[i-l]` term
1120            } else {
1121                buf.get_unchecked(i - l as usize).saturating_add(l)
1122            };
1123            let v = prev.max(with_take);
1124            *buf.get_unchecked_mut(i) = v;
1125            prev = v;
1126        }
1127        prev
1128    }
1129}
1130
1131/// Test-only access to `matching_stats_into` — used by `corpus_sa` tests to verify the SA-based
1132/// fmatch is byte-for-byte identical to the SAM-based fmatch.
1133#[doc(hidden)]
1134pub fn matching_stats_for_test(a: &[char], sam_b: &Sam, fstate: &mut Vec<u32>, fmatch: &mut Vec<u32>) {
1135    matching_stats_into(a, sam_b, fstate, fmatch);
1136}
1137
1138/// Cost probe: run only the matching-statistics scan of `a` vs `sam_b` (the unavoidable per-pair
1139/// floor — RO's first block is an LCS, Θ(|a|)) and return a checksum so it isn't optimized out.
1140/// Measures the pure scan throughput separate from the RO recursion.
1141#[must_use]
1142pub fn matching_stats_cost(a: &[char], sam_b: &Sam) -> u64 {
1143    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1144        matching_stats_into(a, sam_b, fstate, fmatch);
1145        fmatch.iter().map(|&x| u64::from(x)).sum()
1146    })
1147}
1148
1149fn gestalt_m_with(a: &[char], b: &[char], sam_b: &Sam) -> usize {
1150    let n = a.len();
1151    if n == 0 {
1152        return 0;
1153    }
1154    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1155        matching_stats_into(a, sam_b, fstate, fmatch);
1156        gestalt_m_recur(a, b, sam_b, fstate, fmatch)
1157    })
1158}
1159
1160#[allow(clippy::cast_sign_loss, clippy::many_single_char_names)]
1161fn gestalt_m_recur(a: &[char], b: &[char], sam_b: &Sam, fstate: &[u32], fmatch: &[u32]) -> usize {
1162    let n = a.len();
1163    let mut total = 0usize;
1164    STACK_BUF.with_borrow_mut(|stack| {
1165        stack.clear();
1166        stack.push((0, n, 0, b.len()));
1167        while let Some((al, ar, bl, br)) = stack.pop() {
1168            if al >= ar || bl >= br {
1169                continue;
1170            }
1171            let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1172            if l == 0 {
1173                continue;
1174            }
1175            total += l;
1176            stack.push((al, i, bl, j));
1177            stack.push((i + l, ar, j + l, br));
1178        }
1179    });
1180    total
1181}
1182
1183/// Threshold-aware exact decision: does `RO(a,b) ≥ threshold`? Computes the matched total M
1184/// with **two-sided early-exit** — accept the instant `M ≥ need`, reject the instant the upper
1185/// bound `M + Σ min(window lengths) < need`. Exact (the bound never drops a qualifying pair),
1186/// and dissimilar pairs abort long before the full decomposition. `need = ⌈threshold·(|a|+|b|)/2⌉`.
1187#[allow(
1188    clippy::cast_precision_loss,
1189    clippy::cast_sign_loss,
1190    clippy::cast_possible_truncation,
1191    clippy::many_single_char_names
1192)]
1193#[must_use]
1194pub fn gestalt_qualifies(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> bool {
1195    let nb = b.len();
1196    let total = a.len() + nb;
1197    if total == 0 {
1198        return true;
1199    }
1200    let need = (threshold * total as f64 / 2.0).ceil() as usize;
1201    if need == 0 {
1202        return true;
1203    }
1204    let n = a.len();
1205    if n == 0 || nb == 0 {
1206        return false; // M = 0 < need
1207    }
1208    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1209        matching_stats_into(a, sam_b, fstate, fmatch);
1210        gestalt_qualifies_ms(n, nb, sam_b, threshold, fstate, fmatch)
1211    })
1212}
1213
1214/// The threshold early-exit recursion given **precomputed** matching statistics (`fstate`/`fmatch`
1215/// for `a` of length `na` vs `sam_b`). Split out so the scan can be done in an MLP batch separately.
1216#[allow(
1217    clippy::cast_precision_loss,
1218    clippy::cast_sign_loss,
1219    clippy::cast_possible_truncation,
1220    clippy::many_single_char_names
1221)]
1222#[must_use]
1223pub fn gestalt_qualifies_ms(na: usize, nb: usize, sam_b: &Sam, threshold: f64, fstate: &[u32], fmatch: &[u32]) -> bool {
1224    let total = na + nb;
1225    if total == 0 {
1226        return true;
1227    }
1228    let need = (threshold * total as f64 / 2.0).ceil() as usize;
1229    if need == 0 {
1230        return true;
1231    }
1232    if na == 0 || nb == 0 {
1233        return false;
1234    }
1235    STACK_BUF.with_borrow_mut(|stack| {
1236        let mut m = 0usize;
1237        let mut pending = na.min(nb);
1238        stack.clear();
1239        stack.push((0, na, 0, nb));
1240        while let Some((al, ar, bl, br)) = stack.pop() {
1241            pending -= (ar - al).min(br - bl);
1242            if al >= ar || bl >= br {
1243                continue;
1244            }
1245            let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1246            if l == 0 {
1247                continue;
1248            }
1249            m += l;
1250            if m >= need {
1251                return true;
1252            }
1253            pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
1254            if m + pending < need {
1255                return false;
1256            }
1257            stack.push((al, i, bl, j));
1258            stack.push((i + l, ar, j + l, br));
1259        }
1260        m >= need
1261    })
1262}
1263
1264
1265/// Edge test that also yields the exact ratio: `Some(ratio)` iff `RO(a,b) >= threshold`, else `None`.
1266/// Keeps the **reject** early-exit (abort the instant the upper bound `M + Σ min(window) < need`) but
1267/// computes full M on the qualifying branch so the cached ratio feeds the cluster `min_sim` — caching
1268/// edge ratios here means `min_sim` only recomputes the rare non-edge (chained) intra-cluster pairs,
1269/// not the whole dense blob. Bit-identical to `(let r = ratio; (r >= threshold).then_some(r))`.
1270#[allow(
1271    clippy::cast_precision_loss,
1272    clippy::cast_sign_loss,
1273    clippy::cast_possible_truncation,
1274    clippy::many_single_char_names
1275)]
1276#[must_use]
1277pub fn gestalt_edge(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> Option<f64> {
1278    let na = a.len();
1279    let nb = b.len();
1280    let total = na + nb;
1281    if total == 0 {
1282        return Some(1.0);
1283    }
1284    let need = (threshold * total as f64 / 2.0).ceil() as usize;
1285    if na == 0 || nb == 0 {
1286        return (need == 0).then_some(0.0);
1287    }
1288    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1289        matching_stats_into(a, sam_b, fstate, fmatch);
1290        gestalt_edge_with_ms(a, b, sam_b, fstate, fmatch, threshold)
1291    })
1292}
1293
1294/// Stage-4b helper: same recursion + early-exit as `gestalt_edge`, but operates on a
1295/// **caller-provided** `(fstate, fmatch)` instead of running `matching_stats_into` inline.
1296///
1297/// The GPU dispatch produces these arrays in batch for many pairs at once; the CPU side then
1298/// does the small stack walk per pair via this entry point. Keeping the recursion on the CPU is
1299/// the right split because `longest_in`'s suffix-link walk has data-dependent depth (poor GPU
1300/// fit), while `matching_stats_into` is a wide independent per-pair walk (great GPU fit).
1301///
1302/// `fstate` / `fmatch` must be `a.len()` long and have been filled for THIS exact `(a, sam_b)`
1303/// pair — passing arrays computed for a different `a` or `b` is a logic bug, no runtime check.
1304#[must_use]
1305#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
1306pub fn gestalt_edge_with_ms(
1307    a: &[char],
1308    b: &[char],
1309    sam_b: &Sam,
1310    fstate: &[u32],
1311    fmatch: &[u32],
1312    threshold: f64,
1313) -> Option<f64> {
1314    // Backwards-compatible exact wrapper. New callers wanting approximation pass through
1315    // `gestalt_edge_with_ms_delta` directly with their delta.
1316    gestalt_edge_with_ms_delta(a, b, sam_b, fstate, fmatch, threshold, 0.0)
1317}
1318
1319/// Approximate-RO variant: same as [`gestalt_edge_with_ms`] but caps the suffix-link chain walk
1320/// inside `longest_in` to roughly `1/√delta` ascents. `delta = 0.0` means exact (no cap, default).
1321/// `delta ∈ (0, 1]` caps the depth; the returned ratio's worst-case absolute deviation from the
1322/// exact RO is bounded by ~delta (empirically verified on canonical-Python corpora by
1323/// `tests/approx_ro.rs`). The chain depth distribution on real workloads is heavy-headed
1324/// (p99 ≈ 7), so even small delta values rarely actually fire the cap.
1325#[must_use]
1326#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
1327pub fn gestalt_edge_with_ms_delta(
1328    a: &[char],
1329    b: &[char],
1330    sam_b: &Sam,
1331    fstate: &[u32],
1332    fmatch: &[u32],
1333    threshold: f64,
1334    delta: f64,
1335) -> Option<f64> {
1336    let chain_cap = delta_to_chain_cap(delta);
1337    gestalt_edge_with_ms_inner(a, b, sam_b, fstate, fmatch, threshold, chain_cap)
1338}
1339
1340#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::many_single_char_names)]
1341fn gestalt_edge_with_ms_inner(
1342    a: &[char],
1343    b: &[char],
1344    sam_b: &Sam,
1345    fstate: &[u32],
1346    fmatch: &[u32],
1347    threshold: f64,
1348    chain_cap: u32,
1349) -> Option<f64> {
1350    let na = a.len();
1351    let nb = b.len();
1352    let total = na + nb;
1353    if total == 0 {
1354        return Some(1.0);
1355    }
1356    let need = (threshold * total as f64 / 2.0).ceil() as usize;
1357    if na == 0 || nb == 0 {
1358        return (need == 0).then_some(0.0);
1359    }
1360    debug_assert_eq!(fstate.len(), na, "fstate length must equal a.len()");
1361    debug_assert_eq!(fmatch.len(), na, "fmatch length must equal a.len()");
1362    #[cfg(feature = "instrument")]
1363    {
1364        instr_inc(&instrument::PAIRS_PROCESSED, 1);
1365        let (mut zero, mut nz, mut sum) = (0u64, 0u64, 0u64);
1366        for &f in fmatch {
1367            if f == 0 {
1368                zero += 1;
1369            } else {
1370                nz += 1;
1371                sum += f as u64;
1372            }
1373        }
1374        instr_inc(&instrument::FMATCH_ZERO, zero);
1375        instr_inc(&instrument::FMATCH_NONZERO, nz);
1376        instr_inc(&instrument::FMATCH_SUM, sum);
1377    }
1378    // Optimization H (tight fmatch-based UB) was tried and reverted — measured a NET regression
1379    // on cluster_canonicals threshold path: the O(na) weighted-interval-scheduling DP added
1380    // ~25 µs per call across rayon workers, and the bail rate on filter-survivors was too low
1381    // to amortize. The existing `m + pending < need` check inside the recursion loop already
1382    // aborts cheaply for non-edges (after 1-2 longest_in calls in the common case). See
1383    // `src/new/PERF_MAP.md`'s "Tombstones" section and the `ub_from_fmatch` helper just above —
1384    // kept around but unused in case a different call shape makes it worth re-trying.
1385    STACK_BUF.with_borrow_mut(|stack| {
1386        let mut m = 0usize;
1387        let mut pending = na.min(nb);
1388        stack.clear();
1389        stack.push((0, na, 0, nb));
1390        #[cfg(feature = "instrument")]
1391        let mut max_depth: usize = 1;
1392        while let Some((al, ar, bl, br)) = stack.pop() {
1393            #[cfg(feature = "instrument")]
1394            {
1395                if stack.len() + 1 > max_depth {
1396                    max_depth = stack.len() + 1;
1397                }
1398            }
1399            pending -= (ar - al).min(br - bl);
1400            if al >= ar || bl >= br {
1401                continue;
1402            }
1403            let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, chain_cap);
1404            if l == 0 {
1405                continue;
1406            }
1407            m += l;
1408            pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
1409            if m + pending < need {
1410                #[cfg(feature = "instrument")]
1411                instr_hist(&instrument::RECURSION_DEPTHS, max_depth);
1412                return None; // upper bound below the bar ⇒ certified non-edge, abort
1413            }
1414            stack.push((al, i, bl, j));
1415            stack.push((i + l, ar, j + l, br));
1416        }
1417        #[cfg(feature = "instrument")]
1418        instr_hist(&instrument::RECURSION_DEPTHS, max_depth);
1419        (m >= need).then(|| 2.0 * m as f64 / total as f64)
1420    })
1421}
1422
1423/// Cluster `min_sim` helper: returns the **exact** ratio when `RO(a,b) <= cap`, otherwise a value
1424/// `> cap` (it accept-early-exits the instant M exceeds the cap's bound). Used to find a cluster's
1425/// minimum pairwise ratio with `cur = cur.min(gestalt_ratio_capped(a, b, sam, cur))`: in a dense
1426/// cluster the dominant high-ratio pairs blow past `cur` and are pruned after the first block or two,
1427/// so full M is computed only for the genuinely-low pairs. Bit-identical minimum to the full ratio.
1428#[allow(
1429    clippy::cast_precision_loss,
1430    clippy::cast_sign_loss,
1431    clippy::cast_possible_truncation,
1432    clippy::many_single_char_names
1433)]
1434#[must_use]
1435pub fn gestalt_ratio_capped(a: &[char], b: &[char], sam_b: &Sam, cap: f64) -> f64 {
1436    let na = a.len();
1437    let nb = b.len();
1438    let total = na + nb;
1439    if total == 0 {
1440        return 1.0; // two empty strings ⇒ ratio 1.0
1441    }
1442    if na == 0 || nb == 0 {
1443        return 0.0; // M = 0 ⇒ ratio 0 (a valid minimum candidate, <= cap for any cap >= 0)
1444    }
1445    // ratio > cap ⟺ 2M/total > cap ⟺ M > cap·total/2 ⟺ M >= ⌊cap·total/2⌋ + 1.
1446    let exceed = (cap * total as f64 / 2.0).floor() as usize + 1;
1447    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1448        matching_stats_into(a, sam_b, fstate, fmatch);
1449        STACK_BUF.with_borrow_mut(|stack| {
1450            let mut m = 0usize;
1451            stack.clear();
1452            stack.push((0, na, 0, nb));
1453            while let Some((al, ar, bl, br)) = stack.pop() {
1454                if al >= ar || bl >= br {
1455                    continue;
1456                }
1457                let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1458                if l == 0 {
1459                    continue;
1460                }
1461                m += l;
1462                if m >= exceed {
1463                    return 2.0; // ratio > cap ⇒ cannot be the minimum; prune (any value > cap works)
1464                }
1465                stack.push((al, i, bl, j));
1466                stack.push((i + l, ar, j + l, br));
1467            }
1468            2.0 * m as f64 / total as f64 // full exact M ⇒ exact ratio (<= cap)
1469        })
1470    })
1471}
1472
1473/// Ratio of `a` vs the string the prebuilt `sam_b` was built from (= `b`).
1474#[allow(clippy::cast_precision_loss)]
1475#[must_use]
1476pub fn gestalt_ratio_prebuilt(a: &[char], b: &[char], sam_b: &Sam) -> f64 {
1477    let total = a.len() + b.len();
1478    if total == 0 {
1479        return 1.0;
1480    }
1481    2.0 * gestalt_m_with(a, b, sam_b) as f64 / total as f64
1482}
1483
1484#[allow(clippy::cast_precision_loss)]
1485#[must_use]
1486pub fn gestalt_ratio_chars(a: &[char], b: &[char]) -> f64 {
1487    if a.is_empty() && b.is_empty() {
1488        return 1.0;
1489    }
1490    let sam_b = build_sam(b);
1491    gestalt_ratio_prebuilt(a, b, &sam_b)
1492}
1493
1494/// `gestalt_ratio(a, b) -> float`: exact difflib ratio, computed via suffix-automaton LCS.
1495#[must_use]
1496pub fn gestalt_ratio(a: &str, b: &str) -> f64 {
1497    let av: Vec<char> = a.chars().collect();
1498    let bv: Vec<char> = b.chars().collect();
1499    gestalt_ratio_chars(&av, &bv)
1500}
1501
1502#[cfg(test)]
1503mod tests {
1504    #![allow(clippy::float_cmp, clippy::unreadable_literal)]
1505    use super::{build_sam, gestalt_edge, gestalt_qualifies, gestalt_ratio_capped, gestalt_ratio_chars};
1506
1507    fn r(a: &str, b: &str) -> f64 {
1508        gestalt_ratio_chars(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>())
1509    }
1510
1511    #[test]
1512    fn matches_difflib_reference_values() {
1513        assert_eq!(r("", ""), 1.0);
1514        assert_eq!(r("", "x"), 0.0);
1515        assert_eq!(r("abc", "abc"), 1.0);
1516        assert_eq!(r("abc", "abd"), 0.6666666666666666);
1517        assert_eq!(r("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
1518        assert_eq!(r("tide", "diet"), 0.25);
1519        assert_eq!(r("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
1520        assert_eq!(r("aaaaabbbbbccccc", "aaaaaxbbbbbxccccc"), 0.9375);
1521    }
1522
1523    // `gestalt_qualifies` (threshold early-exit) must be byte-for-byte with `ratio >= T`.
1524    #[test]
1525    fn qualifies_matches_ratio_threshold() {
1526        // xorshift PRNG → deterministic pseudo-random ASCII strings over a small alphabet
1527        let mut s: u64 = 0x1234_5678_9abc_def1;
1528        let mut next = || {
1529            s ^= s << 13;
1530            s ^= s >> 7;
1531            s ^= s << 17;
1532            s
1533        };
1534        for _ in 0..3000 {
1535            let la = (next() % 40) as usize;
1536            let lb = (next() % 40) as usize;
1537            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> Vec<char> {
1538                (0..n).map(|_| char::from(b'a' + (rng() % 4) as u8)).collect()
1539            };
1540            let a = mk(la, &mut next);
1541            let b = mk(lb, &mut next);
1542            let sam_b = build_sam(&b);
1543            let ratio = gestalt_ratio_chars(&a, &b);
1544            for &t in &[0.0_f64, 0.25, 0.5, 0.75, 0.9, 1.0] {
1545                assert_eq!(
1546                    gestalt_qualifies(&a, &b, &sam_b, t),
1547                    ratio >= t,
1548                    "a={a:?} b={b:?} t={t} ratio={ratio}"
1549                );
1550                // gestalt_edge(cap=t): Some(exact ratio) iff qualifying; bit-exact ratio when Some.
1551                let edge = gestalt_edge(&a, &b, &sam_b, t);
1552                assert_eq!(edge.is_some(), ratio >= t, "edge.is_some a={a:?} b={b:?} t={t} ratio={ratio}");
1553                if let Some(r) = edge {
1554                    assert_eq!(r, ratio, "edge ratio a={a:?} b={b:?} t={t}");
1555                }
1556                // gestalt_ratio_capped(cap=t): exact ratio when ratio <= t, else a value > t (pruned).
1557                let capped = gestalt_ratio_capped(&a, &b, &sam_b, t);
1558                if ratio <= t {
1559                    assert_eq!(capped, ratio, "capped exact a={a:?} b={b:?} cap={t} ratio={ratio}");
1560                } else {
1561                    assert!(capped > t, "capped prune a={a:?} b={b:?} cap={t} ratio={ratio} got={capped}");
1562                }
1563            }
1564        }
1565    }
1566}