Skip to main content

difflib_fast/
gestalt.rs

1//! Gestalt-Chain: fast **exact** Ratcliff–Obershelp via a suffix automaton.
2//!
3//! difflib's `ratio` is slow because `find_longest_match` is O(|a|·|b|) (it rescans every
4//! occurrence of popular characters). The matched-block total M is the recursive
5//! longest-common-substring decomposition; a **suffix automaton** finds the longest common
6//! substring in O(|a|+|b|) regardless of character frequency, and recursing on the left/
7//! right remainders reproduces difflib's M with the *same* tie-break (longest; earliest in
8//! a; earliest in b). So `gestalt_ratio` equals `difflib.SequenceMatcher.ratio()` exactly.
9//!
10//! Hot-path engineering for the all-pairs join:
11//!   * transitions are stored **CSR** (per-state sorted slice) → O(log deg) lookup, O(len)
12//!     memory, no hashing — and crucially no O(deg) scan at the high-degree root, which the
13//!     scan hammers when strings are dissimilar;
14//!   * the b-side automaton is **prebuilt once per string** (`build_sam`) and reused for
15//!     every pair, so the all-pairs cost is n builds + n² scans, not n² builds.
16//!
17//! Operates on `char` (code points), so it is bit-identical to difflib on non-ASCII.
18
19/// Size of the root's direct transition table — covers ASCII (the canonical text's alphabet).
20const ROOT_TBL: usize = 128;
21
22
23/// Suffix automaton with CSR (sorted-per-state) transitions — built once, queried by scans.
24///
25/// For the range-restricted recursion (fix b), each state also carries its **endpos** as a
26/// contiguous slice `[dfs_in, dfs_in+dfs_cnt)` of `epos` (the end-positions in b, laid out by
27/// a DFS of the suffix-link tree so a subtree is contiguous). A merge-sort tree over `epos`
28/// answers "is there an end-position in [lo,hi] within this state's subtree, and the min/max
29/// such" — so the whole RO recursion runs on this one prebuilt SAM, with **no sub-builds**.
30pub struct Sam {
31    // Per-state hot fields packed into one cache-line-friendly struct `[len, link, edge_lo, edge_hi]`
32    // so a state visit in the scan/recursion loads len + link + transition-range with a SINGLE cache
33    // miss (these were 5 separate arrays — under multi-thread DRAM-bandwidth pressure, fewer distinct
34    // lines per visit is the dominant win). `link` is u32: the root's link (-1) is stored as 0 and
35    // never read, since both the scan and `longest_in` stop at state 0 before following its link.
36    node: Vec<[u32; 4]>,
37    // Transitions packed as `(char as u64) << 32 | to`, sorted by char within each state's range
38    // [edge_lo, edge_hi). Co-locating char+target means the binary-search key and the taken edge's
39    // target live on the same cache line (was two parallel arrays `csr_char`/`csr_to`).
40    edges: Vec<u64>,
41    firstpos: Vec<u32>,  // per state: *smallest* end-position (build-time; folded into epmeta)
42    lastpos: Vec<u32>,   // per state: *largest* end-position (build-time; folded into epmeta)
43    // Direct lookup for the root's ASCII transitions (`root_next[c] = state`, or -1). The root is
44    // the high-degree state hit after every match reset; this makes its transition O(1) instead
45    // of a binary search. Non-ASCII root chars (rare) fall back to the edge search, so it's general.
46    root_next: Vec<i32>,
47    dfs_in: Vec<u32>,    // per state: start index into the endpos array of its endpos range
48    dfs_cnt: Vec<u32>,   // per state: number of end-positions in its subtree
49    // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style: node `i`'s sorted
50    // range is seg_data[seg_off[i]..seg_off[i+1]] (node 1 = root). A state's endpos is the leaf
51    // range [dfs_in, dfs_in+dfs_cnt); the tree answers range predecessor/successor adaptively in
52    // O(log²·slice) — and `firstpos`/`lastpos` give an O(1) fast path that skips it when the query
53    // bound doesn't split the state's endpos span (the common case for wide windows).
54    seg_data: Vec<u32>,
55    seg_off: Vec<u32>,
56    seg_n: usize,        // number of leaves (power of two) in the merge-sort tree
57    // DFS-ordered end-positions (each state's endpos = contiguous slice [dfs_in, dfs_in+dfs_cnt)).
58    // For small endpos sets a cache-friendly linear scan of this beats the scattered tree descent.
59    epos: Vec<u32>,
60    // Query-hot per-state fields packed into one cache line `[firstpos, lastpos, dfs_in, dfs_cnt]`
61    // so `max_le`/`min_in` load all four with a single cache miss (they were 4 separate arrays).
62    epmeta: Vec<[u32; 4]>,
63}
64
65/// Below this endpos-set size, `max_le`/`min_in` linear-scan the contiguous DFS-ordered endpos
66/// (cache-friendly) instead of the merge-sort tree (scattered) — most queried sets are this small.
67const LINEAR_MAX: usize = 256;
68
69impl Sam {
70    fn empty() -> Self {
71        Sam {
72            node: Vec::new(),
73            edges: Vec::new(),
74            firstpos: Vec::new(),
75            lastpos: Vec::new(),
76            root_next: Vec::new(),
77            dfs_in: Vec::new(),
78            dfs_cnt: Vec::new(),
79            seg_data: Vec::new(),
80            seg_off: Vec::new(),
81            seg_n: 0,
82            epos: Vec::new(),
83            epmeta: Vec::new(),
84        }
85    }
86
87    /// Node `i`'s sorted endpos range in the flattened merge-sort tree.
88    fn seg_node(&self, i: usize) -> &[u32] {
89        &self.seg_data[self.seg_off[i] as usize..self.seg_off[i + 1] as usize]
90    }
91
92    /// `state`'s endpos slice as the merge-sort-tree leaf range [l, r), from packed `epmeta`.
93    fn endpos_range_m(m: &[u32; 4], seg_n: usize) -> (usize, usize) {
94        let s = m[2] as usize; // dfs_in
95        (s + seg_n, s + m[3] as usize + seg_n) // dfs_cnt
96    }
97
98    /// Largest end-position `<= x` among `state`'s endpos, or None.
99    fn max_le(&self, state: u32, x: u32) -> Option<u32> {
100        // SAFETY: `state` is a valid SAM state (< nstates = epmeta.len()); the endpos slice
101        // [dfs_in, dfs_in+dfs_cnt) is within epos (built that way). Hot path, called ~30M times.
102        #[allow(clippy::undocumented_unsafe_blocks)]
103        let m = unsafe { self.epmeta.get_unchecked(state as usize) }; // [firstpos,lastpos,dfs_in,dfs_cnt]
104        // O(1) fast path: x doesn't split the state's [firstpos, lastpos] span.
105        if m[1] <= x {
106            return Some(m[1]); // whole set <= x → answer is the max (lastpos)
107        }
108        if m[0] > x {
109            return None; // whole set > x (firstpos) → nothing <= x
110        }
111        let cnt = m[3] as usize;
112        if cnt <= LINEAR_MAX {
113            // small endpos set: branchless (vectorizable) scan of the contiguous slice. firstpos ≤ x
114            // is guaranteed above, so a qualifying element exists and `best` is the true answer.
115            let lo = m[2] as usize;
116            let mut best = 0u32;
117            #[allow(clippy::undocumented_unsafe_blocks)]
118            for &v in unsafe { self.epos.get_unchecked(lo..lo + cnt) } {
119                best = best.max(if v <= x { v } else { 0 });
120            }
121            return Some(best);
122        }
123        // large endpos set: query the merge-sort tree.
124        let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
125        let mut best: Option<u32> = None;
126        while l < r {
127            if l & 1 == 1 {
128                best = best.max(node_max_le(self.seg_node(l), x));
129                l += 1;
130            }
131            if r & 1 == 1 {
132                r -= 1;
133                best = best.max(node_max_le(self.seg_node(r), x));
134            }
135            l >>= 1;
136            r >>= 1;
137        }
138        best
139    }
140
141    /// Smallest end-position in `[lo, hi]` among `state`'s endpos, or None.
142    fn min_in(&self, state: u32, lo: u32, hi: u32) -> Option<u32> {
143        // SAFETY: as in `max_le` — `state` < nstates = epmeta.len(); endpos slice within epos.
144        #[allow(clippy::undocumented_unsafe_blocks)]
145        let m = unsafe { self.epmeta.get_unchecked(state as usize) };
146        // O(1) fast path: the smallest end-position is already >= lo.
147        if m[0] >= lo {
148            return (m[0] <= hi).then_some(m[0]); // firstpos
149        }
150        if m[1] < lo {
151            return None; // whole set < lo (lastpos) → nothing in [lo, hi]
152        }
153        let cnt = m[3] as usize;
154        if cnt <= LINEAR_MAX {
155            // small endpos set: branchless (vectorizable) scan for the smallest value in [lo, hi]
156            let off = m[2] as usize;
157            let mut best = u32::MAX;
158            #[allow(clippy::undocumented_unsafe_blocks)]
159            for &v in unsafe { self.epos.get_unchecked(off..off + cnt) } {
160                best = best.min(if v >= lo && v <= hi { v } else { u32::MAX });
161            }
162            return (best != u32::MAX).then_some(best);
163        }
164        // lo is interior to the span: query the tree for the smallest value in [lo, hi].
165        let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
166        let mut best: Option<u32> = None;
167        while l < r {
168            if l & 1 == 1 {
169                best = merge_min(best, node_min_in(self.seg_node(l), lo, hi));
170                l += 1;
171            }
172            if r & 1 == 1 {
173                r -= 1;
174                best = merge_min(best, node_min_in(self.seg_node(r), lo, hi));
175            }
176            l >>= 1;
177            r >>= 1;
178        }
179        best
180    }
181}
182
183/// Largest value `<= x` in a sorted slice, or None.
184fn node_max_le(sorted: &[u32], x: u32) -> Option<u32> {
185    let k = sorted.partition_point(|&v| v <= x);
186    (k > 0).then(|| sorted[k - 1])
187}
188
189/// Smallest value in `[lo, hi]` in a sorted slice, or None.
190fn node_min_in(sorted: &[u32], lo: u32, hi: u32) -> Option<u32> {
191    let k = sorted.partition_point(|&v| v < lo);
192    sorted.get(k).copied().filter(|&v| v <= hi)
193}
194
195fn merge_min(a: Option<u32>, b: Option<u32>) -> Option<u32> {
196    match (a, b) {
197        (Some(x), Some(y)) => Some(x.min(y)),
198        (x, None) => x,
199        (None, y) => y,
200    }
201}
202
203/// Transient builder (Ukkonen online SAM with a linked-list transition arena).
204struct Builder {
205    edge_char: Vec<char>,
206    edge_to: Vec<u32>,
207    edge_next: Vec<i32>,
208    head: Vec<i32>,
209    link: Vec<i32>,
210    len: Vec<u32>,
211    firstpos: Vec<u32>,
212    primary: Vec<bool>, // true for the per-position state (its firstpos is a real end-position)
213    last: u32,
214}
215
216impl Builder {
217    fn empty() -> Self {
218        Builder {
219            edge_char: Vec::new(),
220            edge_to: Vec::new(),
221            edge_next: Vec::new(),
222            head: Vec::new(),
223            link: Vec::new(),
224            len: Vec::new(),
225            firstpos: Vec::new(),
226            primary: Vec::new(),
227            last: 0,
228        }
229    }
230
231    fn new(cap: usize) -> Self {
232        let mut b = Builder::empty();
233        b.reset(cap);
234        b
235    }
236
237    /// Clear the arenas (keeping capacity) and re-seed the root — for buffer reuse.
238    fn reset(&mut self, cap: usize) {
239        self.edge_char.clear();
240        self.edge_to.clear();
241        self.edge_next.clear();
242        self.head.clear();
243        self.link.clear();
244        self.len.clear();
245        self.firstpos.clear();
246        self.primary.clear();
247        self.edge_char.reserve(3 * cap);
248        self.edge_to.reserve(3 * cap);
249        self.edge_next.reserve(3 * cap);
250        self.primary.push(false); // root state 0 is not a per-position state
251        self.head.push(-1);
252        self.link.push(-1);
253        self.len.push(0);
254        self.firstpos.push(0);
255        self.last = 0;
256    }
257
258    #[allow(clippy::cast_sign_loss)]
259    fn find(&self, state: u32, c: char) -> Option<u32> {
260        let mut e = self.head[state as usize];
261        while e != -1 {
262            let idx = e as usize;
263            if self.edge_char[idx] == c {
264                return Some(self.edge_to[idx]);
265            }
266            e = self.edge_next[idx];
267        }
268        None
269    }
270
271    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
272    fn add_edge(&mut self, state: u32, c: char, to: u32) {
273        self.edge_char.push(c);
274        self.edge_to.push(to);
275        self.edge_next.push(self.head[state as usize]);
276        self.head[state as usize] = (self.edge_char.len() - 1) as i32;
277    }
278
279    #[allow(clippy::cast_sign_loss)]
280    fn set_edge(&mut self, state: u32, c: char, to: u32) {
281        let mut e = self.head[state as usize];
282        while e != -1 {
283            let idx = e as usize;
284            if self.edge_char[idx] == c {
285                self.edge_to[idx] = to;
286                return;
287            }
288            e = self.edge_next[idx];
289        }
290        self.add_edge(state, c, to);
291    }
292
293    #[allow(clippy::cast_possible_truncation)]
294    fn new_state(&mut self, len: u32, link: i32, firstpos: u32, primary: bool) -> u32 {
295        self.head.push(-1);
296        self.len.push(len);
297        self.link.push(link);
298        self.firstpos.push(firstpos);
299        self.primary.push(primary);
300        (self.head.len() - 1) as u32
301    }
302
303    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_sign_loss)]
304    fn extend(&mut self, c: char, pos: usize) {
305        let cur = self.new_state(self.len[self.last as usize] + 1, -1, pos as u32, true);
306        let mut p = self.last as i32;
307        while p != -1 && self.find(p as u32, c).is_none() {
308            self.add_edge(p as u32, c, cur);
309            p = self.link[p as usize];
310        }
311        if p == -1 {
312            self.link[cur as usize] = 0;
313        } else {
314            let q = self.find(p as u32, c).unwrap();
315            if self.len[p as usize] + 1 == self.len[q as usize] {
316                self.link[cur as usize] = q as i32;
317            } else {
318                let clone = self.new_state(self.len[p as usize] + 1, self.link[q as usize], self.firstpos[q as usize], false);
319                let mut e = self.head[q as usize];
320                while e != -1 {
321                    let idx = e as usize;
322                    self.add_edge(clone, self.edge_char[idx], self.edge_to[idx]);
323                    e = self.edge_next[idx];
324                }
325                while p != -1 && self.find(p as u32, c) == Some(q) {
326                    self.set_edge(p as u32, c, clone);
327                    p = self.link[p as usize];
328                }
329                self.link[q as usize] = clone as i32;
330                self.link[cur as usize] = clone as i32;
331            }
332        }
333        self.last = cur;
334    }
335
336    /// Convert the linked-list transitions into the packed `node`/`edges` layout (per-state edges
337    /// sorted by char, co-located char+target), reusing `out`'s allocations (no per-build malloc churn).
338    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::needless_range_loop)]
339    fn finalize_into(&self, out: &mut Sam) {
340        let nstates = self.head.len();
341        let nedges = self.edge_char.len();
342        out.firstpos.clear();
343        out.firstpos.extend_from_slice(&self.firstpos);
344        // per-state edge counts → exclusive prefix-sum offsets (local; folded into `node` below)
345        let mut off = vec![0u32; nstates + 1];
346        for state in 0..nstates {
347            let mut e = self.head[state];
348            while e != -1 {
349                off[state + 1] += 1;
350                e = self.edge_next[e as usize];
351            }
352        }
353        for state in 0..nstates {
354            off[state + 1] += off[state];
355        }
356        // edges: (char << 32 | to), sorted by char within each state's [off[s], off[s+1]) range.
357        // sorting the packed u64 sorts by char (high bits) since a state's chars are distinct.
358        out.edges.clear();
359        out.edges.resize(nedges, 0);
360        let mut scratch: Vec<u64> = Vec::new(); // reused across states
361        for state in 0..nstates {
362            scratch.clear();
363            let mut e = self.head[state];
364            while e != -1 {
365                let idx = e as usize;
366                scratch.push((u64::from(self.edge_char[idx] as u32) << 32) | u64::from(self.edge_to[idx]));
367                e = self.edge_next[idx];
368            }
369            scratch.sort_unstable();
370            let base = off[state] as usize;
371            for (k, &packed) in scratch.iter().enumerate() {
372                out.edges[base + k] = packed;
373            }
374        }
375        // per-state packed node [len, link(clamped: root -1 → 0, never read), edge_lo, edge_hi]
376        out.node.clear();
377        out.node.reserve(nstates);
378        for s in 0..nstates {
379            let link = if self.link[s] < 0 { 0 } else { self.link[s] as u32 };
380            out.node.push([self.len[s], link, off[s], off[s + 1]]);
381        }
382        // root direct transition table (ASCII): fill from the root's edges
383        out.root_next.clear();
384        out.root_next.resize(ROOT_TBL, -1);
385        for k in off[0] as usize..off[1] as usize {
386            let e = out.edges[k];
387            let ci = (e >> 32) as usize;
388            if ci < ROOT_TBL {
389                out.root_next[ci] = i32::try_from((e & 0xFFFF_FFFF) as u32).expect("state fits i32");
390            }
391        }
392        self.build_endpos(out, nstates);
393    }
394
395    /// Build the endpos range structure (fix b): suffix-link children → DFS-order end-positions
396    /// (each state's endpos = a contiguous array range) → wavelet matrix over those positions.
397    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
398    fn build_endpos(&self, out: &mut Sam, nstates: usize) {
399        // suffix-link children, CSR
400        let mut child_head = vec![0u32; nstates + 1];
401        for s in 1..nstates {
402            child_head[self.link[s] as usize + 1] += 1;
403        }
404        for s in 0..nstates {
405            child_head[s + 1] += child_head[s];
406        }
407        let mut child_arr = vec![0u32; nstates.saturating_sub(1)];
408        let mut fill = child_head.clone();
409        for s in 1..nstates {
410            let p = self.link[s] as usize;
411            child_arr[fill[p] as usize] = s as u32;
412            fill[p] += 1;
413        }
414        // iterative DFS (enter/exit) from root → dfs_in / dfs_cnt + DFS-ordered end-positions;
415        // on exit (post-order, children done) accumulate lastpos = max end-position in subtree.
416        out.dfs_in.clear();
417        out.dfs_in.resize(nstates, 0);
418        out.dfs_cnt.clear();
419        out.dfs_cnt.resize(nstates, 0);
420        out.lastpos.clear();
421        out.lastpos.resize(nstates, 0);
422        out.epos.clear();
423        let mut stack: Vec<(u32, bool)> = vec![(0, false)];
424        while let Some((s, exit)) = stack.pop() {
425            let su = s as usize;
426            if exit {
427                out.dfs_cnt[su] = out.epos.len() as u32 - out.dfs_in[su];
428                let mut lp = if self.primary[su] { self.firstpos[su] } else { 0 };
429                for k in child_head[su]..child_head[su + 1] {
430                    lp = lp.max(out.lastpos[child_arr[k as usize] as usize]);
431                }
432                out.lastpos[su] = lp;
433                continue;
434            }
435            out.dfs_in[su] = out.epos.len() as u32;
436            if self.primary[su] {
437                out.epos.push(self.firstpos[su]);
438            }
439            stack.push((s, true));
440            for k in child_head[su]..child_head[su + 1] {
441                stack.push((child_arr[k as usize], false));
442            }
443        }
444        // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style (build is a
445        // negligible fraction of runtime; queries hit the contiguous `seg_data` layout).
446        let n = out.epos.len();
447        let mut seg_n = 1usize;
448        while seg_n < n.max(1) {
449            seg_n <<= 1;
450        }
451        out.seg_n = seg_n;
452        let mut tree: Vec<Vec<u32>> = vec![Vec::new(); 2 * seg_n];
453        for i in 0..n {
454            tree[seg_n + i] = vec![out.epos[i]];
455        }
456        for i in (1..seg_n).rev() {
457            let (lhs, rhs) = (&tree[2 * i], &tree[2 * i + 1]);
458            let mut acc = Vec::with_capacity(lhs.len() + rhs.len());
459            let (mut x, mut y) = (0usize, 0usize);
460            while x < lhs.len() && y < rhs.len() {
461                if lhs[x] <= rhs[y] {
462                    acc.push(lhs[x]);
463                    x += 1;
464                } else {
465                    acc.push(rhs[y]);
466                    y += 1;
467                }
468            }
469            acc.extend_from_slice(&lhs[x..]);
470            acc.extend_from_slice(&rhs[y..]);
471            tree[i] = acc;
472        }
473        out.seg_off.clear();
474        out.seg_off.reserve(2 * seg_n + 1);
475        out.seg_data.clear();
476        let mut off = 0u32;
477        out.seg_off.push(0);
478        for node in &tree {
479            out.seg_data.extend_from_slice(node);
480            off += u32::try_from(node.len()).expect("epos fits u32");
481            out.seg_off.push(off);
482        }
483        // pack the query-hot per-state fields into one cache-line-friendly array
484        out.epmeta.clear();
485        out.epmeta.reserve(nstates);
486        for s in 0..nstates {
487            out.epmeta.push([out.firstpos[s], out.lastpos[s], out.dfs_in[s], out.dfs_cnt[s]]);
488        }
489    }
490
491    fn finalize(self) -> Sam {
492        let mut out = Sam::empty();
493        self.finalize_into(&mut out);
494        out
495    }
496}
497
498/// Build (and finalize) the suffix automaton of `b` — prebuild once, reuse across pairs.
499#[must_use]
500pub fn build_sam(b: &[char]) -> Sam {
501    let mut bld = Builder::new(b.len());
502    for (i, &c) in b.iter().enumerate() {
503        bld.extend(c, i);
504    }
505    bld.finalize()
506}
507
508/// Longest substring of `a[al..ar]` that occurs in `b[bl..br)`, using the **precomputed**
509/// window-independent match (`fstate`/`fmatch` = the SAM state and match length ending at each
510/// a-position, from one full-`a` scan). For each position the chain walk starts at the precomputed
511/// state, capping the usable length to the a-fragment (`i-al+1`) and the b-window via endpos
512/// queries. Returns (`a_start`, absolute `b_start`, len) with difflib's tie-break. No re-scanning.
513#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
514fn longest_in(sam: &Sam, fstate: &[u32], fmatch: &[u32], al: usize, ar: usize, bl: usize, br: usize)
515    -> (usize, usize, usize) {
516    let blo = bl as u32;
517    let hi = br as u32 - 1; // caller guarantees bl < br, so br >= 1
518    let (mut best_len, mut best_a, mut best_b) = (0usize, 0usize, 0usize);
519    let node = sam.node.as_slice();
520    // SAFETY: i ∈ [al, ar) ⊆ [0, n) = fstate.len() = fmatch.len(); `cur` is always a valid SAM
521    // state (fstate entry or a suffix link), so node[cur] and node[link] (link is a state) are in bounds.
522    #[allow(clippy::undocumented_unsafe_blocks)]
523    unsafe {
524        for i in al..ar {
525            let cap = i - al + 1; // a-match ending at i can't start before `al`
526            let eff = (*fmatch.get_unchecked(i) as usize).min(cap);
527            if eff <= best_len {
528                continue; // can't beat the best — skip (dominant pruning)
529            }
530            // walk the suffix-link chain from the precomputed state up; `curlen` is the usable length
531            // at the current state (capped to the a-fragment), shrinking as we ascend.
532            let mut cur = *fstate.get_unchecked(i);
533            loop {
534                if cur == 0 {
535                    break;
536                }
537                let nd = *node.get_unchecked(cur as usize); // [len, link, edge_lo, edge_hi]
538                let curlen = eff.min(nd[0] as usize);
539                if curlen <= best_len {
540                    break;
541                }
542                let band_min = *node.get_unchecked(nd[1] as usize); // node[link]
543                let band_min = band_min[0] as usize + 1; // len(link) + 1
544                if curlen >= band_min {
545                    if let Some(pmax) = sam.max_le(cur, hi) {
546                        if pmax >= blo {
547                            let l = curlen.min((pmax - blo) as usize + 1); // start = p-l+1 >= bl
548                            if l >= band_min {
549                                // earliest-b (min_in) only when we beat the best — rare, off the path.
550                                if l > best_len {
551                                    if let Some(pmin) = sam.min_in(cur, blo + l as u32 - 1, hi) {
552                                        best_len = l;
553                                        best_a = i + 1 - l;
554                                        best_b = pmin as usize + 1 - l;
555                                    }
556                                }
557                                break; // deepest qualifying state ⇒ longest in-window match here
558                            }
559                        }
560                    }
561                }
562                let link = nd[1];
563                if link == 0 {
564                    break; // reached the root (state 0) — no shorter qualifying state above
565                }
566                cur = link;
567            }
568        }
569    }
570    (best_a, best_b, best_len)
571}
572
573/// Window-independent matching statistics of `a` vs `sam_b`, filled into reused buffers: for each
574/// i, `(state, matched)` where `matched` = longest suffix of `a[..=i]` occurring anywhere in b.
575/// One O(|a|) scan, reused by every recursion node (no per-node re-scan). Reusing the caller's
576/// buffers avoids a per-pair allocation (was ~10% of the all-pairs join).
577#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
578fn matching_stats_into(a: &[char], sam_b: &Sam, fstate: &mut Vec<u32>, fmatch: &mut Vec<u32>) {
579    let n = a.len();
580    // Size the buffers WITHOUT zero-filling: the scan below writes every index [0, n) before any read,
581    // so the resize(n, 0) zero-pass was pure waste (it was ~half the scan's store traffic).
582    // SAFETY: u32 has no invalid bit patterns, and fstate[i]/fmatch[i] are written for every i in 0..n
583    // (the loop below) strictly before longest_in ever reads them.
584    fstate.clear();
585    fstate.reserve(n);
586    fmatch.clear();
587    fmatch.reserve(n);
588    #[allow(clippy::undocumented_unsafe_blocks)]
589    unsafe {
590        fstate.set_len(n);
591        fmatch.set_len(n);
592    }
593    // Hoist the SAM arrays into locals so the compiler keeps base pointers in registers and
594    // doesn't reload them through `sam_b` each iteration; transition is hand-inlined.
595    let node = sam_b.node.as_slice();
596    let edges = sam_b.edges.as_slice();
597    let root = sam_b.root_next.as_slice();
598    let mut state = 0u32;
599    let mut matched = 0u32;
600    // SAFETY: `state` is always a valid SAM state index (< nstates = node.len()): it starts at the
601    // root (0) and only ever becomes a transition target (an edge's low bits, a valid state) or a
602    // suffix link (node[..][1], a valid state). So node[state] and node[link] are in bounds; the edge
603    // range [edge_lo, edge_hi) ⊆ [0, edges.len()); `ci < ROOT_TBL == root.len()`; `i < n == fstate.len()`.
604    #[allow(clippy::undocumented_unsafe_blocks)]
605    unsafe {
606        for i in 0..n {
607            let c = *a.get_unchecked(i);
608            loop {
609                // inline `transition(state, c)` → next state, or -1. The non-root branch loads
610                // node[state] ONCE for both the edge range and (on miss) the suffix link — one cache line.
611                let nx: i64 = if state == 0 {
612                    let ci = c as usize;
613                    if ci < ROOT_TBL {
614                        i64::from(*root.get_unchecked(ci))
615                    } else {
616                        let nd = node.get_unchecked(0);
617                        csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
618                    }
619                } else {
620                    let nd = node.get_unchecked(state as usize);
621                    csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
622                };
623                if nx >= 0 {
624                    state = nx as u32;
625                    matched += 1;
626                    break;
627                }
628                if state == 0 {
629                    matched = 0;
630                    break;
631                }
632                let link = node.get_unchecked(state as usize)[1]; // same line as the edge-range load
633                state = link;
634                matched = node.get_unchecked(link as usize)[0]; // len(link)
635            }
636            *fstate.get_unchecked_mut(i) = state;
637            *fmatch.get_unchecked_mut(i) = matched;
638        }
639    }
640}
641
642/// Binary search the packed edge slice `edges[lo..hi]` (sorted by char in the high 32 bits) for `c`;
643/// returns the target state (low 32 bits) as i64, or -1. Inlined into the scan.
644#[inline]
645#[allow(clippy::cast_possible_truncation)]
646fn csr_lookup(edges: &[u64], mut lo: usize, hi: usize, c: char) -> i64 {
647    let mut hi = hi;
648    let key = c as u32;
649    while lo < hi {
650        let mid = lo + (hi - lo) / 2;
651        // SAFETY: callers pass lo,hi from a state's edge range ⊆ [0, edges.len()]; `mid ∈ [lo, hi)`.
652        let e = unsafe { *edges.get_unchecked(mid) };
653        let mc = (e >> 32) as u32; // char (code point) in the high 32 bits
654        if mc == key {
655            return i64::from((e & 0xFFFF_FFFF) as u32); // target state in the low 32 bits
656        }
657        if mc < key {
658            lo = mid + 1;
659        } else {
660            hi = mid;
661        }
662    }
663    -1
664}
665
666thread_local! {
667    /// Reused (fstate, fmatch) buffers for the per-pair matching statistics — no per-pair alloc.
668    static MS_BUF: std::cell::RefCell<(Vec<u32>, Vec<u32>)> =
669        const { std::cell::RefCell::new((Vec::new(), Vec::new())) };
670    /// Reused recursion stack of (al, ar, bl, br) windows — retains capacity across pairs so the
671    /// RO recursion never heap-allocates or reallocs per pair (was `__rust_alloc` + `grow_one`).
672    static STACK_BUF: std::cell::RefCell<Vec<(usize, usize, usize, usize)>> =
673        const { std::cell::RefCell::new(Vec::new()) };
674}
675
676/// Cost probe: run only the matching-statistics scan of `a` vs `sam_b` (the unavoidable per-pair
677/// floor — RO's first block is an LCS, Θ(|a|)) and return a checksum so it isn't optimized out.
678/// Measures the pure scan throughput separate from the RO recursion.
679#[must_use]
680pub fn matching_stats_cost(a: &[char], sam_b: &Sam) -> u64 {
681    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
682        matching_stats_into(a, sam_b, fstate, fmatch);
683        fmatch.iter().map(|&x| u64::from(x)).sum()
684    })
685}
686
687fn gestalt_m_with(a: &[char], b: &[char], sam_b: &Sam) -> usize {
688    let n = a.len();
689    if n == 0 {
690        return 0;
691    }
692    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
693        matching_stats_into(a, sam_b, fstate, fmatch);
694        gestalt_m_recur(a, b, sam_b, fstate, fmatch)
695    })
696}
697
698#[allow(clippy::cast_sign_loss, clippy::many_single_char_names)]
699fn gestalt_m_recur(a: &[char], b: &[char], sam_b: &Sam, fstate: &[u32], fmatch: &[u32]) -> usize {
700    let n = a.len();
701    let mut total = 0usize;
702    STACK_BUF.with_borrow_mut(|stack| {
703        stack.clear();
704        stack.push((0, n, 0, b.len()));
705        while let Some((al, ar, bl, br)) = stack.pop() {
706            if al >= ar || bl >= br {
707                continue;
708            }
709            let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br);
710            if l == 0 {
711                continue;
712            }
713            total += l;
714            stack.push((al, i, bl, j));
715            stack.push((i + l, ar, j + l, br));
716        }
717    });
718    total
719}
720
721/// Threshold-aware exact decision: does `RO(a,b) ≥ threshold`? Computes the matched total M
722/// with **two-sided early-exit** — accept the instant `M ≥ need`, reject the instant the upper
723/// bound `M + Σ min(window lengths) < need`. Exact (the bound never drops a qualifying pair),
724/// and dissimilar pairs abort long before the full decomposition. `need = ⌈threshold·(|a|+|b|)/2⌉`.
725#[allow(
726    clippy::cast_precision_loss,
727    clippy::cast_sign_loss,
728    clippy::cast_possible_truncation,
729    clippy::many_single_char_names
730)]
731#[must_use]
732pub fn gestalt_qualifies(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> bool {
733    let nb = b.len();
734    let total = a.len() + nb;
735    if total == 0 {
736        return true;
737    }
738    let need = (threshold * total as f64 / 2.0).ceil() as usize;
739    if need == 0 {
740        return true;
741    }
742    let n = a.len();
743    if n == 0 || nb == 0 {
744        return false; // M = 0 < need
745    }
746    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
747        matching_stats_into(a, sam_b, fstate, fmatch);
748        gestalt_qualifies_ms(n, nb, sam_b, threshold, fstate, fmatch)
749    })
750}
751
752/// The threshold early-exit recursion given **precomputed** matching statistics (`fstate`/`fmatch`
753/// for `a` of length `na` vs `sam_b`). Split out so the scan can be done in an MLP batch separately.
754#[allow(
755    clippy::cast_precision_loss,
756    clippy::cast_sign_loss,
757    clippy::cast_possible_truncation,
758    clippy::many_single_char_names
759)]
760#[must_use]
761pub fn gestalt_qualifies_ms(na: usize, nb: usize, sam_b: &Sam, threshold: f64, fstate: &[u32], fmatch: &[u32]) -> bool {
762    let total = na + nb;
763    if total == 0 {
764        return true;
765    }
766    let need = (threshold * total as f64 / 2.0).ceil() as usize;
767    if need == 0 {
768        return true;
769    }
770    if na == 0 || nb == 0 {
771        return false;
772    }
773    STACK_BUF.with_borrow_mut(|stack| {
774        let mut m = 0usize;
775        let mut pending = na.min(nb);
776        stack.clear();
777        stack.push((0, na, 0, nb));
778        while let Some((al, ar, bl, br)) = stack.pop() {
779            pending -= (ar - al).min(br - bl);
780            if al >= ar || bl >= br {
781                continue;
782            }
783            let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br);
784            if l == 0 {
785                continue;
786            }
787            m += l;
788            if m >= need {
789                return true;
790            }
791            pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
792            if m + pending < need {
793                return false;
794            }
795            stack.push((al, i, bl, j));
796            stack.push((i + l, ar, j + l, br));
797        }
798        m >= need
799    })
800}
801
802
803/// Edge test that also yields the exact ratio: `Some(ratio)` iff `RO(a,b) >= threshold`, else `None`.
804/// Keeps the **reject** early-exit (abort the instant the upper bound `M + Σ min(window) < need`) but
805/// computes full M on the qualifying branch so the cached ratio feeds the cluster `min_sim` — caching
806/// edge ratios here means `min_sim` only recomputes the rare non-edge (chained) intra-cluster pairs,
807/// not the whole dense blob. Bit-identical to `(let r = ratio; (r >= threshold).then_some(r))`.
808#[allow(
809    clippy::cast_precision_loss,
810    clippy::cast_sign_loss,
811    clippy::cast_possible_truncation,
812    clippy::many_single_char_names
813)]
814#[must_use]
815pub fn gestalt_edge(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> Option<f64> {
816    let na = a.len();
817    let nb = b.len();
818    let total = na + nb;
819    if total == 0 {
820        return Some(1.0);
821    }
822    let need = (threshold * total as f64 / 2.0).ceil() as usize;
823    if na == 0 || nb == 0 {
824        return (need == 0).then_some(0.0);
825    }
826    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
827        matching_stats_into(a, sam_b, fstate, fmatch);
828        STACK_BUF.with_borrow_mut(|stack| {
829            let mut m = 0usize;
830            let mut pending = na.min(nb);
831            stack.clear();
832            stack.push((0, na, 0, nb));
833            while let Some((al, ar, bl, br)) = stack.pop() {
834                pending -= (ar - al).min(br - bl);
835                if al >= ar || bl >= br {
836                    continue;
837                }
838                let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br);
839                if l == 0 {
840                    continue;
841                }
842                m += l;
843                pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
844                if m + pending < need {
845                    return None; // upper bound below the bar ⇒ certified non-edge, abort
846                }
847                stack.push((al, i, bl, j));
848                stack.push((i + l, ar, j + l, br));
849            }
850            (m >= need).then(|| 2.0 * m as f64 / total as f64)
851        })
852    })
853}
854
855/// Cluster `min_sim` helper: returns the **exact** ratio when `RO(a,b) <= cap`, otherwise a value
856/// `> cap` (it accept-early-exits the instant M exceeds the cap's bound). Used to find a cluster's
857/// minimum pairwise ratio with `cur = cur.min(gestalt_ratio_capped(a, b, sam, cur))`: in a dense
858/// cluster the dominant high-ratio pairs blow past `cur` and are pruned after the first block or two,
859/// so full M is computed only for the genuinely-low pairs. Bit-identical minimum to the full ratio.
860#[allow(
861    clippy::cast_precision_loss,
862    clippy::cast_sign_loss,
863    clippy::cast_possible_truncation,
864    clippy::many_single_char_names
865)]
866#[must_use]
867pub fn gestalt_ratio_capped(a: &[char], b: &[char], sam_b: &Sam, cap: f64) -> f64 {
868    let na = a.len();
869    let nb = b.len();
870    let total = na + nb;
871    if total == 0 {
872        return 1.0; // two empty strings ⇒ ratio 1.0
873    }
874    if na == 0 || nb == 0 {
875        return 0.0; // M = 0 ⇒ ratio 0 (a valid minimum candidate, <= cap for any cap >= 0)
876    }
877    // ratio > cap ⟺ 2M/total > cap ⟺ M > cap·total/2 ⟺ M >= ⌊cap·total/2⌋ + 1.
878    let exceed = (cap * total as f64 / 2.0).floor() as usize + 1;
879    MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
880        matching_stats_into(a, sam_b, fstate, fmatch);
881        STACK_BUF.with_borrow_mut(|stack| {
882            let mut m = 0usize;
883            stack.clear();
884            stack.push((0, na, 0, nb));
885            while let Some((al, ar, bl, br)) = stack.pop() {
886                if al >= ar || bl >= br {
887                    continue;
888                }
889                let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br);
890                if l == 0 {
891                    continue;
892                }
893                m += l;
894                if m >= exceed {
895                    return 2.0; // ratio > cap ⇒ cannot be the minimum; prune (any value > cap works)
896                }
897                stack.push((al, i, bl, j));
898                stack.push((i + l, ar, j + l, br));
899            }
900            2.0 * m as f64 / total as f64 // full exact M ⇒ exact ratio (<= cap)
901        })
902    })
903}
904
905/// Ratio of `a` vs the string the prebuilt `sam_b` was built from (= `b`).
906#[allow(clippy::cast_precision_loss)]
907#[must_use]
908pub fn gestalt_ratio_prebuilt(a: &[char], b: &[char], sam_b: &Sam) -> f64 {
909    let total = a.len() + b.len();
910    if total == 0 {
911        return 1.0;
912    }
913    2.0 * gestalt_m_with(a, b, sam_b) as f64 / total as f64
914}
915
916#[allow(clippy::cast_precision_loss)]
917#[must_use]
918pub fn gestalt_ratio_chars(a: &[char], b: &[char]) -> f64 {
919    if a.is_empty() && b.is_empty() {
920        return 1.0;
921    }
922    let sam_b = build_sam(b);
923    gestalt_ratio_prebuilt(a, b, &sam_b)
924}
925
926/// `gestalt_ratio(a, b) -> float`: exact difflib ratio, computed via suffix-automaton LCS.
927#[must_use]
928pub fn gestalt_ratio(a: &str, b: &str) -> f64 {
929    let av: Vec<char> = a.chars().collect();
930    let bv: Vec<char> = b.chars().collect();
931    gestalt_ratio_chars(&av, &bv)
932}
933
934#[cfg(test)]
935mod tests {
936    #![allow(clippy::float_cmp, clippy::unreadable_literal)]
937    use super::{build_sam, gestalt_edge, gestalt_qualifies, gestalt_ratio_capped, gestalt_ratio_chars};
938
939    fn r(a: &str, b: &str) -> f64 {
940        gestalt_ratio_chars(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>())
941    }
942
943    #[test]
944    fn matches_difflib_reference_values() {
945        assert_eq!(r("", ""), 1.0);
946        assert_eq!(r("", "x"), 0.0);
947        assert_eq!(r("abc", "abc"), 1.0);
948        assert_eq!(r("abc", "abd"), 0.6666666666666666);
949        assert_eq!(r("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
950        assert_eq!(r("tide", "diet"), 0.25);
951        assert_eq!(r("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
952        assert_eq!(r("aaaaabbbbbccccc", "aaaaaxbbbbbxccccc"), 0.9375);
953    }
954
955    // `gestalt_qualifies` (threshold early-exit) must be byte-for-byte with `ratio >= T`.
956    #[test]
957    fn qualifies_matches_ratio_threshold() {
958        // xorshift PRNG → deterministic pseudo-random ASCII strings over a small alphabet
959        let mut s: u64 = 0x1234_5678_9abc_def1;
960        let mut next = || {
961            s ^= s << 13;
962            s ^= s >> 7;
963            s ^= s << 17;
964            s
965        };
966        for _ in 0..3000 {
967            let la = (next() % 40) as usize;
968            let lb = (next() % 40) as usize;
969            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> Vec<char> {
970                (0..n).map(|_| char::from(b'a' + (rng() % 4) as u8)).collect()
971            };
972            let a = mk(la, &mut next);
973            let b = mk(lb, &mut next);
974            let sam_b = build_sam(&b);
975            let ratio = gestalt_ratio_chars(&a, &b);
976            for &t in &[0.0_f64, 0.25, 0.5, 0.75, 0.9, 1.0] {
977                assert_eq!(
978                    gestalt_qualifies(&a, &b, &sam_b, t),
979                    ratio >= t,
980                    "a={a:?} b={b:?} t={t} ratio={ratio}"
981                );
982                // gestalt_edge(cap=t): Some(exact ratio) iff qualifying; bit-exact ratio when Some.
983                let edge = gestalt_edge(&a, &b, &sam_b, t);
984                assert_eq!(edge.is_some(), ratio >= t, "edge.is_some a={a:?} b={b:?} t={t} ratio={ratio}");
985                if let Some(r) = edge {
986                    assert_eq!(r, ratio, "edge ratio a={a:?} b={b:?} t={t}");
987                }
988                // gestalt_ratio_capped(cap=t): exact ratio when ratio <= t, else a value > t (pruned).
989                let capped = gestalt_ratio_capped(&a, &b, &sam_b, t);
990                if ratio <= t {
991                    assert_eq!(capped, ratio, "capped exact a={a:?} b={b:?} cap={t} ratio={ratio}");
992                } else {
993                    assert!(capped > t, "capped prune a={a:?} b={b:?} cap={t} ratio={ratio} got={capped}");
994                }
995            }
996        }
997    }
998}