difflib_fast/gestalt.rs
1//! Gestalt-Chain: fast **exact** Ratcliff–Obershelp via a suffix automaton.
2//!
3//! difflib's `ratio` is slow because `find_longest_match` is O(|a|·|b|) (it rescans every
4//! occurrence of popular characters). The matched-block total M is the recursive
5//! longest-common-substring decomposition; a **suffix automaton** finds the longest common
6//! substring in O(|a|+|b|) regardless of character frequency, and recursing on the left/
7//! right remainders reproduces difflib's M with the *same* tie-break (longest; earliest in
8//! a; earliest in b). So `gestalt_ratio` equals `difflib.SequenceMatcher.ratio()` exactly.
9//!
10//! Hot-path engineering for the all-pairs join:
11//! * transitions are stored **CSR** (per-state sorted slice) → O(log deg) lookup, O(len)
12//! memory, no hashing — and crucially no O(deg) scan at the high-degree root, which the
13//! scan hammers when strings are dissimilar;
14//! * the b-side automaton is **prebuilt once per string** (`build_sam`) and reused for
15//! every pair, so the all-pairs cost is n builds + n² scans, not n² builds.
16//! * **prefetch hints attempted on the per-iteration `node[state]` load** — the SAM walk is a
17//! data-dependent pointer chase the hardware prefetcher cannot anticipate. A `prfm pldl1keep`
18//! (`AArch64`) / `_mm_prefetch` (x86) experiment was MEASURED on M3 Pro and did NOT pay off
19//! (-10% to -2% across mypy/sympy/django) — the SAMs fit in L2 already, and the prefetch
20//! instruction burns execution slots without producing a hit-rate improvement. See the tombstone
21//! near `matching_stats_into` for details.
22//!
23//! Operates on `char` (code points), so it is bit-identical to difflib on non-ASCII.
24
25// Note: a software-prefetch experiment (prfm pldl1keep / _mm_prefetch for the next iteration's
26// node[state] load) was tried and DID NOT pay off on M3 Pro — the prefetch instruction itself
27// burned execution slots without producing a measurable hit-rate improvement, because the SAM
28// pages fit comfortably in L2 for the per-pair working set (~100KB for the two SAMs in play)
29// and the hardware prefetcher already keeps the hot stretches warm. Net: -10% to -2% across
30// mypy/sympy/django. Left here as a tombstone so future readers don't waste a day on it.
31
32/// Size of the root's direct transition table — covers ASCII (the canonical text's alphabet).
33const ROOT_TBL: usize = 128;
34
35/// `#[cfg(feature = "instrument")]` per-call counters for data-driven perf decisions. Compiles to
36/// no-op in default builds (no atomic ops, no codegen). Enabled via `cargo build --features
37/// instrument`. The counters use `Ordering::Relaxed` because we only care about totals at
38/// end-of-workload (no causal ordering needed) — the relaxed cost is one atomic add per inc.
39///
40/// Cross-thread aggregation: every rayon worker writes into the same global atomics; we read them
41/// after the workload completes via `instrument::dump()`. Use `instrument::reset()` between
42/// successive workload runs in the same process so the numbers stay per-workload.
43#[cfg(feature = "instrument")]
44pub mod instrument {
45 use std::sync::atomic::{AtomicU64, Ordering};
46
47 /// Histogram bucket count for chain walk + recursion depth. Anything deeper than 63 falls in
48 /// the last bucket — measured tail on canonical Python is < 30 in practice, so 64 is plenty.
49 pub const HIST_BUCKETS: usize = 64;
50
51 pub static LONGEST_IN_CALLS: AtomicU64 = AtomicU64::new(0);
52 pub static PAIRS_PROCESSED: AtomicU64 = AtomicU64::new(0);
53 pub static MAX_LE_CALLS: AtomicU64 = AtomicU64::new(0);
54 pub static MAX_LE_FAST_PATH: AtomicU64 = AtomicU64::new(0); // lastpos<=x or firstpos>x shortcut
55 pub static MAX_LE_LINEAR: AtomicU64 = AtomicU64::new(0); // cnt <= LINEAR_MAX scan
56 pub static MAX_LE_LINEAR_LEN_SUM: AtomicU64 = AtomicU64::new(0); // total entries scanned
57 pub static MAX_LE_SEGTREE: AtomicU64 = AtomicU64::new(0); // merge-sort-tree walk
58 pub static MIN_IN_CALLS: AtomicU64 = AtomicU64::new(0);
59 pub static MIN_IN_FAST_PATH: AtomicU64 = AtomicU64::new(0);
60 pub static MIN_IN_LINEAR: AtomicU64 = AtomicU64::new(0);
61 pub static MIN_IN_LINEAR_LEN_SUM: AtomicU64 = AtomicU64::new(0);
62 pub static MIN_IN_SEGTREE: AtomicU64 = AtomicU64::new(0);
63 pub static FMATCH_ZERO: AtomicU64 = AtomicU64::new(0);
64 pub static FMATCH_NONZERO: AtomicU64 = AtomicU64::new(0);
65 pub static FMATCH_SUM: AtomicU64 = AtomicU64::new(0); // sum of all non-zero fmatch values
66
67 pub static CHAIN_DEPTHS: [AtomicU64; HIST_BUCKETS] = {
68 // const init via repeated AtomicU64::new(0) — array initializer.
69 const ZERO: AtomicU64 = AtomicU64::new(0);
70 [ZERO; HIST_BUCKETS]
71 };
72 pub static RECURSION_DEPTHS: [AtomicU64; HIST_BUCKETS] = {
73 const ZERO: AtomicU64 = AtomicU64::new(0);
74 [ZERO; HIST_BUCKETS]
75 };
76
77 pub fn reset() {
78 LONGEST_IN_CALLS.store(0, Ordering::Relaxed);
79 PAIRS_PROCESSED.store(0, Ordering::Relaxed);
80 MAX_LE_CALLS.store(0, Ordering::Relaxed);
81 MAX_LE_FAST_PATH.store(0, Ordering::Relaxed);
82 MAX_LE_LINEAR.store(0, Ordering::Relaxed);
83 MAX_LE_LINEAR_LEN_SUM.store(0, Ordering::Relaxed);
84 MAX_LE_SEGTREE.store(0, Ordering::Relaxed);
85 MIN_IN_CALLS.store(0, Ordering::Relaxed);
86 MIN_IN_FAST_PATH.store(0, Ordering::Relaxed);
87 MIN_IN_LINEAR.store(0, Ordering::Relaxed);
88 MIN_IN_LINEAR_LEN_SUM.store(0, Ordering::Relaxed);
89 MIN_IN_SEGTREE.store(0, Ordering::Relaxed);
90 FMATCH_ZERO.store(0, Ordering::Relaxed);
91 FMATCH_NONZERO.store(0, Ordering::Relaxed);
92 FMATCH_SUM.store(0, Ordering::Relaxed);
93 for b in &CHAIN_DEPTHS {
94 b.store(0, Ordering::Relaxed);
95 }
96 for b in &RECURSION_DEPTHS {
97 b.store(0, Ordering::Relaxed);
98 }
99 }
100
101 /// Dump all counters as a human-readable multi-line string. Includes derived stats
102 /// (avg per pair, % linear vs seg-tree, percentile of chain depth, etc.).
103 #[must_use]
104 pub fn dump() -> String {
105 let mut s = String::with_capacity(2048);
106 let pairs = PAIRS_PROCESSED.load(Ordering::Relaxed).max(1);
107 let li = LONGEST_IN_CALLS.load(Ordering::Relaxed);
108 let mxc = MAX_LE_CALLS.load(Ordering::Relaxed).max(1);
109 let mxfp = MAX_LE_FAST_PATH.load(Ordering::Relaxed);
110 let mxln = MAX_LE_LINEAR.load(Ordering::Relaxed);
111 let mxlnsum = MAX_LE_LINEAR_LEN_SUM.load(Ordering::Relaxed);
112 let mxsg = MAX_LE_SEGTREE.load(Ordering::Relaxed);
113 let mic = MIN_IN_CALLS.load(Ordering::Relaxed);
114 let micfp = MIN_IN_FAST_PATH.load(Ordering::Relaxed);
115 let micln = MIN_IN_LINEAR.load(Ordering::Relaxed);
116 let miclnsum = MIN_IN_LINEAR_LEN_SUM.load(Ordering::Relaxed);
117 let micsg = MIN_IN_SEGTREE.load(Ordering::Relaxed);
118 let fz = FMATCH_ZERO.load(Ordering::Relaxed);
119 let fnz = FMATCH_NONZERO.load(Ordering::Relaxed);
120 let fsum = FMATCH_SUM.load(Ordering::Relaxed);
121 let ftotal = (fz + fnz).max(1);
122
123 s.push_str(&format!("=== gestalt::instrument dump ===\n"));
124 s.push_str(&format!(
125 "pairs processed: {}\n",
126 PAIRS_PROCESSED.load(Ordering::Relaxed),
127 ));
128 s.push_str(&format!(
129 "longest_in calls: {} ({:.1} per pair)\n",
130 li,
131 li as f64 / pairs as f64,
132 ));
133 s.push_str(&format!(
134 "max_le calls: {} ({:.1} per pair, {:.1} per longest_in)\n",
135 mxc,
136 mxc as f64 / pairs as f64,
137 mxc as f64 / li.max(1) as f64,
138 ));
139 s.push_str(&format!(
140 " fast-path: {} ({:.1}%)\n",
141 mxfp,
142 mxfp as f64 / mxc as f64 * 100.0,
143 ));
144 s.push_str(&format!(
145 " linear scan: {} ({:.1}%) avg scan len: {:.1}\n",
146 mxln,
147 mxln as f64 / mxc as f64 * 100.0,
148 mxlnsum as f64 / mxln.max(1) as f64,
149 ));
150 s.push_str(&format!(
151 " seg-tree: {} ({:.1}%)\n",
152 mxsg,
153 mxsg as f64 / mxc as f64 * 100.0,
154 ));
155 s.push_str(&format!(
156 "min_in calls: {} ({:.1} per pair)\n",
157 mic,
158 mic as f64 / pairs as f64,
159 ));
160 s.push_str(&format!(
161 " fast-path: {} ({:.1}%)\n",
162 micfp,
163 micfp as f64 / mic.max(1) as f64 * 100.0,
164 ));
165 s.push_str(&format!(
166 " linear scan: {} ({:.1}%) avg scan len: {:.1}\n",
167 micln,
168 micln as f64 / mic.max(1) as f64 * 100.0,
169 miclnsum as f64 / micln.max(1) as f64,
170 ));
171 s.push_str(&format!(
172 " seg-tree: {} ({:.1}%)\n",
173 micsg,
174 micsg as f64 / mic.max(1) as f64 * 100.0,
175 ));
176 s.push_str(&format!(
177 "fmatch values: {:.1}% zero ({} z / {} nz) avg-nz {:.2}\n",
178 fz as f64 / ftotal as f64 * 100.0,
179 fz,
180 fnz,
181 fsum as f64 / fnz.max(1) as f64,
182 ));
183
184 let mut chain_total = 0u64;
185 let chain: Vec<u64> = CHAIN_DEPTHS.iter().map(|a| a.load(Ordering::Relaxed)).collect();
186 for &v in &chain {
187 chain_total += v;
188 }
189 s.push_str(&format!(
190 "chain walk depths ({} total walks):\n",
191 chain_total,
192 ));
193 let pct = |target: f64| -> usize {
194 let mut acc = 0u64;
195 for (i, &v) in chain.iter().enumerate() {
196 acc += v;
197 if acc as f64 >= target * chain_total as f64 {
198 return i;
199 }
200 }
201 chain.len() - 1
202 };
203 s.push_str(&format!(
204 " p50={} p90={} p95={} p99={} max-bucket={}\n",
205 pct(0.50),
206 pct(0.90),
207 pct(0.95),
208 pct(0.99),
209 chain.iter().rposition(|&v| v > 0).unwrap_or(0),
210 ));
211 // Compact histogram preview (first 16 buckets)
212 s.push_str(" hist[0..16]: ");
213 for &v in chain.iter().take(16) {
214 s.push_str(&format!("{} ", v));
215 }
216 s.push('\n');
217
218 let mut rec_total = 0u64;
219 let rec: Vec<u64> = RECURSION_DEPTHS.iter().map(|a| a.load(Ordering::Relaxed)).collect();
220 for &v in &rec {
221 rec_total += v;
222 }
223 s.push_str(&format!(
224 "recursion depths ({} stack pushes):\n",
225 rec_total,
226 ));
227 s.push_str(" hist[0..16]: ");
228 for &v in rec.iter().take(16) {
229 s.push_str(&format!("{} ", v));
230 }
231 s.push('\n');
232
233 s
234 }
235}
236
237/// Helper that the instrument hooks call when the feature is enabled; no-op otherwise.
238#[cfg(feature = "instrument")]
239#[inline(always)]
240fn instr_inc(c: &std::sync::atomic::AtomicU64, by: u64) {
241 c.fetch_add(by, std::sync::atomic::Ordering::Relaxed);
242}
243#[cfg(feature = "instrument")]
244#[inline(always)]
245fn instr_hist(buckets: &[std::sync::atomic::AtomicU64; instrument::HIST_BUCKETS], depth: usize) {
246 buckets[depth.min(instrument::HIST_BUCKETS - 1)]
247 .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
248}
249
250
251/// Suffix automaton with CSR (sorted-per-state) transitions — built once, queried by scans.
252///
253/// For the range-restricted recursion (fix b), each state also carries its **endpos** as a
254/// contiguous slice `[dfs_in, dfs_in+dfs_cnt)` of `epos` (the end-positions in b, laid out by
255/// a DFS of the suffix-link tree so a subtree is contiguous). A merge-sort tree over `epos`
256/// answers "is there an end-position in [lo,hi] within this state's subtree, and the min/max
257/// such" — so the whole RO recursion runs on this one prebuilt SAM, with **no sub-builds**.
258#[derive(Clone)]
259pub struct Sam {
260 // Per-state hot fields packed into one cache-line-friendly struct `[len, link, edge_lo, edge_hi]`
261 // so a state visit in the scan/recursion loads len + link + transition-range with a SINGLE cache
262 // miss (these were 5 separate arrays — under multi-thread DRAM-bandwidth pressure, fewer distinct
263 // lines per visit is the dominant win). `link` is u32: the root's link (-1) is stored as 0 and
264 // never read, since both the scan and `longest_in` stop at state 0 before following its link.
265 node: Vec<[u32; 4]>,
266 // Transitions packed as `(char as u64) << 32 | to`, sorted by char within each state's range
267 // [edge_lo, edge_hi). Co-locating char+target means the binary-search key and the taken edge's
268 // target live on the same cache line (was two parallel arrays `csr_char`/`csr_to`).
269 edges: Vec<u64>,
270 firstpos: Vec<u32>, // per state: *smallest* end-position (build-time; folded into epmeta)
271 lastpos: Vec<u32>, // per state: *largest* end-position (build-time; folded into epmeta)
272 // Direct lookup for the root's ASCII transitions (`root_next[c] = state`, or -1). The root is
273 // the high-degree state hit after every match reset; this makes its transition O(1) instead
274 // of a binary search. Non-ASCII root chars (rare) fall back to the edge search, so it's general.
275 root_next: Vec<i32>,
276 dfs_in: Vec<u32>, // per state: start index into the endpos array of its endpos range
277 dfs_cnt: Vec<u32>, // per state: number of end-positions in its subtree
278 // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style: node `i`'s sorted
279 // range is seg_data[seg_off[i]..seg_off[i+1]] (node 1 = root). A state's endpos is the leaf
280 // range [dfs_in, dfs_in+dfs_cnt); the tree answers range predecessor/successor adaptively in
281 // O(log²·slice) — and `firstpos`/`lastpos` give an O(1) fast path that skips it when the query
282 // bound doesn't split the state's endpos span (the common case for wide windows).
283 seg_data: Vec<u32>,
284 seg_off: Vec<u32>,
285 seg_n: usize, // number of leaves (power of two) in the merge-sort tree
286 // Precomputed `len(link[s])` per state — used by `longest_in`'s chain walk's `band_min` check.
287 // Without this, every chain step does TWO node loads: `node[cur]` for (len, link, ...), then
288 // `node[node[cur].link]` for its `len`. Storing the link's length in a parallel array lets the
289 // hot loop fold both into one cache line. Saves ~1 memory access per chain step on M3 — chain
290 // walks are pointer-chase-bound, so this is a measurable win on `longest_in` (the 70%-of-CPU
291 // function on prepared ratio_many).
292 link_len: Vec<u32>,
293 // DFS-ordered end-positions (each state's endpos = contiguous slice [dfs_in, dfs_in+dfs_cnt)).
294 // For small endpos sets a cache-friendly linear scan of this beats the scattered tree descent.
295 epos: Vec<u32>,
296 // Query-hot per-state fields packed into one cache line `[firstpos, lastpos, dfs_in, dfs_cnt]`
297 // so `max_le`/`min_in` load all four with a single cache miss (they were 4 separate arrays).
298 epmeta: Vec<[u32; 4]>,
299 // Optimization B (Phase 4): chain-walk hot slot. Layout:
300 // [0] = len (same as node[s][0])
301 // [1] = link (same as node[s][1])
302 // [2] = link_len[s]
303 // [3] = padding
304 // [4..8] = epmeta[s] = [firstpos, lastpos, dfs_in, dfs_cnt]
305 //
306 // 32 bytes / state — one cache-line-aligned load supplies everything `longest_in`'s chain
307 // walk + `max_le`/`min_in`'s fast path need for one state. Replaces three separate scattered
308 // accesses (`node[cur]`, `link_len[cur]`, `epmeta[cur]`) on the chain-walk hot path; PMU said
309 // L1D miss rate was 0.51 % and accounted for 18 % of cycles, distributed across these three
310 // per-state arrays. Co-locating folds 3 cache-line misses per chain step into 1.
311 //
312 // Memory cost: +32 bytes / state ≈ +1.6 MB on a 50 k-state SAM. Total `Sam` size grows ~20 %.
313 // Bench `bench_new delta-sweep` validates a wall-time reduction on the exact path.
314 chain_slot: Vec<[u32; 8]>,
315}
316
317/// Below this endpos-set size, `max_le`/`min_in` linear-scan the contiguous DFS-ordered endpos
318/// (cache-friendly) instead of the merge-sort tree (scattered) — most queried sets are this small.
319const LINEAR_MAX: usize = 256;
320
321impl Sam {
322 fn empty() -> Self {
323 Sam {
324 node: Vec::new(),
325 edges: Vec::new(),
326 firstpos: Vec::new(),
327 lastpos: Vec::new(),
328 root_next: Vec::new(),
329 dfs_in: Vec::new(),
330 dfs_cnt: Vec::new(),
331 seg_data: Vec::new(),
332 seg_off: Vec::new(),
333 seg_n: 0,
334 epos: Vec::new(),
335 epmeta: Vec::new(),
336 link_len: Vec::new(),
337 chain_slot: Vec::new(),
338 }
339 }
340
341 /// Node `i`'s sorted endpos range in the flattened merge-sort tree.
342 fn seg_node(&self, i: usize) -> &[u32] {
343 &self.seg_data[self.seg_off[i] as usize..self.seg_off[i + 1] as usize]
344 }
345
346 /// Read-only view of the packed `[len, link, edge_lo, edge_hi]` per state — needed by the
347 /// GPU port (`gpu::matching_stats_gpu`) to serialize the SAM into a Metal buffer. The kernel
348 /// reads this slice via index calculations, so we expose it raw (one `[u32; 4]` per state).
349 #[must_use]
350 pub fn nodes(&self) -> &[[u32; 4]] {
351 &self.node
352 }
353
354 /// Read-only view of the packed edge slice: `(char << 32) | target_state`, sorted by char
355 /// within each state's `[edge_lo, edge_hi)` range. The GPU kernel does binary search over
356 /// this slice exactly as `csr_lookup` does on the CPU.
357 #[must_use]
358 pub fn edges_packed(&self) -> &[u64] {
359 &self.edges
360 }
361
362 /// Read-only view of the root's direct ASCII transition table (`root_next[c] = state`, or
363 /// `-1` for missing). 128 entries per SAM. The GPU kernel uses this to skip the binary
364 /// search at the root state, exactly as the CPU does.
365 #[must_use]
366 pub fn root_next_table(&self) -> &[i32] {
367 &self.root_next
368 }
369
370 /// `state`'s endpos slice as the merge-sort-tree leaf range [l, r), from packed `epmeta`.
371 fn endpos_range_m(m: &[u32; 4], seg_n: usize) -> (usize, usize) {
372 let s = m[2] as usize; // dfs_in
373 (s + seg_n, s + m[3] as usize + seg_n) // dfs_cnt
374 }
375
376 /// Largest end-position `<= x` among `state`'s endpos, with pre-loaded metadata. Used by
377 /// `longest_in`'s chain walk where `m` was already pulled from `chain_slot[cur]` (Optimization
378 /// B) — saves an `epmeta[state]` load that would hit a separate cache line.
379 fn max_le_with_meta(&self, m: &[u32; 4], x: u32) -> Option<u32> {
380 #[cfg(feature = "instrument")]
381 instr_inc(&instrument::MAX_LE_CALLS, 1);
382 // O(1) fast path: x doesn't split the state's [firstpos, lastpos] span.
383 if m[1] <= x {
384 #[cfg(feature = "instrument")]
385 instr_inc(&instrument::MAX_LE_FAST_PATH, 1);
386 return Some(m[1]);
387 }
388 if m[0] > x {
389 #[cfg(feature = "instrument")]
390 instr_inc(&instrument::MAX_LE_FAST_PATH, 1);
391 return None;
392 }
393 let cnt = m[3] as usize;
394 if cnt <= LINEAR_MAX {
395 #[cfg(feature = "instrument")]
396 {
397 instr_inc(&instrument::MAX_LE_LINEAR, 1);
398 instr_inc(&instrument::MAX_LE_LINEAR_LEN_SUM, cnt as u64);
399 }
400 let lo = m[2] as usize;
401 let mut best = 0u32;
402 // SAFETY: `m` comes from a valid SAM state's epmeta or chain_slot, so the slice
403 // [m[2], m[2]+m[3]) is within `self.epos` (built that way at SAM construction).
404 #[allow(clippy::undocumented_unsafe_blocks)]
405 for &v in unsafe { self.epos.get_unchecked(lo..lo + cnt) } {
406 best = best.max(if v <= x { v } else { 0 });
407 }
408 return Some(best);
409 }
410 #[cfg(feature = "instrument")]
411 instr_inc(&instrument::MAX_LE_SEGTREE, 1);
412 let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
413 let mut best: Option<u32> = None;
414 while l < r {
415 if l & 1 == 1 {
416 best = best.max(node_max_le(self.seg_node(l), x));
417 l += 1;
418 }
419 if r & 1 == 1 {
420 r -= 1;
421 best = best.max(node_max_le(self.seg_node(r), x));
422 }
423 l >>= 1;
424 r >>= 1;
425 }
426 best
427 }
428
429 /// Smallest end-position in `[lo, hi]` with pre-loaded metadata. See `max_le_with_meta`.
430 fn min_in_with_meta(&self, m: &[u32; 4], lo: u32, hi: u32) -> Option<u32> {
431 #[cfg(feature = "instrument")]
432 instr_inc(&instrument::MIN_IN_CALLS, 1);
433 if m[0] >= lo {
434 #[cfg(feature = "instrument")]
435 instr_inc(&instrument::MIN_IN_FAST_PATH, 1);
436 return (m[0] <= hi).then_some(m[0]);
437 }
438 if m[1] < lo {
439 #[cfg(feature = "instrument")]
440 instr_inc(&instrument::MIN_IN_FAST_PATH, 1);
441 return None;
442 }
443 let cnt = m[3] as usize;
444 if cnt <= LINEAR_MAX {
445 #[cfg(feature = "instrument")]
446 {
447 instr_inc(&instrument::MIN_IN_LINEAR, 1);
448 instr_inc(&instrument::MIN_IN_LINEAR_LEN_SUM, cnt as u64);
449 }
450 let off = m[2] as usize;
451 let mut best = u32::MAX;
452 // SAFETY: `m` is from a valid SAM state; endpos slice within bounds (same as max_le).
453 #[allow(clippy::undocumented_unsafe_blocks)]
454 for &v in unsafe { self.epos.get_unchecked(off..off + cnt) } {
455 best = best.min(if v >= lo && v <= hi { v } else { u32::MAX });
456 }
457 return (best != u32::MAX).then_some(best);
458 }
459 #[cfg(feature = "instrument")]
460 instr_inc(&instrument::MIN_IN_SEGTREE, 1);
461 let (mut l, mut r) = Self::endpos_range_m(m, self.seg_n);
462 let mut best: Option<u32> = None;
463 while l < r {
464 if l & 1 == 1 {
465 best = merge_min(best, node_min_in(self.seg_node(l), lo, hi));
466 l += 1;
467 }
468 if r & 1 == 1 {
469 r -= 1;
470 best = merge_min(best, node_min_in(self.seg_node(r), lo, hi));
471 }
472 l >>= 1;
473 r >>= 1;
474 }
475 best
476 }
477
478}
479
480/// Largest value `<= x` in a sorted slice, or None.
481fn node_max_le(sorted: &[u32], x: u32) -> Option<u32> {
482 let k = sorted.partition_point(|&v| v <= x);
483 (k > 0).then(|| sorted[k - 1])
484}
485
486/// Smallest value in `[lo, hi]` in a sorted slice, or None.
487fn node_min_in(sorted: &[u32], lo: u32, hi: u32) -> Option<u32> {
488 let k = sorted.partition_point(|&v| v < lo);
489 sorted.get(k).copied().filter(|&v| v <= hi)
490}
491
492fn merge_min(a: Option<u32>, b: Option<u32>) -> Option<u32> {
493 match (a, b) {
494 (Some(x), Some(y)) => Some(x.min(y)),
495 (x, None) => x,
496 (None, y) => y,
497 }
498}
499
500/// Transient builder (Ukkonen online SAM with a linked-list transition arena).
501struct Builder {
502 edge_char: Vec<char>,
503 edge_to: Vec<u32>,
504 edge_next: Vec<i32>,
505 head: Vec<i32>,
506 link: Vec<i32>,
507 len: Vec<u32>,
508 firstpos: Vec<u32>,
509 primary: Vec<bool>, // true for the per-position state (its firstpos is a real end-position)
510 last: u32,
511}
512
513impl Builder {
514 fn empty() -> Self {
515 Builder {
516 edge_char: Vec::new(),
517 edge_to: Vec::new(),
518 edge_next: Vec::new(),
519 head: Vec::new(),
520 link: Vec::new(),
521 len: Vec::new(),
522 firstpos: Vec::new(),
523 primary: Vec::new(),
524 last: 0,
525 }
526 }
527
528 fn new(cap: usize) -> Self {
529 let mut b = Builder::empty();
530 b.reset(cap);
531 b
532 }
533
534 /// Clear the arenas (keeping capacity) and re-seed the root — for buffer reuse.
535 fn reset(&mut self, cap: usize) {
536 self.edge_char.clear();
537 self.edge_to.clear();
538 self.edge_next.clear();
539 self.head.clear();
540 self.link.clear();
541 self.len.clear();
542 self.firstpos.clear();
543 self.primary.clear();
544 self.edge_char.reserve(3 * cap);
545 self.edge_to.reserve(3 * cap);
546 self.edge_next.reserve(3 * cap);
547 self.primary.push(false); // root state 0 is not a per-position state
548 self.head.push(-1);
549 self.link.push(-1);
550 self.len.push(0);
551 self.firstpos.push(0);
552 self.last = 0;
553 }
554
555 #[allow(clippy::cast_sign_loss)]
556 fn find(&self, state: u32, c: char) -> Option<u32> {
557 let mut e = self.head[state as usize];
558 while e != -1 {
559 let idx = e as usize;
560 if self.edge_char[idx] == c {
561 return Some(self.edge_to[idx]);
562 }
563 e = self.edge_next[idx];
564 }
565 None
566 }
567
568 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
569 fn add_edge(&mut self, state: u32, c: char, to: u32) {
570 self.edge_char.push(c);
571 self.edge_to.push(to);
572 self.edge_next.push(self.head[state as usize]);
573 self.head[state as usize] = (self.edge_char.len() - 1) as i32;
574 }
575
576 #[allow(clippy::cast_sign_loss)]
577 fn set_edge(&mut self, state: u32, c: char, to: u32) {
578 let mut e = self.head[state as usize];
579 while e != -1 {
580 let idx = e as usize;
581 if self.edge_char[idx] == c {
582 self.edge_to[idx] = to;
583 return;
584 }
585 e = self.edge_next[idx];
586 }
587 self.add_edge(state, c, to);
588 }
589
590 #[allow(clippy::cast_possible_truncation)]
591 fn new_state(&mut self, len: u32, link: i32, firstpos: u32, primary: bool) -> u32 {
592 self.head.push(-1);
593 self.len.push(len);
594 self.link.push(link);
595 self.firstpos.push(firstpos);
596 self.primary.push(primary);
597 (self.head.len() - 1) as u32
598 }
599
600 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_sign_loss)]
601 fn extend(&mut self, c: char, pos: usize) {
602 let cur = self.new_state(self.len[self.last as usize] + 1, -1, pos as u32, true);
603 let mut p = self.last as i32;
604 while p != -1 && self.find(p as u32, c).is_none() {
605 self.add_edge(p as u32, c, cur);
606 p = self.link[p as usize];
607 }
608 if p == -1 {
609 self.link[cur as usize] = 0;
610 } else {
611 let q = self.find(p as u32, c).unwrap();
612 if self.len[p as usize] + 1 == self.len[q as usize] {
613 self.link[cur as usize] = q as i32;
614 } else {
615 let clone = self.new_state(self.len[p as usize] + 1, self.link[q as usize], self.firstpos[q as usize], false);
616 let mut e = self.head[q as usize];
617 while e != -1 {
618 let idx = e as usize;
619 self.add_edge(clone, self.edge_char[idx], self.edge_to[idx]);
620 e = self.edge_next[idx];
621 }
622 while p != -1 && self.find(p as u32, c) == Some(q) {
623 self.set_edge(p as u32, c, clone);
624 p = self.link[p as usize];
625 }
626 self.link[q as usize] = clone as i32;
627 self.link[cur as usize] = clone as i32;
628 }
629 }
630 self.last = cur;
631 }
632
633 /// Convert the linked-list transitions into the packed `node`/`edges` layout (per-state edges
634 /// sorted by char, co-located char+target), reusing `out`'s allocations (no per-build malloc churn).
635 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::needless_range_loop)]
636 fn finalize_into(&self, out: &mut Sam) {
637 let nstates = self.head.len();
638 let nedges = self.edge_char.len();
639 out.firstpos.clear();
640 out.firstpos.extend_from_slice(&self.firstpos);
641 // per-state edge counts → exclusive prefix-sum offsets (local; folded into `node` below)
642 let mut off = vec![0u32; nstates + 1];
643 for state in 0..nstates {
644 let mut e = self.head[state];
645 while e != -1 {
646 off[state + 1] += 1;
647 e = self.edge_next[e as usize];
648 }
649 }
650 for state in 0..nstates {
651 off[state + 1] += off[state];
652 }
653 // edges: (char << 32 | to), sorted by char within each state's [off[s], off[s+1]) range.
654 // sorting the packed u64 sorts by char (high bits) since a state's chars are distinct.
655 out.edges.clear();
656 out.edges.resize(nedges, 0);
657 let mut scratch: Vec<u64> = Vec::new(); // reused across states
658 for state in 0..nstates {
659 scratch.clear();
660 let mut e = self.head[state];
661 while e != -1 {
662 let idx = e as usize;
663 scratch.push((u64::from(self.edge_char[idx] as u32) << 32) | u64::from(self.edge_to[idx]));
664 e = self.edge_next[idx];
665 }
666 scratch.sort_unstable();
667 let base = off[state] as usize;
668 for (k, &packed) in scratch.iter().enumerate() {
669 out.edges[base + k] = packed;
670 }
671 }
672 // per-state packed node [len, link(clamped: root -1 → 0, never read), edge_lo, edge_hi]
673 out.node.clear();
674 out.node.reserve(nstates);
675 out.link_len.clear();
676 out.link_len.reserve(nstates);
677 for s in 0..nstates {
678 let link_idx_signed = self.link[s];
679 let link = if link_idx_signed < 0 { 0 } else { link_idx_signed as u32 };
680 out.node.push([self.len[s], link, off[s], off[s + 1]]);
681 // Precompute len(link[s]) — saves the second node-load in `longest_in`'s chain walk.
682 // Root (link = -1) is never read here because the chain walk breaks before entering it.
683 let llen = if link_idx_signed < 0 { 0 } else { self.len[link_idx_signed as usize] };
684 out.link_len.push(llen);
685 }
686 // root direct transition table (ASCII): fill from the root's edges
687 out.root_next.clear();
688 out.root_next.resize(ROOT_TBL, -1);
689 for k in off[0] as usize..off[1] as usize {
690 let e = out.edges[k];
691 let ci = (e >> 32) as usize;
692 if ci < ROOT_TBL {
693 out.root_next[ci] = i32::try_from((e & 0xFFFF_FFFF) as u32).expect("state fits i32");
694 }
695 }
696 self.build_endpos(out, nstates);
697 }
698
699 /// Build the endpos range structure (fix b): suffix-link children → DFS-order end-positions
700 /// (each state's endpos = a contiguous array range) → wavelet matrix over those positions.
701 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
702 fn build_endpos(&self, out: &mut Sam, nstates: usize) {
703 // suffix-link children, CSR
704 let mut child_head = vec![0u32; nstates + 1];
705 for s in 1..nstates {
706 child_head[self.link[s] as usize + 1] += 1;
707 }
708 for s in 0..nstates {
709 child_head[s + 1] += child_head[s];
710 }
711 let mut child_arr = vec![0u32; nstates.saturating_sub(1)];
712 let mut fill = child_head.clone();
713 for s in 1..nstates {
714 let p = self.link[s] as usize;
715 child_arr[fill[p] as usize] = s as u32;
716 fill[p] += 1;
717 }
718 // iterative DFS (enter/exit) from root → dfs_in / dfs_cnt + DFS-ordered end-positions;
719 // on exit (post-order, children done) accumulate lastpos = max end-position in subtree.
720 out.dfs_in.clear();
721 out.dfs_in.resize(nstates, 0);
722 out.dfs_cnt.clear();
723 out.dfs_cnt.resize(nstates, 0);
724 out.lastpos.clear();
725 out.lastpos.resize(nstates, 0);
726 out.epos.clear();
727 let mut stack: Vec<(u32, bool)> = vec![(0, false)];
728 while let Some((s, exit)) = stack.pop() {
729 let su = s as usize;
730 if exit {
731 out.dfs_cnt[su] = out.epos.len() as u32 - out.dfs_in[su];
732 let mut lp = if self.primary[su] { self.firstpos[su] } else { 0 };
733 for k in child_head[su]..child_head[su + 1] {
734 lp = lp.max(out.lastpos[child_arr[k as usize] as usize]);
735 }
736 out.lastpos[su] = lp;
737 continue;
738 }
739 out.dfs_in[su] = out.epos.len() as u32;
740 if self.primary[su] {
741 out.epos.push(self.firstpos[su]);
742 }
743 stack.push((s, true));
744 for k in child_head[su]..child_head[su + 1] {
745 stack.push((child_arr[k as usize], false));
746 }
747 }
748 // merge-sort tree over the DFS-ordered end-positions, flattened CSR-style (build is a
749 // negligible fraction of runtime; queries hit the contiguous `seg_data` layout).
750 let n = out.epos.len();
751 let mut seg_n = 1usize;
752 while seg_n < n.max(1) {
753 seg_n <<= 1;
754 }
755 out.seg_n = seg_n;
756 let mut tree: Vec<Vec<u32>> = vec![Vec::new(); 2 * seg_n];
757 for i in 0..n {
758 tree[seg_n + i] = vec![out.epos[i]];
759 }
760 for i in (1..seg_n).rev() {
761 let (lhs, rhs) = (&tree[2 * i], &tree[2 * i + 1]);
762 let mut acc = Vec::with_capacity(lhs.len() + rhs.len());
763 let (mut x, mut y) = (0usize, 0usize);
764 while x < lhs.len() && y < rhs.len() {
765 if lhs[x] <= rhs[y] {
766 acc.push(lhs[x]);
767 x += 1;
768 } else {
769 acc.push(rhs[y]);
770 y += 1;
771 }
772 }
773 acc.extend_from_slice(&lhs[x..]);
774 acc.extend_from_slice(&rhs[y..]);
775 tree[i] = acc;
776 }
777 out.seg_off.clear();
778 out.seg_off.reserve(2 * seg_n + 1);
779 out.seg_data.clear();
780 let mut off = 0u32;
781 out.seg_off.push(0);
782 for node in &tree {
783 out.seg_data.extend_from_slice(node);
784 off += u32::try_from(node.len()).expect("epos fits u32");
785 out.seg_off.push(off);
786 }
787 // pack the query-hot per-state fields into one cache-line-friendly array
788 out.epmeta.clear();
789 out.epmeta.reserve(nstates);
790 for s in 0..nstates {
791 out.epmeta.push([out.firstpos[s], out.lastpos[s], out.dfs_in[s], out.dfs_cnt[s]]);
792 }
793 // Optimization B: interleave (node.len, node.link, link_len, epmeta) into one 32-byte slot
794 // per state so the chain walk's per-state data lands on a single cache line. Built lazily
795 // after epmeta + link_len; safe to do here because all the source arrays are finalized.
796 out.chain_slot.clear();
797 out.chain_slot.reserve(nstates);
798 for s in 0..nstates {
799 let nd = out.node[s];
800 let m = out.epmeta[s];
801 out.chain_slot.push([
802 nd[0], // len
803 nd[1], // link
804 out.link_len[s], // band_min source
805 0, // padding for 32-byte alignment
806 m[0], // firstpos
807 m[1], // lastpos
808 m[2], // dfs_in
809 m[3], // dfs_cnt
810 ]);
811 }
812 }
813
814 fn finalize(self) -> Sam {
815 let mut out = Sam::empty();
816 self.finalize_into(&mut out);
817 out
818 }
819}
820
821/// Build (and finalize) the suffix automaton of `b` — prebuild once, reuse across pairs.
822#[must_use]
823pub fn build_sam(b: &[char]) -> Sam {
824 let mut bld = Builder::new(b.len());
825 for (i, &c) in b.iter().enumerate() {
826 bld.extend(c, i);
827 }
828 bld.finalize()
829}
830
831/// Longest substring of `a[al..ar]` that occurs in `b[bl..br)`, using the **precomputed**
832/// window-independent match (`fstate`/`fmatch` = the SAM state and match length ending at each
833/// a-position, from one full-`a` scan). For each position the chain walk starts at the precomputed
834/// state, capping the usable length to the a-fragment (`i-al+1`) and the b-window via endpos
835/// queries. Returns (`a_start`, absolute `b_start`, len) with difflib's tie-break. No re-scanning.
836///
837/// `chain_cap` (Phase 3 approximation knob) bounds the suffix-link chain walk to `chain_cap`
838/// states ascended; deeper chains return whatever match was already found. With the empirically
839/// measured distribution (p99 depth 7, p95 depth 5; see `gestalt::instrument`'s histograms), a
840/// cap of 7 keeps ~99% of chains intact, capping at 5 keeps ~95%, etc. Setting `chain_cap =
841/// u32::MAX` recovers the exact behaviour. The caller (`gestalt_edge_with_ms` and friends)
842/// derives `chain_cap` from the user-supplied `delta` parameter via `delta_to_chain_cap`.
843#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments)]
844fn longest_in(
845 sam: &Sam,
846 fstate: &[u32],
847 fmatch: &[u32],
848 al: usize,
849 ar: usize,
850 bl: usize,
851 br: usize,
852 chain_cap: u32,
853) -> (usize, usize, usize) {
854 #[cfg(feature = "instrument")]
855 instr_inc(&instrument::LONGEST_IN_CALLS, 1);
856 let blo = bl as u32;
857 let hi = br as u32 - 1; // caller guarantees bl < br, so br >= 1
858 let (mut best_len, mut best_a, mut best_b) = (0usize, 0usize, 0usize);
859 // SAFETY: i ∈ [al, ar) ⊆ [0, n) = fstate.len() = fmatch.len(); `cur` is always a valid SAM
860 // state (fstate entry or a suffix link), so chain_slot[cur] is in bounds (= nstates entries).
861 #[allow(clippy::undocumented_unsafe_blocks)]
862 unsafe {
863 for i in al..ar {
864 let cap = i - al + 1; // a-match ending at i can't start before `al`
865 let eff = (*fmatch.get_unchecked(i) as usize).min(cap);
866 if eff <= best_len {
867 continue; // can't beat the best — skip (dominant pruning)
868 }
869 // walk the suffix-link chain from the precomputed state up; `curlen` is the usable length
870 // at the current state (capped to the a-fragment), shrinking as we ascend.
871 let mut cur = *fstate.get_unchecked(i);
872 // Optimization K1: `cur != 0` is an invariant here. `cur == 0` (the SAM root) is only
873 // stored in `fstate[i]` when `fmatch[i] == 0`, and in that case `eff == 0 <= best_len`
874 // already pruned this iteration via the `continue` above. From the second chain step
875 // on, `cur` came from `link != 0` (we check before assigning). Hoisting the
876 // entry-time `if cur == 0 { break }` out saves ≈4 k arm64 instructions per pair on
877 // the bench corpus (-0.9 % retired) — no cycle change on this single-thread profile
878 // because the chain walk is latency-bound by the `node[cur] → link → node[link]`
879 // pointer chase, but it frees front-end issue bandwidth for the threaded path.
880 let mut chain_depth: u32 = 0;
881 loop {
882 chain_depth += 1;
883 // Phase 3 approximate-RO cap: stop walking the chain past `chain_cap` ascended
884 // states. With cap=u32::MAX (default delta=0) this never fires; smaller caps
885 // trade tail accuracy for fewer pointer chases. Measurement on canonical Python:
886 // p95 chain depth = 5, p99 = 7 — capping at 5/7 truncates <5%/<1% of chains.
887 if chain_depth > chain_cap {
888 break;
889 }
890 // OPTIMIZATION B: single 32-byte load of `chain_slot[cur]` brings the per-state
891 // hot fields into registers in ONE cache-line touch:
892 // [0] = len [1] = link [2] = link_len [3] = padding
893 // [4..8] = epmeta (firstpos, lastpos, dfs_in, dfs_cnt) for max_le/min_in
894 // Replaces the three scattered loads (node[cur], link_len[cur], epmeta[cur]) that
895 // the chain walk used to do, each on its own cache line — PMU attribution showed
896 // L1D misses at 18 % of cycle budget split between those three arrays.
897 let cs = *sam.chain_slot.get_unchecked(cur as usize);
898 let curlen = eff.min(cs[0] as usize);
899 if curlen <= best_len {
900 break;
901 }
902 let band_min = cs[2] as usize + 1;
903 if curlen >= band_min {
904 // Reuse the epmeta bytes already in `cs` — no extra load needed.
905 let m = [cs[4], cs[5], cs[6], cs[7]];
906 if let Some(pmax) = sam.max_le_with_meta(&m, hi) {
907 if pmax >= blo {
908 let l_window = (pmax - blo) as usize + 1; // window-cap on match len
909 let l = curlen.min(l_window); // = min(chain-cap, window-cap)
910 if l >= band_min {
911 // earliest-b (min_in) only when we beat the best — rare, off the path.
912 if l > best_len {
913 // OPTIMIZATION A: when the window cap binds (l == l_window),
914 // lo_q = blo + l - 1 = pmax. Since pmax is THE max v <= hi in
915 // this state's endpos, the set ∩ [pmax, hi] is exactly {pmax},
916 // so pmin = pmax trivially — skip the linear scan in `min_in`.
917 // Only the chain-cap branch (l < l_window) needs a real scan.
918 let pmin = if l == l_window {
919 Some(pmax)
920 } else {
921 sam.min_in_with_meta(&m, blo + l as u32 - 1, hi)
922 };
923 if let Some(pmin) = pmin {
924 best_len = l;
925 best_a = i + 1 - l;
926 best_b = pmin as usize + 1 - l;
927 }
928 }
929 break; // deepest qualifying state ⇒ longest in-window match here
930 }
931 }
932 }
933 }
934 let link = cs[1];
935 if link == 0 {
936 break; // reached the root (state 0) — no shorter qualifying state above
937 }
938 cur = link;
939 }
940 #[cfg(feature = "instrument")]
941 instr_hist(&instrument::CHAIN_DEPTHS, chain_depth as usize);
942 }
943 }
944 (best_a, best_b, best_len)
945}
946
947/// Map a user-facing `delta` (max acceptable RO ratio loss, in absolute units) to a chain-walk
948/// depth cap for `longest_in`. Empirically calibrated against `gestalt::instrument`'s chain
949/// depth histogram on the mypy/sympy corpora:
950///
951/// | delta | cap | covers |
952/// |---:|---:|---:|
953/// | 0.00 | `u32::MAX` | exact (default) |
954/// | 0.01 | 12 | p99.5+ |
955/// | 0.05 | 7 | p99 |
956/// | 0.10 | 5 | p95 |
957/// | 0.20 | 3 | p85 |
958/// | 0.50 | 2 | p70 |
959/// | 1.00 | 1 | only fstate, no walk |
960///
961/// Formula: `cap = ceil(1 / sqrt(delta))` clamped to ≥1 for delta > 0. Pure heuristic — the
962/// property test in `tests/approx_ro.rs` verifies the actual loss stays below `delta` on a
963/// representative corpus.
964#[must_use]
965#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
966pub fn delta_to_chain_cap(delta: f64) -> u32 {
967 if delta <= 0.0 {
968 return u32::MAX;
969 }
970 if delta >= 1.0 {
971 return 1;
972 }
973 let cap = (1.0_f64 / delta.sqrt()).ceil() as u32;
974 cap.max(1)
975}
976
977/// Window-independent matching statistics of `a` vs `sam_b`, filled into reused buffers: for each
978/// i, `(state, matched)` where `matched` = longest suffix of `a[..=i]` occurring anywhere in b.
979/// One O(|a|) scan, reused by every recursion node (no per-node re-scan). Reusing the caller's
980/// buffers avoids a per-pair allocation (was ~10% of the all-pairs join).
981#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
982fn matching_stats_into(a: &[char], sam_b: &Sam, fstate: &mut Vec<u32>, fmatch: &mut Vec<u32>) {
983 let n = a.len();
984 // Size the buffers WITHOUT zero-filling: the scan below writes every index [0, n) before any read,
985 // so the resize(n, 0) zero-pass was pure waste (it was ~half the scan's store traffic).
986 // SAFETY: u32 has no invalid bit patterns, and fstate[i]/fmatch[i] are written for every i in 0..n
987 // (the loop below) strictly before longest_in ever reads them.
988 fstate.clear();
989 fstate.reserve(n);
990 fmatch.clear();
991 fmatch.reserve(n);
992 #[allow(clippy::undocumented_unsafe_blocks)]
993 unsafe {
994 fstate.set_len(n);
995 fmatch.set_len(n);
996 }
997 // Hoist the SAM arrays into locals so the compiler keeps base pointers in registers and
998 // doesn't reload them through `sam_b` each iteration; transition is hand-inlined.
999 let node = sam_b.node.as_slice();
1000 let edges = sam_b.edges.as_slice();
1001 let root = sam_b.root_next.as_slice();
1002 let mut state = 0u32;
1003 let mut matched = 0u32;
1004 // SAFETY: `state` is always a valid SAM state index (< nstates = node.len()): it starts at the
1005 // root (0) and only ever becomes a transition target (an edge's low bits, a valid state) or a
1006 // suffix link (node[..][1], a valid state). So node[state] and node[link] are in bounds; the edge
1007 // range [edge_lo, edge_hi) ⊆ [0, edges.len()); `ci < ROOT_TBL == root.len()`; `i < n == fstate.len()`.
1008 #[allow(clippy::undocumented_unsafe_blocks)]
1009 unsafe {
1010 for i in 0..n {
1011 let c = *a.get_unchecked(i);
1012 loop {
1013 // inline `transition(state, c)` → next state, or -1. The non-root branch loads
1014 // node[state] ONCE for both the edge range and (on miss) the suffix link — one cache line.
1015 let nx: i64 = if state == 0 {
1016 let ci = c as usize;
1017 if ci < ROOT_TBL {
1018 i64::from(*root.get_unchecked(ci))
1019 } else {
1020 let nd = node.get_unchecked(0);
1021 csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
1022 }
1023 } else {
1024 let nd = node.get_unchecked(state as usize);
1025 csr_lookup(edges, nd[2] as usize, nd[3] as usize, c)
1026 };
1027 if nx >= 0 {
1028 state = nx as u32;
1029 matched += 1;
1030 break;
1031 }
1032 if state == 0 {
1033 matched = 0;
1034 break;
1035 }
1036 let link = node.get_unchecked(state as usize)[1]; // same line as the edge-range load
1037 state = link;
1038 matched = node.get_unchecked(link as usize)[0]; // len(link)
1039 }
1040 *fstate.get_unchecked_mut(i) = state;
1041 *fmatch.get_unchecked_mut(i) = matched;
1042 }
1043 }
1044}
1045
1046/// Binary search the packed edge slice `edges[lo..hi]` (sorted by char in the high 32 bits) for `c`;
1047/// returns the target state (low 32 bits) as i64, or -1. Inlined into the scan.
1048#[inline]
1049#[allow(clippy::cast_possible_truncation)]
1050fn csr_lookup(edges: &[u64], mut lo: usize, hi: usize, c: char) -> i64 {
1051 let mut hi = hi;
1052 let key = c as u32;
1053 while lo < hi {
1054 let mid = lo + (hi - lo) / 2;
1055 // SAFETY: callers pass lo,hi from a state's edge range ⊆ [0, edges.len()]; `mid ∈ [lo, hi)`.
1056 let e = unsafe { *edges.get_unchecked(mid) };
1057 let mc = (e >> 32) as u32; // char (code point) in the high 32 bits
1058 if mc == key {
1059 return i64::from((e & 0xFFFF_FFFF) as u32); // target state in the low 32 bits
1060 }
1061 if mc < key {
1062 lo = mid + 1;
1063 } else {
1064 hi = mid;
1065 }
1066 }
1067 -1
1068}
1069
1070thread_local! {
1071 /// Reused (fstate, fmatch) buffers for the per-pair matching statistics — no per-pair alloc.
1072 static MS_BUF: std::cell::RefCell<(Vec<u32>, Vec<u32>)> =
1073 const { std::cell::RefCell::new((Vec::new(), Vec::new())) };
1074 /// Reused recursion stack of (al, ar, bl, br) windows — retains capacity across pairs so the
1075 /// RO recursion never heap-allocates or reallocs per pair (was `__rust_alloc` + `grow_one`).
1076 static STACK_BUF: std::cell::RefCell<Vec<(usize, usize, usize, usize)>> =
1077 const { std::cell::RefCell::new(Vec::new()) };
1078 /// Reused DP buffer for `ub_from_fmatch` (currently unused — kept for re-enabling H later).
1079 #[allow(dead_code)]
1080 static UB_BUF: std::cell::RefCell<Vec<u32>> = const { std::cell::RefCell::new(Vec::new()) };
1081}
1082
1083/// Tight upper bound on the RO matched-length M, computed from the per-position `fmatch` array
1084/// via O(na) weighted-interval-scheduling DP. Each `fmatch[i]` records the longest substring of
1085/// `b` ending at `a[i]`; an actual RO decomposition picks non-overlapping such matches, so M is
1086/// bounded above by the max non-overlapping sum we can extract from `fmatch`.
1087///
1088/// `dp[i]` = best sum using only positions `0..=i`. Recurrence:
1089/// * not taking position i: `dp[i-1]`
1090/// * taking the fmatch[i]-length block ending at i (a[i+1-l..=i]): `dp[i-l] + l` where
1091/// `l = fmatch[i].min(i+1)` (clamped to what fits in `a[..=i]`)
1092///
1093/// **CURRENTLY UNUSED** — tried in `gestalt_edge_with_ms` (optimization H) and reverted: on
1094/// the `cluster_canonicals` threshold path the O(na) DP cost (+ thread-local buffer access) was
1095/// larger than the wall savings from skipping recursion, because the existing
1096/// `m + pending < need` check inside the recursion already bails cheaply for non-edges. Kept
1097/// as a tombstone in case a future call shape (e.g., larger threshold + denser matches) makes
1098/// it worth re-trying. To re-enable: insert the call before `STACK_BUF.with_borrow_mut` in
1099/// `gestalt_edge_with_ms` and gate on `need > 0`.
1100#[allow(dead_code)]
1101#[inline]
1102#[allow(clippy::cast_possible_truncation)]
1103fn ub_from_fmatch(fmatch: &[u32], buf: &mut Vec<u32>) -> u32 {
1104 let n = fmatch.len();
1105 if n == 0 {
1106 return 0;
1107 }
1108 buf.clear();
1109 buf.resize(n, 0);
1110 // SAFETY: buf and fmatch are both len = n; indices below stay in [0, n).
1111 #[allow(clippy::undocumented_unsafe_blocks)]
1112 unsafe {
1113 let mut prev: u32 = 0;
1114 for i in 0..n {
1115 let l = (*fmatch.get_unchecked(i)).min(i as u32 + 1);
1116 let with_take = if l == 0 {
1117 0
1118 } else if (l as usize) > i {
1119 l // entire prefix [0..=i] is the block, no `dp[i-l]` term
1120 } else {
1121 buf.get_unchecked(i - l as usize).saturating_add(l)
1122 };
1123 let v = prev.max(with_take);
1124 *buf.get_unchecked_mut(i) = v;
1125 prev = v;
1126 }
1127 prev
1128 }
1129}
1130
1131/// Test-only access to `matching_stats_into` — used by `corpus_sa` tests to verify the SA-based
1132/// fmatch is byte-for-byte identical to the SAM-based fmatch.
1133#[doc(hidden)]
1134pub fn matching_stats_for_test(a: &[char], sam_b: &Sam, fstate: &mut Vec<u32>, fmatch: &mut Vec<u32>) {
1135 matching_stats_into(a, sam_b, fstate, fmatch);
1136}
1137
1138/// Cost probe: run only the matching-statistics scan of `a` vs `sam_b` (the unavoidable per-pair
1139/// floor — RO's first block is an LCS, Θ(|a|)) and return a checksum so it isn't optimized out.
1140/// Measures the pure scan throughput separate from the RO recursion.
1141#[must_use]
1142pub fn matching_stats_cost(a: &[char], sam_b: &Sam) -> u64 {
1143 MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1144 matching_stats_into(a, sam_b, fstate, fmatch);
1145 fmatch.iter().map(|&x| u64::from(x)).sum()
1146 })
1147}
1148
1149fn gestalt_m_with(a: &[char], b: &[char], sam_b: &Sam) -> usize {
1150 let n = a.len();
1151 if n == 0 {
1152 return 0;
1153 }
1154 MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1155 matching_stats_into(a, sam_b, fstate, fmatch);
1156 gestalt_m_recur(a, b, sam_b, fstate, fmatch)
1157 })
1158}
1159
1160#[allow(clippy::cast_sign_loss, clippy::many_single_char_names)]
1161fn gestalt_m_recur(a: &[char], b: &[char], sam_b: &Sam, fstate: &[u32], fmatch: &[u32]) -> usize {
1162 let n = a.len();
1163 let mut total = 0usize;
1164 STACK_BUF.with_borrow_mut(|stack| {
1165 stack.clear();
1166 stack.push((0, n, 0, b.len()));
1167 while let Some((al, ar, bl, br)) = stack.pop() {
1168 if al >= ar || bl >= br {
1169 continue;
1170 }
1171 let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1172 if l == 0 {
1173 continue;
1174 }
1175 total += l;
1176 stack.push((al, i, bl, j));
1177 stack.push((i + l, ar, j + l, br));
1178 }
1179 });
1180 total
1181}
1182
1183/// Threshold-aware exact decision: does `RO(a,b) ≥ threshold`? Computes the matched total M
1184/// with **two-sided early-exit** — accept the instant `M ≥ need`, reject the instant the upper
1185/// bound `M + Σ min(window lengths) < need`. Exact (the bound never drops a qualifying pair),
1186/// and dissimilar pairs abort long before the full decomposition. `need = ⌈threshold·(|a|+|b|)/2⌉`.
1187#[allow(
1188 clippy::cast_precision_loss,
1189 clippy::cast_sign_loss,
1190 clippy::cast_possible_truncation,
1191 clippy::many_single_char_names
1192)]
1193#[must_use]
1194pub fn gestalt_qualifies(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> bool {
1195 let nb = b.len();
1196 let total = a.len() + nb;
1197 if total == 0 {
1198 return true;
1199 }
1200 let need = (threshold * total as f64 / 2.0).ceil() as usize;
1201 if need == 0 {
1202 return true;
1203 }
1204 let n = a.len();
1205 if n == 0 || nb == 0 {
1206 return false; // M = 0 < need
1207 }
1208 MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1209 matching_stats_into(a, sam_b, fstate, fmatch);
1210 gestalt_qualifies_ms(n, nb, sam_b, threshold, fstate, fmatch)
1211 })
1212}
1213
1214/// The threshold early-exit recursion given **precomputed** matching statistics (`fstate`/`fmatch`
1215/// for `a` of length `na` vs `sam_b`). Split out so the scan can be done in an MLP batch separately.
1216#[allow(
1217 clippy::cast_precision_loss,
1218 clippy::cast_sign_loss,
1219 clippy::cast_possible_truncation,
1220 clippy::many_single_char_names
1221)]
1222#[must_use]
1223pub fn gestalt_qualifies_ms(na: usize, nb: usize, sam_b: &Sam, threshold: f64, fstate: &[u32], fmatch: &[u32]) -> bool {
1224 let total = na + nb;
1225 if total == 0 {
1226 return true;
1227 }
1228 let need = (threshold * total as f64 / 2.0).ceil() as usize;
1229 if need == 0 {
1230 return true;
1231 }
1232 if na == 0 || nb == 0 {
1233 return false;
1234 }
1235 STACK_BUF.with_borrow_mut(|stack| {
1236 let mut m = 0usize;
1237 let mut pending = na.min(nb);
1238 stack.clear();
1239 stack.push((0, na, 0, nb));
1240 while let Some((al, ar, bl, br)) = stack.pop() {
1241 pending -= (ar - al).min(br - bl);
1242 if al >= ar || bl >= br {
1243 continue;
1244 }
1245 let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1246 if l == 0 {
1247 continue;
1248 }
1249 m += l;
1250 if m >= need {
1251 return true;
1252 }
1253 pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
1254 if m + pending < need {
1255 return false;
1256 }
1257 stack.push((al, i, bl, j));
1258 stack.push((i + l, ar, j + l, br));
1259 }
1260 m >= need
1261 })
1262}
1263
1264
1265/// Edge test that also yields the exact ratio: `Some(ratio)` iff `RO(a,b) >= threshold`, else `None`.
1266/// Keeps the **reject** early-exit (abort the instant the upper bound `M + Σ min(window) < need`) but
1267/// computes full M on the qualifying branch so the cached ratio feeds the cluster `min_sim` — caching
1268/// edge ratios here means `min_sim` only recomputes the rare non-edge (chained) intra-cluster pairs,
1269/// not the whole dense blob. Bit-identical to `(let r = ratio; (r >= threshold).then_some(r))`.
1270#[allow(
1271 clippy::cast_precision_loss,
1272 clippy::cast_sign_loss,
1273 clippy::cast_possible_truncation,
1274 clippy::many_single_char_names
1275)]
1276#[must_use]
1277pub fn gestalt_edge(a: &[char], b: &[char], sam_b: &Sam, threshold: f64) -> Option<f64> {
1278 let na = a.len();
1279 let nb = b.len();
1280 let total = na + nb;
1281 if total == 0 {
1282 return Some(1.0);
1283 }
1284 let need = (threshold * total as f64 / 2.0).ceil() as usize;
1285 if na == 0 || nb == 0 {
1286 return (need == 0).then_some(0.0);
1287 }
1288 MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1289 matching_stats_into(a, sam_b, fstate, fmatch);
1290 gestalt_edge_with_ms(a, b, sam_b, fstate, fmatch, threshold)
1291 })
1292}
1293
1294/// Stage-4b helper: same recursion + early-exit as `gestalt_edge`, but operates on a
1295/// **caller-provided** `(fstate, fmatch)` instead of running `matching_stats_into` inline.
1296///
1297/// The GPU dispatch produces these arrays in batch for many pairs at once; the CPU side then
1298/// does the small stack walk per pair via this entry point. Keeping the recursion on the CPU is
1299/// the right split because `longest_in`'s suffix-link walk has data-dependent depth (poor GPU
1300/// fit), while `matching_stats_into` is a wide independent per-pair walk (great GPU fit).
1301///
1302/// `fstate` / `fmatch` must be `a.len()` long and have been filled for THIS exact `(a, sam_b)`
1303/// pair — passing arrays computed for a different `a` or `b` is a logic bug, no runtime check.
1304#[must_use]
1305#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
1306pub fn gestalt_edge_with_ms(
1307 a: &[char],
1308 b: &[char],
1309 sam_b: &Sam,
1310 fstate: &[u32],
1311 fmatch: &[u32],
1312 threshold: f64,
1313) -> Option<f64> {
1314 // Backwards-compatible exact wrapper. New callers wanting approximation pass through
1315 // `gestalt_edge_with_ms_delta` directly with their delta.
1316 gestalt_edge_with_ms_delta(a, b, sam_b, fstate, fmatch, threshold, 0.0)
1317}
1318
1319/// Approximate-RO variant: same as [`gestalt_edge_with_ms`] but caps the suffix-link chain walk
1320/// inside `longest_in` to roughly `1/√delta` ascents. `delta = 0.0` means exact (no cap, default).
1321/// `delta ∈ (0, 1]` caps the depth; the returned ratio's worst-case absolute deviation from the
1322/// exact RO is bounded by ~delta (empirically verified on canonical-Python corpora by
1323/// `tests/approx_ro.rs`). The chain depth distribution on real workloads is heavy-headed
1324/// (p99 ≈ 7), so even small delta values rarely actually fire the cap.
1325#[must_use]
1326#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
1327pub fn gestalt_edge_with_ms_delta(
1328 a: &[char],
1329 b: &[char],
1330 sam_b: &Sam,
1331 fstate: &[u32],
1332 fmatch: &[u32],
1333 threshold: f64,
1334 delta: f64,
1335) -> Option<f64> {
1336 let chain_cap = delta_to_chain_cap(delta);
1337 gestalt_edge_with_ms_inner(a, b, sam_b, fstate, fmatch, threshold, chain_cap)
1338}
1339
1340#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::many_single_char_names)]
1341fn gestalt_edge_with_ms_inner(
1342 a: &[char],
1343 b: &[char],
1344 sam_b: &Sam,
1345 fstate: &[u32],
1346 fmatch: &[u32],
1347 threshold: f64,
1348 chain_cap: u32,
1349) -> Option<f64> {
1350 let na = a.len();
1351 let nb = b.len();
1352 let total = na + nb;
1353 if total == 0 {
1354 return Some(1.0);
1355 }
1356 let need = (threshold * total as f64 / 2.0).ceil() as usize;
1357 if na == 0 || nb == 0 {
1358 return (need == 0).then_some(0.0);
1359 }
1360 debug_assert_eq!(fstate.len(), na, "fstate length must equal a.len()");
1361 debug_assert_eq!(fmatch.len(), na, "fmatch length must equal a.len()");
1362 #[cfg(feature = "instrument")]
1363 {
1364 instr_inc(&instrument::PAIRS_PROCESSED, 1);
1365 let (mut zero, mut nz, mut sum) = (0u64, 0u64, 0u64);
1366 for &f in fmatch {
1367 if f == 0 {
1368 zero += 1;
1369 } else {
1370 nz += 1;
1371 sum += f as u64;
1372 }
1373 }
1374 instr_inc(&instrument::FMATCH_ZERO, zero);
1375 instr_inc(&instrument::FMATCH_NONZERO, nz);
1376 instr_inc(&instrument::FMATCH_SUM, sum);
1377 }
1378 // Optimization H (tight fmatch-based UB) was tried and reverted — measured a NET regression
1379 // on cluster_canonicals threshold path: the O(na) weighted-interval-scheduling DP added
1380 // ~25 µs per call across rayon workers, and the bail rate on filter-survivors was too low
1381 // to amortize. The existing `m + pending < need` check inside the recursion loop already
1382 // aborts cheaply for non-edges (after 1-2 longest_in calls in the common case). See
1383 // `src/new/PERF_MAP.md`'s "Tombstones" section and the `ub_from_fmatch` helper just above —
1384 // kept around but unused in case a different call shape makes it worth re-trying.
1385 STACK_BUF.with_borrow_mut(|stack| {
1386 let mut m = 0usize;
1387 let mut pending = na.min(nb);
1388 stack.clear();
1389 stack.push((0, na, 0, nb));
1390 #[cfg(feature = "instrument")]
1391 let mut max_depth: usize = 1;
1392 while let Some((al, ar, bl, br)) = stack.pop() {
1393 #[cfg(feature = "instrument")]
1394 {
1395 if stack.len() + 1 > max_depth {
1396 max_depth = stack.len() + 1;
1397 }
1398 }
1399 pending -= (ar - al).min(br - bl);
1400 if al >= ar || bl >= br {
1401 continue;
1402 }
1403 let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, chain_cap);
1404 if l == 0 {
1405 continue;
1406 }
1407 m += l;
1408 pending += (i - al).min(j - bl) + (ar - i - l).min(br - j - l);
1409 if m + pending < need {
1410 #[cfg(feature = "instrument")]
1411 instr_hist(&instrument::RECURSION_DEPTHS, max_depth);
1412 return None; // upper bound below the bar ⇒ certified non-edge, abort
1413 }
1414 stack.push((al, i, bl, j));
1415 stack.push((i + l, ar, j + l, br));
1416 }
1417 #[cfg(feature = "instrument")]
1418 instr_hist(&instrument::RECURSION_DEPTHS, max_depth);
1419 (m >= need).then(|| 2.0 * m as f64 / total as f64)
1420 })
1421}
1422
1423/// Cluster `min_sim` helper: returns the **exact** ratio when `RO(a,b) <= cap`, otherwise a value
1424/// `> cap` (it accept-early-exits the instant M exceeds the cap's bound). Used to find a cluster's
1425/// minimum pairwise ratio with `cur = cur.min(gestalt_ratio_capped(a, b, sam, cur))`: in a dense
1426/// cluster the dominant high-ratio pairs blow past `cur` and are pruned after the first block or two,
1427/// so full M is computed only for the genuinely-low pairs. Bit-identical minimum to the full ratio.
1428#[allow(
1429 clippy::cast_precision_loss,
1430 clippy::cast_sign_loss,
1431 clippy::cast_possible_truncation,
1432 clippy::many_single_char_names
1433)]
1434#[must_use]
1435pub fn gestalt_ratio_capped(a: &[char], b: &[char], sam_b: &Sam, cap: f64) -> f64 {
1436 let na = a.len();
1437 let nb = b.len();
1438 let total = na + nb;
1439 if total == 0 {
1440 return 1.0; // two empty strings ⇒ ratio 1.0
1441 }
1442 if na == 0 || nb == 0 {
1443 return 0.0; // M = 0 ⇒ ratio 0 (a valid minimum candidate, <= cap for any cap >= 0)
1444 }
1445 // ratio > cap ⟺ 2M/total > cap ⟺ M > cap·total/2 ⟺ M >= ⌊cap·total/2⌋ + 1.
1446 let exceed = (cap * total as f64 / 2.0).floor() as usize + 1;
1447 MS_BUF.with_borrow_mut(|(fstate, fmatch)| {
1448 matching_stats_into(a, sam_b, fstate, fmatch);
1449 STACK_BUF.with_borrow_mut(|stack| {
1450 let mut m = 0usize;
1451 stack.clear();
1452 stack.push((0, na, 0, nb));
1453 while let Some((al, ar, bl, br)) = stack.pop() {
1454 if al >= ar || bl >= br {
1455 continue;
1456 }
1457 let (i, j, l) = longest_in(sam_b, fstate, fmatch, al, ar, bl, br, u32::MAX);
1458 if l == 0 {
1459 continue;
1460 }
1461 m += l;
1462 if m >= exceed {
1463 return 2.0; // ratio > cap ⇒ cannot be the minimum; prune (any value > cap works)
1464 }
1465 stack.push((al, i, bl, j));
1466 stack.push((i + l, ar, j + l, br));
1467 }
1468 2.0 * m as f64 / total as f64 // full exact M ⇒ exact ratio (<= cap)
1469 })
1470 })
1471}
1472
1473/// Ratio of `a` vs the string the prebuilt `sam_b` was built from (= `b`).
1474#[allow(clippy::cast_precision_loss)]
1475#[must_use]
1476pub fn gestalt_ratio_prebuilt(a: &[char], b: &[char], sam_b: &Sam) -> f64 {
1477 let total = a.len() + b.len();
1478 if total == 0 {
1479 return 1.0;
1480 }
1481 2.0 * gestalt_m_with(a, b, sam_b) as f64 / total as f64
1482}
1483
1484#[allow(clippy::cast_precision_loss)]
1485#[must_use]
1486pub fn gestalt_ratio_chars(a: &[char], b: &[char]) -> f64 {
1487 if a.is_empty() && b.is_empty() {
1488 return 1.0;
1489 }
1490 let sam_b = build_sam(b);
1491 gestalt_ratio_prebuilt(a, b, &sam_b)
1492}
1493
1494/// `gestalt_ratio(a, b) -> float`: exact difflib ratio, computed via suffix-automaton LCS.
1495#[must_use]
1496pub fn gestalt_ratio(a: &str, b: &str) -> f64 {
1497 let av: Vec<char> = a.chars().collect();
1498 let bv: Vec<char> = b.chars().collect();
1499 gestalt_ratio_chars(&av, &bv)
1500}
1501
1502#[cfg(test)]
1503mod tests {
1504 #![allow(clippy::float_cmp, clippy::unreadable_literal)]
1505 use super::{build_sam, gestalt_edge, gestalt_qualifies, gestalt_ratio_capped, gestalt_ratio_chars};
1506
1507 fn r(a: &str, b: &str) -> f64 {
1508 gestalt_ratio_chars(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>())
1509 }
1510
1511 #[test]
1512 fn matches_difflib_reference_values() {
1513 assert_eq!(r("", ""), 1.0);
1514 assert_eq!(r("", "x"), 0.0);
1515 assert_eq!(r("abc", "abc"), 1.0);
1516 assert_eq!(r("abc", "abd"), 0.6666666666666666);
1517 assert_eq!(r("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
1518 assert_eq!(r("tide", "diet"), 0.25);
1519 assert_eq!(r("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
1520 assert_eq!(r("aaaaabbbbbccccc", "aaaaaxbbbbbxccccc"), 0.9375);
1521 }
1522
1523 // `gestalt_qualifies` (threshold early-exit) must be byte-for-byte with `ratio >= T`.
1524 #[test]
1525 fn qualifies_matches_ratio_threshold() {
1526 // xorshift PRNG → deterministic pseudo-random ASCII strings over a small alphabet
1527 let mut s: u64 = 0x1234_5678_9abc_def1;
1528 let mut next = || {
1529 s ^= s << 13;
1530 s ^= s >> 7;
1531 s ^= s << 17;
1532 s
1533 };
1534 for _ in 0..3000 {
1535 let la = (next() % 40) as usize;
1536 let lb = (next() % 40) as usize;
1537 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> Vec<char> {
1538 (0..n).map(|_| char::from(b'a' + (rng() % 4) as u8)).collect()
1539 };
1540 let a = mk(la, &mut next);
1541 let b = mk(lb, &mut next);
1542 let sam_b = build_sam(&b);
1543 let ratio = gestalt_ratio_chars(&a, &b);
1544 for &t in &[0.0_f64, 0.25, 0.5, 0.75, 0.9, 1.0] {
1545 assert_eq!(
1546 gestalt_qualifies(&a, &b, &sam_b, t),
1547 ratio >= t,
1548 "a={a:?} b={b:?} t={t} ratio={ratio}"
1549 );
1550 // gestalt_edge(cap=t): Some(exact ratio) iff qualifying; bit-exact ratio when Some.
1551 let edge = gestalt_edge(&a, &b, &sam_b, t);
1552 assert_eq!(edge.is_some(), ratio >= t, "edge.is_some a={a:?} b={b:?} t={t} ratio={ratio}");
1553 if let Some(r) = edge {
1554 assert_eq!(r, ratio, "edge ratio a={a:?} b={b:?} t={t}");
1555 }
1556 // gestalt_ratio_capped(cap=t): exact ratio when ratio <= t, else a value > t (pruned).
1557 let capped = gestalt_ratio_capped(&a, &b, &sam_b, t);
1558 if ratio <= t {
1559 assert_eq!(capped, ratio, "capped exact a={a:?} b={b:?} cap={t} ratio={ratio}");
1560 } else {
1561 assert!(capped > t, "capped prune a={a:?} b={b:?} cap={t} ratio={ratio} got={capped}");
1562 }
1563 }
1564 }
1565 }
1566}