difflib_fast/
lib.rs

1//! `difflib-fast` — fast, **byte-for-byte exact** difflib Ratcliff–Obershelp ("gestalt") similarity.
2//!
3//! [`ratio`] / [`gestalt::gestalt_ratio`] are a drop-in for
4//! `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`:
5//!   `ratio = 2·M / (len(a)+len(b))`, where `M` is the total size of the Ratcliff–Obershelp matching
6//! blocks. The result — including difflib's tie-break (longest; earliest-a; earliest-b) and its
7//! argument-order asymmetry — is reproduced exactly, but `M` is computed via a **suffix automaton**
8//! (LCS in O(|a|+|b|) regardless of character frequency) instead of difflib's popular-character
9//! `b2j` rescans. On long, small-alphabet text (e.g. canonicalized source code) this is the
10//! difference between difflib's pathological case and a linear scan.
11//!
12//! Beyond the per-pair ratio, [`cluster_canonicals`] does an exact single-linkage **clustering** of a
13//! corpus at a similarity threshold — prebuild each string's automaton once, then an early-exit
14//! all-pairs join (length blocking + `quick_ratio` filter + threshold-aware RO), in parallel via
15//! rayon — and reports each cluster with its exact minimum pairwise ratio. [`cluster_canonicals_lsh`]
16//! is the scalable `MinHash`-LSH variant (candidate generation + exact verification) for very large
17//! corpora past the O(n²) wall.
18//!
19//! Two independent implementations of `M` live here: the suffix-automaton path ([`gestalt`]) and a
20//! straight port of difflib's `b2j` recursion ([`ratio_reference`]); the test suite asserts they are
21//! bit-identical, which is the crate's core correctness gate.
22
23use std::cell::RefCell;
24use std::collections::{HashMap, HashSet};
25
26use rayon::prelude::*;
27
28pub mod gestalt;
29pub use gestalt::gestalt_ratio;
30
31/// Dispatch threshold: take the `b2j` path while its estimated work per element
32/// (`Σ_c count_a·count_b / (|a|+|b|)`) stays at/below this; above it the automaton wins. Tuned on real
33/// canonicalized-code corpora. Override at runtime with `DF_WORK_FACTOR`. (The clustering join always
34/// uses the automaton — there it's prebuilt once and reused across all n² scans, so it always wins.)
35const B2J_WORK_FACTOR: u64 = 34;
36
37/// Fast exact difflib ratio. Bit-identical to
38/// `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`.
39///
40/// Dispatches by length: short inputs take the lightweight difflib `b2j` recursion (cheap to set up),
41/// long inputs take the suffix-automaton LCS (frequency-independent, so it doesn't degrade on
42/// repetitive text). Both paths are exact and agree bit-for-bit.
43#[must_use]
44pub fn ratio(a: &str, b: &str) -> f64 {
45    let av: Vec<char> = a.chars().collect();
46    let bv: Vec<char> = b.chars().collect();
47    ratio_chars(&av, &bv)
48}
49
50/// ASCII char histogram (canonical code is ~all ASCII; non-ASCII folded into one overflow bucket).
51fn ascii_counts(s: &[char]) -> ([u32; 128], u32) {
52    let mut c = [0u32; 128];
53    let mut other = 0u32;
54    for &ch in s {
55        let u = ch as u32;
56        if u < 128 {
57            c[u as usize] += 1;
58        } else {
59            other += 1;
60        }
61    }
62    (c, other)
63}
64
65fn work_factor() -> u64 {
66    use std::sync::OnceLock;
67    static F: OnceLock<u64> = OnceLock::new();
68    *F.get_or_init(|| std::env::var("DF_WORK_FACTOR").ok().and_then(|s| s.parse().ok()).unwrap_or(B2J_WORK_FACTOR))
69}
70
71#[allow(clippy::cast_precision_loss)]
72#[must_use]
73fn ratio_chars(a: &[char], b: &[char]) -> f64 {
74    let total = a.len() + b.len();
75    if total == 0 {
76        return 1.0;
77    }
78    let (ca, oa) = ascii_counts(a);
79    let (cb, ob) = ascii_counts(b);
80    // Non-ASCII present (rare in canonical code): the b2j fast path is ASCII-only, so use the
81    // automaton, which compares arbitrary code points.
82    if oa > 0 || ob > 0 {
83        return gestalt::gestalt_ratio_chars(a, b);
84    }
85    // Dispatch on b2j's estimated work `W = Σ_c count_a(c)·count_b(c)` (exactly the positions the
86    // first-block scan visits) per element. Below the threshold b2j is cheaper to set up; above it the
87    // repetitive case makes b2j's recursion blow up and the automaton wins. Committing to one path
88    // (rather than speculatively running b2j and aborting) avoids wasting work on the clear-automaton
89    // cases. The histograms just computed double as the b2j build's counts, so dispatch is near-free.
90    let mut w = 0u64;
91    for i in 0..128 {
92        w += u64::from(ca[i]) * u64::from(cb[i]);
93    }
94    if w <= work_factor() * total as u64 {
95        ratio_b2j_chars(a, b, &cb)
96    } else {
97        gestalt::gestalt_ratio_chars(a, b)
98    }
99}
100
101/// The difflib `b2j` ratio path directly (bypasses the length dispatch) — exposed for benchmarking
102/// and as a second, structurally-distinct exact implementation. Prefer [`ratio`] for real use.
103#[must_use]
104pub fn ratio_b2j(a: &str, b: &str) -> f64 {
105    let av: Vec<char> = a.chars().collect();
106    let bv: Vec<char> = b.chars().collect();
107    let (cb, ob) = ascii_counts(&bv);
108    if ob > 0 {
109        return gestalt::gestalt_ratio_chars(&av, &bv); // b2j fast path is ASCII-only
110    }
111    ratio_b2j_chars(&av, &bv, &cb)
112}
113
114/// Exact difflib [`ratio`] for many `(a, b)` pairs at once, computed **in parallel across all cores**
115/// (rayon). `ratio_many(pairs)[i]` equals `ratio(&pairs[i].0, &pairs[i].1)`, bit-for-bit.
116///
117/// This is the batch primitive: hand it the whole workload and the fan-out happens inside Rust — from
118/// Python it runs with the GIL released, so it saturates every core with no `ThreadPoolExecutor` and
119/// no per-call Python overhead.
120#[must_use]
121pub fn ratio_many(pairs: &[(String, String)]) -> Vec<f64> {
122    pairs.par_iter().map(|(a, b)| ratio(a, b)).collect()
123}
124
125// ───────────────────────── reference b2j path (independent oracle) ─────────────────────────
126// A faithful port of difflib's own algorithm (popular-character `b2j` index + the
127// `find_longest_match` recursion). Slower (this is what the suffix automaton replaces), kept as a
128// second, structurally-different implementation so the tests can assert the fast path matches it.
129
130/// Code point → ascending positions in `b`.
131fn build_b2j(b: &[char]) -> HashMap<char, Vec<usize>> {
132    let mut b2j: HashMap<char, Vec<usize>> = HashMap::new();
133    for (j, &c) in b.iter().enumerate() {
134        b2j.entry(c).or_default().push(j);
135    }
136    b2j
137}
138
139/// difflib `find_longest_match` over `a[alo:ahi] × b[blo:bhi]`; returns `(i, j, k)`.
140#[allow(clippy::similar_names)]
141fn find_longest(a: &[char], b2j: &HashMap<char, Vec<usize>>, alo: usize, ahi: usize, blo: usize, bhi: usize) -> (usize, usize, usize) {
142    let mut besti = alo;
143    let mut bestj = blo;
144    let mut bestsize = 0usize;
145    let mut j2_prev: HashMap<usize, usize> = HashMap::new();
146    for (i, ch) in a.iter().enumerate().take(ahi).skip(alo) {
147        let mut j2_cur: HashMap<usize, usize> = HashMap::new();
148        if let Some(positions) = b2j.get(ch) {
149            for &j in positions {
150                if j < blo {
151                    continue;
152                }
153                if j >= bhi {
154                    break;
155                }
156                let prev = if j > blo { *j2_prev.get(&(j - 1)).unwrap_or(&0) } else { 0 };
157                let k = prev + 1;
158                j2_cur.insert(j, k);
159                if k > bestsize {
160                    besti = i + 1 - k;
161                    bestj = j + 1 - k;
162                    bestsize = k;
163                }
164            }
165        }
166        j2_prev = j2_cur;
167    }
168    (besti, bestj, bestsize)
169}
170
171/// Total size of the Ratcliff–Obershelp matching blocks (difflib `get_matching_blocks`).
172#[allow(clippy::many_single_char_names)]
173fn matching_count(a: &[char], b: &[char], b2j: &HashMap<char, Vec<usize>>) -> usize {
174    let mut total = 0usize;
175    let mut stack: Vec<(usize, usize, usize, usize)> = vec![(0, a.len(), 0, b.len())];
176    while let Some((alo, ahi, blo, bhi)) = stack.pop() {
177        let (i, j, k) = find_longest(a, b2j, alo, ahi, blo, bhi);
178        if k > 0 {
179            total += k;
180            if alo < i && blo < j {
181                stack.push((alo, i, blo, j));
182            }
183            if i + k < ahi && j + k < bhi {
184                stack.push((i + k, ahi, j + k, bhi));
185            }
186        }
187    }
188    total
189}
190
191/// Reference (b2j) difflib ratio — structurally distinct from the suffix-automaton path; the test
192/// suite asserts [`gestalt_ratio`] equals this exactly. Prefer [`ratio`] for real use (far faster).
193#[allow(clippy::cast_precision_loss)]
194#[must_use]
195pub fn ratio_reference(a: &str, b: &str) -> f64 {
196    let av: Vec<char> = a.chars().collect();
197    let bv: Vec<char> = b.chars().collect();
198    let total = av.len() + bv.len();
199    if total == 0 {
200        return 1.0;
201    }
202    let b2j = build_b2j(&bv);
203    2.0 * (matching_count(&av, &bv, &b2j) as f64) / (total as f64)
204}
205
206// ───────────────────────── optimized b2j path (ASCII, short strings) ─────────────────────────
207// difflib's own algorithm with CPython's vector `j2len` + touched-index clearing (O(matches)
208// per row, not a per-row HashMap), and a **count-sort** b2j index (offsets[128] + a flat positions
209// array) instead of a per-char `HashMap<char, Vec>`. All buffers are reused thread-locals → ZERO
210// per-pair heap allocation, which is what lets it scale across threads (the HashMap version churned
211// the allocator). ASCII-only (the caller routes non-ASCII to the automaton). Exact — equals the SAM
212// path byte-for-byte (tested).
213
214#[derive(Default)]
215struct B2jScratch {
216    offsets: Vec<u32>,   // [129] prefix sums: char c's positions are positions[offsets[c]..offsets[c+1])
217    positions: Vec<u32>, // b positions grouped by char, ascending within each char (count-sort order)
218    cursor: Vec<u32>,    // write cursors during the count-sort fill
219    j2len: Vec<u32>,
220    erase: Vec<(u32, u32)>,  // (position, value) set in the previous row → reset to 0 this row
221    affect: Vec<(u32, u32)>, // (position, value) to set this row
222    stack: Vec<(usize, usize, usize, usize)>,
223}
224
225thread_local! {
226    /// Reused b2j scratch — no per-pair allocation in the short-string path.
227    static B2J: RefCell<B2jScratch> = RefCell::new(B2jScratch::default());
228}
229
230/// Estimated b2j inner-loop work `Σ_c count_a(c)·count_b(c)` — exactly the positions the first-block
231/// scan visits; the dispatch signal, also exposed for benchmarking. (`oa·ob` folds the non-ASCII tail.)
232#[must_use]
233pub fn b2j_work(a: &[char], b: &[char]) -> u64 {
234    let (ca, oa) = ascii_counts(a);
235    let (cb, ob) = ascii_counts(b);
236    let mut w = u64::from(oa) * u64::from(ob);
237    for i in 0..128 {
238        w += u64::from(ca[i]) * u64::from(cb[i]);
239    }
240    w
241}
242
243/// difflib `b2j` ratio: count-sort index (offsets + positions, all buffers reused → ZERO per-pair
244/// allocation) + difflib's `find_longest_match` recursion with a reused vector `j2len`. `cb` = ASCII
245/// counts of `b` (computed by the dispatcher, reused here as the count-sort counts); `b` must be ASCII.
246/// Second exact implementation; the tests assert it equals the automaton path byte-for-byte.
247#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
248fn ratio_b2j_chars(a: &[char], b: &[char], cb: &[u32; 128]) -> f64 {
249    let total = a.len() + b.len();
250    if total == 0 {
251        return 1.0;
252    }
253    if a.is_empty() || b.is_empty() {
254        return 0.0;
255    }
256    B2J.with_borrow_mut(|s| {
257        let B2jScratch { offsets, positions, cursor, j2len, erase, affect, stack } = s;
258        // count-sort b2j: prefix-sum cb → offsets, then place each b position into its char bucket.
259        offsets.clear();
260        offsets.resize(129, 0);
261        for c in 0..128 {
262            offsets[c + 1] = offsets[c] + cb[c];
263        }
264        cursor.clear();
265        cursor.extend_from_slice(&offsets[..128]);
266        // size `positions` WITHOUT zeroing: the count-sort below writes all b.len() slots before any
267        // read (each b position lands in exactly one bucket), so a resize(_, 0) memset would be pure
268        // bandwidth waste — and under many threads that waste is what otherwise caps b2j's scaling.
269        positions.clear();
270        positions.reserve(b.len());
271        #[allow(clippy::uninit_vec)]
272        // SAFETY: u32 has no invalid bit patterns; every index 0..b.len() is written by the
273        // count-sort below (one write per b position) before find_longest_b2j reads `positions`.
274        unsafe {
275            positions.set_len(b.len());
276        }
277        for (j, &ch) in b.iter().enumerate() {
278            let c = ch as usize; // ASCII guaranteed
279            positions[cursor[c] as usize] = j as u32;
280            cursor[c] += 1;
281        }
282        j2len.clear();
283        j2len.resize(b.len() + 1, 0);
284        // at most one (key, value) per b-position per row → reserve once so `push` never reallocates.
285        erase.reserve(b.len() + 1);
286        affect.reserve(b.len() + 1);
287        stack.clear();
288        stack.push((0, a.len(), 0, b.len()));
289        let mut m = 0usize;
290        while let Some((alo, ahi, blo, bhi)) = stack.pop() {
291            let (bi, bj, bk) = find_longest_b2j(a, offsets, positions, j2len, erase, affect, alo, ahi, blo, bhi);
292            if bk > 0 {
293                m += bk;
294                if alo < bi && blo < bj {
295                    stack.push((alo, bi, blo, bj));
296                }
297                if bi + bk < ahi && bj + bk < bhi {
298                    stack.push((bi + bk, ahi, bj + bk, bhi));
299                }
300            }
301        }
302        2.0 * m as f64 / total as f64
303    })
304}
305
306/// difflib `find_longest_match` with a reused vector `j2len` (cleared via the touched-index lists) and
307/// the count-sort b2j index (char `c`'s positions = `positions[offsets[c]..offsets[c+1])`).
308///
309/// The inner loop runs `M` times (the match count), so it is the whole cost — `get_unchecked` there
310/// removes the per-iteration `j2len[j]` bounds check the optimizer can't elide (≈10% on b2j, per the
311/// disassembly). `affect`/`erase` are pre-reserved by the caller so `push` never reallocates in the
312/// loop. A plain `for i in alo..ahi` (not `take().skip()`) keeps the iterator out of the hot path.
313#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation, clippy::similar_names)]
314fn find_longest_b2j(
315    a: &[char],
316    offsets: &[u32],
317    positions: &[u32],
318    j2len: &mut [u32],
319    erase: &mut Vec<(u32, u32)>,
320    affect: &mut Vec<(u32, u32)>,
321    alo: usize,
322    ahi: usize,
323    blo: usize,
324    bhi: usize,
325) -> (usize, usize, usize) {
326    let (mut bi, mut bj, mut bk) = (alo, blo, 0usize);
327    erase.clear();
328    // SAFETY (whole loop): `i ∈ [alo, ahi) ⊆ [0, a.len())`. `c < 128 = offsets.len()-1` is guarded, so
329    // `offsets[c]`/`offsets[c+1]` are in bounds and `[lo, hi) ⊆ [0, positions.len())`. Every match
330    // position `j` satisfies `blo ≤ j < bhi ≤ b.len()`, and written keys are `j+1 ≤ b.len()`, both
331    // `< j2len.len() = b.len()+1`. `erase`/`affect` hold only such keys, so their clears are in bounds.
332    #[allow(clippy::undocumented_unsafe_blocks)]
333    unsafe {
334        for i in alo..ahi {
335            affect.clear();
336            let c = *a.get_unchecked(i) as usize;
337            if c < 128 {
338                let lo = *offsets.get_unchecked(c) as usize;
339                let hi = *offsets.get_unchecked(c + 1) as usize;
340                for &jj in positions.get_unchecked(lo..hi) {
341                    let j = jj as usize;
342                    if j < blo {
343                        continue;
344                    }
345                    if j >= bhi {
346                        break;
347                    }
348                    let k = *j2len.get_unchecked(j) as usize + 1;
349                    affect.push((j as u32 + 1, k as u32));
350                    if k > bk {
351                        bi = i + 1 - k;
352                        bj = j + 1 - k;
353                        bk = k;
354                    }
355                }
356            }
357            for &(p, _) in erase.iter() {
358                *j2len.get_unchecked_mut(p as usize) = 0;
359            }
360            for &(p, v) in affect.iter() {
361                *j2len.get_unchecked_mut(p as usize) = v;
362            }
363            std::mem::swap(erase, affect);
364        }
365        for &(p, _) in erase.iter() {
366            *j2len.get_unchecked_mut(p as usize) = 0;
367        }
368    }
369    (bi, bj, bk)
370}
371
372// ───────────────────────────── cheap exact upper-bound filters ─────────────────────────────
373
374/// difflib `real_quick_ratio`: a length-only upper bound on `ratio` (cheap skip).
375#[allow(clippy::cast_precision_loss)]
376fn real_quick_ratio(a: &[char], b: &[char]) -> f64 {
377    let total = a.len() + b.len();
378    if total == 0 {
379        return 1.0;
380    }
381    2.0 * (a.len().min(b.len()) as f64) / (total as f64)
382}
383
384/// Sorted `(char, count)` multiset of `a` — precomputed once per string so the `quick_ratio`
385/// upper-bound filter is a linear merge over the (small) alphabet instead of a per-pair `HashMap`.
386fn char_counts(a: &[char]) -> Vec<(char, u32)> {
387    let mut v = a.to_vec();
388    v.sort_unstable();
389    let mut out: Vec<(char, u32)> = Vec::new();
390    for c in v {
391        match out.last_mut() {
392            Some(last) if last.0 == c => last.1 += 1,
393            _ => out.push((c, 1)),
394        }
395    }
396    out
397}
398
399/// difflib `quick_ratio` from precomputed sorted char-counts: `2·Σ min(count_a, count_b)/(|a|+|b|)`,
400/// an exact upper bound on `ratio`. Merge of two sorted multisets — O(distinct chars), no hashing.
401#[allow(clippy::cast_precision_loss)]
402fn quick_ratio_counts(ca: &[(char, u32)], cb: &[(char, u32)], total: usize) -> f64 {
403    if total == 0 {
404        return 1.0;
405    }
406    let (mut x, mut y, mut matches) = (0usize, 0usize, 0u32);
407    while x < ca.len() && y < cb.len() {
408        match ca[x].0.cmp(&cb[y].0) {
409            std::cmp::Ordering::Less => x += 1,
410            std::cmp::Ordering::Greater => y += 1,
411            std::cmp::Ordering::Equal => {
412                matches += ca[x].1.min(cb[y].1);
413                x += 1;
414                y += 1;
415            }
416        }
417    }
418    2.0 * f64::from(matches) / total as f64
419}
420
421fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
422    while parent[x] != x {
423        parent[x] = parent[parent[x]];
424        x = parent[x];
425    }
426    x
427}
428
429// Env-gated diagnostics: set DIFFLIB_FAST_PROGRESS=1 to stream phase timings + progress to stderr
430// from inside the Rust hot path (off in production — zero output, ~zero cost).
431fn progress_on() -> bool {
432    std::env::var_os("DIFFLIB_FAST_PROGRESS").is_some()
433}
434
435// ───────────────────────────────────── exact clustering ─────────────────────────────────────
436
437/// Qualifying pairs `(i<j, ratio)` with `ratio >= threshold`, in parallel. The exact upper-bound
438/// early-exits (`real_quick_ratio`/`quick_ratio`) skip most pairs without the full O(len²) RO;
439/// survivors go through `gestalt_edge` — reject early-exit for non-edges, exact ratio for edges. The
440/// edge ratio is kept so `min_sim` reuses it (a dense cluster's intra pairs are ~all edges, so the
441/// `min_sim` pass recomputes almost nothing — removing the redundant second scan over the same pairs).
442#[allow(clippy::cast_precision_loss, clippy::many_single_char_names)]
443fn qualifying_pairs(chars: &[Vec<char>], sams: &[gestalt::Sam], threshold: f64) -> Vec<(usize, usize, f64)> {
444    use std::sync::atomic::{AtomicUsize, Ordering};
445
446    let n = chars.len();
447    let rows = AtomicUsize::new(0);
448    std::thread::scope(|scope| {
449        if progress_on() {
450            let rows = &rows;
451            scope.spawn(move || {
452                while rows.load(Ordering::Relaxed) < n {
453                    std::thread::sleep(std::time::Duration::from_secs(1));
454                    let done = rows.load(Ordering::Relaxed);
455                    eprintln!("    [difflib-fast] qualifying_pairs: row {done}/{n} ({:.0}%)", done as f64 / n as f64 * 100.0);
456                }
457            });
458        }
459        // Length blocking: ratio>=T ⟹ |short|/|long| >= T/(2-T), so in length-sorted order each
460        // string only reaches a contiguous run of (not-too-much-longer) strings — break the inner
461        // loop as soon as `real_quick_ratio` drops below T. Turns the O(n²) enumeration into
462        // O(n·window); exact (never drops a qualifying pair). `counts` feeds the cheap quick filter.
463        let mut order: Vec<usize> = (0..n).collect();
464        order.sort_by_key(|&i| chars[i].len());
465        let counts: Vec<Vec<(char, u32)>> = chars.par_iter().map(|c| char_counts(c)).collect();
466        let pairs = (0..n)
467            .into_par_iter()
468            .flat_map_iter(|p| {
469                let i = order[p];
470                let a = &chars[i];
471                let mut local: Vec<(usize, usize, f64)> = Vec::new();
472                for &j in &order[p + 1..] {
473                    let b = &chars[j];
474                    if real_quick_ratio(a, b) < threshold {
475                        break; // lengths only grow ⇒ all remaining partners also fail the bound
476                    }
477                    if quick_ratio_counts(&counts[i], &counts[j], a.len() + b.len()) < threshold {
478                        continue;
479                    }
480                    let (lo, hi) = if i < j { (i, j) } else { (j, i) };
481                    // Reject early-exit for non-edges; exact ratio (cached for min_sim) for edges.
482                    if let Some(r) = gestalt::gestalt_edge(&chars[lo], &chars[hi], &sams[hi], threshold) {
483                        local.push((lo, hi, r));
484                    }
485                }
486                rows.fetch_add(1, Ordering::Relaxed);
487                local
488            })
489            .collect();
490        rows.store(n, Ordering::Relaxed); // unblock the progress thread
491        pairs
492    })
493}
494
495/// Min intra-cluster pairwise ratio (single-linkage's conservative figure), exact. Edge pairs are
496/// already in `ratios` (cached from the qualifying pass) — for a dense cluster that is ~every intra
497/// pair, so almost nothing is recomputed. The rare missing (non-edge, ratio < threshold) pair is
498/// computed with the pruned `gestalt_ratio_capped` (accept-exits any pair above the running min).
499/// Parallel over members; each task's cap is its own running min (>= the global min ⇒ exact min kept).
500fn cluster_min_sim(members: &[usize], chars: &[Vec<char>], sams: &[gestalt::Sam], ratios: &HashMap<(usize, usize), f64>) -> f64 {
501    members
502        .par_iter()
503        .enumerate()
504        .map(|(pos, &i)| {
505            let mut local = 1.0_f64;
506            for &j in &members[pos + 1..] {
507                let key = if i < j { (i, j) } else { (j, i) };
508                let r = match ratios.get(&key) {
509                    Some(&r) => r, // edge ratio cached by the qualifying pass — no recompute
510                    None => gestalt::gestalt_ratio_capped(&chars[key.0], &chars[key.1], &sams[key.1], local),
511                };
512                local = local.min(r);
513            }
514            local
515        })
516        .reduce(|| 1.0_f64, f64::min)
517}
518
519/// Union-find over qualifying edge pairs → clusters (size >= 2), each with its exact min intra-pair
520/// ratio. The qualifying pass's edge ratios are cached in `ratios` and reused by `cluster_min_sim`.
521fn assemble(n: usize, pairs: Vec<(usize, usize, f64)>, chars: &[Vec<char>], sams: &[gestalt::Sam]) -> Vec<(Vec<usize>, f64)> {
522    let mut parent: Vec<usize> = (0..n).collect();
523    let mut ratios: HashMap<(usize, usize), f64> = HashMap::with_capacity(pairs.len());
524    for (i, j, r) in pairs {
525        ratios.insert((i, j), r);
526        let (ri, rj) = (uf_find(&mut parent, i), uf_find(&mut parent, j));
527        if ri != rj {
528            parent[ri] = rj;
529        }
530    }
531    let mut comps: HashMap<usize, Vec<usize>> = HashMap::new();
532    for i in 0..n {
533        let root = uf_find(&mut parent, i);
534        comps.entry(root).or_default().push(i);
535    }
536    let mut out: Vec<(Vec<usize>, f64)> = Vec::new();
537    for members in comps.values() {
538        if members.len() < 2 {
539            continue;
540        }
541        let min_sim = cluster_min_sim(members, chars, sams, &ratios);
542        let mut sorted = members.clone();
543        sorted.sort_unstable();
544        out.push((sorted, min_sim));
545    }
546    out.sort_by(|a, b| a.0[0].cmp(&b.0[0]));
547    out
548}
549
550/// Exact single-linkage clustering over pre-collected `char` vectors: returns each cluster (member
551/// indices, sorted) with its exact minimum pairwise ratio. O(n²) early-exit join, rayon-parallel.
552#[must_use]
553pub fn cluster_canonicals_chars(chars: &[Vec<char>], threshold: f64) -> Vec<(Vec<usize>, f64)> {
554    let n = chars.len();
555    // Prebuild each string's suffix automaton ONCE (n builds), reused as the b-side for all n²
556    // pairs — the all-pairs cost becomes n builds + n² scans, not n² builds.
557    let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
558    let pairs = qualifying_pairs(chars, &sams, threshold);
559    assemble(n, pairs, chars, &sams)
560}
561
562/// `cluster_canonicals(canonicals, threshold)` → `[(member indices, min pairwise ratio)]`.
563///
564/// Exact single-linkage clustering: `ratio >= threshold` joins two strings; each returned cluster
565/// (size >= 2) carries its exact minimum intra-cluster ratio. Bit-identical to the reference
566/// pairwise clustering — just far faster (suffix automaton + early-exit + rayon).
567#[must_use]
568pub fn cluster_canonicals(canonicals: &[String], threshold: f64) -> Vec<(Vec<usize>, f64)> {
569    let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
570    cluster_canonicals_chars(&chars, threshold)
571}
572
573// ─────────────────────────────── scalable MinHash-LSH variant ───────────────────────────────
574
575const SHINGLE_K: usize = 9; // char-k-gram length for MinHash shingles (calibrated on real code)
576
577fn fnv1a_bytes(data: &[u8]) -> u64 {
578    let mut h = 0xcbf2_9ce4_8422_2325u64;
579    for &b in data {
580        h ^= u64::from(b);
581        h = h.wrapping_mul(0x0000_0100_0000_01b3);
582    }
583    h
584}
585
586fn fnv1a_u64s(values: &[u64]) -> u64 {
587    let mut h = 0xcbf2_9ce4_8422_2325u64;
588    for &v in values {
589        h ^= v;
590        h = h.wrapping_mul(0x0000_0100_0000_01b3);
591    }
592    h
593}
594
595/// Distinct char-k-gram shingle hashes of `s` (the set `MinHash` estimates Jaccard over).
596fn shingle_hashes(s: &str) -> Vec<u64> {
597    let bytes = s.as_bytes();
598    if bytes.len() <= SHINGLE_K {
599        return vec![fnv1a_bytes(bytes)];
600    }
601    let mut set: HashSet<u64> = HashSet::new();
602    for window in bytes.windows(SHINGLE_K) {
603        set.insert(fnv1a_bytes(window));
604    }
605    set.into_iter().collect()
606}
607
608/// `num` deterministic `(a, b)` hash permutations (fixed seed → reproducible signatures).
609fn make_perms(num: usize) -> Vec<(u64, u64)> {
610    let mut state = 0x9e37_79b9_7f4a_7c15u64;
611    let mut next = move || {
612        state ^= state << 13;
613        state ^= state >> 7;
614        state ^= state << 17;
615        state
616    };
617    (0..num).map(|_| (next() | 1, next())).collect()
618}
619
620fn minhash(shingles: &[u64], perms: &[(u64, u64)]) -> Vec<u64> {
621    perms
622        .iter()
623        .map(|&(a, b)| shingles.iter().map(|&h| a.wrapping_mul(h).wrapping_add(b)).min().unwrap_or(u64::MAX))
624        .collect()
625}
626
627/// LSH candidate pairs: documents that share a full band signature in any band (an O(n)-ish proxy
628/// for "Jaccard above the band threshold" — recall tuned via `band_rows`).
629fn lsh_candidates(sigs: &[Vec<u64>], band_rows: usize) -> HashSet<(usize, usize)> {
630    let bands = sigs.first().map_or(0, Vec::len).checked_div(band_rows).unwrap_or(0);
631    let mut candidates: HashSet<(usize, usize)> = HashSet::new();
632    for band in 0..bands {
633        let lo = band * band_rows;
634        let mut buckets: HashMap<u64, Vec<usize>> = HashMap::new();
635        for (d, sig) in sigs.iter().enumerate() {
636            buckets.entry(fnv1a_u64s(&sig[lo..lo + band_rows])).or_default().push(d);
637        }
638        for docs in buckets.values() {
639            for a in 0..docs.len() {
640                for b in (a + 1)..docs.len() {
641                    candidates.insert((docs[a].min(docs[b]), docs[a].max(docs[b])));
642                }
643            }
644        }
645    }
646    candidates
647}
648
649/// `cluster_canonicals_lsh(canonicals, threshold, num_perm, band_rows)`: the scalable path.
650///
651/// `MinHash`-LSH generates candidate pairs in ~O(n) (skipping the O(n²) dissimilar pairs); each
652/// candidate is then **verified with the exact ratio**, so clusters + `min_sim` match the exact path
653/// (modulo LSH recall, tuned high via `band_rows`). Filter-verification, in the `BayesLSH`-Lite /
654/// `SourcererCC` lineage. Use past the O(n²) wall (>100k strings); for exact recall use
655/// [`cluster_canonicals`].
656#[must_use]
657pub fn cluster_canonicals_lsh(canonicals: &[String], threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
658    let debug = progress_on();
659    let start = std::time::Instant::now();
660    let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
661    let n = chars.len();
662    let perms = make_perms(num_perm);
663    let sigs: Vec<Vec<u64>> = canonicals.par_iter().map(|s| minhash(&shingle_hashes(s), &perms)).collect();
664    if debug {
665        eprintln!("    [difflib-fast] lsh: {n} signatures in {:.2}s", start.elapsed().as_secs_f64());
666    }
667    let candidates = lsh_candidates(&sigs, band_rows);
668    if debug {
669        eprintln!("    [difflib-fast] lsh: {} candidate pairs in {:.2}s", candidates.len(), start.elapsed().as_secs_f64());
670    }
671    let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
672    let cand: Vec<(usize, usize)> = candidates.into_iter().collect();
673    let pairs: Vec<(usize, usize, f64)> = cand
674        .par_iter()
675        .filter_map(|&(i, j)| {
676            let (a, b) = if i < j { (i, j) } else { (j, i) };
677            gestalt::gestalt_edge(&chars[a], &chars[b], &sams[b], threshold).map(|r| (a, b, r))
678        })
679        .collect();
680    if debug {
681        eprintln!("    [difflib-fast] lsh: {} verified pairs in {:.2}s", pairs.len(), start.elapsed().as_secs_f64());
682    }
683    assemble(n, pairs, &chars, &sams)
684}
685
686// ───────────────────────────────── optional Python bindings ─────────────────────────────────
687
688#[cfg(feature = "python")]
689mod python {
690    use pyo3::prelude::*;
691
692    /// `ratio(a, b)` — fast exact `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`.
693    ///
694    /// Releases the GIL for the compute (the inputs are copied to owned `String`s first), so calling
695    /// this from many Python threads actually scales across cores instead of serializing on the GIL.
696    /// Backs the scalar form of the public `ratio`; for a batch the package routes to `ratio_many`.
697    #[pyfunction]
698    fn ratio(py: Python<'_>, a: &str, b: &str) -> f64 {
699        let (a, b) = (a.to_owned(), b.to_owned());
700        py.detach(|| super::ratio(&a, &b))
701    }
702
703    /// `ratio_many(pairs)` → one exact ratio per `(a, b)` pair, **computed across all cores inside
704    /// Rust** (rayon, GIL released). Backs the list form of the public `ratio` — the contention-free
705    /// batch path, no `ThreadPoolExecutor`.
706    #[pyfunction]
707    #[allow(clippy::needless_pass_by_value)]
708    fn ratio_many(py: Python<'_>, pairs: Vec<(String, String)>) -> Vec<f64> {
709        py.detach(|| super::ratio_many(&pairs))
710    }
711
712    /// `cluster_canonicals(canonicals, threshold)` → `[(member indices, min pairwise ratio)]`.
713    ///
714    /// Fans the all-pairs join out across every core internally (rayon) — one call, full multicore,
715    /// no Python threads needed. The GIL is released during the compute, so it never blocks the rest
716    /// of your program. This is the contention-free way to use all cores from Python.
717    #[pyfunction]
718    #[allow(clippy::needless_pass_by_value)]
719    fn cluster_canonicals(py: Python<'_>, canonicals: Vec<String>, threshold: f64) -> Vec<(Vec<usize>, f64)> {
720        py.detach(|| super::cluster_canonicals(&canonicals, threshold))
721    }
722
723    /// `cluster_canonicals_lsh(canonicals, threshold, num_perm, band_rows)` — scalable LSH variant.
724    #[pyfunction]
725    #[allow(clippy::needless_pass_by_value)]
726    fn cluster_canonicals_lsh(py: Python<'_>, canonicals: Vec<String>, threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
727        py.detach(|| super::cluster_canonicals_lsh(&canonicals, threshold, num_perm, band_rows))
728    }
729
730    /// Compiled core of the `difflib_fast` Python package (re-exported by `difflib_fast/__init__.py`).
731    #[pymodule]
732    fn _difflib_fast(m: &Bound<'_, PyModule>) -> PyResult<()> {
733        m.add_function(wrap_pyfunction!(ratio, m)?)?;
734        m.add_function(wrap_pyfunction!(ratio_many, m)?)?;
735        m.add_function(wrap_pyfunction!(cluster_canonicals, m)?)?;
736        m.add_function(wrap_pyfunction!(cluster_canonicals_lsh, m)?)?;
737        Ok(())
738    }
739}
740
741#[cfg(test)]
742mod tests {
743    #![allow(clippy::float_cmp, clippy::unreadable_literal)]
744    use super::{cluster_canonicals, gestalt_ratio, ratio, ratio_b2j, ratio_reference};
745
746    #[test]
747    fn matches_known_difflib_values() {
748        // Cross-checked against difflib.SequenceMatcher(None, a, b, autojunk=False).ratio().
749        assert_eq!(gestalt_ratio("", ""), 1.0);
750        assert_eq!(gestalt_ratio("", "x"), 0.0);
751        assert_eq!(gestalt_ratio("abc", "abc"), 1.0);
752        assert_eq!(gestalt_ratio("abc", "abd"), 0.6666666666666666);
753        assert_eq!(gestalt_ratio("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
754        assert_eq!(gestalt_ratio("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
755    }
756
757    // The fast suffix-automaton path must equal the structurally-distinct b2j reference exactly.
758    #[test]
759    fn fast_matches_reference() {
760        let mut s: u64 = 0x1234_5678_9abc_def1;
761        let mut next = || {
762            s ^= s << 13;
763            s ^= s >> 7;
764            s ^= s << 17;
765            s
766        };
767        for _ in 0..2000 {
768            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
769                (0..n).map(|_| char::from(b'a' + (rng() % 5) as u8)).collect()
770            };
771            let (la, lb) = ((next() % 50) as usize, (next() % 50) as usize);
772            let a = mk(la, &mut next);
773            let b = mk(lb, &mut next);
774            let r = ratio_reference(&a, &b);
775            assert_eq!(gestalt_ratio(&a, &b), r, "SAM a={a:?} b={b:?}");
776            assert_eq!(ratio_b2j(&a, &b), r, "b2j a={a:?} b={b:?}");
777        }
778    }
779
780    // Both dispatch branches (b2j for short, SAM for long) must agree with the reference on long,
781    // repetitive strings that cross B2J_CROSSOVER.
782    #[test]
783    fn long_strings_all_paths_agree() {
784        let mut s: u64 = 0xdead_beef_cafe_1234;
785        let mut next = || {
786            s ^= s << 13;
787            s ^= s >> 7;
788            s ^= s << 17;
789            s
790        };
791        for _ in 0..40 {
792            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
793                (0..n).map(|_| char::from(b'a' + (rng() % 6) as u8)).collect()
794            };
795            let a = mk(1400 + (next() % 600) as usize, &mut next); // > B2J_CROSSOVER ⇒ SAM branch
796            let b = mk(1400 + (next() % 600) as usize, &mut next);
797            let r = ratio_reference(&a, &b);
798            assert_eq!(gestalt_ratio(&a, &b), r);
799            assert_eq!(ratio_b2j(&a, &b), r);
800            assert_eq!(ratio(&a, &b), r); // dispatched (SAM here)
801        }
802    }
803
804    #[test]
805    fn clusters_obvious_duplicates() {
806        let corpus: Vec<String> = vec![
807            "def add(a, b): return a + b".into(),
808            "def add(x, y): return x + y".into(),
809            "completely unrelated text here".into(),
810        ];
811        let clusters = cluster_canonicals(&corpus, 0.5);
812        assert_eq!(clusters.len(), 1, "the two add() variants should cluster");
813        assert_eq!(clusters[0].0, vec![0, 1]);
814        assert!(clusters[0].1 >= 0.5);
815    }
816}
difflib_fast/lib.rs

difflib_fast/
lib.rs