Skip to main content

difflib_fast/
lib.rs

1//! `difflib-fast` — fast, **byte-for-byte exact** difflib Ratcliff–Obershelp ("gestalt") similarity.
2//!
3//! [`ratio`] / [`gestalt::gestalt_ratio`] are a drop-in for
4//! `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`:
5//!   `ratio = 2·M / (len(a)+len(b))`, where `M` is the total size of the Ratcliff–Obershelp matching
6//! blocks. The result — including difflib's tie-break (longest; earliest-a; earliest-b) and its
7//! argument-order asymmetry — is reproduced exactly, but `M` is computed via a **suffix automaton**
8//! (LCS in O(|a|+|b|) regardless of character frequency) instead of difflib's popular-character
9//! `b2j` rescans. On long, small-alphabet text (e.g. canonicalized source code) this is the
10//! difference between difflib's pathological case and a linear scan.
11//!
12//! Beyond the per-pair ratio, [`cluster_canonicals`] does an exact single-linkage **clustering** of a
13//! corpus at a similarity threshold — prebuild each string's automaton once, then an early-exit
14//! all-pairs join (length blocking + `quick_ratio` filter + threshold-aware RO), in parallel via
15//! rayon — and reports each cluster with its exact minimum pairwise ratio. [`cluster_canonicals_lsh`]
16//! is the scalable `MinHash`-LSH variant (candidate generation + exact verification) for very large
17//! corpora past the O(n²) wall.
18//!
19//! Two independent implementations of `M` back this: the suffix-automaton path ([`gestalt`]) and a
20//! straight port of difflib's `b2j` recursion (a test-only reference oracle); the test suite asserts
21//! they are bit-identical, which is the crate's core correctness gate.
22
23use std::cell::RefCell;
24use std::collections::{HashMap, HashSet};
25
26use rayon::prelude::*;
27
28pub mod gestalt;
29pub use gestalt::gestalt_ratio;
30
31// Exact all-pairs weighted-cosine similarity join (AllPairs/L2AP) — inverted index + prefix
32// filtering. The principled replacement for shingle-candidate near-duplicate detection.
33pub mod simjoin;
34
35// Heterogeneous CPU+GPU exact RO — Metal compute backend (Apple Silicon). Behind `feature = "gpu"`
36// + `cfg(target_os = "macos")`; the rest of the crate falls back to the CPU path when it's off or
37// no Metal device is available at runtime.
38#[cfg(all(feature = "gpu", target_os = "macos"))]
39pub mod gpu;
40
41// GPU batched sparse-cosine — the offload experiment for `simjoin`'s bandwidth-bound verify step.
42#[cfg(all(feature = "gpu", target_os = "macos"))]
43pub mod simjoin_gpu;
44
45// `Rationer` — the stateful, GPU-accelerated similarity/clustering handle. Always available;
46// degrades to CPU when the `gpu` feature is off or no Metal device can be acquired.
47pub mod rationer;
48pub use rationer::{Concurrency, PreparedRationer, Rationer, RationerBuilder};
49
50/// Dispatch threshold: take the `b2j` path while its estimated work per element
51/// (`Σ_c count_a·count_b / (|a|+|b|)`) stays at/below this; above it the automaton wins. Tuned on real
52/// canonicalized-code corpora. Override at runtime with `DF_WORK_FACTOR`. (The clustering join always
53/// uses the automaton — there it's prebuilt once and reused across all n² scans, so it always wins.)
54const B2J_WORK_FACTOR: u64 = 34;
55
56/// Fast exact difflib ratio. Bit-identical to
57/// `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`.
58///
59/// Dispatches by length: short inputs take the lightweight difflib `b2j` recursion (cheap to set up),
60/// long inputs take the suffix-automaton LCS (frequency-independent, so it doesn't degrade on
61/// repetitive text). Both paths are exact and agree bit-for-bit.
62#[must_use]
63pub fn ratio(a: &str, b: &str) -> f64 {
64    let av: Vec<char> = a.chars().collect();
65    let bv: Vec<char> = b.chars().collect();
66    ratio_chars(&av, &bv)
67}
68
69/// ASCII char histogram (canonical code is ~all ASCII; non-ASCII folded into one overflow bucket).
70fn ascii_counts(s: &[char]) -> ([u32; 128], u32) {
71    let mut c = [0u32; 128];
72    let mut other = 0u32;
73    for &ch in s {
74        let u = ch as u32;
75        if u < 128 {
76            c[u as usize] += 1;
77        } else {
78            other += 1;
79        }
80    }
81    (c, other)
82}
83
84fn work_factor() -> u64 {
85    use std::sync::OnceLock;
86    static F: OnceLock<u64> = OnceLock::new();
87    *F.get_or_init(|| std::env::var("DF_WORK_FACTOR").ok().and_then(|s| s.parse().ok()).unwrap_or(B2J_WORK_FACTOR))
88}
89
90#[allow(clippy::cast_precision_loss)]
91#[must_use]
92fn ratio_chars(a: &[char], b: &[char]) -> f64 {
93    let total = a.len() + b.len();
94    if total == 0 {
95        return 1.0;
96    }
97    let (ca, oa) = ascii_counts(a);
98    let (cb, ob) = ascii_counts(b);
99    // Non-ASCII present (rare in canonical code): the b2j fast path is ASCII-only, so use the
100    // automaton, which compares arbitrary code points.
101    if oa > 0 || ob > 0 {
102        return gestalt::gestalt_ratio_chars(a, b);
103    }
104    // Dispatch on b2j's estimated work `W = Σ_c count_a(c)·count_b(c)` (exactly the positions the
105    // first-block scan visits) per element. Below the threshold b2j is cheaper to set up; above it the
106    // repetitive case makes b2j's recursion blow up and the automaton wins. Committing to one path
107    // (rather than speculatively running b2j and aborting) avoids wasting work on the clear-automaton
108    // cases. The histograms just computed double as the b2j build's counts, so dispatch is near-free.
109    let mut w = 0u64;
110    for i in 0..128 {
111        w += u64::from(ca[i]) * u64::from(cb[i]);
112    }
113    if w <= work_factor() * total as u64 {
114        ratio_b2j_chars(a, b, &cb)
115    } else {
116        gestalt::gestalt_ratio_chars(a, b)
117    }
118}
119
120/// The difflib `b2j` ratio path directly (bypasses the length dispatch) — a second,
121/// structurally-distinct exact implementation kept as a test oracle for the suffix-automaton path.
122#[cfg(test)]
123#[must_use]
124fn ratio_b2j(a: &str, b: &str) -> f64 {
125    let av: Vec<char> = a.chars().collect();
126    let bv: Vec<char> = b.chars().collect();
127    let (cb, ob) = ascii_counts(&bv);
128    if ob > 0 {
129        return gestalt::gestalt_ratio_chars(&av, &bv); // b2j fast path is ASCII-only
130    }
131    ratio_b2j_chars(&av, &bv, &cb)
132}
133
134/// Exact difflib [`ratio`] for many `(a, b)` pairs at once, computed **in parallel across all cores**
135/// (rayon). `ratio_many(pairs)[i]` equals `ratio(&pairs[i].0, &pairs[i].1)`, bit-for-bit.
136///
137/// This is the batch primitive: hand it the whole workload and the fan-out happens inside Rust — from
138/// Python it runs with the GIL released, so it saturates every core with no `ThreadPoolExecutor` and
139/// no per-call Python overhead.
140#[must_use]
141pub fn ratio_many(pairs: &[(String, String)]) -> Vec<f64> {
142    pairs.par_iter().map(|(a, b)| ratio(a, b)).collect()
143}
144
145// ───────────────────────── reference b2j path (independent oracle) ─────────────────────────
146// A faithful port of difflib's own algorithm (popular-character `b2j` index + the
147// `find_longest_match` recursion). Slower (this is what the suffix automaton replaces), kept as a
148// second, structurally-different implementation so the tests can assert the fast path matches it.
149// Test-only: not part of the public API.
150
151/// Code point → ascending positions in `b`.
152#[cfg(test)]
153fn build_b2j(b: &[char]) -> HashMap<char, Vec<usize>> {
154    let mut b2j: HashMap<char, Vec<usize>> = HashMap::new();
155    for (j, &c) in b.iter().enumerate() {
156        b2j.entry(c).or_default().push(j);
157    }
158    b2j
159}
160
161/// difflib `find_longest_match` over `a[alo:ahi] × b[blo:bhi]`; returns `(i, j, k)`.
162#[cfg(test)]
163#[allow(clippy::similar_names)]
164fn find_longest(a: &[char], b2j: &HashMap<char, Vec<usize>>, alo: usize, ahi: usize, blo: usize, bhi: usize) -> (usize, usize, usize) {
165    let mut besti = alo;
166    let mut bestj = blo;
167    let mut bestsize = 0usize;
168    let mut j2_prev: HashMap<usize, usize> = HashMap::new();
169    for (i, ch) in a.iter().enumerate().take(ahi).skip(alo) {
170        let mut j2_cur: HashMap<usize, usize> = HashMap::new();
171        if let Some(positions) = b2j.get(ch) {
172            for &j in positions {
173                if j < blo {
174                    continue;
175                }
176                if j >= bhi {
177                    break;
178                }
179                let prev = if j > blo { *j2_prev.get(&(j - 1)).unwrap_or(&0) } else { 0 };
180                let k = prev + 1;
181                j2_cur.insert(j, k);
182                if k > bestsize {
183                    besti = i + 1 - k;
184                    bestj = j + 1 - k;
185                    bestsize = k;
186                }
187            }
188        }
189        j2_prev = j2_cur;
190    }
191    (besti, bestj, bestsize)
192}
193
194/// Total size of the Ratcliff–Obershelp matching blocks (difflib `get_matching_blocks`).
195#[cfg(test)]
196#[allow(clippy::many_single_char_names)]
197fn matching_count(a: &[char], b: &[char], b2j: &HashMap<char, Vec<usize>>) -> usize {
198    let mut total = 0usize;
199    let mut stack: Vec<(usize, usize, usize, usize)> = vec![(0, a.len(), 0, b.len())];
200    while let Some((alo, ahi, blo, bhi)) = stack.pop() {
201        let (i, j, k) = find_longest(a, b2j, alo, ahi, blo, bhi);
202        if k > 0 {
203            total += k;
204            if alo < i && blo < j {
205                stack.push((alo, i, blo, j));
206            }
207            if i + k < ahi && j + k < bhi {
208                stack.push((i + k, ahi, j + k, bhi));
209            }
210        }
211    }
212    total
213}
214
215/// Reference (b2j) difflib ratio — structurally distinct from the suffix-automaton path; the test
216/// suite asserts [`gestalt_ratio`] equals this exactly. Test oracle only (not public API).
217#[cfg(test)]
218#[allow(clippy::cast_precision_loss)]
219#[must_use]
220fn ratio_reference(a: &str, b: &str) -> f64 {
221    let av: Vec<char> = a.chars().collect();
222    let bv: Vec<char> = b.chars().collect();
223    let total = av.len() + bv.len();
224    if total == 0 {
225        return 1.0;
226    }
227    let b2j = build_b2j(&bv);
228    2.0 * (matching_count(&av, &bv, &b2j) as f64) / (total as f64)
229}
230
231// ───────────────────────── optimized b2j path (ASCII, short strings) ─────────────────────────
232// difflib's own algorithm with CPython's vector `j2len` + touched-index clearing (O(matches)
233// per row, not a per-row HashMap), and a **count-sort** b2j index (offsets[128] + a flat positions
234// array) instead of a per-char `HashMap<char, Vec>`. All buffers are reused thread-locals → ZERO
235// per-pair heap allocation, which is what lets it scale across threads (the HashMap version churned
236// the allocator). ASCII-only (the caller routes non-ASCII to the automaton). Exact — equals the SAM
237// path byte-for-byte (tested).
238
239#[derive(Default)]
240struct B2jScratch {
241    offsets: Vec<u32>,   // [129] prefix sums: char c's positions are positions[offsets[c]..offsets[c+1])
242    positions: Vec<u32>, // b positions grouped by char, ascending within each char (count-sort order)
243    cursor: Vec<u32>,    // write cursors during the count-sort fill
244    j2len: Vec<u32>,
245    erase: Vec<(u32, u32)>,  // (position, value) set in the previous row → reset to 0 this row
246    affect: Vec<(u32, u32)>, // (position, value) to set this row
247    stack: Vec<(usize, usize, usize, usize)>,
248}
249
250thread_local! {
251    /// Reused b2j scratch — no per-pair allocation in the short-string path.
252    static B2J: RefCell<B2jScratch> = RefCell::new(B2jScratch::default());
253}
254
255/// difflib `b2j` ratio: count-sort index (offsets + positions, all buffers reused → ZERO per-pair
256/// allocation) + difflib's `find_longest_match` recursion with a reused vector `j2len`. `cb` = ASCII
257/// counts of `b` (computed by the dispatcher, reused here as the count-sort counts); `b` must be ASCII.
258/// Second exact implementation; the tests assert it equals the automaton path byte-for-byte.
259#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
260fn ratio_b2j_chars(a: &[char], b: &[char], cb: &[u32; 128]) -> f64 {
261    let total = a.len() + b.len();
262    if total == 0 {
263        return 1.0;
264    }
265    if a.is_empty() || b.is_empty() {
266        return 0.0;
267    }
268    B2J.with_borrow_mut(|s| {
269        let B2jScratch { offsets, positions, cursor, j2len, erase, affect, stack } = s;
270        // count-sort b2j: prefix-sum cb → offsets, then place each b position into its char bucket.
271        offsets.clear();
272        offsets.resize(129, 0);
273        for c in 0..128 {
274            offsets[c + 1] = offsets[c] + cb[c];
275        }
276        cursor.clear();
277        cursor.extend_from_slice(&offsets[..128]);
278        // size `positions` WITHOUT zeroing: the count-sort below writes all b.len() slots before any
279        // read (each b position lands in exactly one bucket), so a resize(_, 0) memset would be pure
280        // bandwidth waste — and under many threads that waste is what otherwise caps b2j's scaling.
281        positions.clear();
282        positions.reserve(b.len());
283        #[allow(clippy::uninit_vec)]
284        // SAFETY: u32 has no invalid bit patterns; every index 0..b.len() is written by the
285        // count-sort below (one write per b position) before find_longest_b2j reads `positions`.
286        unsafe {
287            positions.set_len(b.len());
288        }
289        for (j, &ch) in b.iter().enumerate() {
290            let c = ch as usize; // ASCII guaranteed
291            positions[cursor[c] as usize] = j as u32;
292            cursor[c] += 1;
293        }
294        j2len.clear();
295        j2len.resize(b.len() + 1, 0);
296        // at most one (key, value) per b-position per row → reserve once so `push` never reallocates.
297        erase.reserve(b.len() + 1);
298        affect.reserve(b.len() + 1);
299        stack.clear();
300        stack.push((0, a.len(), 0, b.len()));
301        let mut m = 0usize;
302        while let Some((alo, ahi, blo, bhi)) = stack.pop() {
303            let (bi, bj, bk) = find_longest_b2j(a, offsets, positions, j2len, erase, affect, alo, ahi, blo, bhi);
304            if bk > 0 {
305                m += bk;
306                if alo < bi && blo < bj {
307                    stack.push((alo, bi, blo, bj));
308                }
309                if bi + bk < ahi && bj + bk < bhi {
310                    stack.push((bi + bk, ahi, bj + bk, bhi));
311                }
312            }
313        }
314        2.0 * m as f64 / total as f64
315    })
316}
317
318/// difflib `find_longest_match` with a reused vector `j2len` (cleared via the touched-index lists) and
319/// the count-sort b2j index (char `c`'s positions = `positions[offsets[c]..offsets[c+1])`).
320///
321/// The inner loop runs `M` times (the match count), so it is the whole cost — `get_unchecked` there
322/// removes the per-iteration `j2len[j]` bounds check the optimizer can't elide (≈10% on b2j, per the
323/// disassembly). `affect`/`erase` are pre-reserved by the caller so `push` never reallocates in the
324/// loop. A plain `for i in alo..ahi` (not `take().skip()`) keeps the iterator out of the hot path.
325#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation, clippy::similar_names)]
326fn find_longest_b2j(
327    a: &[char],
328    offsets: &[u32],
329    positions: &[u32],
330    j2len: &mut [u32],
331    erase: &mut Vec<(u32, u32)>,
332    affect: &mut Vec<(u32, u32)>,
333    alo: usize,
334    ahi: usize,
335    blo: usize,
336    bhi: usize,
337) -> (usize, usize, usize) {
338    let (mut bi, mut bj, mut bk) = (alo, blo, 0usize);
339    erase.clear();
340    // SAFETY (whole loop): `i ∈ [alo, ahi) ⊆ [0, a.len())`. `c < 128 = offsets.len()-1` is guarded, so
341    // `offsets[c]`/`offsets[c+1]` are in bounds and `[lo, hi) ⊆ [0, positions.len())`. Every match
342    // position `j` satisfies `blo ≤ j < bhi ≤ b.len()`, and written keys are `j+1 ≤ b.len()`, both
343    // `< j2len.len() = b.len()+1`. `erase`/`affect` hold only such keys, so their clears are in bounds.
344    #[allow(clippy::undocumented_unsafe_blocks)]
345    unsafe {
346        for i in alo..ahi {
347            affect.clear();
348            let c = *a.get_unchecked(i) as usize;
349            if c < 128 {
350                let lo = *offsets.get_unchecked(c) as usize;
351                let hi = *offsets.get_unchecked(c + 1) as usize;
352                for &jj in positions.get_unchecked(lo..hi) {
353                    let j = jj as usize;
354                    if j < blo {
355                        continue;
356                    }
357                    if j >= bhi {
358                        break;
359                    }
360                    let k = *j2len.get_unchecked(j) as usize + 1;
361                    affect.push((j as u32 + 1, k as u32));
362                    if k > bk {
363                        bi = i + 1 - k;
364                        bj = j + 1 - k;
365                        bk = k;
366                    }
367                }
368            }
369            for &(p, _) in erase.iter() {
370                *j2len.get_unchecked_mut(p as usize) = 0;
371            }
372            for &(p, v) in affect.iter() {
373                *j2len.get_unchecked_mut(p as usize) = v;
374            }
375            std::mem::swap(erase, affect);
376        }
377        for &(p, _) in erase.iter() {
378            *j2len.get_unchecked_mut(p as usize) = 0;
379        }
380    }
381    (bi, bj, bk)
382}
383
384// ───────────────────────────── cheap exact upper-bound filters ─────────────────────────────
385
386/// difflib `real_quick_ratio`: a length-only upper bound on `ratio` (cheap skip).
387#[allow(clippy::cast_precision_loss)]
388pub(crate) fn real_quick_ratio(a: &[char], b: &[char]) -> f64 {
389    let total = a.len() + b.len();
390    if total == 0 {
391        return 1.0;
392    }
393    2.0 * (a.len().min(b.len()) as f64) / (total as f64)
394}
395
396/// Sorted `(char, count)` multiset of `a` — precomputed once per string so the `quick_ratio`
397/// upper-bound filter is a linear merge over the (small) alphabet instead of a per-pair `HashMap`.
398pub(crate) fn char_counts(a: &[char]) -> Vec<(char, u32)> {
399    let mut v = a.to_vec();
400    v.sort_unstable();
401    let mut out: Vec<(char, u32)> = Vec::new();
402    for c in v {
403        match out.last_mut() {
404            Some(last) if last.0 == c => last.1 += 1,
405            _ => out.push((c, 1)),
406        }
407    }
408    out
409}
410
411/// difflib `quick_ratio` from precomputed sorted char-counts: `2·Σ min(count_a, count_b)/(|a|+|b|)`,
412/// an exact upper bound on `ratio`. Merge of two sorted multisets — O(distinct chars), no hashing.
413#[allow(clippy::cast_precision_loss)]
414pub(crate) fn quick_ratio_counts(ca: &[(char, u32)], cb: &[(char, u32)], total: usize) -> f64 {
415    if total == 0 {
416        return 1.0;
417    }
418    let (mut x, mut y, mut matches) = (0usize, 0usize, 0u32);
419    while x < ca.len() && y < cb.len() {
420        match ca[x].0.cmp(&cb[y].0) {
421            std::cmp::Ordering::Less => x += 1,
422            std::cmp::Ordering::Greater => y += 1,
423            std::cmp::Ordering::Equal => {
424                matches += ca[x].1.min(cb[y].1);
425                x += 1;
426                y += 1;
427            }
428        }
429    }
430    2.0 * f64::from(matches) / total as f64
431}
432
433fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
434    while parent[x] != x {
435        parent[x] = parent[parent[x]];
436        x = parent[x];
437    }
438    x
439}
440
441// Env-gated diagnostics: set DIFFLIB_FAST_PROGRESS=1 to stream phase timings + progress to stderr
442// from inside the Rust hot path (off in production — zero output, ~zero cost).
443fn progress_on() -> bool {
444    std::env::var_os("DIFFLIB_FAST_PROGRESS").is_some()
445}
446
447// ───────────────────────────────────── exact clustering ─────────────────────────────────────
448
449/// Qualifying pairs `(i<j, ratio)` with `ratio >= threshold`, in parallel. The exact upper-bound
450/// early-exits (`real_quick_ratio`/`quick_ratio`) skip most pairs without the full O(len²) RO;
451/// survivors go through `gestalt_edge` — reject early-exit for non-edges, exact ratio for edges. The
452/// edge ratio is kept so `min_sim` reuses it (a dense cluster's intra pairs are ~all edges, so the
453/// `min_sim` pass recomputes almost nothing — removing the redundant second scan over the same pairs).
454#[allow(clippy::cast_precision_loss, clippy::many_single_char_names)]
455fn qualifying_pairs(chars: &[Vec<char>], sams: &[gestalt::Sam], threshold: f64) -> Vec<(usize, usize, f64)> {
456    use std::sync::atomic::{AtomicUsize, Ordering};
457
458    let n = chars.len();
459    let rows = AtomicUsize::new(0);
460    std::thread::scope(|scope| {
461        if progress_on() {
462            let rows = &rows;
463            scope.spawn(move || {
464                while rows.load(Ordering::Relaxed) < n {
465                    std::thread::sleep(std::time::Duration::from_secs(1));
466                    let done = rows.load(Ordering::Relaxed);
467                    eprintln!("    [difflib-fast] qualifying_pairs: row {done}/{n} ({:.0}%)", done as f64 / n as f64 * 100.0);
468                }
469            });
470        }
471        // Length blocking: ratio>=T ⟹ |short|/|long| >= T/(2-T), so in length-sorted order each
472        // string only reaches a contiguous run of (not-too-much-longer) strings — break the inner
473        // loop as soon as `real_quick_ratio` drops below T. Turns the O(n²) enumeration into
474        // O(n·window); exact (never drops a qualifying pair). `counts` feeds the cheap quick filter.
475        let mut order: Vec<usize> = (0..n).collect();
476        order.sort_by_key(|&i| chars[i].len());
477        let counts: Vec<Vec<(char, u32)>> = chars.par_iter().map(|c| char_counts(c)).collect();
478        let pairs = (0..n)
479            .into_par_iter()
480            .flat_map_iter(|p| {
481                let i = order[p];
482                let a = &chars[i];
483                let mut local: Vec<(usize, usize, f64)> = Vec::new();
484                for &j in &order[p + 1..] {
485                    let b = &chars[j];
486                    if real_quick_ratio(a, b) < threshold {
487                        break; // lengths only grow ⇒ all remaining partners also fail the bound
488                    }
489                    if quick_ratio_counts(&counts[i], &counts[j], a.len() + b.len()) < threshold {
490                        continue;
491                    }
492                    let (lo, hi) = if i < j { (i, j) } else { (j, i) };
493                    // Reject early-exit for non-edges; exact ratio (cached for min_sim) for edges.
494                    if let Some(r) = gestalt::gestalt_edge(&chars[lo], &chars[hi], &sams[hi], threshold) {
495                        local.push((lo, hi, r));
496                    }
497                }
498                rows.fetch_add(1, Ordering::Relaxed);
499                local
500            })
501            .collect();
502        rows.store(n, Ordering::Relaxed); // unblock the progress thread
503        pairs
504    })
505}
506
507/// Min intra-cluster pairwise ratio (single-linkage's conservative figure), exact. Edge pairs are
508/// already in `ratios` (cached from the qualifying pass) — for a dense cluster that is ~every intra
509/// pair, so almost nothing is recomputed. The rare missing (non-edge, ratio < threshold) pair is
510/// computed with the pruned `gestalt_ratio_capped` (accept-exits any pair above the running min).
511/// Parallel over members; each task's cap is its own running min (>= the global min ⇒ exact min kept).
512fn cluster_min_sim(members: &[usize], chars: &[Vec<char>], sams: &[gestalt::Sam], ratios: &HashMap<(usize, usize), f64>) -> f64 {
513    members
514        .par_iter()
515        .enumerate()
516        .map(|(pos, &i)| {
517            let mut local = 1.0_f64;
518            for &j in &members[pos + 1..] {
519                let key = if i < j { (i, j) } else { (j, i) };
520                let r = match ratios.get(&key) {
521                    Some(&r) => r, // edge ratio cached by the qualifying pass — no recompute
522                    None => gestalt::gestalt_ratio_capped(&chars[key.0], &chars[key.1], &sams[key.1], local),
523                };
524                local = local.min(r);
525            }
526            local
527        })
528        .reduce(|| 1.0_f64, f64::min)
529}
530
531/// Union-find over qualifying edge pairs → clusters (size >= 2), each with its exact min intra-pair
532/// ratio. The qualifying pass's edge ratios are cached in `ratios` and reused by `cluster_min_sim`.
533pub(crate) fn assemble(n: usize, pairs: Vec<(usize, usize, f64)>, chars: &[Vec<char>], sams: &[gestalt::Sam]) -> Vec<(Vec<usize>, f64)> {
534    let mut parent: Vec<usize> = (0..n).collect();
535    let mut ratios: HashMap<(usize, usize), f64> = HashMap::with_capacity(pairs.len());
536    for (i, j, r) in pairs {
537        ratios.insert((i, j), r);
538        let (ri, rj) = (uf_find(&mut parent, i), uf_find(&mut parent, j));
539        if ri != rj {
540            parent[ri] = rj;
541        }
542    }
543    let mut comps: HashMap<usize, Vec<usize>> = HashMap::new();
544    for i in 0..n {
545        let root = uf_find(&mut parent, i);
546        comps.entry(root).or_default().push(i);
547    }
548    let mut out: Vec<(Vec<usize>, f64)> = Vec::new();
549    for members in comps.values() {
550        if members.len() < 2 {
551            continue;
552        }
553        let min_sim = cluster_min_sim(members, chars, sams, &ratios);
554        let mut sorted = members.clone();
555        sorted.sort_unstable();
556        out.push((sorted, min_sim));
557    }
558    out.sort_by(|a, b| a.0[0].cmp(&b.0[0]));
559    out
560}
561
562/// Exact single-linkage clustering over pre-collected `char` vectors: returns each cluster (member
563/// indices, sorted) with its exact minimum pairwise ratio. O(n²) early-exit join, rayon-parallel.
564#[must_use]
565#[doc(hidden)] // low-level Vec<char> entry — used by the bench bin + `Rationer`; prefer `cluster_canonicals`.
566pub fn cluster_canonicals_chars(chars: &[Vec<char>], threshold: f64) -> Vec<(Vec<usize>, f64)> {
567    let n = chars.len();
568    // Prebuild each string's suffix automaton ONCE (n builds), reused as the b-side for all n²
569    // pairs — the all-pairs cost becomes n builds + n² scans, not n² builds.
570    let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
571    let pairs = qualifying_pairs(chars, &sams, threshold);
572    assemble(n, pairs, chars, &sams)
573}
574
575/// `cluster_canonicals(canonicals, threshold)` → `[(member indices, min pairwise ratio)]`.
576///
577/// Exact single-linkage clustering: `ratio >= threshold` joins two strings; each returned cluster
578/// (size >= 2) carries its exact minimum intra-cluster ratio. Bit-identical to the reference
579/// pairwise clustering — just far faster (suffix automaton + early-exit + rayon).
580#[must_use]
581pub fn cluster_canonicals(canonicals: &[String], threshold: f64) -> Vec<(Vec<usize>, f64)> {
582    let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
583    cluster_canonicals_chars(&chars, threshold)
584}
585
586// ─────────────────────────────── scalable MinHash-LSH variant ───────────────────────────────
587
588const SHINGLE_K: usize = 9; // char-k-gram length for MinHash shingles (calibrated on real code)
589
590fn fnv1a_bytes(data: &[u8]) -> u64 {
591    let mut h = 0xcbf2_9ce4_8422_2325u64;
592    for &b in data {
593        h ^= u64::from(b);
594        h = h.wrapping_mul(0x0000_0100_0000_01b3);
595    }
596    h
597}
598
599fn fnv1a_u64s(values: &[u64]) -> u64 {
600    let mut h = 0xcbf2_9ce4_8422_2325u64;
601    for &v in values {
602        h ^= v;
603        h = h.wrapping_mul(0x0000_0100_0000_01b3);
604    }
605    h
606}
607
608/// Distinct char-k-gram shingle hashes of `s` (the set `MinHash` estimates Jaccard over).
609fn shingle_hashes(s: &str) -> Vec<u64> {
610    let bytes = s.as_bytes();
611    if bytes.len() <= SHINGLE_K {
612        return vec![fnv1a_bytes(bytes)];
613    }
614    let mut set: HashSet<u64> = HashSet::new();
615    for window in bytes.windows(SHINGLE_K) {
616        set.insert(fnv1a_bytes(window));
617    }
618    set.into_iter().collect()
619}
620
621/// `num` deterministic `(a, b)` hash permutations (fixed seed → reproducible signatures).
622fn make_perms(num: usize) -> Vec<(u64, u64)> {
623    let mut state = 0x9e37_79b9_7f4a_7c15u64;
624    let mut next = move || {
625        state ^= state << 13;
626        state ^= state >> 7;
627        state ^= state << 17;
628        state
629    };
630    (0..num).map(|_| (next() | 1, next())).collect()
631}
632
633fn minhash(shingles: &[u64], perms: &[(u64, u64)]) -> Vec<u64> {
634    perms
635        .iter()
636        .map(|&(a, b)| shingles.iter().map(|&h| a.wrapping_mul(h).wrapping_add(b)).min().unwrap_or(u64::MAX))
637        .collect()
638}
639
640/// LSH candidate pairs: documents that share a full band signature in any band (an O(n)-ish proxy
641/// for "Jaccard above the band threshold" — recall tuned via `band_rows`).
642fn lsh_candidates(sigs: &[Vec<u64>], band_rows: usize) -> HashSet<(usize, usize)> {
643    let bands = sigs.first().map_or(0, Vec::len).checked_div(band_rows).unwrap_or(0);
644    let mut candidates: HashSet<(usize, usize)> = HashSet::new();
645    for band in 0..bands {
646        let lo = band * band_rows;
647        let mut buckets: HashMap<u64, Vec<usize>> = HashMap::new();
648        for (d, sig) in sigs.iter().enumerate() {
649            buckets.entry(fnv1a_u64s(&sig[lo..lo + band_rows])).or_default().push(d);
650        }
651        for docs in buckets.values() {
652            for a in 0..docs.len() {
653                for b in (a + 1)..docs.len() {
654                    candidates.insert((docs[a].min(docs[b]), docs[a].max(docs[b])));
655                }
656            }
657        }
658    }
659    candidates
660}
661
662/// `cluster_canonicals_lsh(canonicals, threshold, num_perm, band_rows)`: the scalable path.
663///
664/// `MinHash`-LSH generates candidate pairs in ~O(n) (skipping the O(n²) dissimilar pairs); each
665/// candidate is then **verified with the exact ratio**, so clusters + `min_sim` match the exact path
666/// (modulo LSH recall, tuned high via `band_rows`). Filter-verification, in the `BayesLSH`-Lite /
667/// `SourcererCC` lineage. Use past the O(n²) wall (>100k strings); for exact recall use
668/// [`cluster_canonicals`].
669#[must_use]
670pub fn cluster_canonicals_lsh(canonicals: &[String], threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
671    let debug = progress_on();
672    let start = std::time::Instant::now();
673    let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
674    let n = chars.len();
675    let perms = make_perms(num_perm);
676    let sigs: Vec<Vec<u64>> = canonicals.par_iter().map(|s| minhash(&shingle_hashes(s), &perms)).collect();
677    if debug {
678        eprintln!("    [difflib-fast] lsh: {n} signatures in {:.2}s", start.elapsed().as_secs_f64());
679    }
680    let candidates = lsh_candidates(&sigs, band_rows);
681    if debug {
682        eprintln!("    [difflib-fast] lsh: {} candidate pairs in {:.2}s", candidates.len(), start.elapsed().as_secs_f64());
683    }
684    let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
685    let cand: Vec<(usize, usize)> = candidates.into_iter().collect();
686    let pairs: Vec<(usize, usize, f64)> = cand
687        .par_iter()
688        .filter_map(|&(i, j)| {
689            let (a, b) = if i < j { (i, j) } else { (j, i) };
690            gestalt::gestalt_edge(&chars[a], &chars[b], &sams[b], threshold).map(|r| (a, b, r))
691        })
692        .collect();
693    if debug {
694        eprintln!("    [difflib-fast] lsh: {} verified pairs in {:.2}s", pairs.len(), start.elapsed().as_secs_f64());
695    }
696    assemble(n, pairs, &chars, &sams)
697}
698
699// ───────────────────────────────── optional Python bindings ─────────────────────────────────
700
701#[cfg(feature = "python")]
702mod python {
703    use pyo3::prelude::*;
704
705    /// Run `f` on a rayon pool of `threads` workers; `threads == 0` uses the global pool (all cores,
706    /// itself tunable process-wide via `RAYON_NUM_THREADS`). A bad pool build falls back to global.
707    fn run_on_threads<T: Send>(threads: usize, f: impl FnOnce() -> T + Send) -> T {
708        if threads == 0 {
709            return f();
710        }
711        match rayon::ThreadPoolBuilder::new().num_threads(threads).build() {
712            Ok(pool) => pool.install(f),
713            Err(_) => f(),
714        }
715    }
716
717    /// Parse a `"cpu" | "gpu" | "gpu+cpu"` backend string into a [`Concurrency`](super::Concurrency).
718    fn parse_concurrency(s: &str) -> PyResult<super::Concurrency> {
719        use pyo3::exceptions::PyValueError;
720        match s.to_ascii_lowercase().as_str() {
721            "cpu" => Ok(super::Concurrency::Cpu),
722            "gpu" => Ok(super::Concurrency::Gpu),
723            "gpu+cpu" | "gpucpu" | "gpu_cpu" => Ok(super::Concurrency::GpuPlusCpu),
724            other => Err(PyValueError::new_err(format!(
725                "unknown concurrency {other:?}; expected \"cpu\", \"gpu\", or \"gpu+cpu\""
726            ))),
727        }
728    }
729
730    /// `ratio(a, b)` — fast exact `difflib.SequenceMatcher(None, a, b, autojunk=False).ratio()`.
731    ///
732    /// Releases the GIL for the compute (the inputs are copied to owned `String`s first), so calling
733    /// this from many Python threads actually scales across cores instead of serializing on the GIL.
734    /// Backs the scalar form of the public `ratio`; for a batch the package routes to `ratio_many`.
735    #[pyfunction]
736    fn ratio(py: Python<'_>, a: &str, b: &str) -> f64 {
737        let (a, b) = (a.to_owned(), b.to_owned());
738        py.detach(|| super::ratio(&a, &b))
739    }
740
741    /// `ratio_many(pairs, threads=0)` → one exact ratio per `(a, b)` pair, **computed across cores
742    /// inside Rust** (rayon, GIL released). Backs the list form of the public `ratio` — the
743    /// contention-free batch path, no `ThreadPoolExecutor`. `threads=0` = all cores; `threads=N` caps
744    /// it to N for this call.
745    #[pyfunction]
746    #[pyo3(signature = (pairs, threads=0))]
747    #[allow(clippy::needless_pass_by_value)]
748    fn ratio_many(py: Python<'_>, pairs: Vec<(String, String)>, threads: usize) -> Vec<f64> {
749        py.detach(|| run_on_threads(threads, || super::ratio_many(&pairs)))
750    }
751
752    /// `cluster_canonicals(canonicals, threshold, threads=0)` → `[(member indices, min pairwise
753    /// ratio)]`.
754    ///
755    /// Fans the all-pairs join out across cores internally (rayon) — one call, full multicore, no
756    /// Python threads needed. The GIL is released during the compute, so it never blocks the rest of
757    /// your program. `threads=0` = all cores; `threads=N` caps it to N for this call.
758    #[pyfunction]
759    #[pyo3(signature = (canonicals, threshold, threads=0))]
760    #[allow(clippy::needless_pass_by_value)]
761    fn cluster_canonicals(py: Python<'_>, canonicals: Vec<String>, threshold: f64, threads: usize) -> Vec<(Vec<usize>, f64)> {
762        py.detach(|| run_on_threads(threads, || super::cluster_canonicals(&canonicals, threshold)))
763    }
764
765    /// `cluster_canonicals_lsh(canonicals, threshold, num_perm, band_rows, threads=0)` — scalable LSH
766    /// variant. `threads=0` = all cores; `threads=N` caps it to N for this call.
767    #[pyfunction]
768    #[pyo3(signature = (canonicals, threshold, num_perm, band_rows, threads=0))]
769    #[allow(clippy::needless_pass_by_value)]
770    fn cluster_canonicals_lsh(py: Python<'_>, canonicals: Vec<String>, threshold: f64, num_perm: usize, band_rows: usize, threads: usize) -> Vec<(Vec<usize>, f64)> {
771        py.detach(|| run_on_threads(threads, || super::cluster_canonicals_lsh(&canonicals, threshold, num_perm, band_rows)))
772    }
773
774    /// `Rationer(concurrency="gpu+cpu", threads=0, delta=0.0)` — stateful handle that owns the
775    /// long-lived backend resources (on macOS+`gpu`: Metal device + power-boost assertion) once and
776    /// reuses them across calls. The free functions rebuild per-call state each time; a `Rationer`
777    /// pays it once.
778    ///
779    /// `concurrency` ∈ `"cpu" | "gpu" | "gpu+cpu"`. The GPU is only engaged where it measured a net
780    /// win — `cluster_canonicals` on a single large group (~1.1–1.4× on Apple Silicon). `ratio` /
781    /// `ratio_many` stay on CPU. On a wheel built without the `gpu` feature (the default Linux/Windows
782    /// wheels), or with no Metal device, every call quietly runs on CPU with identical output.
783    #[pyclass(name = "Rationer")]
784    struct PyRationer {
785        inner: super::Rationer,
786    }
787
788    #[pymethods]
789    impl PyRationer {
790        #[new]
791        #[pyo3(signature = (concurrency="gpu+cpu", threads=0, delta=0.0))]
792        fn new(concurrency: &str, threads: usize, delta: f64) -> PyResult<Self> {
793            let c = parse_concurrency(concurrency)?;
794            let mut b = super::Rationer::builder().concurrency(c).delta(delta);
795            if threads > 0 {
796                b = b.threads(threads);
797            }
798            Ok(Self { inner: b.build() })
799        }
800
801        /// The active backend after construction-time fallback: `"cpu"`, `"gpu"`, or `"gpu+cpu"`.
802        /// A handle requested as `"gpu"` on a non-Metal build/host reports `"cpu"` here.
803        #[getter]
804        fn concurrency(&self) -> &'static str {
805            match self.inner.concurrency() {
806                super::Concurrency::Cpu => "cpu",
807                super::Concurrency::Gpu => "gpu",
808                super::Concurrency::GpuPlusCpu => "gpu+cpu",
809            }
810        }
811
812        /// Active approximate-RO `delta` (0.0 = exact).
813        #[getter]
814        fn delta(&self) -> f64 {
815            self.inner.delta()
816        }
817
818        /// Single-pair exact ratio (always CPU; one pair offers no GPU win).
819        fn ratio(&self, py: Python<'_>, a: &str, b: &str) -> f64 {
820            let (a, b) = (a.to_owned(), b.to_owned());
821            py.detach(|| self.inner.ratio(&a, &b))
822        }
823
824        /// Batched exact ratio over `(a, b)` pairs (CPU rayon; GIL released).
825        #[allow(clippy::needless_pass_by_value)]
826        fn ratio_many(&self, py: Python<'_>, pairs: Vec<(String, String)>) -> Vec<f64> {
827            py.detach(|| self.inner.ratio_many(&pairs))
828        }
829
830        /// Exact single-linkage clustering at `threshold`. Routes through the GPU on macOS+`gpu`
831        /// when the group is large enough to amortize dispatch; otherwise CPU. Same output as the
832        /// free `cluster_canonicals`.
833        #[allow(clippy::needless_pass_by_value)]
834        fn cluster_canonicals(&self, py: Python<'_>, canonicals: Vec<String>, threshold: f64) -> Vec<(Vec<usize>, f64)> {
835            py.detach(|| self.inner.cluster_canonicals(&canonicals, threshold))
836        }
837    }
838
839    /// `cosine_join(docs, threshold, concurrency="cpu", threads=0)` — exact all-pairs weighted-cosine
840    /// similarity join over **token documents**. Each `doc` is a list of string tokens (e.g. a
841    /// function's canonical lines); they're turned into TF-IDF sparse vectors in Rust and every pair
842    /// with cosine `>= threshold` is returned as `(j, i, cos)` with `j < i`.
843    ///
844    /// `concurrency` ∈ `"cpu" | "gpu" | "gpu+cpu"`: `"cpu"` is exact f64 everywhere; `"gpu+cpu"` is the
845    /// exact f64 GPU-accelerated hybrid (byte-identical to `"cpu"`); `"gpu"` runs the dot on the GPU in
846    /// f32 (fastest; differs from exact only on pairs within ~1e-6 of the threshold). On a wheel built
847    /// without the `gpu` feature, or with no Metal device, the GPU modes quietly fall back to CPU. The
848    /// GIL is released for the whole build+join. For repeated joins on one corpus, use `CosineJoiner`.
849    #[pyfunction]
850    #[pyo3(signature = (docs, threshold, concurrency="cpu", threads=0))]
851    #[allow(clippy::needless_pass_by_value)]
852    fn cosine_join(
853        py: Python<'_>,
854        docs: Vec<Vec<String>>,
855        threshold: f64,
856        concurrency: &str,
857        threads: usize,
858    ) -> PyResult<Vec<(usize, usize, f64)>> {
859        let mode = parse_concurrency(concurrency)?;
860        Ok(py.detach(|| {
861            run_on_threads(threads, || {
862                let corpus = super::simjoin::Corpus::from_token_docs(&docs);
863                super::simjoin::cosine_join_with(&corpus, threshold, mode)
864            })
865        }))
866    }
867
868    /// `CosineJoiner(docs)` — stateful similarity-join handle that builds the TF-IDF corpus and (on a
869    /// macOS `gpu` wheel) acquires the Metal device + uploads the corpus **once**, then answers
870    /// repeated `join(threshold, concurrency=...)` calls reusing them. The free `cosine_join` rebuilds
871    /// everything per call; a `CosineJoiner` pays it once — use it to sweep thresholds.
872    #[pyclass(name = "CosineJoiner")]
873    struct PyCosineJoiner {
874        inner: super::simjoin::CosineJoiner,
875    }
876
877    #[pymethods]
878    impl PyCosineJoiner {
879        #[new]
880        #[allow(clippy::needless_pass_by_value)]
881        fn new(py: Python<'_>, docs: Vec<Vec<String>>) -> Self {
882            let inner = py.detach(|| {
883                super::simjoin::CosineJoiner::new(super::simjoin::Corpus::from_token_docs(&docs))
884            });
885            Self { inner }
886        }
887
888        /// Number of documents in the corpus.
889        fn __len__(&self) -> usize {
890            self.inner.corpus().len()
891        }
892
893        /// Whether a Metal GPU backend was acquired (always `False` off a macOS `gpu` wheel). When
894        /// `False`, every `join` runs on CPU regardless of the `concurrency` argument.
895        #[getter]
896        fn has_gpu(&self) -> bool {
897            self.inner.has_gpu()
898        }
899
900        /// Join at `threshold` under `concurrency` (`"cpu" | "gpu" | "gpu+cpu"`), reusing the handle's
901        /// resources. Returns `(j, i, cos)` pairs with `j < i`. GIL released for the compute.
902        #[pyo3(signature = (threshold, concurrency="cpu", threads=0))]
903        fn join(
904            &self,
905            py: Python<'_>,
906            threshold: f64,
907            concurrency: &str,
908            threads: usize,
909        ) -> PyResult<Vec<(usize, usize, f64)>> {
910            let mode = parse_concurrency(concurrency)?;
911            Ok(py.detach(|| run_on_threads(threads, || self.inner.join(threshold, mode))))
912        }
913    }
914
915    /// Compiled core of the `difflib_fast` Python package (re-exported by `difflib_fast/__init__.py`).
916    #[pymodule]
917    fn _difflib_fast(m: &Bound<'_, PyModule>) -> PyResult<()> {
918        m.add_function(wrap_pyfunction!(ratio, m)?)?;
919        m.add_function(wrap_pyfunction!(ratio_many, m)?)?;
920        m.add_function(wrap_pyfunction!(cluster_canonicals, m)?)?;
921        m.add_function(wrap_pyfunction!(cluster_canonicals_lsh, m)?)?;
922        m.add_function(wrap_pyfunction!(cosine_join, m)?)?;
923        m.add_class::<PyRationer>()?;
924        m.add_class::<PyCosineJoiner>()?;
925        Ok(())
926    }
927}
928
929#[cfg(test)]
930mod tests {
931    #![allow(clippy::float_cmp, clippy::unreadable_literal)]
932    use super::{cluster_canonicals, gestalt_ratio, ratio, ratio_b2j, ratio_reference};
933
934    #[test]
935    fn matches_known_difflib_values() {
936        // Cross-checked against difflib.SequenceMatcher(None, a, b, autojunk=False).ratio().
937        assert_eq!(gestalt_ratio("", ""), 1.0);
938        assert_eq!(gestalt_ratio("", "x"), 0.0);
939        assert_eq!(gestalt_ratio("abc", "abc"), 1.0);
940        assert_eq!(gestalt_ratio("abc", "abd"), 0.6666666666666666);
941        assert_eq!(gestalt_ratio("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
942        assert_eq!(gestalt_ratio("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
943    }
944
945    // The fast suffix-automaton path must equal the structurally-distinct b2j reference exactly.
946    #[test]
947    fn fast_matches_reference() {
948        let mut s: u64 = 0x1234_5678_9abc_def1;
949        let mut next = || {
950            s ^= s << 13;
951            s ^= s >> 7;
952            s ^= s << 17;
953            s
954        };
955        for _ in 0..2000 {
956            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
957                (0..n).map(|_| char::from(b'a' + (rng() % 5) as u8)).collect()
958            };
959            let (la, lb) = ((next() % 50) as usize, (next() % 50) as usize);
960            let a = mk(la, &mut next);
961            let b = mk(lb, &mut next);
962            let r = ratio_reference(&a, &b);
963            assert_eq!(gestalt_ratio(&a, &b), r, "SAM a={a:?} b={b:?}");
964            assert_eq!(ratio_b2j(&a, &b), r, "b2j a={a:?} b={b:?}");
965        }
966    }
967
968    // Both dispatch branches (b2j for short, SAM for long) must agree with the reference on long,
969    // repetitive strings that cross B2J_CROSSOVER.
970    #[test]
971    fn long_strings_all_paths_agree() {
972        let mut s: u64 = 0xdead_beef_cafe_1234;
973        let mut next = || {
974            s ^= s << 13;
975            s ^= s >> 7;
976            s ^= s << 17;
977            s
978        };
979        for _ in 0..40 {
980            let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
981                (0..n).map(|_| char::from(b'a' + (rng() % 6) as u8)).collect()
982            };
983            let a = mk(1400 + (next() % 600) as usize, &mut next); // > B2J_CROSSOVER ⇒ SAM branch
984            let b = mk(1400 + (next() % 600) as usize, &mut next);
985            let r = ratio_reference(&a, &b);
986            assert_eq!(gestalt_ratio(&a, &b), r);
987            assert_eq!(ratio_b2j(&a, &b), r);
988            assert_eq!(ratio(&a, &b), r); // dispatched (SAM here)
989        }
990    }
991
992    #[test]
993    fn clusters_obvious_duplicates() {
994        let corpus: Vec<String> = vec![
995            "def add(a, b): return a + b".into(),
996            "def add(x, y): return x + y".into(),
997            "completely unrelated text here".into(),
998        ];
999        let clusters = cluster_canonicals(&corpus, 0.5);
1000        assert_eq!(clusters.len(), 1, "the two add() variants should cluster");
1001        assert_eq!(clusters[0].0, vec![0, 1]);
1002        assert!(clusters[0].1 >= 0.5);
1003    }
1004}