1use std::cell::RefCell;
24use std::collections::{HashMap, HashSet};
25
26use rayon::prelude::*;
27
28pub mod gestalt;
29pub use gestalt::gestalt_ratio;
30
31#[cfg(all(feature = "gpu", target_os = "macos"))]
35pub mod gpu;
36
37pub mod rationer;
40pub use rationer::{Concurrency, PreparedRationer, Rationer, RationerBuilder};
41
42const B2J_WORK_FACTOR: u64 = 34;
47
48#[must_use]
55pub fn ratio(a: &str, b: &str) -> f64 {
56 let av: Vec<char> = a.chars().collect();
57 let bv: Vec<char> = b.chars().collect();
58 ratio_chars(&av, &bv)
59}
60
61fn ascii_counts(s: &[char]) -> ([u32; 128], u32) {
63 let mut c = [0u32; 128];
64 let mut other = 0u32;
65 for &ch in s {
66 let u = ch as u32;
67 if u < 128 {
68 c[u as usize] += 1;
69 } else {
70 other += 1;
71 }
72 }
73 (c, other)
74}
75
76fn work_factor() -> u64 {
77 use std::sync::OnceLock;
78 static F: OnceLock<u64> = OnceLock::new();
79 *F.get_or_init(|| std::env::var("DF_WORK_FACTOR").ok().and_then(|s| s.parse().ok()).unwrap_or(B2J_WORK_FACTOR))
80}
81
82#[allow(clippy::cast_precision_loss)]
83#[must_use]
84fn ratio_chars(a: &[char], b: &[char]) -> f64 {
85 let total = a.len() + b.len();
86 if total == 0 {
87 return 1.0;
88 }
89 let (ca, oa) = ascii_counts(a);
90 let (cb, ob) = ascii_counts(b);
91 if oa > 0 || ob > 0 {
94 return gestalt::gestalt_ratio_chars(a, b);
95 }
96 let mut w = 0u64;
102 for i in 0..128 {
103 w += u64::from(ca[i]) * u64::from(cb[i]);
104 }
105 if w <= work_factor() * total as u64 {
106 ratio_b2j_chars(a, b, &cb)
107 } else {
108 gestalt::gestalt_ratio_chars(a, b)
109 }
110}
111
112#[cfg(test)]
115#[must_use]
116fn ratio_b2j(a: &str, b: &str) -> f64 {
117 let av: Vec<char> = a.chars().collect();
118 let bv: Vec<char> = b.chars().collect();
119 let (cb, ob) = ascii_counts(&bv);
120 if ob > 0 {
121 return gestalt::gestalt_ratio_chars(&av, &bv); }
123 ratio_b2j_chars(&av, &bv, &cb)
124}
125
126#[must_use]
133pub fn ratio_many(pairs: &[(String, String)]) -> Vec<f64> {
134 pairs.par_iter().map(|(a, b)| ratio(a, b)).collect()
135}
136
137#[cfg(test)]
145fn build_b2j(b: &[char]) -> HashMap<char, Vec<usize>> {
146 let mut b2j: HashMap<char, Vec<usize>> = HashMap::new();
147 for (j, &c) in b.iter().enumerate() {
148 b2j.entry(c).or_default().push(j);
149 }
150 b2j
151}
152
153#[cfg(test)]
155#[allow(clippy::similar_names)]
156fn find_longest(a: &[char], b2j: &HashMap<char, Vec<usize>>, alo: usize, ahi: usize, blo: usize, bhi: usize) -> (usize, usize, usize) {
157 let mut besti = alo;
158 let mut bestj = blo;
159 let mut bestsize = 0usize;
160 let mut j2_prev: HashMap<usize, usize> = HashMap::new();
161 for (i, ch) in a.iter().enumerate().take(ahi).skip(alo) {
162 let mut j2_cur: HashMap<usize, usize> = HashMap::new();
163 if let Some(positions) = b2j.get(ch) {
164 for &j in positions {
165 if j < blo {
166 continue;
167 }
168 if j >= bhi {
169 break;
170 }
171 let prev = if j > blo { *j2_prev.get(&(j - 1)).unwrap_or(&0) } else { 0 };
172 let k = prev + 1;
173 j2_cur.insert(j, k);
174 if k > bestsize {
175 besti = i + 1 - k;
176 bestj = j + 1 - k;
177 bestsize = k;
178 }
179 }
180 }
181 j2_prev = j2_cur;
182 }
183 (besti, bestj, bestsize)
184}
185
186#[cfg(test)]
188#[allow(clippy::many_single_char_names)]
189fn matching_count(a: &[char], b: &[char], b2j: &HashMap<char, Vec<usize>>) -> usize {
190 let mut total = 0usize;
191 let mut stack: Vec<(usize, usize, usize, usize)> = vec![(0, a.len(), 0, b.len())];
192 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
193 let (i, j, k) = find_longest(a, b2j, alo, ahi, blo, bhi);
194 if k > 0 {
195 total += k;
196 if alo < i && blo < j {
197 stack.push((alo, i, blo, j));
198 }
199 if i + k < ahi && j + k < bhi {
200 stack.push((i + k, ahi, j + k, bhi));
201 }
202 }
203 }
204 total
205}
206
207#[cfg(test)]
210#[allow(clippy::cast_precision_loss)]
211#[must_use]
212fn ratio_reference(a: &str, b: &str) -> f64 {
213 let av: Vec<char> = a.chars().collect();
214 let bv: Vec<char> = b.chars().collect();
215 let total = av.len() + bv.len();
216 if total == 0 {
217 return 1.0;
218 }
219 let b2j = build_b2j(&bv);
220 2.0 * (matching_count(&av, &bv, &b2j) as f64) / (total as f64)
221}
222
223#[derive(Default)]
232struct B2jScratch {
233 offsets: Vec<u32>, positions: Vec<u32>, cursor: Vec<u32>, j2len: Vec<u32>,
237 erase: Vec<(u32, u32)>, affect: Vec<(u32, u32)>, stack: Vec<(usize, usize, usize, usize)>,
240}
241
242thread_local! {
243 static B2J: RefCell<B2jScratch> = RefCell::new(B2jScratch::default());
245}
246
247#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
252fn ratio_b2j_chars(a: &[char], b: &[char], cb: &[u32; 128]) -> f64 {
253 let total = a.len() + b.len();
254 if total == 0 {
255 return 1.0;
256 }
257 if a.is_empty() || b.is_empty() {
258 return 0.0;
259 }
260 B2J.with_borrow_mut(|s| {
261 let B2jScratch { offsets, positions, cursor, j2len, erase, affect, stack } = s;
262 offsets.clear();
264 offsets.resize(129, 0);
265 for c in 0..128 {
266 offsets[c + 1] = offsets[c] + cb[c];
267 }
268 cursor.clear();
269 cursor.extend_from_slice(&offsets[..128]);
270 positions.clear();
274 positions.reserve(b.len());
275 #[allow(clippy::uninit_vec)]
276 unsafe {
279 positions.set_len(b.len());
280 }
281 for (j, &ch) in b.iter().enumerate() {
282 let c = ch as usize; positions[cursor[c] as usize] = j as u32;
284 cursor[c] += 1;
285 }
286 j2len.clear();
287 j2len.resize(b.len() + 1, 0);
288 erase.reserve(b.len() + 1);
290 affect.reserve(b.len() + 1);
291 stack.clear();
292 stack.push((0, a.len(), 0, b.len()));
293 let mut m = 0usize;
294 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
295 let (bi, bj, bk) = find_longest_b2j(a, offsets, positions, j2len, erase, affect, alo, ahi, blo, bhi);
296 if bk > 0 {
297 m += bk;
298 if alo < bi && blo < bj {
299 stack.push((alo, bi, blo, bj));
300 }
301 if bi + bk < ahi && bj + bk < bhi {
302 stack.push((bi + bk, ahi, bj + bk, bhi));
303 }
304 }
305 }
306 2.0 * m as f64 / total as f64
307 })
308}
309
310#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation, clippy::similar_names)]
318fn find_longest_b2j(
319 a: &[char],
320 offsets: &[u32],
321 positions: &[u32],
322 j2len: &mut [u32],
323 erase: &mut Vec<(u32, u32)>,
324 affect: &mut Vec<(u32, u32)>,
325 alo: usize,
326 ahi: usize,
327 blo: usize,
328 bhi: usize,
329) -> (usize, usize, usize) {
330 let (mut bi, mut bj, mut bk) = (alo, blo, 0usize);
331 erase.clear();
332 #[allow(clippy::undocumented_unsafe_blocks)]
337 unsafe {
338 for i in alo..ahi {
339 affect.clear();
340 let c = *a.get_unchecked(i) as usize;
341 if c < 128 {
342 let lo = *offsets.get_unchecked(c) as usize;
343 let hi = *offsets.get_unchecked(c + 1) as usize;
344 for &jj in positions.get_unchecked(lo..hi) {
345 let j = jj as usize;
346 if j < blo {
347 continue;
348 }
349 if j >= bhi {
350 break;
351 }
352 let k = *j2len.get_unchecked(j) as usize + 1;
353 affect.push((j as u32 + 1, k as u32));
354 if k > bk {
355 bi = i + 1 - k;
356 bj = j + 1 - k;
357 bk = k;
358 }
359 }
360 }
361 for &(p, _) in erase.iter() {
362 *j2len.get_unchecked_mut(p as usize) = 0;
363 }
364 for &(p, v) in affect.iter() {
365 *j2len.get_unchecked_mut(p as usize) = v;
366 }
367 std::mem::swap(erase, affect);
368 }
369 for &(p, _) in erase.iter() {
370 *j2len.get_unchecked_mut(p as usize) = 0;
371 }
372 }
373 (bi, bj, bk)
374}
375
376#[allow(clippy::cast_precision_loss)]
380pub(crate) fn real_quick_ratio(a: &[char], b: &[char]) -> f64 {
381 let total = a.len() + b.len();
382 if total == 0 {
383 return 1.0;
384 }
385 2.0 * (a.len().min(b.len()) as f64) / (total as f64)
386}
387
388pub(crate) fn char_counts(a: &[char]) -> Vec<(char, u32)> {
391 let mut v = a.to_vec();
392 v.sort_unstable();
393 let mut out: Vec<(char, u32)> = Vec::new();
394 for c in v {
395 match out.last_mut() {
396 Some(last) if last.0 == c => last.1 += 1,
397 _ => out.push((c, 1)),
398 }
399 }
400 out
401}
402
403#[allow(clippy::cast_precision_loss)]
406pub(crate) fn quick_ratio_counts(ca: &[(char, u32)], cb: &[(char, u32)], total: usize) -> f64 {
407 if total == 0 {
408 return 1.0;
409 }
410 let (mut x, mut y, mut matches) = (0usize, 0usize, 0u32);
411 while x < ca.len() && y < cb.len() {
412 match ca[x].0.cmp(&cb[y].0) {
413 std::cmp::Ordering::Less => x += 1,
414 std::cmp::Ordering::Greater => y += 1,
415 std::cmp::Ordering::Equal => {
416 matches += ca[x].1.min(cb[y].1);
417 x += 1;
418 y += 1;
419 }
420 }
421 }
422 2.0 * f64::from(matches) / total as f64
423}
424
425fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
426 while parent[x] != x {
427 parent[x] = parent[parent[x]];
428 x = parent[x];
429 }
430 x
431}
432
433fn progress_on() -> bool {
436 std::env::var_os("DIFFLIB_FAST_PROGRESS").is_some()
437}
438
439#[allow(clippy::cast_precision_loss, clippy::many_single_char_names)]
447fn qualifying_pairs(chars: &[Vec<char>], sams: &[gestalt::Sam], threshold: f64) -> Vec<(usize, usize, f64)> {
448 use std::sync::atomic::{AtomicUsize, Ordering};
449
450 let n = chars.len();
451 let rows = AtomicUsize::new(0);
452 std::thread::scope(|scope| {
453 if progress_on() {
454 let rows = &rows;
455 scope.spawn(move || {
456 while rows.load(Ordering::Relaxed) < n {
457 std::thread::sleep(std::time::Duration::from_secs(1));
458 let done = rows.load(Ordering::Relaxed);
459 eprintln!(" [difflib-fast] qualifying_pairs: row {done}/{n} ({:.0}%)", done as f64 / n as f64 * 100.0);
460 }
461 });
462 }
463 let mut order: Vec<usize> = (0..n).collect();
468 order.sort_by_key(|&i| chars[i].len());
469 let counts: Vec<Vec<(char, u32)>> = chars.par_iter().map(|c| char_counts(c)).collect();
470 let pairs = (0..n)
471 .into_par_iter()
472 .flat_map_iter(|p| {
473 let i = order[p];
474 let a = &chars[i];
475 let mut local: Vec<(usize, usize, f64)> = Vec::new();
476 for &j in &order[p + 1..] {
477 let b = &chars[j];
478 if real_quick_ratio(a, b) < threshold {
479 break; }
481 if quick_ratio_counts(&counts[i], &counts[j], a.len() + b.len()) < threshold {
482 continue;
483 }
484 let (lo, hi) = if i < j { (i, j) } else { (j, i) };
485 if let Some(r) = gestalt::gestalt_edge(&chars[lo], &chars[hi], &sams[hi], threshold) {
487 local.push((lo, hi, r));
488 }
489 }
490 rows.fetch_add(1, Ordering::Relaxed);
491 local
492 })
493 .collect();
494 rows.store(n, Ordering::Relaxed); pairs
496 })
497}
498
499fn cluster_min_sim(members: &[usize], chars: &[Vec<char>], sams: &[gestalt::Sam], ratios: &HashMap<(usize, usize), f64>) -> f64 {
505 members
506 .par_iter()
507 .enumerate()
508 .map(|(pos, &i)| {
509 let mut local = 1.0_f64;
510 for &j in &members[pos + 1..] {
511 let key = if i < j { (i, j) } else { (j, i) };
512 let r = match ratios.get(&key) {
513 Some(&r) => r, None => gestalt::gestalt_ratio_capped(&chars[key.0], &chars[key.1], &sams[key.1], local),
515 };
516 local = local.min(r);
517 }
518 local
519 })
520 .reduce(|| 1.0_f64, f64::min)
521}
522
523pub(crate) fn assemble(n: usize, pairs: Vec<(usize, usize, f64)>, chars: &[Vec<char>], sams: &[gestalt::Sam]) -> Vec<(Vec<usize>, f64)> {
526 let mut parent: Vec<usize> = (0..n).collect();
527 let mut ratios: HashMap<(usize, usize), f64> = HashMap::with_capacity(pairs.len());
528 for (i, j, r) in pairs {
529 ratios.insert((i, j), r);
530 let (ri, rj) = (uf_find(&mut parent, i), uf_find(&mut parent, j));
531 if ri != rj {
532 parent[ri] = rj;
533 }
534 }
535 let mut comps: HashMap<usize, Vec<usize>> = HashMap::new();
536 for i in 0..n {
537 let root = uf_find(&mut parent, i);
538 comps.entry(root).or_default().push(i);
539 }
540 let mut out: Vec<(Vec<usize>, f64)> = Vec::new();
541 for members in comps.values() {
542 if members.len() < 2 {
543 continue;
544 }
545 let min_sim = cluster_min_sim(members, chars, sams, &ratios);
546 let mut sorted = members.clone();
547 sorted.sort_unstable();
548 out.push((sorted, min_sim));
549 }
550 out.sort_by(|a, b| a.0[0].cmp(&b.0[0]));
551 out
552}
553
554#[must_use]
557#[doc(hidden)] pub fn cluster_canonicals_chars(chars: &[Vec<char>], threshold: f64) -> Vec<(Vec<usize>, f64)> {
559 let n = chars.len();
560 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
563 let pairs = qualifying_pairs(chars, &sams, threshold);
564 assemble(n, pairs, chars, &sams)
565}
566
567#[must_use]
573pub fn cluster_canonicals(canonicals: &[String], threshold: f64) -> Vec<(Vec<usize>, f64)> {
574 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
575 cluster_canonicals_chars(&chars, threshold)
576}
577
578const SHINGLE_K: usize = 9; fn fnv1a_bytes(data: &[u8]) -> u64 {
583 let mut h = 0xcbf2_9ce4_8422_2325u64;
584 for &b in data {
585 h ^= u64::from(b);
586 h = h.wrapping_mul(0x0000_0100_0000_01b3);
587 }
588 h
589}
590
591fn fnv1a_u64s(values: &[u64]) -> u64 {
592 let mut h = 0xcbf2_9ce4_8422_2325u64;
593 for &v in values {
594 h ^= v;
595 h = h.wrapping_mul(0x0000_0100_0000_01b3);
596 }
597 h
598}
599
600fn shingle_hashes(s: &str) -> Vec<u64> {
602 let bytes = s.as_bytes();
603 if bytes.len() <= SHINGLE_K {
604 return vec![fnv1a_bytes(bytes)];
605 }
606 let mut set: HashSet<u64> = HashSet::new();
607 for window in bytes.windows(SHINGLE_K) {
608 set.insert(fnv1a_bytes(window));
609 }
610 set.into_iter().collect()
611}
612
613fn make_perms(num: usize) -> Vec<(u64, u64)> {
615 let mut state = 0x9e37_79b9_7f4a_7c15u64;
616 let mut next = move || {
617 state ^= state << 13;
618 state ^= state >> 7;
619 state ^= state << 17;
620 state
621 };
622 (0..num).map(|_| (next() | 1, next())).collect()
623}
624
625fn minhash(shingles: &[u64], perms: &[(u64, u64)]) -> Vec<u64> {
626 perms
627 .iter()
628 .map(|&(a, b)| shingles.iter().map(|&h| a.wrapping_mul(h).wrapping_add(b)).min().unwrap_or(u64::MAX))
629 .collect()
630}
631
632fn lsh_candidates(sigs: &[Vec<u64>], band_rows: usize) -> HashSet<(usize, usize)> {
635 let bands = sigs.first().map_or(0, Vec::len).checked_div(band_rows).unwrap_or(0);
636 let mut candidates: HashSet<(usize, usize)> = HashSet::new();
637 for band in 0..bands {
638 let lo = band * band_rows;
639 let mut buckets: HashMap<u64, Vec<usize>> = HashMap::new();
640 for (d, sig) in sigs.iter().enumerate() {
641 buckets.entry(fnv1a_u64s(&sig[lo..lo + band_rows])).or_default().push(d);
642 }
643 for docs in buckets.values() {
644 for a in 0..docs.len() {
645 for b in (a + 1)..docs.len() {
646 candidates.insert((docs[a].min(docs[b]), docs[a].max(docs[b])));
647 }
648 }
649 }
650 }
651 candidates
652}
653
654#[must_use]
662pub fn cluster_canonicals_lsh(canonicals: &[String], threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
663 let debug = progress_on();
664 let start = std::time::Instant::now();
665 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
666 let n = chars.len();
667 let perms = make_perms(num_perm);
668 let sigs: Vec<Vec<u64>> = canonicals.par_iter().map(|s| minhash(&shingle_hashes(s), &perms)).collect();
669 if debug {
670 eprintln!(" [difflib-fast] lsh: {n} signatures in {:.2}s", start.elapsed().as_secs_f64());
671 }
672 let candidates = lsh_candidates(&sigs, band_rows);
673 if debug {
674 eprintln!(" [difflib-fast] lsh: {} candidate pairs in {:.2}s", candidates.len(), start.elapsed().as_secs_f64());
675 }
676 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
677 let cand: Vec<(usize, usize)> = candidates.into_iter().collect();
678 let pairs: Vec<(usize, usize, f64)> = cand
679 .par_iter()
680 .filter_map(|&(i, j)| {
681 let (a, b) = if i < j { (i, j) } else { (j, i) };
682 gestalt::gestalt_edge(&chars[a], &chars[b], &sams[b], threshold).map(|r| (a, b, r))
683 })
684 .collect();
685 if debug {
686 eprintln!(" [difflib-fast] lsh: {} verified pairs in {:.2}s", pairs.len(), start.elapsed().as_secs_f64());
687 }
688 assemble(n, pairs, &chars, &sams)
689}
690
691#[cfg(feature = "python")]
694mod python {
695 use pyo3::prelude::*;
696
697 fn run_on_threads<T: Send>(threads: usize, f: impl FnOnce() -> T + Send) -> T {
700 if threads == 0 {
701 return f();
702 }
703 match rayon::ThreadPoolBuilder::new().num_threads(threads).build() {
704 Ok(pool) => pool.install(f),
705 Err(_) => f(),
706 }
707 }
708
709 #[pyfunction]
715 fn ratio(py: Python<'_>, a: &str, b: &str) -> f64 {
716 let (a, b) = (a.to_owned(), b.to_owned());
717 py.detach(|| super::ratio(&a, &b))
718 }
719
720 #[pyfunction]
725 #[pyo3(signature = (pairs, threads=0))]
726 #[allow(clippy::needless_pass_by_value)]
727 fn ratio_many(py: Python<'_>, pairs: Vec<(String, String)>, threads: usize) -> Vec<f64> {
728 py.detach(|| run_on_threads(threads, || super::ratio_many(&pairs)))
729 }
730
731 #[pyfunction]
738 #[pyo3(signature = (canonicals, threshold, threads=0))]
739 #[allow(clippy::needless_pass_by_value)]
740 fn cluster_canonicals(py: Python<'_>, canonicals: Vec<String>, threshold: f64, threads: usize) -> Vec<(Vec<usize>, f64)> {
741 py.detach(|| run_on_threads(threads, || super::cluster_canonicals(&canonicals, threshold)))
742 }
743
744 #[pyfunction]
747 #[pyo3(signature = (canonicals, threshold, num_perm, band_rows, threads=0))]
748 #[allow(clippy::needless_pass_by_value)]
749 fn cluster_canonicals_lsh(py: Python<'_>, canonicals: Vec<String>, threshold: f64, num_perm: usize, band_rows: usize, threads: usize) -> Vec<(Vec<usize>, f64)> {
750 py.detach(|| run_on_threads(threads, || super::cluster_canonicals_lsh(&canonicals, threshold, num_perm, band_rows)))
751 }
752
753 #[pyclass(name = "Rationer")]
763 struct PyRationer {
764 inner: super::Rationer,
765 }
766
767 #[pymethods]
768 impl PyRationer {
769 #[new]
770 #[pyo3(signature = (concurrency="gpu+cpu", threads=0, delta=0.0))]
771 fn new(concurrency: &str, threads: usize, delta: f64) -> PyResult<Self> {
772 use pyo3::exceptions::PyValueError;
773 let c = match concurrency.to_ascii_lowercase().as_str() {
774 "cpu" => super::Concurrency::Cpu,
775 "gpu" => super::Concurrency::Gpu,
776 "gpu+cpu" | "gpucpu" | "gpu_cpu" => super::Concurrency::GpuPlusCpu,
777 other => {
778 return Err(PyValueError::new_err(format!(
779 "unknown concurrency {other:?}; expected \"cpu\", \"gpu\", or \"gpu+cpu\""
780 )))
781 }
782 };
783 let mut b = super::Rationer::builder().concurrency(c).delta(delta);
784 if threads > 0 {
785 b = b.threads(threads);
786 }
787 Ok(Self { inner: b.build() })
788 }
789
790 #[getter]
793 fn concurrency(&self) -> &'static str {
794 match self.inner.concurrency() {
795 super::Concurrency::Cpu => "cpu",
796 super::Concurrency::Gpu => "gpu",
797 super::Concurrency::GpuPlusCpu => "gpu+cpu",
798 }
799 }
800
801 #[getter]
803 fn delta(&self) -> f64 {
804 self.inner.delta()
805 }
806
807 fn ratio(&self, py: Python<'_>, a: &str, b: &str) -> f64 {
809 let (a, b) = (a.to_owned(), b.to_owned());
810 py.detach(|| self.inner.ratio(&a, &b))
811 }
812
813 #[allow(clippy::needless_pass_by_value)]
815 fn ratio_many(&self, py: Python<'_>, pairs: Vec<(String, String)>) -> Vec<f64> {
816 py.detach(|| self.inner.ratio_many(&pairs))
817 }
818
819 #[allow(clippy::needless_pass_by_value)]
823 fn cluster_canonicals(&self, py: Python<'_>, canonicals: Vec<String>, threshold: f64) -> Vec<(Vec<usize>, f64)> {
824 py.detach(|| self.inner.cluster_canonicals(&canonicals, threshold))
825 }
826 }
827
828 #[pymodule]
830 fn _difflib_fast(m: &Bound<'_, PyModule>) -> PyResult<()> {
831 m.add_function(wrap_pyfunction!(ratio, m)?)?;
832 m.add_function(wrap_pyfunction!(ratio_many, m)?)?;
833 m.add_function(wrap_pyfunction!(cluster_canonicals, m)?)?;
834 m.add_function(wrap_pyfunction!(cluster_canonicals_lsh, m)?)?;
835 m.add_class::<PyRationer>()?;
836 Ok(())
837 }
838}
839
840#[cfg(test)]
841mod tests {
842 #![allow(clippy::float_cmp, clippy::unreadable_literal)]
843 use super::{cluster_canonicals, gestalt_ratio, ratio, ratio_b2j, ratio_reference};
844
845 #[test]
846 fn matches_known_difflib_values() {
847 assert_eq!(gestalt_ratio("", ""), 1.0);
849 assert_eq!(gestalt_ratio("", "x"), 0.0);
850 assert_eq!(gestalt_ratio("abc", "abc"), 1.0);
851 assert_eq!(gestalt_ratio("abc", "abd"), 0.6666666666666666);
852 assert_eq!(gestalt_ratio("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
853 assert_eq!(gestalt_ratio("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
854 }
855
856 #[test]
858 fn fast_matches_reference() {
859 let mut s: u64 = 0x1234_5678_9abc_def1;
860 let mut next = || {
861 s ^= s << 13;
862 s ^= s >> 7;
863 s ^= s << 17;
864 s
865 };
866 for _ in 0..2000 {
867 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
868 (0..n).map(|_| char::from(b'a' + (rng() % 5) as u8)).collect()
869 };
870 let (la, lb) = ((next() % 50) as usize, (next() % 50) as usize);
871 let a = mk(la, &mut next);
872 let b = mk(lb, &mut next);
873 let r = ratio_reference(&a, &b);
874 assert_eq!(gestalt_ratio(&a, &b), r, "SAM a={a:?} b={b:?}");
875 assert_eq!(ratio_b2j(&a, &b), r, "b2j a={a:?} b={b:?}");
876 }
877 }
878
879 #[test]
882 fn long_strings_all_paths_agree() {
883 let mut s: u64 = 0xdead_beef_cafe_1234;
884 let mut next = || {
885 s ^= s << 13;
886 s ^= s >> 7;
887 s ^= s << 17;
888 s
889 };
890 for _ in 0..40 {
891 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
892 (0..n).map(|_| char::from(b'a' + (rng() % 6) as u8)).collect()
893 };
894 let a = mk(1400 + (next() % 600) as usize, &mut next); let b = mk(1400 + (next() % 600) as usize, &mut next);
896 let r = ratio_reference(&a, &b);
897 assert_eq!(gestalt_ratio(&a, &b), r);
898 assert_eq!(ratio_b2j(&a, &b), r);
899 assert_eq!(ratio(&a, &b), r); }
901 }
902
903 #[test]
904 fn clusters_obvious_duplicates() {
905 let corpus: Vec<String> = vec![
906 "def add(a, b): return a + b".into(),
907 "def add(x, y): return x + y".into(),
908 "completely unrelated text here".into(),
909 ];
910 let clusters = cluster_canonicals(&corpus, 0.5);
911 assert_eq!(clusters.len(), 1, "the two add() variants should cluster");
912 assert_eq!(clusters[0].0, vec![0, 1]);
913 assert!(clusters[0].1 >= 0.5);
914 }
915}