1use std::cell::RefCell;
24use std::collections::{HashMap, HashSet};
25
26use rayon::prelude::*;
27
28pub mod gestalt;
29pub use gestalt::gestalt_ratio;
30
31const B2J_WORK_FACTOR: u64 = 34;
36
37#[must_use]
44pub fn ratio(a: &str, b: &str) -> f64 {
45 let av: Vec<char> = a.chars().collect();
46 let bv: Vec<char> = b.chars().collect();
47 ratio_chars(&av, &bv)
48}
49
50fn ascii_counts(s: &[char]) -> ([u32; 128], u32) {
52 let mut c = [0u32; 128];
53 let mut other = 0u32;
54 for &ch in s {
55 let u = ch as u32;
56 if u < 128 {
57 c[u as usize] += 1;
58 } else {
59 other += 1;
60 }
61 }
62 (c, other)
63}
64
65fn work_factor() -> u64 {
66 use std::sync::OnceLock;
67 static F: OnceLock<u64> = OnceLock::new();
68 *F.get_or_init(|| std::env::var("DF_WORK_FACTOR").ok().and_then(|s| s.parse().ok()).unwrap_or(B2J_WORK_FACTOR))
69}
70
71#[allow(clippy::cast_precision_loss)]
72#[must_use]
73fn ratio_chars(a: &[char], b: &[char]) -> f64 {
74 let total = a.len() + b.len();
75 if total == 0 {
76 return 1.0;
77 }
78 let (ca, oa) = ascii_counts(a);
79 let (cb, ob) = ascii_counts(b);
80 if oa > 0 || ob > 0 {
83 return gestalt::gestalt_ratio_chars(a, b);
84 }
85 let mut w = 0u64;
91 for i in 0..128 {
92 w += u64::from(ca[i]) * u64::from(cb[i]);
93 }
94 if w <= work_factor() * total as u64 {
95 ratio_b2j_chars(a, b, &cb)
96 } else {
97 gestalt::gestalt_ratio_chars(a, b)
98 }
99}
100
101#[must_use]
104pub fn ratio_b2j(a: &str, b: &str) -> f64 {
105 let av: Vec<char> = a.chars().collect();
106 let bv: Vec<char> = b.chars().collect();
107 let (cb, ob) = ascii_counts(&bv);
108 if ob > 0 {
109 return gestalt::gestalt_ratio_chars(&av, &bv); }
111 ratio_b2j_chars(&av, &bv, &cb)
112}
113
114#[must_use]
121pub fn ratio_many(pairs: &[(String, String)]) -> Vec<f64> {
122 pairs.par_iter().map(|(a, b)| ratio(a, b)).collect()
123}
124
125fn build_b2j(b: &[char]) -> HashMap<char, Vec<usize>> {
132 let mut b2j: HashMap<char, Vec<usize>> = HashMap::new();
133 for (j, &c) in b.iter().enumerate() {
134 b2j.entry(c).or_default().push(j);
135 }
136 b2j
137}
138
139#[allow(clippy::similar_names)]
141fn find_longest(a: &[char], b2j: &HashMap<char, Vec<usize>>, alo: usize, ahi: usize, blo: usize, bhi: usize) -> (usize, usize, usize) {
142 let mut besti = alo;
143 let mut bestj = blo;
144 let mut bestsize = 0usize;
145 let mut j2_prev: HashMap<usize, usize> = HashMap::new();
146 for (i, ch) in a.iter().enumerate().take(ahi).skip(alo) {
147 let mut j2_cur: HashMap<usize, usize> = HashMap::new();
148 if let Some(positions) = b2j.get(ch) {
149 for &j in positions {
150 if j < blo {
151 continue;
152 }
153 if j >= bhi {
154 break;
155 }
156 let prev = if j > blo { *j2_prev.get(&(j - 1)).unwrap_or(&0) } else { 0 };
157 let k = prev + 1;
158 j2_cur.insert(j, k);
159 if k > bestsize {
160 besti = i + 1 - k;
161 bestj = j + 1 - k;
162 bestsize = k;
163 }
164 }
165 }
166 j2_prev = j2_cur;
167 }
168 (besti, bestj, bestsize)
169}
170
171#[allow(clippy::many_single_char_names)]
173fn matching_count(a: &[char], b: &[char], b2j: &HashMap<char, Vec<usize>>) -> usize {
174 let mut total = 0usize;
175 let mut stack: Vec<(usize, usize, usize, usize)> = vec![(0, a.len(), 0, b.len())];
176 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
177 let (i, j, k) = find_longest(a, b2j, alo, ahi, blo, bhi);
178 if k > 0 {
179 total += k;
180 if alo < i && blo < j {
181 stack.push((alo, i, blo, j));
182 }
183 if i + k < ahi && j + k < bhi {
184 stack.push((i + k, ahi, j + k, bhi));
185 }
186 }
187 }
188 total
189}
190
191#[allow(clippy::cast_precision_loss)]
194#[must_use]
195pub fn ratio_reference(a: &str, b: &str) -> f64 {
196 let av: Vec<char> = a.chars().collect();
197 let bv: Vec<char> = b.chars().collect();
198 let total = av.len() + bv.len();
199 if total == 0 {
200 return 1.0;
201 }
202 let b2j = build_b2j(&bv);
203 2.0 * (matching_count(&av, &bv, &b2j) as f64) / (total as f64)
204}
205
206#[derive(Default)]
215struct B2jScratch {
216 offsets: Vec<u32>, positions: Vec<u32>, cursor: Vec<u32>, j2len: Vec<u32>,
220 erase: Vec<(u32, u32)>, affect: Vec<(u32, u32)>, stack: Vec<(usize, usize, usize, usize)>,
223}
224
225thread_local! {
226 static B2J: RefCell<B2jScratch> = RefCell::new(B2jScratch::default());
228}
229
230#[must_use]
233pub fn b2j_work(a: &[char], b: &[char]) -> u64 {
234 let (ca, oa) = ascii_counts(a);
235 let (cb, ob) = ascii_counts(b);
236 let mut w = u64::from(oa) * u64::from(ob);
237 for i in 0..128 {
238 w += u64::from(ca[i]) * u64::from(cb[i]);
239 }
240 w
241}
242
243#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
248fn ratio_b2j_chars(a: &[char], b: &[char], cb: &[u32; 128]) -> f64 {
249 let total = a.len() + b.len();
250 if total == 0 {
251 return 1.0;
252 }
253 if a.is_empty() || b.is_empty() {
254 return 0.0;
255 }
256 B2J.with_borrow_mut(|s| {
257 let B2jScratch { offsets, positions, cursor, j2len, erase, affect, stack } = s;
258 offsets.clear();
260 offsets.resize(129, 0);
261 for c in 0..128 {
262 offsets[c + 1] = offsets[c] + cb[c];
263 }
264 cursor.clear();
265 cursor.extend_from_slice(&offsets[..128]);
266 positions.clear();
270 positions.reserve(b.len());
271 #[allow(clippy::uninit_vec)]
272 unsafe {
275 positions.set_len(b.len());
276 }
277 for (j, &ch) in b.iter().enumerate() {
278 let c = ch as usize; positions[cursor[c] as usize] = j as u32;
280 cursor[c] += 1;
281 }
282 j2len.clear();
283 j2len.resize(b.len() + 1, 0);
284 erase.reserve(b.len() + 1);
286 affect.reserve(b.len() + 1);
287 stack.clear();
288 stack.push((0, a.len(), 0, b.len()));
289 let mut m = 0usize;
290 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
291 let (bi, bj, bk) = find_longest_b2j(a, offsets, positions, j2len, erase, affect, alo, ahi, blo, bhi);
292 if bk > 0 {
293 m += bk;
294 if alo < bi && blo < bj {
295 stack.push((alo, bi, blo, bj));
296 }
297 if bi + bk < ahi && bj + bk < bhi {
298 stack.push((bi + bk, ahi, bj + bk, bhi));
299 }
300 }
301 }
302 2.0 * m as f64 / total as f64
303 })
304}
305
306#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation, clippy::similar_names)]
314fn find_longest_b2j(
315 a: &[char],
316 offsets: &[u32],
317 positions: &[u32],
318 j2len: &mut [u32],
319 erase: &mut Vec<(u32, u32)>,
320 affect: &mut Vec<(u32, u32)>,
321 alo: usize,
322 ahi: usize,
323 blo: usize,
324 bhi: usize,
325) -> (usize, usize, usize) {
326 let (mut bi, mut bj, mut bk) = (alo, blo, 0usize);
327 erase.clear();
328 #[allow(clippy::undocumented_unsafe_blocks)]
333 unsafe {
334 for i in alo..ahi {
335 affect.clear();
336 let c = *a.get_unchecked(i) as usize;
337 if c < 128 {
338 let lo = *offsets.get_unchecked(c) as usize;
339 let hi = *offsets.get_unchecked(c + 1) as usize;
340 for &jj in positions.get_unchecked(lo..hi) {
341 let j = jj as usize;
342 if j < blo {
343 continue;
344 }
345 if j >= bhi {
346 break;
347 }
348 let k = *j2len.get_unchecked(j) as usize + 1;
349 affect.push((j as u32 + 1, k as u32));
350 if k > bk {
351 bi = i + 1 - k;
352 bj = j + 1 - k;
353 bk = k;
354 }
355 }
356 }
357 for &(p, _) in erase.iter() {
358 *j2len.get_unchecked_mut(p as usize) = 0;
359 }
360 for &(p, v) in affect.iter() {
361 *j2len.get_unchecked_mut(p as usize) = v;
362 }
363 std::mem::swap(erase, affect);
364 }
365 for &(p, _) in erase.iter() {
366 *j2len.get_unchecked_mut(p as usize) = 0;
367 }
368 }
369 (bi, bj, bk)
370}
371
372#[allow(clippy::cast_precision_loss)]
376fn real_quick_ratio(a: &[char], b: &[char]) -> f64 {
377 let total = a.len() + b.len();
378 if total == 0 {
379 return 1.0;
380 }
381 2.0 * (a.len().min(b.len()) as f64) / (total as f64)
382}
383
384fn char_counts(a: &[char]) -> Vec<(char, u32)> {
387 let mut v = a.to_vec();
388 v.sort_unstable();
389 let mut out: Vec<(char, u32)> = Vec::new();
390 for c in v {
391 match out.last_mut() {
392 Some(last) if last.0 == c => last.1 += 1,
393 _ => out.push((c, 1)),
394 }
395 }
396 out
397}
398
399#[allow(clippy::cast_precision_loss)]
402fn quick_ratio_counts(ca: &[(char, u32)], cb: &[(char, u32)], total: usize) -> f64 {
403 if total == 0 {
404 return 1.0;
405 }
406 let (mut x, mut y, mut matches) = (0usize, 0usize, 0u32);
407 while x < ca.len() && y < cb.len() {
408 match ca[x].0.cmp(&cb[y].0) {
409 std::cmp::Ordering::Less => x += 1,
410 std::cmp::Ordering::Greater => y += 1,
411 std::cmp::Ordering::Equal => {
412 matches += ca[x].1.min(cb[y].1);
413 x += 1;
414 y += 1;
415 }
416 }
417 }
418 2.0 * f64::from(matches) / total as f64
419}
420
421fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
422 while parent[x] != x {
423 parent[x] = parent[parent[x]];
424 x = parent[x];
425 }
426 x
427}
428
429fn progress_on() -> bool {
432 std::env::var_os("DIFFLIB_FAST_PROGRESS").is_some()
433}
434
435#[allow(clippy::cast_precision_loss, clippy::many_single_char_names)]
443fn qualifying_pairs(chars: &[Vec<char>], sams: &[gestalt::Sam], threshold: f64) -> Vec<(usize, usize, f64)> {
444 use std::sync::atomic::{AtomicUsize, Ordering};
445
446 let n = chars.len();
447 let rows = AtomicUsize::new(0);
448 std::thread::scope(|scope| {
449 if progress_on() {
450 let rows = &rows;
451 scope.spawn(move || {
452 while rows.load(Ordering::Relaxed) < n {
453 std::thread::sleep(std::time::Duration::from_secs(1));
454 let done = rows.load(Ordering::Relaxed);
455 eprintln!(" [difflib-fast] qualifying_pairs: row {done}/{n} ({:.0}%)", done as f64 / n as f64 * 100.0);
456 }
457 });
458 }
459 let mut order: Vec<usize> = (0..n).collect();
464 order.sort_by_key(|&i| chars[i].len());
465 let counts: Vec<Vec<(char, u32)>> = chars.par_iter().map(|c| char_counts(c)).collect();
466 let pairs = (0..n)
467 .into_par_iter()
468 .flat_map_iter(|p| {
469 let i = order[p];
470 let a = &chars[i];
471 let mut local: Vec<(usize, usize, f64)> = Vec::new();
472 for &j in &order[p + 1..] {
473 let b = &chars[j];
474 if real_quick_ratio(a, b) < threshold {
475 break; }
477 if quick_ratio_counts(&counts[i], &counts[j], a.len() + b.len()) < threshold {
478 continue;
479 }
480 let (lo, hi) = if i < j { (i, j) } else { (j, i) };
481 if let Some(r) = gestalt::gestalt_edge(&chars[lo], &chars[hi], &sams[hi], threshold) {
483 local.push((lo, hi, r));
484 }
485 }
486 rows.fetch_add(1, Ordering::Relaxed);
487 local
488 })
489 .collect();
490 rows.store(n, Ordering::Relaxed); pairs
492 })
493}
494
495fn cluster_min_sim(members: &[usize], chars: &[Vec<char>], sams: &[gestalt::Sam], ratios: &HashMap<(usize, usize), f64>) -> f64 {
501 members
502 .par_iter()
503 .enumerate()
504 .map(|(pos, &i)| {
505 let mut local = 1.0_f64;
506 for &j in &members[pos + 1..] {
507 let key = if i < j { (i, j) } else { (j, i) };
508 let r = match ratios.get(&key) {
509 Some(&r) => r, None => gestalt::gestalt_ratio_capped(&chars[key.0], &chars[key.1], &sams[key.1], local),
511 };
512 local = local.min(r);
513 }
514 local
515 })
516 .reduce(|| 1.0_f64, f64::min)
517}
518
519fn assemble(n: usize, pairs: Vec<(usize, usize, f64)>, chars: &[Vec<char>], sams: &[gestalt::Sam]) -> Vec<(Vec<usize>, f64)> {
522 let mut parent: Vec<usize> = (0..n).collect();
523 let mut ratios: HashMap<(usize, usize), f64> = HashMap::with_capacity(pairs.len());
524 for (i, j, r) in pairs {
525 ratios.insert((i, j), r);
526 let (ri, rj) = (uf_find(&mut parent, i), uf_find(&mut parent, j));
527 if ri != rj {
528 parent[ri] = rj;
529 }
530 }
531 let mut comps: HashMap<usize, Vec<usize>> = HashMap::new();
532 for i in 0..n {
533 let root = uf_find(&mut parent, i);
534 comps.entry(root).or_default().push(i);
535 }
536 let mut out: Vec<(Vec<usize>, f64)> = Vec::new();
537 for members in comps.values() {
538 if members.len() < 2 {
539 continue;
540 }
541 let min_sim = cluster_min_sim(members, chars, sams, &ratios);
542 let mut sorted = members.clone();
543 sorted.sort_unstable();
544 out.push((sorted, min_sim));
545 }
546 out.sort_by(|a, b| a.0[0].cmp(&b.0[0]));
547 out
548}
549
550#[must_use]
553pub fn cluster_canonicals_chars(chars: &[Vec<char>], threshold: f64) -> Vec<(Vec<usize>, f64)> {
554 let n = chars.len();
555 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
558 let pairs = qualifying_pairs(chars, &sams, threshold);
559 assemble(n, pairs, chars, &sams)
560}
561
562#[must_use]
568pub fn cluster_canonicals(canonicals: &[String], threshold: f64) -> Vec<(Vec<usize>, f64)> {
569 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
570 cluster_canonicals_chars(&chars, threshold)
571}
572
573const SHINGLE_K: usize = 9; fn fnv1a_bytes(data: &[u8]) -> u64 {
578 let mut h = 0xcbf2_9ce4_8422_2325u64;
579 for &b in data {
580 h ^= u64::from(b);
581 h = h.wrapping_mul(0x0000_0100_0000_01b3);
582 }
583 h
584}
585
586fn fnv1a_u64s(values: &[u64]) -> u64 {
587 let mut h = 0xcbf2_9ce4_8422_2325u64;
588 for &v in values {
589 h ^= v;
590 h = h.wrapping_mul(0x0000_0100_0000_01b3);
591 }
592 h
593}
594
595fn shingle_hashes(s: &str) -> Vec<u64> {
597 let bytes = s.as_bytes();
598 if bytes.len() <= SHINGLE_K {
599 return vec![fnv1a_bytes(bytes)];
600 }
601 let mut set: HashSet<u64> = HashSet::new();
602 for window in bytes.windows(SHINGLE_K) {
603 set.insert(fnv1a_bytes(window));
604 }
605 set.into_iter().collect()
606}
607
608fn make_perms(num: usize) -> Vec<(u64, u64)> {
610 let mut state = 0x9e37_79b9_7f4a_7c15u64;
611 let mut next = move || {
612 state ^= state << 13;
613 state ^= state >> 7;
614 state ^= state << 17;
615 state
616 };
617 (0..num).map(|_| (next() | 1, next())).collect()
618}
619
620fn minhash(shingles: &[u64], perms: &[(u64, u64)]) -> Vec<u64> {
621 perms
622 .iter()
623 .map(|&(a, b)| shingles.iter().map(|&h| a.wrapping_mul(h).wrapping_add(b)).min().unwrap_or(u64::MAX))
624 .collect()
625}
626
627fn lsh_candidates(sigs: &[Vec<u64>], band_rows: usize) -> HashSet<(usize, usize)> {
630 let bands = sigs.first().map_or(0, Vec::len).checked_div(band_rows).unwrap_or(0);
631 let mut candidates: HashSet<(usize, usize)> = HashSet::new();
632 for band in 0..bands {
633 let lo = band * band_rows;
634 let mut buckets: HashMap<u64, Vec<usize>> = HashMap::new();
635 for (d, sig) in sigs.iter().enumerate() {
636 buckets.entry(fnv1a_u64s(&sig[lo..lo + band_rows])).or_default().push(d);
637 }
638 for docs in buckets.values() {
639 for a in 0..docs.len() {
640 for b in (a + 1)..docs.len() {
641 candidates.insert((docs[a].min(docs[b]), docs[a].max(docs[b])));
642 }
643 }
644 }
645 }
646 candidates
647}
648
649#[must_use]
657pub fn cluster_canonicals_lsh(canonicals: &[String], threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
658 let debug = progress_on();
659 let start = std::time::Instant::now();
660 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
661 let n = chars.len();
662 let perms = make_perms(num_perm);
663 let sigs: Vec<Vec<u64>> = canonicals.par_iter().map(|s| minhash(&shingle_hashes(s), &perms)).collect();
664 if debug {
665 eprintln!(" [difflib-fast] lsh: {n} signatures in {:.2}s", start.elapsed().as_secs_f64());
666 }
667 let candidates = lsh_candidates(&sigs, band_rows);
668 if debug {
669 eprintln!(" [difflib-fast] lsh: {} candidate pairs in {:.2}s", candidates.len(), start.elapsed().as_secs_f64());
670 }
671 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
672 let cand: Vec<(usize, usize)> = candidates.into_iter().collect();
673 let pairs: Vec<(usize, usize, f64)> = cand
674 .par_iter()
675 .filter_map(|&(i, j)| {
676 let (a, b) = if i < j { (i, j) } else { (j, i) };
677 gestalt::gestalt_edge(&chars[a], &chars[b], &sams[b], threshold).map(|r| (a, b, r))
678 })
679 .collect();
680 if debug {
681 eprintln!(" [difflib-fast] lsh: {} verified pairs in {:.2}s", pairs.len(), start.elapsed().as_secs_f64());
682 }
683 assemble(n, pairs, &chars, &sams)
684}
685
686#[cfg(feature = "python")]
689mod python {
690 use pyo3::prelude::*;
691
692 #[pyfunction]
698 fn ratio(py: Python<'_>, a: &str, b: &str) -> f64 {
699 let (a, b) = (a.to_owned(), b.to_owned());
700 py.detach(|| super::ratio(&a, &b))
701 }
702
703 #[pyfunction]
707 #[allow(clippy::needless_pass_by_value)]
708 fn ratio_many(py: Python<'_>, pairs: Vec<(String, String)>) -> Vec<f64> {
709 py.detach(|| super::ratio_many(&pairs))
710 }
711
712 #[pyfunction]
718 #[allow(clippy::needless_pass_by_value)]
719 fn cluster_canonicals(py: Python<'_>, canonicals: Vec<String>, threshold: f64) -> Vec<(Vec<usize>, f64)> {
720 py.detach(|| super::cluster_canonicals(&canonicals, threshold))
721 }
722
723 #[pyfunction]
725 #[allow(clippy::needless_pass_by_value)]
726 fn cluster_canonicals_lsh(py: Python<'_>, canonicals: Vec<String>, threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
727 py.detach(|| super::cluster_canonicals_lsh(&canonicals, threshold, num_perm, band_rows))
728 }
729
730 #[pymodule]
732 fn _difflib_fast(m: &Bound<'_, PyModule>) -> PyResult<()> {
733 m.add_function(wrap_pyfunction!(ratio, m)?)?;
734 m.add_function(wrap_pyfunction!(ratio_many, m)?)?;
735 m.add_function(wrap_pyfunction!(cluster_canonicals, m)?)?;
736 m.add_function(wrap_pyfunction!(cluster_canonicals_lsh, m)?)?;
737 Ok(())
738 }
739}
740
741#[cfg(test)]
742mod tests {
743 #![allow(clippy::float_cmp, clippy::unreadable_literal)]
744 use super::{cluster_canonicals, gestalt_ratio, ratio, ratio_b2j, ratio_reference};
745
746 #[test]
747 fn matches_known_difflib_values() {
748 assert_eq!(gestalt_ratio("", ""), 1.0);
750 assert_eq!(gestalt_ratio("", "x"), 0.0);
751 assert_eq!(gestalt_ratio("abc", "abc"), 1.0);
752 assert_eq!(gestalt_ratio("abc", "abd"), 0.6666666666666666);
753 assert_eq!(gestalt_ratio("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
754 assert_eq!(gestalt_ratio("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
755 }
756
757 #[test]
759 fn fast_matches_reference() {
760 let mut s: u64 = 0x1234_5678_9abc_def1;
761 let mut next = || {
762 s ^= s << 13;
763 s ^= s >> 7;
764 s ^= s << 17;
765 s
766 };
767 for _ in 0..2000 {
768 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
769 (0..n).map(|_| char::from(b'a' + (rng() % 5) as u8)).collect()
770 };
771 let (la, lb) = ((next() % 50) as usize, (next() % 50) as usize);
772 let a = mk(la, &mut next);
773 let b = mk(lb, &mut next);
774 let r = ratio_reference(&a, &b);
775 assert_eq!(gestalt_ratio(&a, &b), r, "SAM a={a:?} b={b:?}");
776 assert_eq!(ratio_b2j(&a, &b), r, "b2j a={a:?} b={b:?}");
777 }
778 }
779
780 #[test]
783 fn long_strings_all_paths_agree() {
784 let mut s: u64 = 0xdead_beef_cafe_1234;
785 let mut next = || {
786 s ^= s << 13;
787 s ^= s >> 7;
788 s ^= s << 17;
789 s
790 };
791 for _ in 0..40 {
792 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
793 (0..n).map(|_| char::from(b'a' + (rng() % 6) as u8)).collect()
794 };
795 let a = mk(1400 + (next() % 600) as usize, &mut next); let b = mk(1400 + (next() % 600) as usize, &mut next);
797 let r = ratio_reference(&a, &b);
798 assert_eq!(gestalt_ratio(&a, &b), r);
799 assert_eq!(ratio_b2j(&a, &b), r);
800 assert_eq!(ratio(&a, &b), r); }
802 }
803
804 #[test]
805 fn clusters_obvious_duplicates() {
806 let corpus: Vec<String> = vec![
807 "def add(a, b): return a + b".into(),
808 "def add(x, y): return x + y".into(),
809 "completely unrelated text here".into(),
810 ];
811 let clusters = cluster_canonicals(&corpus, 0.5);
812 assert_eq!(clusters.len(), 1, "the two add() variants should cluster");
813 assert_eq!(clusters[0].0, vec![0, 1]);
814 assert!(clusters[0].1 >= 0.5);
815 }
816}