1use std::cell::RefCell;
24use std::collections::{HashMap, HashSet};
25
26use rayon::prelude::*;
27
28pub mod gestalt;
29pub use gestalt::gestalt_ratio;
30
31const B2J_WORK_FACTOR: u64 = 34;
36
37#[must_use]
44pub fn ratio(a: &str, b: &str) -> f64 {
45 let av: Vec<char> = a.chars().collect();
46 let bv: Vec<char> = b.chars().collect();
47 ratio_chars(&av, &bv)
48}
49
50fn ascii_counts(s: &[char]) -> ([u32; 128], u32) {
52 let mut c = [0u32; 128];
53 let mut other = 0u32;
54 for &ch in s {
55 let u = ch as u32;
56 if u < 128 {
57 c[u as usize] += 1;
58 } else {
59 other += 1;
60 }
61 }
62 (c, other)
63}
64
65fn work_factor() -> u64 {
66 use std::sync::OnceLock;
67 static F: OnceLock<u64> = OnceLock::new();
68 *F.get_or_init(|| std::env::var("DF_WORK_FACTOR").ok().and_then(|s| s.parse().ok()).unwrap_or(B2J_WORK_FACTOR))
69}
70
71#[allow(clippy::cast_precision_loss)]
72#[must_use]
73fn ratio_chars(a: &[char], b: &[char]) -> f64 {
74 let total = a.len() + b.len();
75 if total == 0 {
76 return 1.0;
77 }
78 let (ca, oa) = ascii_counts(a);
79 let (cb, ob) = ascii_counts(b);
80 if oa > 0 || ob > 0 {
83 return gestalt::gestalt_ratio_chars(a, b);
84 }
85 let mut w = 0u64;
91 for i in 0..128 {
92 w += u64::from(ca[i]) * u64::from(cb[i]);
93 }
94 if w <= work_factor() * total as u64 {
95 ratio_b2j_chars(a, b, &cb)
96 } else {
97 gestalt::gestalt_ratio_chars(a, b)
98 }
99}
100
101#[must_use]
104pub fn ratio_b2j(a: &str, b: &str) -> f64 {
105 let av: Vec<char> = a.chars().collect();
106 let bv: Vec<char> = b.chars().collect();
107 let (cb, ob) = ascii_counts(&bv);
108 if ob > 0 {
109 return gestalt::gestalt_ratio_chars(&av, &bv); }
111 ratio_b2j_chars(&av, &bv, &cb)
112}
113
114fn build_b2j(b: &[char]) -> HashMap<char, Vec<usize>> {
121 let mut b2j: HashMap<char, Vec<usize>> = HashMap::new();
122 for (j, &c) in b.iter().enumerate() {
123 b2j.entry(c).or_default().push(j);
124 }
125 b2j
126}
127
128#[allow(clippy::similar_names)]
130fn find_longest(a: &[char], b2j: &HashMap<char, Vec<usize>>, alo: usize, ahi: usize, blo: usize, bhi: usize) -> (usize, usize, usize) {
131 let mut besti = alo;
132 let mut bestj = blo;
133 let mut bestsize = 0usize;
134 let mut j2_prev: HashMap<usize, usize> = HashMap::new();
135 for (i, ch) in a.iter().enumerate().take(ahi).skip(alo) {
136 let mut j2_cur: HashMap<usize, usize> = HashMap::new();
137 if let Some(positions) = b2j.get(ch) {
138 for &j in positions {
139 if j < blo {
140 continue;
141 }
142 if j >= bhi {
143 break;
144 }
145 let prev = if j > blo { *j2_prev.get(&(j - 1)).unwrap_or(&0) } else { 0 };
146 let k = prev + 1;
147 j2_cur.insert(j, k);
148 if k > bestsize {
149 besti = i + 1 - k;
150 bestj = j + 1 - k;
151 bestsize = k;
152 }
153 }
154 }
155 j2_prev = j2_cur;
156 }
157 (besti, bestj, bestsize)
158}
159
160#[allow(clippy::many_single_char_names)]
162fn matching_count(a: &[char], b: &[char], b2j: &HashMap<char, Vec<usize>>) -> usize {
163 let mut total = 0usize;
164 let mut stack: Vec<(usize, usize, usize, usize)> = vec![(0, a.len(), 0, b.len())];
165 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
166 let (i, j, k) = find_longest(a, b2j, alo, ahi, blo, bhi);
167 if k > 0 {
168 total += k;
169 if alo < i && blo < j {
170 stack.push((alo, i, blo, j));
171 }
172 if i + k < ahi && j + k < bhi {
173 stack.push((i + k, ahi, j + k, bhi));
174 }
175 }
176 }
177 total
178}
179
180#[allow(clippy::cast_precision_loss)]
183#[must_use]
184pub fn ratio_reference(a: &str, b: &str) -> f64 {
185 let av: Vec<char> = a.chars().collect();
186 let bv: Vec<char> = b.chars().collect();
187 let total = av.len() + bv.len();
188 if total == 0 {
189 return 1.0;
190 }
191 let b2j = build_b2j(&bv);
192 2.0 * (matching_count(&av, &bv, &b2j) as f64) / (total as f64)
193}
194
195#[derive(Default)]
204struct B2jScratch {
205 offsets: Vec<u32>, positions: Vec<u32>, cursor: Vec<u32>, j2len: Vec<u32>,
209 erase: Vec<(u32, u32)>, affect: Vec<(u32, u32)>, stack: Vec<(usize, usize, usize, usize)>,
212}
213
214thread_local! {
215 static B2J: RefCell<B2jScratch> = RefCell::new(B2jScratch::default());
217}
218
219#[must_use]
222pub fn b2j_work(a: &[char], b: &[char]) -> u64 {
223 let (ca, oa) = ascii_counts(a);
224 let (cb, ob) = ascii_counts(b);
225 let mut w = u64::from(oa) * u64::from(ob);
226 for i in 0..128 {
227 w += u64::from(ca[i]) * u64::from(cb[i]);
228 }
229 w
230}
231
232#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
237fn ratio_b2j_chars(a: &[char], b: &[char], cb: &[u32; 128]) -> f64 {
238 let total = a.len() + b.len();
239 if total == 0 {
240 return 1.0;
241 }
242 if a.is_empty() || b.is_empty() {
243 return 0.0;
244 }
245 B2J.with_borrow_mut(|s| {
246 let B2jScratch { offsets, positions, cursor, j2len, erase, affect, stack } = s;
247 offsets.clear();
249 offsets.resize(129, 0);
250 for c in 0..128 {
251 offsets[c + 1] = offsets[c] + cb[c];
252 }
253 cursor.clear();
254 cursor.extend_from_slice(&offsets[..128]);
255 positions.clear();
259 positions.reserve(b.len());
260 #[allow(clippy::uninit_vec)]
261 unsafe {
264 positions.set_len(b.len());
265 }
266 for (j, &ch) in b.iter().enumerate() {
267 let c = ch as usize; positions[cursor[c] as usize] = j as u32;
269 cursor[c] += 1;
270 }
271 j2len.clear();
272 j2len.resize(b.len() + 1, 0);
273 erase.reserve(b.len() + 1);
275 affect.reserve(b.len() + 1);
276 stack.clear();
277 stack.push((0, a.len(), 0, b.len()));
278 let mut m = 0usize;
279 while let Some((alo, ahi, blo, bhi)) = stack.pop() {
280 let (bi, bj, bk) = find_longest_b2j(a, offsets, positions, j2len, erase, affect, alo, ahi, blo, bhi);
281 if bk > 0 {
282 m += bk;
283 if alo < bi && blo < bj {
284 stack.push((alo, bi, blo, bj));
285 }
286 if bi + bk < ahi && bj + bk < bhi {
287 stack.push((bi + bk, ahi, bj + bk, bhi));
288 }
289 }
290 }
291 2.0 * m as f64 / total as f64
292 })
293}
294
295#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation, clippy::similar_names)]
303fn find_longest_b2j(
304 a: &[char],
305 offsets: &[u32],
306 positions: &[u32],
307 j2len: &mut [u32],
308 erase: &mut Vec<(u32, u32)>,
309 affect: &mut Vec<(u32, u32)>,
310 alo: usize,
311 ahi: usize,
312 blo: usize,
313 bhi: usize,
314) -> (usize, usize, usize) {
315 let (mut bi, mut bj, mut bk) = (alo, blo, 0usize);
316 erase.clear();
317 #[allow(clippy::undocumented_unsafe_blocks)]
322 unsafe {
323 for i in alo..ahi {
324 affect.clear();
325 let c = *a.get_unchecked(i) as usize;
326 if c < 128 {
327 let lo = *offsets.get_unchecked(c) as usize;
328 let hi = *offsets.get_unchecked(c + 1) as usize;
329 for &jj in positions.get_unchecked(lo..hi) {
330 let j = jj as usize;
331 if j < blo {
332 continue;
333 }
334 if j >= bhi {
335 break;
336 }
337 let k = *j2len.get_unchecked(j) as usize + 1;
338 affect.push((j as u32 + 1, k as u32));
339 if k > bk {
340 bi = i + 1 - k;
341 bj = j + 1 - k;
342 bk = k;
343 }
344 }
345 }
346 for &(p, _) in erase.iter() {
347 *j2len.get_unchecked_mut(p as usize) = 0;
348 }
349 for &(p, v) in affect.iter() {
350 *j2len.get_unchecked_mut(p as usize) = v;
351 }
352 std::mem::swap(erase, affect);
353 }
354 for &(p, _) in erase.iter() {
355 *j2len.get_unchecked_mut(p as usize) = 0;
356 }
357 }
358 (bi, bj, bk)
359}
360
361#[allow(clippy::cast_precision_loss)]
365fn real_quick_ratio(a: &[char], b: &[char]) -> f64 {
366 let total = a.len() + b.len();
367 if total == 0 {
368 return 1.0;
369 }
370 2.0 * (a.len().min(b.len()) as f64) / (total as f64)
371}
372
373fn char_counts(a: &[char]) -> Vec<(char, u32)> {
376 let mut v = a.to_vec();
377 v.sort_unstable();
378 let mut out: Vec<(char, u32)> = Vec::new();
379 for c in v {
380 match out.last_mut() {
381 Some(last) if last.0 == c => last.1 += 1,
382 _ => out.push((c, 1)),
383 }
384 }
385 out
386}
387
388#[allow(clippy::cast_precision_loss)]
391fn quick_ratio_counts(ca: &[(char, u32)], cb: &[(char, u32)], total: usize) -> f64 {
392 if total == 0 {
393 return 1.0;
394 }
395 let (mut x, mut y, mut matches) = (0usize, 0usize, 0u32);
396 while x < ca.len() && y < cb.len() {
397 match ca[x].0.cmp(&cb[y].0) {
398 std::cmp::Ordering::Less => x += 1,
399 std::cmp::Ordering::Greater => y += 1,
400 std::cmp::Ordering::Equal => {
401 matches += ca[x].1.min(cb[y].1);
402 x += 1;
403 y += 1;
404 }
405 }
406 }
407 2.0 * f64::from(matches) / total as f64
408}
409
410fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
411 while parent[x] != x {
412 parent[x] = parent[parent[x]];
413 x = parent[x];
414 }
415 x
416}
417
418fn progress_on() -> bool {
421 std::env::var_os("DIFFLIB_FAST_PROGRESS").is_some()
422}
423
424#[allow(clippy::cast_precision_loss, clippy::many_single_char_names)]
432fn qualifying_pairs(chars: &[Vec<char>], sams: &[gestalt::Sam], threshold: f64) -> Vec<(usize, usize, f64)> {
433 use std::sync::atomic::{AtomicUsize, Ordering};
434
435 let n = chars.len();
436 let rows = AtomicUsize::new(0);
437 std::thread::scope(|scope| {
438 if progress_on() {
439 let rows = &rows;
440 scope.spawn(move || {
441 while rows.load(Ordering::Relaxed) < n {
442 std::thread::sleep(std::time::Duration::from_millis(1000));
443 let done = rows.load(Ordering::Relaxed);
444 eprintln!(" [difflib-fast] qualifying_pairs: row {done}/{n} ({:.0}%)", done as f64 / n as f64 * 100.0);
445 }
446 });
447 }
448 let mut order: Vec<usize> = (0..n).collect();
453 order.sort_by_key(|&i| chars[i].len());
454 let counts: Vec<Vec<(char, u32)>> = chars.par_iter().map(|c| char_counts(c)).collect();
455 let pairs = (0..n)
456 .into_par_iter()
457 .flat_map_iter(|p| {
458 let i = order[p];
459 let a = &chars[i];
460 let mut local: Vec<(usize, usize, f64)> = Vec::new();
461 for &j in &order[p + 1..] {
462 let b = &chars[j];
463 if real_quick_ratio(a, b) < threshold {
464 break; }
466 if quick_ratio_counts(&counts[i], &counts[j], a.len() + b.len()) < threshold {
467 continue;
468 }
469 let (lo, hi) = if i < j { (i, j) } else { (j, i) };
470 if let Some(r) = gestalt::gestalt_edge(&chars[lo], &chars[hi], &sams[hi], threshold) {
472 local.push((lo, hi, r));
473 }
474 }
475 rows.fetch_add(1, Ordering::Relaxed);
476 local
477 })
478 .collect();
479 rows.store(n, Ordering::Relaxed); pairs
481 })
482}
483
484fn cluster_min_sim(members: &[usize], chars: &[Vec<char>], sams: &[gestalt::Sam], ratios: &HashMap<(usize, usize), f64>) -> f64 {
490 members
491 .par_iter()
492 .enumerate()
493 .map(|(pos, &i)| {
494 let mut local = 1.0_f64;
495 for &j in &members[pos + 1..] {
496 let key = if i < j { (i, j) } else { (j, i) };
497 let r = match ratios.get(&key) {
498 Some(&r) => r, None => gestalt::gestalt_ratio_capped(&chars[key.0], &chars[key.1], &sams[key.1], local),
500 };
501 local = local.min(r);
502 }
503 local
504 })
505 .reduce(|| 1.0_f64, f64::min)
506}
507
508fn assemble(n: usize, pairs: Vec<(usize, usize, f64)>, chars: &[Vec<char>], sams: &[gestalt::Sam]) -> Vec<(Vec<usize>, f64)> {
511 let mut parent: Vec<usize> = (0..n).collect();
512 let mut ratios: HashMap<(usize, usize), f64> = HashMap::with_capacity(pairs.len());
513 for (i, j, r) in pairs {
514 ratios.insert((i, j), r);
515 let (ri, rj) = (uf_find(&mut parent, i), uf_find(&mut parent, j));
516 if ri != rj {
517 parent[ri] = rj;
518 }
519 }
520 let mut comps: HashMap<usize, Vec<usize>> = HashMap::new();
521 for i in 0..n {
522 let root = uf_find(&mut parent, i);
523 comps.entry(root).or_default().push(i);
524 }
525 let mut out: Vec<(Vec<usize>, f64)> = Vec::new();
526 for members in comps.values() {
527 if members.len() < 2 {
528 continue;
529 }
530 let min_sim = cluster_min_sim(members, chars, sams, &ratios);
531 let mut sorted = members.clone();
532 sorted.sort_unstable();
533 out.push((sorted, min_sim));
534 }
535 out.sort_by(|a, b| a.0[0].cmp(&b.0[0]));
536 out
537}
538
539#[must_use]
542pub fn cluster_canonicals_chars(chars: &[Vec<char>], threshold: f64) -> Vec<(Vec<usize>, f64)> {
543 let n = chars.len();
544 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
547 let pairs = qualifying_pairs(chars, &sams, threshold);
548 assemble(n, pairs, chars, &sams)
549}
550
551#[must_use]
557pub fn cluster_canonicals(canonicals: &[String], threshold: f64) -> Vec<(Vec<usize>, f64)> {
558 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
559 cluster_canonicals_chars(&chars, threshold)
560}
561
562const SHINGLE_K: usize = 9; fn fnv1a_bytes(data: &[u8]) -> u64 {
567 let mut h = 0xcbf2_9ce4_8422_2325u64;
568 for &b in data {
569 h ^= u64::from(b);
570 h = h.wrapping_mul(0x0000_0100_0000_01b3);
571 }
572 h
573}
574
575fn fnv1a_u64s(values: &[u64]) -> u64 {
576 let mut h = 0xcbf2_9ce4_8422_2325u64;
577 for &v in values {
578 h ^= v;
579 h = h.wrapping_mul(0x0000_0100_0000_01b3);
580 }
581 h
582}
583
584fn shingle_hashes(s: &str) -> Vec<u64> {
586 let bytes = s.as_bytes();
587 if bytes.len() <= SHINGLE_K {
588 return vec![fnv1a_bytes(bytes)];
589 }
590 let mut set: HashSet<u64> = HashSet::new();
591 for window in bytes.windows(SHINGLE_K) {
592 set.insert(fnv1a_bytes(window));
593 }
594 set.into_iter().collect()
595}
596
597fn make_perms(num: usize) -> Vec<(u64, u64)> {
599 let mut state = 0x9e37_79b9_7f4a_7c15u64;
600 let mut next = move || {
601 state ^= state << 13;
602 state ^= state >> 7;
603 state ^= state << 17;
604 state
605 };
606 (0..num).map(|_| (next() | 1, next())).collect()
607}
608
609fn minhash(shingles: &[u64], perms: &[(u64, u64)]) -> Vec<u64> {
610 perms
611 .iter()
612 .map(|&(a, b)| shingles.iter().map(|&h| a.wrapping_mul(h).wrapping_add(b)).min().unwrap_or(u64::MAX))
613 .collect()
614}
615
616fn lsh_candidates(sigs: &[Vec<u64>], band_rows: usize) -> HashSet<(usize, usize)> {
619 let bands = if band_rows == 0 { 0 } else { sigs.first().map_or(0, Vec::len) / band_rows };
620 let mut candidates: HashSet<(usize, usize)> = HashSet::new();
621 for band in 0..bands {
622 let lo = band * band_rows;
623 let mut buckets: HashMap<u64, Vec<usize>> = HashMap::new();
624 for (d, sig) in sigs.iter().enumerate() {
625 buckets.entry(fnv1a_u64s(&sig[lo..lo + band_rows])).or_default().push(d);
626 }
627 for docs in buckets.values() {
628 for a in 0..docs.len() {
629 for b in (a + 1)..docs.len() {
630 candidates.insert((docs[a].min(docs[b]), docs[a].max(docs[b])));
631 }
632 }
633 }
634 }
635 candidates
636}
637
638#[must_use]
646pub fn cluster_canonicals_lsh(canonicals: &[String], threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
647 let debug = progress_on();
648 let start = std::time::Instant::now();
649 let chars: Vec<Vec<char>> = canonicals.iter().map(|s| s.chars().collect()).collect();
650 let n = chars.len();
651 let perms = make_perms(num_perm);
652 let sigs: Vec<Vec<u64>> = canonicals.par_iter().map(|s| minhash(&shingle_hashes(s), &perms)).collect();
653 if debug {
654 eprintln!(" [difflib-fast] lsh: {n} signatures in {:.2}s", start.elapsed().as_secs_f64());
655 }
656 let candidates = lsh_candidates(&sigs, band_rows);
657 if debug {
658 eprintln!(" [difflib-fast] lsh: {} candidate pairs in {:.2}s", candidates.len(), start.elapsed().as_secs_f64());
659 }
660 let sams: Vec<gestalt::Sam> = chars.par_iter().map(|c| gestalt::build_sam(c)).collect();
661 let cand: Vec<(usize, usize)> = candidates.into_iter().collect();
662 let pairs: Vec<(usize, usize, f64)> = cand
663 .par_iter()
664 .filter_map(|&(i, j)| {
665 let (a, b) = if i < j { (i, j) } else { (j, i) };
666 gestalt::gestalt_edge(&chars[a], &chars[b], &sams[b], threshold).map(|r| (a, b, r))
667 })
668 .collect();
669 if debug {
670 eprintln!(" [difflib-fast] lsh: {} verified pairs in {:.2}s", pairs.len(), start.elapsed().as_secs_f64());
671 }
672 assemble(n, pairs, &chars, &sams)
673}
674
675#[cfg(feature = "python")]
678mod python {
679 use pyo3::prelude::*;
680
681 #[pyfunction]
683 fn ratio(a: &str, b: &str) -> f64 {
684 super::gestalt_ratio(a, b)
685 }
686
687 #[pyfunction]
689 #[allow(clippy::needless_pass_by_value)]
690 fn cluster_canonicals(canonicals: Vec<String>, threshold: f64) -> Vec<(Vec<usize>, f64)> {
691 super::cluster_canonicals(&canonicals, threshold)
692 }
693
694 #[pyfunction]
696 #[allow(clippy::needless_pass_by_value)]
697 fn cluster_canonicals_lsh(canonicals: Vec<String>, threshold: f64, num_perm: usize, band_rows: usize) -> Vec<(Vec<usize>, f64)> {
698 super::cluster_canonicals_lsh(&canonicals, threshold, num_perm, band_rows)
699 }
700
701 #[pymodule]
703 fn _difflib_fast(m: &Bound<'_, PyModule>) -> PyResult<()> {
704 m.add_function(wrap_pyfunction!(ratio, m)?)?;
705 m.add_function(wrap_pyfunction!(cluster_canonicals, m)?)?;
706 m.add_function(wrap_pyfunction!(cluster_canonicals_lsh, m)?)?;
707 Ok(())
708 }
709}
710
711#[cfg(test)]
712mod tests {
713 #![allow(clippy::float_cmp, clippy::unreadable_literal)]
714 use super::{cluster_canonicals, gestalt_ratio, ratio, ratio_b2j, ratio_reference};
715
716 #[test]
717 fn matches_known_difflib_values() {
718 assert_eq!(gestalt_ratio("", ""), 1.0);
720 assert_eq!(gestalt_ratio("", "x"), 0.0);
721 assert_eq!(gestalt_ratio("abc", "abc"), 1.0);
722 assert_eq!(gestalt_ratio("abc", "abd"), 0.6666666666666666);
723 assert_eq!(gestalt_ratio("the quick brown fox", "the quick brown dog"), 0.8947368421052632);
724 assert_eq!(gestalt_ratio("ПриветМир", "ПриветМирЪ"), 0.9473684210526315);
725 }
726
727 #[test]
729 fn fast_matches_reference() {
730 let mut s: u64 = 0x1234_5678_9abc_def1;
731 let mut next = || {
732 s ^= s << 13;
733 s ^= s >> 7;
734 s ^= s << 17;
735 s
736 };
737 for _ in 0..2000 {
738 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
739 (0..n).map(|_| char::from(b'a' + (rng() % 5) as u8)).collect()
740 };
741 let (la, lb) = ((next() % 50) as usize, (next() % 50) as usize);
742 let a = mk(la, &mut next);
743 let b = mk(lb, &mut next);
744 let r = ratio_reference(&a, &b);
745 assert_eq!(gestalt_ratio(&a, &b), r, "SAM a={a:?} b={b:?}");
746 assert_eq!(ratio_b2j(&a, &b), r, "b2j a={a:?} b={b:?}");
747 }
748 }
749
750 #[test]
753 fn long_strings_all_paths_agree() {
754 let mut s: u64 = 0xdead_beef_cafe_1234;
755 let mut next = || {
756 s ^= s << 13;
757 s ^= s >> 7;
758 s ^= s << 17;
759 s
760 };
761 for _ in 0..40 {
762 let mk = |n: usize, rng: &mut dyn FnMut() -> u64| -> String {
763 (0..n).map(|_| char::from(b'a' + (rng() % 6) as u8)).collect()
764 };
765 let a = mk(1400 + (next() % 600) as usize, &mut next); let b = mk(1400 + (next() % 600) as usize, &mut next);
767 let r = ratio_reference(&a, &b);
768 assert_eq!(gestalt_ratio(&a, &b), r);
769 assert_eq!(ratio_b2j(&a, &b), r);
770 assert_eq!(ratio(&a, &b), r); }
772 }
773
774 #[test]
775 fn clusters_obvious_duplicates() {
776 let corpus: Vec<String> = vec![
777 "def add(a, b): return a + b".into(),
778 "def add(x, y): return x + y".into(),
779 "completely unrelated text here".into(),
780 ];
781 let clusters = cluster_canonicals(&corpus, 0.5);
782 assert_eq!(clusters.len(), 1, "the two add() variants should cluster");
783 assert_eq!(clusters[0].0, vec![0, 1]);
784 assert!(clusters[0].1 >= 0.5);
785 }
786}