1#![deny(unsafe_code)]
20#![warn(missing_docs)]
21#![warn(rust_2018_idioms)]
22
23use std::collections::HashMap;
24use std::hash::{BuildHasher, Hasher};
25
26use ahash::RandomState;
27use rand::{Rng, SeedableRng};
28use serde::{Deserialize, Serialize};
29use thiserror::Error;
30
31pub type Result<T> = std::result::Result<T, DedupError>;
33
34#[derive(Error, Debug)]
36pub enum DedupError {
37 #[error("invalid config: {0}")]
39 InvalidConfig(String),
40}
41
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
44#[serde(rename_all = "snake_case")]
45pub enum ShingleUnit {
46 #[default]
48 Char,
49 Word,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55pub struct Config {
56 pub num_hashes: usize,
58 pub bands: usize,
60 pub shingle_size: usize,
62 pub shingle_unit: ShingleUnit,
64 pub seed: u64,
66}
67
68impl Default for Config {
69 fn default() -> Self {
70 Self {
71 num_hashes: 128,
72 bands: 32,
73 shingle_size: 5,
74 shingle_unit: ShingleUnit::Char,
75 seed: 0,
76 }
77 }
78}
79
80#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
82pub struct Hit {
83 pub id: String,
85 pub similarity: f64,
87}
88
89pub struct Index {
91 cfg: Config,
92 a: Vec<u32>, b: Vec<u32>, band_hasher: RandomState,
95 docs: Vec<DocEntry>,
96 bands: HashMap<(usize, u64), Vec<usize>>, }
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100struct DocEntry {
101 id: String,
102 signature: Vec<u32>,
103}
104
105#[derive(Serialize, Deserialize)]
108struct SerializedIndex {
109 cfg: Config,
110 a: Vec<u32>,
111 b: Vec<u32>,
112 docs: Vec<DocEntry>,
113}
114
115impl Index {
116 pub fn new(cfg: Config) -> Result<Self> {
118 if cfg.num_hashes == 0 {
119 return Err(DedupError::InvalidConfig("num_hashes must be > 0".into()));
120 }
121 if cfg.bands == 0 {
122 return Err(DedupError::InvalidConfig("bands must be > 0".into()));
123 }
124 if cfg.num_hashes % cfg.bands != 0 {
125 return Err(DedupError::InvalidConfig(format!(
126 "num_hashes ({}) must be a multiple of bands ({})",
127 cfg.num_hashes, cfg.bands
128 )));
129 }
130 if cfg.shingle_size == 0 {
131 return Err(DedupError::InvalidConfig("shingle_size must be > 0".into()));
132 }
133 let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.seed);
134 let mut a = Vec::with_capacity(cfg.num_hashes);
137 let mut b = Vec::with_capacity(cfg.num_hashes);
138 for _ in 0..cfg.num_hashes {
139 let mut ai: u32 = rng.random();
140 ai |= 1; a.push(ai);
142 b.push(rng.random::<u32>());
143 }
144 let band_hasher = RandomState::with_seeds(
147 cfg.seed,
148 cfg.seed.wrapping_add(1),
149 cfg.seed.wrapping_add(2),
150 cfg.seed.wrapping_add(3),
151 );
152 Ok(Self {
153 cfg,
154 a,
155 b,
156 band_hasher,
157 docs: Vec::new(),
158 bands: HashMap::new(),
159 })
160 }
161
162 pub fn config(&self) -> &Config {
164 &self.cfg
165 }
166
167 pub fn len(&self) -> usize {
169 self.docs.len()
170 }
171
172 pub fn is_empty(&self) -> bool {
174 self.docs.is_empty()
175 }
176
177 pub fn insert(&mut self, id: impl Into<String>, text: &str) -> Result<()> {
180 let signature = self.signature(text);
181 let doc_idx = self.docs.len();
182 let band_hashes: Vec<u64> = self.band_hashes(&signature).collect();
185 for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
186 self.bands
187 .entry((band_idx, band_hash))
188 .or_default()
189 .push(doc_idx);
190 }
191 self.docs.push(DocEntry {
192 id: id.into(),
193 signature,
194 });
195 Ok(())
196 }
197
198 pub fn signature(&self, text: &str) -> Vec<u32> {
201 let shingles = shingle(text, self.cfg.shingle_size, self.cfg.shingle_unit);
202 if shingles.is_empty() {
203 return vec![u32::MAX; self.cfg.num_hashes];
204 }
205 let mut sig = vec![u32::MAX; self.cfg.num_hashes];
206 for s in &shingles {
207 let h = stable_hash(s);
208 for (i, slot) in sig.iter_mut().enumerate() {
211 let v = self.a[i].wrapping_mul(h).wrapping_add(self.b[i]);
212 if v < *slot {
213 *slot = v;
214 }
215 }
216 }
217 sig
218 }
219
220 pub fn jaccard(sig_a: &[u32], sig_b: &[u32]) -> f64 {
222 if sig_a.is_empty() || sig_a.len() != sig_b.len() {
223 return 0.0;
224 }
225 let eq = sig_a.iter().zip(sig_b).filter(|(x, y)| x == y).count();
226 eq as f64 / sig_a.len() as f64
227 }
228
229 pub fn near_duplicates(&self, text: &str, min_similarity: f64) -> Vec<Hit> {
232 let sig = self.signature(text);
233 let mut candidates: std::collections::HashSet<usize> = std::collections::HashSet::new();
235 for (band_idx, band_hash) in self.band_hashes(&sig).enumerate() {
236 if let Some(ids) = self.bands.get(&(band_idx, band_hash)) {
237 for &i in ids {
238 candidates.insert(i);
239 }
240 }
241 }
242 let mut hits: Vec<Hit> = Vec::with_capacity(candidates.len());
243 for i in candidates {
244 let sim = Self::jaccard(&sig, &self.docs[i].signature);
245 if sim >= min_similarity {
246 hits.push(Hit {
247 id: self.docs[i].id.clone(),
248 similarity: sim,
249 });
250 }
251 }
252 hits.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
253 hits
254 }
255
256 fn band_hashes<'a>(&'a self, sig: &'a [u32]) -> impl Iterator<Item = u64> + 'a {
257 let rows = self.cfg.num_hashes / self.cfg.bands;
258 (0..self.cfg.bands).map(move |b| {
259 let mut hasher = self.band_hasher.build_hasher();
260 for r in 0..rows {
261 hasher.write_u32(sig[b * rows + r]);
262 }
263 hasher.finish()
264 })
265 }
266
267 pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> Result<()> {
272 let s = SerializedIndex {
273 cfg: self.cfg.clone(),
274 a: self.a.clone(),
275 b: self.b.clone(),
276 docs: self.docs.clone(),
277 };
278 let file = std::fs::File::create(path)
279 .map_err(|e| DedupError::InvalidConfig(format!("save: {e}")))?;
280 serde_json::to_writer(std::io::BufWriter::new(file), &s)
281 .map_err(|e| DedupError::InvalidConfig(format!("save serde: {e}")))?;
282 Ok(())
283 }
284
285 pub fn load<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
289 let file = std::fs::File::open(path)
290 .map_err(|e| DedupError::InvalidConfig(format!("load: {e}")))?;
291 let s: SerializedIndex = serde_json::from_reader(std::io::BufReader::new(file))
292 .map_err(|e| DedupError::InvalidConfig(format!("load serde: {e}")))?;
293 let band_hasher = RandomState::with_seeds(
297 s.cfg.seed,
298 s.cfg.seed.wrapping_add(1),
299 s.cfg.seed.wrapping_add(2),
300 s.cfg.seed.wrapping_add(3),
301 );
302 let mut idx = Self {
303 cfg: s.cfg,
304 a: s.a,
305 b: s.b,
306 band_hasher,
307 docs: Vec::new(),
308 bands: HashMap::new(),
309 };
310 for doc in s.docs {
314 let doc_idx = idx.docs.len();
315 let band_hashes: Vec<u64> = idx.band_hashes(&doc.signature).collect();
316 for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
317 idx.bands
318 .entry((band_idx, band_hash))
319 .or_default()
320 .push(doc_idx);
321 }
322 idx.docs.push(doc);
323 }
324 Ok(idx)
325 }
326}
327
328fn stable_hash(shingle: &str) -> u32 {
332 static STATE: RandomState = RandomState::with_seeds(
335 0x5151_5151_5151_5151,
336 0x6262_6262_6262_6262,
337 0x7373_7373_7373_7373,
338 0x8484_8484_8484_8484,
339 );
340 let mut h = STATE.build_hasher();
341 h.write(shingle.as_bytes());
342 (h.finish() & 0xFFFF_FFFF) as u32
343}
344
345fn shingle(text: &str, k: usize, unit: ShingleUnit) -> Vec<String> {
349 match unit {
350 ShingleUnit::Char => char_shingles(text, k),
351 ShingleUnit::Word => word_shingles(text, k),
352 }
353}
354
355fn char_shingles(text: &str, k: usize) -> Vec<String> {
356 let chars: Vec<char> = text.chars().collect();
357 if chars.is_empty() {
358 return Vec::new();
359 }
360 if chars.len() <= k {
361 return vec![chars.iter().collect()];
362 }
363 let mut out = Vec::with_capacity(chars.len() - k + 1);
364 for i in 0..=chars.len() - k {
365 out.push(chars[i..i + k].iter().collect());
366 }
367 out
368}
369
370fn word_shingles(text: &str, k: usize) -> Vec<String> {
371 let words: Vec<&str> = text.split_whitespace().collect();
372 if words.is_empty() {
373 return Vec::new();
374 }
375 if words.len() <= k {
376 return vec![words.join(" ")];
377 }
378 let mut out = Vec::with_capacity(words.len() - k + 1);
379 for i in 0..=words.len() - k {
380 out.push(words[i..i + k].join(" "));
381 }
382 out
383}
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388
389 fn idx() -> Index {
390 Index::new(Config::default()).unwrap()
391 }
392
393 #[test]
394 fn empty_index_returns_no_hits() {
395 let i = idx();
396 assert!(i.near_duplicates("anything", 0.0).is_empty());
397 assert_eq!(i.len(), 0);
398 assert!(i.is_empty());
399 }
400
401 #[test]
402 fn identical_text_hits_with_high_similarity() {
403 let mut i = idx();
404 i.insert("a", "the quick brown fox jumps over the lazy dog")
405 .unwrap();
406 let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
407 assert_eq!(hits.len(), 1);
408 assert!(hits[0].similarity > 0.95, "got {}", hits[0].similarity);
409 }
410
411 #[test]
412 fn near_duplicate_detected() {
413 let mut i = idx();
414 i.insert("a", "the quick brown fox jumps over the lazy dog")
415 .unwrap();
416 i.insert("b", "the quick brown fox jumps over a lazy dog")
417 .unwrap(); let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
419 assert!(!hits.is_empty());
420 assert_eq!(hits[0].id, "a");
422 }
423
424 #[test]
425 fn unrelated_text_below_threshold() {
426 let mut i = idx();
427 i.insert("a", "the quick brown fox jumps over the lazy dog")
428 .unwrap();
429 i.insert(
430 "b",
431 "completely unrelated text about space exploration and rockets",
432 )
433 .unwrap();
434 let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
435 assert!(hits.iter().all(|h| h.id != "b"));
437 }
438
439 #[test]
440 fn jaccard_zero_for_disjoint() {
441 let i = idx();
442 let s1 = i.signature("alpha beta gamma");
443 let s2 = i.signature("delta epsilon zeta");
444 let j = Index::jaccard(&s1, &s2);
445 assert!(j < 0.2, "expected near-zero Jaccard, got {j}");
446 }
447
448 #[test]
449 fn jaccard_one_for_identical() {
450 let i = idx();
451 let s = i.signature("alpha beta gamma delta epsilon");
452 assert_eq!(Index::jaccard(&s, &s), 1.0);
453 }
454
455 #[test]
456 fn signatures_deterministic_across_indices() {
457 let i1 = Index::new(Config {
459 seed: 42,
460 ..Config::default()
461 })
462 .unwrap();
463 let i2 = Index::new(Config {
464 seed: 42,
465 ..Config::default()
466 })
467 .unwrap();
468 let s1 = i1.signature("hello world");
469 let s2 = i2.signature("hello world");
470 assert_eq!(s1, s2);
471 }
472
473 #[test]
474 fn signatures_differ_with_different_seed() {
475 let i1 = Index::new(Config {
476 seed: 1,
477 ..Config::default()
478 })
479 .unwrap();
480 let i2 = Index::new(Config {
481 seed: 2,
482 ..Config::default()
483 })
484 .unwrap();
485 let s1 = i1.signature("hello world");
486 let s2 = i2.signature("hello world");
487 assert_ne!(s1, s2);
488 }
489
490 #[test]
491 fn invalid_config_rejected() {
492 let bad = Config {
494 num_hashes: 100,
495 bands: 33,
496 ..Default::default()
497 };
498 assert!(Index::new(bad).is_err());
499 let bad = Config {
501 num_hashes: 0,
502 bands: 1,
503 ..Default::default()
504 };
505 assert!(Index::new(bad).is_err());
506 let bad = Config {
508 num_hashes: 100,
509 bands: 0,
510 ..Default::default()
511 };
512 assert!(Index::new(bad).is_err());
513 let bad = Config {
515 shingle_size: 0,
516 ..Default::default()
517 };
518 assert!(Index::new(bad).is_err());
519 }
520
521 #[test]
522 fn empty_input_safe_signature() {
523 let i = idx();
524 let s = i.signature("");
525 assert_eq!(s.len(), 128);
527 assert!(s.iter().all(|&x| x == u32::MAX));
528 }
529
530 #[test]
531 fn short_input_still_signs() {
532 let i = idx();
533 let s = i.signature("hi");
534 assert_eq!(s.len(), 128);
535 assert!(s.iter().any(|&x| x != u32::MAX));
536 }
537
538 #[test]
539 fn word_shingles_work() {
540 let i = Index::new(Config {
541 shingle_unit: ShingleUnit::Word,
542 shingle_size: 3,
543 ..Config::default()
544 })
545 .unwrap();
546 let s1 = i.signature("the quick brown fox jumps");
547 let s2 = i.signature("the quick brown fox runs");
548 let j = Index::jaccard(&s1, &s2);
549 assert!(j > 0.3, "expected high Jaccard, got {j}");
551 }
552
553 #[test]
554 fn three_doc_corpus_returns_two_dupes() {
555 let mut i = idx();
556 i.insert(
557 "dup-1",
558 "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
559 )
560 .unwrap();
561 i.insert(
562 "dup-2",
563 "Lorem ipsum dolor sit amet consectetur adipiscing elit",
564 )
565 .unwrap();
566 i.insert(
567 "other",
568 "completely different text on a different subject entirely",
569 )
570 .unwrap();
571 let hits = i.near_duplicates(
572 "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
573 0.4,
574 );
575 let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
576 assert!(ids.contains("dup-1"));
577 assert!(ids.contains("dup-2"));
578 assert!(!ids.contains("other"));
579 }
580
581 #[test]
582 fn hits_sorted_by_similarity_desc() {
583 let mut i = idx();
584 i.insert("identical", "alpha beta gamma delta epsilon zeta")
585 .unwrap();
586 i.insert("partial", "alpha beta gamma OMEGA SIGMA TAU")
587 .unwrap();
588 let hits = i.near_duplicates("alpha beta gamma delta epsilon zeta", 0.0);
589 assert_eq!(hits[0].id, "identical");
590 for win in hits.windows(2) {
591 assert!(win[0].similarity >= win[1].similarity);
592 }
593 }
594
595 fn tempfile() -> std::path::PathBuf {
596 let dir = std::env::temp_dir().join(format!(
597 "lshdedup-test-{}-{}",
598 std::process::id(),
599 std::time::SystemTime::now()
600 .duration_since(std::time::UNIX_EPOCH)
601 .unwrap()
602 .as_nanos()
603 ));
604 std::fs::create_dir_all(&dir).unwrap();
605 dir.join("index.json")
606 }
607
608 #[test]
609 fn save_load_round_trip() {
610 let path = tempfile();
611 let mut i = idx();
612 i.insert("a", "the quick brown fox jumps over the lazy dog")
613 .unwrap();
614 i.insert("b", "the quick brown fox jumps over a lazy dog")
615 .unwrap();
616 i.insert("c", "completely unrelated text on a different subject")
617 .unwrap();
618 i.save(&path).unwrap();
619
620 let loaded = Index::load(&path).unwrap();
621 assert_eq!(loaded.len(), 3);
622 let hits = loaded.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
623 let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
624 assert!(ids.contains("a"));
625 }
626
627 #[test]
628 fn load_missing_file_errors() {
629 let r = Index::load("/no/such/path/should/exist.json");
630 assert!(r.is_err());
631 }
632
633 #[test]
634 fn loaded_signatures_match_originals() {
635 let path = tempfile();
636 let mut i = Index::new(Config {
637 seed: 42,
638 ..Config::default()
639 })
640 .unwrap();
641 i.insert("doc", "lorem ipsum dolor sit amet").unwrap();
642 let original_sig = i.signature("lorem ipsum dolor sit amet");
643 i.save(&path).unwrap();
644
645 let loaded = Index::load(&path).unwrap();
646 let loaded_sig = loaded.signature("lorem ipsum dolor sit amet");
649 assert_eq!(original_sig, loaded_sig);
650 }
651}