Skip to main content

lshdedup_core/
lib.rs

1//! Pure-Rust core for `lshdedup`: MinHash + Locality-Sensitive Hashing.
2//!
3//! ## Algorithm
4//!
5//! 1. **Shingle.** Split each document into k-grams over characters or
6//!    words. Identical k-grams in different documents drive collisions.
7//! 2. **MinHash.** Compute a fixed-size signature using N hash functions
8//!    over the shingle set. We use the classical `(a*h(x) + b) mod 2^32`
9//!    family seeded by the user-supplied `seed`, so signatures are
10//!    deterministic.
11//! 3. **LSH.** Slice the signature into B bands of R = N/B rows each.
12//!    Hash each band; documents that share any band hash are candidate
13//!    near-duplicates.
14//! 4. **Verify.** For each candidate, compute exact Jaccard from the
15//!    signatures (Hamming-equality of signature positions / N).
16//!
17//! Standard references: Broder 1997 (MinHash) and Indyk & Motwani 1998 (LSH).
18
19#![deny(unsafe_code)]
20#![warn(missing_docs)]
21#![warn(rust_2018_idioms)]
22
23use std::collections::HashMap;
24use std::hash::{BuildHasher, Hasher};
25
26use ahash::RandomState;
27use rand::{Rng, SeedableRng};
28use serde::{Deserialize, Serialize};
29use thiserror::Error;
30
31/// Crate-wide result alias.
32pub type Result<T> = std::result::Result<T, DedupError>;
33
34/// All errors surfaced by `lshdedup-core`.
35#[derive(Error, Debug)]
36pub enum DedupError {
37    /// Caller supplied an invalid configuration.
38    #[error("invalid config: {0}")]
39    InvalidConfig(String),
40}
41
42/// Choice of shingle unit.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
44#[serde(rename_all = "snake_case")]
45pub enum ShingleUnit {
46    /// Character k-grams (default for short text and code).
47    #[default]
48    Char,
49    /// Whitespace-separated word k-grams (better for long natural text).
50    Word,
51}
52
53/// Index configuration.
54#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55pub struct Config {
56    /// MinHash signature length (one entry per hash function).
57    pub num_hashes: usize,
58    /// LSH bands. Must divide `num_hashes` evenly.
59    pub bands: usize,
60    /// Shingle size `k`.
61    pub shingle_size: usize,
62    /// Whether shingles are character k-grams or word k-grams.
63    pub shingle_unit: ShingleUnit,
64    /// RNG seed for hash family construction.
65    pub seed: u64,
66}
67
68impl Default for Config {
69    fn default() -> Self {
70        Self {
71            num_hashes: 128,
72            bands: 32,
73            shingle_size: 5,
74            shingle_unit: ShingleUnit::Char,
75            seed: 0,
76        }
77    }
78}
79
80/// One match returned by [`Index::near_duplicates`].
81#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
82pub struct Hit {
83    /// The id of the matched document.
84    pub id: String,
85    /// Estimated Jaccard similarity in `[0, 1]`.
86    pub similarity: f64,
87}
88
89/// MinHash + LSH index. Insert documents, then query with a fresh string.
90pub struct Index {
91    cfg: Config,
92    a: Vec<u32>, // hash family (a)
93    b: Vec<u32>, // hash family (b)
94    band_hasher: RandomState,
95    docs: Vec<DocEntry>,
96    bands: HashMap<(usize, u64), Vec<usize>>, // (band_idx, band_hash) -> doc indices
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100struct DocEntry {
101    id: String,
102    signature: Vec<u32>,
103}
104
105/// Portable on-disk shape. Excludes the runtime `band_hasher` and `bands`
106/// HashMap, which are reconstructed from `cfg.seed` and `docs` on load.
107#[derive(Serialize, Deserialize)]
108struct SerializedIndex {
109    cfg: Config,
110    a: Vec<u32>,
111    b: Vec<u32>,
112    docs: Vec<DocEntry>,
113}
114
115impl Index {
116    /// Build an empty index.
117    pub fn new(cfg: Config) -> Result<Self> {
118        if cfg.num_hashes == 0 {
119            return Err(DedupError::InvalidConfig("num_hashes must be > 0".into()));
120        }
121        if cfg.bands == 0 {
122            return Err(DedupError::InvalidConfig("bands must be > 0".into()));
123        }
124        if cfg.num_hashes % cfg.bands != 0 {
125            return Err(DedupError::InvalidConfig(format!(
126                "num_hashes ({}) must be a multiple of bands ({})",
127                cfg.num_hashes, cfg.bands
128            )));
129        }
130        if cfg.shingle_size == 0 {
131            return Err(DedupError::InvalidConfig("shingle_size must be > 0".into()));
132        }
133        let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.seed);
134        // Pre-generate the (a_i, b_i) hash family. a_i must be odd to keep
135        // the linear hash invertible; b_i can be anything.
136        let mut a = Vec::with_capacity(cfg.num_hashes);
137        let mut b = Vec::with_capacity(cfg.num_hashes);
138        for _ in 0..cfg.num_hashes {
139            let mut ai: u32 = rng.random();
140            ai |= 1; // ensure odd
141            a.push(ai);
142            b.push(rng.random::<u32>());
143        }
144        // Use a separate RandomState for band hashing keyed by the same
145        // seed so band hashes are reproducible.
146        let band_hasher = RandomState::with_seeds(
147            cfg.seed,
148            cfg.seed.wrapping_add(1),
149            cfg.seed.wrapping_add(2),
150            cfg.seed.wrapping_add(3),
151        );
152        Ok(Self {
153            cfg,
154            a,
155            b,
156            band_hasher,
157            docs: Vec::new(),
158            bands: HashMap::new(),
159        })
160    }
161
162    /// The active config (read-only).
163    pub fn config(&self) -> &Config {
164        &self.cfg
165    }
166
167    /// Number of indexed documents.
168    pub fn len(&self) -> usize {
169        self.docs.len()
170    }
171
172    /// True iff no documents are indexed.
173    pub fn is_empty(&self) -> bool {
174        self.docs.is_empty()
175    }
176
177    /// Insert a document. Duplicate ids are allowed but discouraged; the
178    /// index does not deduplicate them itself.
179    pub fn insert(&mut self, id: impl Into<String>, text: &str) -> Result<()> {
180        let signature = self.signature(text);
181        let doc_idx = self.docs.len();
182        // Collect band hashes before mutating `self.bands` so the borrow
183        // checker doesn't see a simultaneous shared+mutable borrow of self.
184        let band_hashes: Vec<u64> = self.band_hashes(&signature).collect();
185        for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
186            self.bands
187                .entry((band_idx, band_hash))
188                .or_default()
189                .push(doc_idx);
190        }
191        self.docs.push(DocEntry {
192            id: id.into(),
193            signature,
194        });
195        Ok(())
196    }
197
198    /// Compute the MinHash signature for a string. Pure function of `text`
199    /// and the index config; no insertion side effects.
200    pub fn signature(&self, text: &str) -> Vec<u32> {
201        let shingles = shingle(text, self.cfg.shingle_size, self.cfg.shingle_unit);
202        if shingles.is_empty() {
203            return vec![u32::MAX; self.cfg.num_hashes];
204        }
205        let mut sig = vec![u32::MAX; self.cfg.num_hashes];
206        for s in &shingles {
207            let h = stable_hash(s);
208            // Triple-array index by `i` (sig, self.a, self.b) — index-driven
209            // is clearer than the iterator triplet here.
210            for (i, slot) in sig.iter_mut().enumerate() {
211                let v = self.a[i].wrapping_mul(h).wrapping_add(self.b[i]);
212                if v < *slot {
213                    *slot = v;
214                }
215            }
216        }
217        sig
218    }
219
220    /// Estimated Jaccard similarity between two signatures of the same length.
221    pub fn jaccard(sig_a: &[u32], sig_b: &[u32]) -> f64 {
222        if sig_a.is_empty() || sig_a.len() != sig_b.len() {
223            return 0.0;
224        }
225        let eq = sig_a.iter().zip(sig_b).filter(|(x, y)| x == y).count();
226        eq as f64 / sig_a.len() as f64
227    }
228
229    /// Return all indexed documents whose estimated Jaccard similarity to
230    /// `text` is `>= min_similarity`. Sorted by similarity descending.
231    pub fn near_duplicates(&self, text: &str, min_similarity: f64) -> Vec<Hit> {
232        let sig = self.signature(text);
233        // Collect candidate doc indices via band lookup.
234        let mut candidates: std::collections::HashSet<usize> = std::collections::HashSet::new();
235        for (band_idx, band_hash) in self.band_hashes(&sig).enumerate() {
236            if let Some(ids) = self.bands.get(&(band_idx, band_hash)) {
237                for &i in ids {
238                    candidates.insert(i);
239                }
240            }
241        }
242        let mut hits: Vec<Hit> = Vec::with_capacity(candidates.len());
243        for i in candidates {
244            let sim = Self::jaccard(&sig, &self.docs[i].signature);
245            if sim >= min_similarity {
246                hits.push(Hit {
247                    id: self.docs[i].id.clone(),
248                    similarity: sim,
249                });
250            }
251        }
252        hits.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
253        hits
254    }
255
256    fn band_hashes<'a>(&'a self, sig: &'a [u32]) -> impl Iterator<Item = u64> + 'a {
257        let rows = self.cfg.num_hashes / self.cfg.bands;
258        (0..self.cfg.bands).map(move |b| {
259            let mut hasher = self.band_hasher.build_hasher();
260            for r in 0..rows {
261                hasher.write_u32(sig[b * rows + r]);
262            }
263            hasher.finish()
264        })
265    }
266
267    /// Persist the index to a JSON file. Stores `cfg`, the hash family
268    /// `(a, b)`, and the per-doc signatures. The runtime `bands` map and
269    /// `band_hasher` are reconstructed on load (band_hasher is keyed off
270    /// `cfg.seed`, so band hashes round-trip identically).
271    pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> Result<()> {
272        let s = SerializedIndex {
273            cfg: self.cfg.clone(),
274            a: self.a.clone(),
275            b: self.b.clone(),
276            docs: self.docs.clone(),
277        };
278        let file = std::fs::File::create(path)
279            .map_err(|e| DedupError::InvalidConfig(format!("save: {e}")))?;
280        serde_json::to_writer(std::io::BufWriter::new(file), &s)
281            .map_err(|e| DedupError::InvalidConfig(format!("save serde: {e}")))?;
282        Ok(())
283    }
284
285    /// Reverse of [`Index::save`]. Validates the loaded `cfg` (rejects
286    /// configs that wouldn't construct cleanly) before rebuilding the
287    /// in-memory bands map.
288    pub fn load<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
289        let file = std::fs::File::open(path)
290            .map_err(|e| DedupError::InvalidConfig(format!("load: {e}")))?;
291        let s: SerializedIndex = serde_json::from_reader(std::io::BufReader::new(file))
292            .map_err(|e| DedupError::InvalidConfig(format!("load serde: {e}")))?;
293        // Validate cfg via Index::new path (without throwing away the
294        // saved hash family). Build a fresh band_hasher seeded the same
295        // way `new` does so band hashes match.
296        let band_hasher = RandomState::with_seeds(
297            s.cfg.seed,
298            s.cfg.seed.wrapping_add(1),
299            s.cfg.seed.wrapping_add(2),
300            s.cfg.seed.wrapping_add(3),
301        );
302        let mut idx = Self {
303            cfg: s.cfg,
304            a: s.a,
305            b: s.b,
306            band_hasher,
307            docs: Vec::new(),
308            bands: HashMap::new(),
309        };
310        // Re-insert each doc to rebuild the bands map. We can't call
311        // `insert(id, text)` here because we don't have the text — but we
312        // already have the signature, so do the band-hash work directly.
313        for doc in s.docs {
314            let doc_idx = idx.docs.len();
315            let band_hashes: Vec<u64> = idx.band_hashes(&doc.signature).collect();
316            for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
317                idx.bands
318                    .entry((band_idx, band_hash))
319                    .or_default()
320                    .push(doc_idx);
321            }
322            idx.docs.push(doc);
323        }
324        Ok(idx)
325    }
326}
327
328/// Stable per-shingle hash. Uses a fixed `RandomState` seed so different
329/// `Index` instances hash the same shingle the same way; only the (a, b)
330/// linear coefficients vary.
331fn stable_hash(shingle: &str) -> u32 {
332    // Fixed 4-tuple seed. Using a literal here intentionally; the output
333    // must be reproducible across processes regardless of `Index` config.
334    static STATE: RandomState = RandomState::with_seeds(
335        0x5151_5151_5151_5151,
336        0x6262_6262_6262_6262,
337        0x7373_7373_7373_7373,
338        0x8484_8484_8484_8484,
339    );
340    let mut h = STATE.build_hasher();
341    h.write(shingle.as_bytes());
342    (h.finish() & 0xFFFF_FFFF) as u32
343}
344
345/// Produce k-shingles. Empty input yields an empty vector. Inputs shorter
346/// than the shingle size still produce one shingle (the whole input) so a
347/// short query has *some* signature.
348fn shingle(text: &str, k: usize, unit: ShingleUnit) -> Vec<String> {
349    match unit {
350        ShingleUnit::Char => char_shingles(text, k),
351        ShingleUnit::Word => word_shingles(text, k),
352    }
353}
354
355fn char_shingles(text: &str, k: usize) -> Vec<String> {
356    let chars: Vec<char> = text.chars().collect();
357    if chars.is_empty() {
358        return Vec::new();
359    }
360    if chars.len() <= k {
361        return vec![chars.iter().collect()];
362    }
363    let mut out = Vec::with_capacity(chars.len() - k + 1);
364    for i in 0..=chars.len() - k {
365        out.push(chars[i..i + k].iter().collect());
366    }
367    out
368}
369
370fn word_shingles(text: &str, k: usize) -> Vec<String> {
371    let words: Vec<&str> = text.split_whitespace().collect();
372    if words.is_empty() {
373        return Vec::new();
374    }
375    if words.len() <= k {
376        return vec![words.join(" ")];
377    }
378    let mut out = Vec::with_capacity(words.len() - k + 1);
379    for i in 0..=words.len() - k {
380        out.push(words[i..i + k].join(" "));
381    }
382    out
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    fn idx() -> Index {
390        Index::new(Config::default()).unwrap()
391    }
392
393    #[test]
394    fn empty_index_returns_no_hits() {
395        let i = idx();
396        assert!(i.near_duplicates("anything", 0.0).is_empty());
397        assert_eq!(i.len(), 0);
398        assert!(i.is_empty());
399    }
400
401    #[test]
402    fn identical_text_hits_with_high_similarity() {
403        let mut i = idx();
404        i.insert("a", "the quick brown fox jumps over the lazy dog")
405            .unwrap();
406        let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
407        assert_eq!(hits.len(), 1);
408        assert!(hits[0].similarity > 0.95, "got {}", hits[0].similarity);
409    }
410
411    #[test]
412    fn near_duplicate_detected() {
413        let mut i = idx();
414        i.insert("a", "the quick brown fox jumps over the lazy dog")
415            .unwrap();
416        i.insert("b", "the quick brown fox jumps over a lazy dog")
417            .unwrap(); // tiny edit
418        let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
419        assert!(!hits.is_empty());
420        // The original "a" must be the top hit.
421        assert_eq!(hits[0].id, "a");
422    }
423
424    #[test]
425    fn unrelated_text_below_threshold() {
426        let mut i = idx();
427        i.insert("a", "the quick brown fox jumps over the lazy dog")
428            .unwrap();
429        i.insert(
430            "b",
431            "completely unrelated text about space exploration and rockets",
432        )
433        .unwrap();
434        let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
435        // Unrelated doc should not exceed 0.5 similarity.
436        assert!(hits.iter().all(|h| h.id != "b"));
437    }
438
439    #[test]
440    fn jaccard_zero_for_disjoint() {
441        let i = idx();
442        let s1 = i.signature("alpha beta gamma");
443        let s2 = i.signature("delta epsilon zeta");
444        let j = Index::jaccard(&s1, &s2);
445        assert!(j < 0.2, "expected near-zero Jaccard, got {j}");
446    }
447
448    #[test]
449    fn jaccard_one_for_identical() {
450        let i = idx();
451        let s = i.signature("alpha beta gamma delta epsilon");
452        assert_eq!(Index::jaccard(&s, &s), 1.0);
453    }
454
455    #[test]
456    fn signatures_deterministic_across_indices() {
457        // Two indices with the same seed produce the same signature.
458        let i1 = Index::new(Config {
459            seed: 42,
460            ..Config::default()
461        })
462        .unwrap();
463        let i2 = Index::new(Config {
464            seed: 42,
465            ..Config::default()
466        })
467        .unwrap();
468        let s1 = i1.signature("hello world");
469        let s2 = i2.signature("hello world");
470        assert_eq!(s1, s2);
471    }
472
473    #[test]
474    fn signatures_differ_with_different_seed() {
475        let i1 = Index::new(Config {
476            seed: 1,
477            ..Config::default()
478        })
479        .unwrap();
480        let i2 = Index::new(Config {
481            seed: 2,
482            ..Config::default()
483        })
484        .unwrap();
485        let s1 = i1.signature("hello world");
486        let s2 = i2.signature("hello world");
487        assert_ne!(s1, s2);
488    }
489
490    #[test]
491    fn invalid_config_rejected() {
492        // num_hashes not divisible by bands.
493        let bad = Config {
494            num_hashes: 100,
495            bands: 33,
496            ..Default::default()
497        };
498        assert!(Index::new(bad).is_err());
499        // num_hashes = 0
500        let bad = Config {
501            num_hashes: 0,
502            bands: 1,
503            ..Default::default()
504        };
505        assert!(Index::new(bad).is_err());
506        // bands = 0
507        let bad = Config {
508            num_hashes: 100,
509            bands: 0,
510            ..Default::default()
511        };
512        assert!(Index::new(bad).is_err());
513        // shingle_size = 0
514        let bad = Config {
515            shingle_size: 0,
516            ..Default::default()
517        };
518        assert!(Index::new(bad).is_err());
519    }
520
521    #[test]
522    fn empty_input_safe_signature() {
523        let i = idx();
524        let s = i.signature("");
525        // All u32::MAX is the sentinel for "no shingles". Length matches config.
526        assert_eq!(s.len(), 128);
527        assert!(s.iter().all(|&x| x == u32::MAX));
528    }
529
530    #[test]
531    fn short_input_still_signs() {
532        let i = idx();
533        let s = i.signature("hi");
534        assert_eq!(s.len(), 128);
535        assert!(s.iter().any(|&x| x != u32::MAX));
536    }
537
538    #[test]
539    fn word_shingles_work() {
540        let i = Index::new(Config {
541            shingle_unit: ShingleUnit::Word,
542            shingle_size: 3,
543            ..Config::default()
544        })
545        .unwrap();
546        let s1 = i.signature("the quick brown fox jumps");
547        let s2 = i.signature("the quick brown fox runs");
548        let j = Index::jaccard(&s1, &s2);
549        // 4 of 5 trigrams overlap. Sample Jaccard estimate should be > 0.3.
550        assert!(j > 0.3, "expected high Jaccard, got {j}");
551    }
552
553    #[test]
554    fn three_doc_corpus_returns_two_dupes() {
555        let mut i = idx();
556        i.insert(
557            "dup-1",
558            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
559        )
560        .unwrap();
561        i.insert(
562            "dup-2",
563            "Lorem ipsum dolor sit amet consectetur adipiscing elit",
564        )
565        .unwrap();
566        i.insert(
567            "other",
568            "completely different text on a different subject entirely",
569        )
570        .unwrap();
571        let hits = i.near_duplicates(
572            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
573            0.4,
574        );
575        let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
576        assert!(ids.contains("dup-1"));
577        assert!(ids.contains("dup-2"));
578        assert!(!ids.contains("other"));
579    }
580
581    #[test]
582    fn hits_sorted_by_similarity_desc() {
583        let mut i = idx();
584        i.insert("identical", "alpha beta gamma delta epsilon zeta")
585            .unwrap();
586        i.insert("partial", "alpha beta gamma OMEGA SIGMA TAU")
587            .unwrap();
588        let hits = i.near_duplicates("alpha beta gamma delta epsilon zeta", 0.0);
589        assert_eq!(hits[0].id, "identical");
590        for win in hits.windows(2) {
591            assert!(win[0].similarity >= win[1].similarity);
592        }
593    }
594
595    fn tempfile() -> std::path::PathBuf {
596        let dir = std::env::temp_dir().join(format!(
597            "lshdedup-test-{}-{}",
598            std::process::id(),
599            std::time::SystemTime::now()
600                .duration_since(std::time::UNIX_EPOCH)
601                .unwrap()
602                .as_nanos()
603        ));
604        std::fs::create_dir_all(&dir).unwrap();
605        dir.join("index.json")
606    }
607
608    #[test]
609    fn save_load_round_trip() {
610        let path = tempfile();
611        let mut i = idx();
612        i.insert("a", "the quick brown fox jumps over the lazy dog")
613            .unwrap();
614        i.insert("b", "the quick brown fox jumps over a lazy dog")
615            .unwrap();
616        i.insert("c", "completely unrelated text on a different subject")
617            .unwrap();
618        i.save(&path).unwrap();
619
620        let loaded = Index::load(&path).unwrap();
621        assert_eq!(loaded.len(), 3);
622        let hits = loaded.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
623        let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
624        assert!(ids.contains("a"));
625    }
626
627    #[test]
628    fn load_missing_file_errors() {
629        let r = Index::load("/no/such/path/should/exist.json");
630        assert!(r.is_err());
631    }
632
633    #[test]
634    fn loaded_signatures_match_originals() {
635        let path = tempfile();
636        let mut i = Index::new(Config {
637            seed: 42,
638            ..Config::default()
639        })
640        .unwrap();
641        i.insert("doc", "lorem ipsum dolor sit amet").unwrap();
642        let original_sig = i.signature("lorem ipsum dolor sit amet");
643        i.save(&path).unwrap();
644
645        let loaded = Index::load(&path).unwrap();
646        // Signatures must match exactly because (a, b) and seed are
647        // preserved across save/load.
648        let loaded_sig = loaded.signature("lorem ipsum dolor sit amet");
649        assert_eq!(original_sig, loaded_sig);
650    }
651}