#![deny(unsafe_code)]
#![warn(missing_docs)]
#![warn(rust_2018_idioms)]
use std::collections::HashMap;
use std::hash::{BuildHasher, Hasher};
use ahash::RandomState;
use rand::{Rng, SeedableRng};
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub type Result<T> = std::result::Result<T, DedupError>;
#[derive(Error, Debug)]
pub enum DedupError {
#[error("invalid config: {0}")]
InvalidConfig(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ShingleUnit {
#[default]
Char,
Word,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Config {
pub num_hashes: usize,
pub bands: usize,
pub shingle_size: usize,
pub shingle_unit: ShingleUnit,
pub seed: u64,
}
impl Default for Config {
fn default() -> Self {
Self {
num_hashes: 128,
bands: 32,
shingle_size: 5,
shingle_unit: ShingleUnit::Char,
seed: 0,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Hit {
pub id: String,
pub similarity: f64,
}
pub struct Index {
cfg: Config,
a: Vec<u32>, b: Vec<u32>, band_hasher: RandomState,
docs: Vec<DocEntry>,
bands: HashMap<(usize, u64), Vec<usize>>, }
#[derive(Debug, Clone, Serialize, Deserialize)]
struct DocEntry {
id: String,
signature: Vec<u32>,
}
#[derive(Serialize, Deserialize)]
struct SerializedIndex {
cfg: Config,
a: Vec<u32>,
b: Vec<u32>,
docs: Vec<DocEntry>,
}
impl Index {
pub fn new(cfg: Config) -> Result<Self> {
if cfg.num_hashes == 0 {
return Err(DedupError::InvalidConfig("num_hashes must be > 0".into()));
}
if cfg.bands == 0 {
return Err(DedupError::InvalidConfig("bands must be > 0".into()));
}
if cfg.num_hashes % cfg.bands != 0 {
return Err(DedupError::InvalidConfig(format!(
"num_hashes ({}) must be a multiple of bands ({})",
cfg.num_hashes, cfg.bands
)));
}
if cfg.shingle_size == 0 {
return Err(DedupError::InvalidConfig("shingle_size must be > 0".into()));
}
let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.seed);
let mut a = Vec::with_capacity(cfg.num_hashes);
let mut b = Vec::with_capacity(cfg.num_hashes);
for _ in 0..cfg.num_hashes {
let mut ai: u32 = rng.random();
ai |= 1; a.push(ai);
b.push(rng.random::<u32>());
}
let band_hasher = RandomState::with_seeds(
cfg.seed,
cfg.seed.wrapping_add(1),
cfg.seed.wrapping_add(2),
cfg.seed.wrapping_add(3),
);
Ok(Self {
cfg,
a,
b,
band_hasher,
docs: Vec::new(),
bands: HashMap::new(),
})
}
pub fn config(&self) -> &Config {
&self.cfg
}
pub fn len(&self) -> usize {
self.docs.len()
}
pub fn is_empty(&self) -> bool {
self.docs.is_empty()
}
pub fn insert(&mut self, id: impl Into<String>, text: &str) -> Result<()> {
let signature = self.signature(text);
let doc_idx = self.docs.len();
let band_hashes: Vec<u64> = self.band_hashes(&signature).collect();
for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
self.bands
.entry((band_idx, band_hash))
.or_default()
.push(doc_idx);
}
self.docs.push(DocEntry {
id: id.into(),
signature,
});
Ok(())
}
pub fn signature(&self, text: &str) -> Vec<u32> {
let shingles = shingle(text, self.cfg.shingle_size, self.cfg.shingle_unit);
if shingles.is_empty() {
return vec![u32::MAX; self.cfg.num_hashes];
}
let mut sig = vec![u32::MAX; self.cfg.num_hashes];
for s in &shingles {
let h = stable_hash(s);
for (i, slot) in sig.iter_mut().enumerate() {
let v = self.a[i].wrapping_mul(h).wrapping_add(self.b[i]);
if v < *slot {
*slot = v;
}
}
}
sig
}
pub fn jaccard(sig_a: &[u32], sig_b: &[u32]) -> f64 {
if sig_a.is_empty() || sig_a.len() != sig_b.len() {
return 0.0;
}
let eq = sig_a.iter().zip(sig_b).filter(|(x, y)| x == y).count();
eq as f64 / sig_a.len() as f64
}
pub fn near_duplicates(&self, text: &str, min_similarity: f64) -> Vec<Hit> {
let sig = self.signature(text);
let mut candidates: std::collections::HashSet<usize> = std::collections::HashSet::new();
for (band_idx, band_hash) in self.band_hashes(&sig).enumerate() {
if let Some(ids) = self.bands.get(&(band_idx, band_hash)) {
for &i in ids {
candidates.insert(i);
}
}
}
let mut hits: Vec<Hit> = Vec::with_capacity(candidates.len());
for i in candidates {
let sim = Self::jaccard(&sig, &self.docs[i].signature);
if sim >= min_similarity {
hits.push(Hit {
id: self.docs[i].id.clone(),
similarity: sim,
});
}
}
hits.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
hits
}
fn band_hashes<'a>(&'a self, sig: &'a [u32]) -> impl Iterator<Item = u64> + 'a {
let rows = self.cfg.num_hashes / self.cfg.bands;
(0..self.cfg.bands).map(move |b| {
let mut hasher = self.band_hasher.build_hasher();
for r in 0..rows {
hasher.write_u32(sig[b * rows + r]);
}
hasher.finish()
})
}
pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> Result<()> {
let s = SerializedIndex {
cfg: self.cfg.clone(),
a: self.a.clone(),
b: self.b.clone(),
docs: self.docs.clone(),
};
let file = std::fs::File::create(path)
.map_err(|e| DedupError::InvalidConfig(format!("save: {e}")))?;
serde_json::to_writer(std::io::BufWriter::new(file), &s)
.map_err(|e| DedupError::InvalidConfig(format!("save serde: {e}")))?;
Ok(())
}
pub fn load<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
let file = std::fs::File::open(path)
.map_err(|e| DedupError::InvalidConfig(format!("load: {e}")))?;
let s: SerializedIndex = serde_json::from_reader(std::io::BufReader::new(file))
.map_err(|e| DedupError::InvalidConfig(format!("load serde: {e}")))?;
let band_hasher = RandomState::with_seeds(
s.cfg.seed,
s.cfg.seed.wrapping_add(1),
s.cfg.seed.wrapping_add(2),
s.cfg.seed.wrapping_add(3),
);
let mut idx = Self {
cfg: s.cfg,
a: s.a,
b: s.b,
band_hasher,
docs: Vec::new(),
bands: HashMap::new(),
};
for doc in s.docs {
let doc_idx = idx.docs.len();
let band_hashes: Vec<u64> = idx.band_hashes(&doc.signature).collect();
for (band_idx, band_hash) in band_hashes.into_iter().enumerate() {
idx.bands
.entry((band_idx, band_hash))
.or_default()
.push(doc_idx);
}
idx.docs.push(doc);
}
Ok(idx)
}
}
fn stable_hash(shingle: &str) -> u32 {
static STATE: RandomState = RandomState::with_seeds(
0x5151_5151_5151_5151,
0x6262_6262_6262_6262,
0x7373_7373_7373_7373,
0x8484_8484_8484_8484,
);
let mut h = STATE.build_hasher();
h.write(shingle.as_bytes());
(h.finish() & 0xFFFF_FFFF) as u32
}
fn shingle(text: &str, k: usize, unit: ShingleUnit) -> Vec<String> {
match unit {
ShingleUnit::Char => char_shingles(text, k),
ShingleUnit::Word => word_shingles(text, k),
}
}
fn char_shingles(text: &str, k: usize) -> Vec<String> {
let chars: Vec<char> = text.chars().collect();
if chars.is_empty() {
return Vec::new();
}
if chars.len() <= k {
return vec![chars.iter().collect()];
}
let mut out = Vec::with_capacity(chars.len() - k + 1);
for i in 0..=chars.len() - k {
out.push(chars[i..i + k].iter().collect());
}
out
}
fn word_shingles(text: &str, k: usize) -> Vec<String> {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return Vec::new();
}
if words.len() <= k {
return vec![words.join(" ")];
}
let mut out = Vec::with_capacity(words.len() - k + 1);
for i in 0..=words.len() - k {
out.push(words[i..i + k].join(" "));
}
out
}
#[cfg(test)]
mod tests {
use super::*;
fn idx() -> Index {
Index::new(Config::default()).unwrap()
}
#[test]
fn empty_index_returns_no_hits() {
let i = idx();
assert!(i.near_duplicates("anything", 0.0).is_empty());
assert_eq!(i.len(), 0);
assert!(i.is_empty());
}
#[test]
fn identical_text_hits_with_high_similarity() {
let mut i = idx();
i.insert("a", "the quick brown fox jumps over the lazy dog")
.unwrap();
let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
assert_eq!(hits.len(), 1);
assert!(hits[0].similarity > 0.95, "got {}", hits[0].similarity);
}
#[test]
fn near_duplicate_detected() {
let mut i = idx();
i.insert("a", "the quick brown fox jumps over the lazy dog")
.unwrap();
i.insert("b", "the quick brown fox jumps over a lazy dog")
.unwrap(); let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
assert!(!hits.is_empty());
assert_eq!(hits[0].id, "a");
}
#[test]
fn unrelated_text_below_threshold() {
let mut i = idx();
i.insert("a", "the quick brown fox jumps over the lazy dog")
.unwrap();
i.insert(
"b",
"completely unrelated text about space exploration and rockets",
)
.unwrap();
let hits = i.near_duplicates("the quick brown fox jumps over the lazy dog", 0.5);
assert!(hits.iter().all(|h| h.id != "b"));
}
#[test]
fn jaccard_zero_for_disjoint() {
let i = idx();
let s1 = i.signature("alpha beta gamma");
let s2 = i.signature("delta epsilon zeta");
let j = Index::jaccard(&s1, &s2);
assert!(j < 0.2, "expected near-zero Jaccard, got {j}");
}
#[test]
fn jaccard_one_for_identical() {
let i = idx();
let s = i.signature("alpha beta gamma delta epsilon");
assert_eq!(Index::jaccard(&s, &s), 1.0);
}
#[test]
fn signatures_deterministic_across_indices() {
let i1 = Index::new(Config {
seed: 42,
..Config::default()
})
.unwrap();
let i2 = Index::new(Config {
seed: 42,
..Config::default()
})
.unwrap();
let s1 = i1.signature("hello world");
let s2 = i2.signature("hello world");
assert_eq!(s1, s2);
}
#[test]
fn signatures_differ_with_different_seed() {
let i1 = Index::new(Config {
seed: 1,
..Config::default()
})
.unwrap();
let i2 = Index::new(Config {
seed: 2,
..Config::default()
})
.unwrap();
let s1 = i1.signature("hello world");
let s2 = i2.signature("hello world");
assert_ne!(s1, s2);
}
#[test]
fn invalid_config_rejected() {
let bad = Config {
num_hashes: 100,
bands: 33,
..Default::default()
};
assert!(Index::new(bad).is_err());
let bad = Config {
num_hashes: 0,
bands: 1,
..Default::default()
};
assert!(Index::new(bad).is_err());
let bad = Config {
num_hashes: 100,
bands: 0,
..Default::default()
};
assert!(Index::new(bad).is_err());
let bad = Config {
shingle_size: 0,
..Default::default()
};
assert!(Index::new(bad).is_err());
}
#[test]
fn empty_input_safe_signature() {
let i = idx();
let s = i.signature("");
assert_eq!(s.len(), 128);
assert!(s.iter().all(|&x| x == u32::MAX));
}
#[test]
fn short_input_still_signs() {
let i = idx();
let s = i.signature("hi");
assert_eq!(s.len(), 128);
assert!(s.iter().any(|&x| x != u32::MAX));
}
#[test]
fn word_shingles_work() {
let i = Index::new(Config {
shingle_unit: ShingleUnit::Word,
shingle_size: 3,
..Config::default()
})
.unwrap();
let s1 = i.signature("the quick brown fox jumps");
let s2 = i.signature("the quick brown fox runs");
let j = Index::jaccard(&s1, &s2);
assert!(j > 0.3, "expected high Jaccard, got {j}");
}
#[test]
fn three_doc_corpus_returns_two_dupes() {
let mut i = idx();
i.insert(
"dup-1",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
)
.unwrap();
i.insert(
"dup-2",
"Lorem ipsum dolor sit amet consectetur adipiscing elit",
)
.unwrap();
i.insert(
"other",
"completely different text on a different subject entirely",
)
.unwrap();
let hits = i.near_duplicates(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
0.4,
);
let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
assert!(ids.contains("dup-1"));
assert!(ids.contains("dup-2"));
assert!(!ids.contains("other"));
}
#[test]
fn hits_sorted_by_similarity_desc() {
let mut i = idx();
i.insert("identical", "alpha beta gamma delta epsilon zeta")
.unwrap();
i.insert("partial", "alpha beta gamma OMEGA SIGMA TAU")
.unwrap();
let hits = i.near_duplicates("alpha beta gamma delta epsilon zeta", 0.0);
assert_eq!(hits[0].id, "identical");
for win in hits.windows(2) {
assert!(win[0].similarity >= win[1].similarity);
}
}
fn tempfile() -> std::path::PathBuf {
let dir = std::env::temp_dir().join(format!(
"lshdedup-test-{}-{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos()
));
std::fs::create_dir_all(&dir).unwrap();
dir.join("index.json")
}
#[test]
fn save_load_round_trip() {
let path = tempfile();
let mut i = idx();
i.insert("a", "the quick brown fox jumps over the lazy dog")
.unwrap();
i.insert("b", "the quick brown fox jumps over a lazy dog")
.unwrap();
i.insert("c", "completely unrelated text on a different subject")
.unwrap();
i.save(&path).unwrap();
let loaded = Index::load(&path).unwrap();
assert_eq!(loaded.len(), 3);
let hits = loaded.near_duplicates("the quick brown fox jumps over the lazy dog", 0.4);
let ids: std::collections::HashSet<String> = hits.iter().map(|h| h.id.clone()).collect();
assert!(ids.contains("a"));
}
#[test]
fn load_missing_file_errors() {
let r = Index::load("/no/such/path/should/exist.json");
assert!(r.is_err());
}
#[test]
fn loaded_signatures_match_originals() {
let path = tempfile();
let mut i = Index::new(Config {
seed: 42,
..Config::default()
})
.unwrap();
i.insert("doc", "lorem ipsum dolor sit amet").unwrap();
let original_sig = i.signature("lorem ipsum dolor sit amet");
i.save(&path).unwrap();
let loaded = Index::load(&path).unwrap();
let loaded_sig = loaded.signature("lorem ipsum dolor sit amet");
assert_eq!(original_sig, loaded_sig);
}
}