use std::collections::HashSet;
use gxhash::GxBuildHasher;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DeduplicationMode {
Exact,
Normalized,
MinHash {
num_hashes: usize,
threshold: f32,
shingle_size: usize,
},
}
impl Default for DeduplicationMode {
fn default() -> Self {
Self::Normalized
}
}
pub struct Deduplicator {
seen: HashSet<u64, GxBuildHasher>,
mode: DeduplicationMode,
minhash_signatures: Vec<MinHashSignature>,
stats: DeduplicationStats,
}
struct MinHashSignature {
hashes: Vec<u64>,
}
impl MinHashSignature {
fn from_shingles(shingles: &[u64], num_hashes: usize) -> Self {
let mut hashes = vec![u64::MAX; num_hashes];
for &shingle in shingles {
for (i, hash) in hashes.iter_mut().enumerate() {
let h = Self::hash_with_seed(shingle, i as u64);
if h < *hash {
*hash = h;
}
}
}
Self { hashes }
}
fn hash_with_seed(value: u64, seed: u64) -> u64 {
use std::hash::{DefaultHasher, Hasher};
let mut hasher = DefaultHasher::new();
hasher.write_u64(seed);
hasher.write_u64(value);
hasher.finish()
}
fn jaccard_similarity(&self, other: &MinHashSignature) -> f32 {
if self.hashes.len() != other.hashes.len() {
return 0.0;
}
let matching = self
.hashes
.iter()
.zip(other.hashes.iter())
.filter(|(a, b)| a == b)
.count();
matching as f32 / self.hashes.len() as f32
}
}
#[derive(Debug, Clone, Default)]
pub struct DeduplicationStats {
pub total: usize,
pub unique: usize,
pub duplicates: usize,
}
impl DeduplicationStats {
pub fn dedup_rate(&self) -> f64 {
if self.total == 0 {
0.0
} else {
100.0 * self.duplicates as f64 / self.total as f64
}
}
}
impl std::fmt::Display for DeduplicationStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Deduplication Statistics:")?;
writeln!(f, " Total: {}", self.total)?;
writeln!(f, " Unique: {}", self.unique)?;
writeln!(
f,
" Duplicates: {} ({:.1}%)",
self.duplicates,
self.dedup_rate()
)?;
Ok(())
}
}
impl Deduplicator {
pub fn new(mode: DeduplicationMode) -> Self {
Self {
seen: HashSet::with_hasher(GxBuildHasher::default()),
mode,
minhash_signatures: Vec::new(),
stats: DeduplicationStats::default(),
}
}
pub fn exact() -> Self {
Self::new(DeduplicationMode::Exact)
}
pub fn normalized() -> Self {
Self::new(DeduplicationMode::Normalized)
}
pub fn minhash(num_hashes: usize, threshold: f32, shingle_size: usize) -> Self {
Self::new(DeduplicationMode::MinHash {
num_hashes,
threshold,
shingle_size,
})
}
pub fn minhash_default() -> Self {
Self::minhash(128, 0.8, 3)
}
pub fn is_unique(&mut self, sentence: &str) -> bool {
self.stats.total += 1;
let is_unique = match &self.mode {
DeduplicationMode::Exact => self.check_exact(sentence),
DeduplicationMode::Normalized => self.check_normalized(sentence),
DeduplicationMode::MinHash {
num_hashes,
threshold,
shingle_size,
} => self.check_minhash(sentence, *num_hashes, *threshold, *shingle_size),
};
if is_unique {
self.stats.unique += 1;
} else {
self.stats.duplicates += 1;
}
is_unique
}
fn check_exact(&mut self, sentence: &str) -> bool {
let hash = Self::hash_string(sentence);
self.seen.insert(hash)
}
fn check_normalized(&mut self, sentence: &str) -> bool {
let normalized = Self::normalize(sentence);
let hash = Self::hash_string(&normalized);
self.seen.insert(hash)
}
fn check_minhash(
&mut self,
sentence: &str,
num_hashes: usize,
threshold: f32,
shingle_size: usize,
) -> bool {
let shingles = Self::create_shingles(sentence, shingle_size);
if shingles.is_empty() {
return false;
}
let signature = MinHashSignature::from_shingles(&shingles, num_hashes);
for existing in &self.minhash_signatures {
if signature.jaccard_similarity(existing) >= threshold {
return false; }
}
self.minhash_signatures.push(signature);
true
}
fn hash_string(s: &str) -> u64 {
crate::util::hash::safe_hash(s.as_bytes())
}
fn normalize(sentence: &str) -> String {
sentence
.chars()
.filter_map(|c| {
if c.is_alphabetic() {
Some(c.to_lowercase().next().unwrap_or(c))
} else if c.is_whitespace() {
Some(' ')
} else if c.is_numeric() {
Some(c)
} else {
None
}
})
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn create_shingles(sentence: &str, n: usize) -> Vec<u64> {
let normalized = Self::normalize(sentence);
let chars: Vec<char> = normalized.chars().collect();
if chars.len() < n {
return vec![];
}
let mut shingles = Vec::with_capacity(chars.len() - n + 1);
for window in chars.windows(n) {
let shingle: String = window.iter().collect();
shingles.push(Self::hash_string(&shingle));
}
shingles
}
pub fn filter<'a, I>(&'a mut self, sentences: I) -> impl Iterator<Item = String> + 'a
where
I: Iterator<Item = String> + 'a,
{
sentences.filter(move |s| self.is_unique(s))
}
pub fn stats(&self) -> &DeduplicationStats {
&self.stats
}
pub fn reset(&mut self) {
self.seen.clear();
self.minhash_signatures.clear();
self.stats = DeduplicationStats::default();
}
pub fn memory_usage(&self) -> usize {
let seen_size = self.seen.capacity() * std::mem::size_of::<u64>();
let minhash_size = self
.minhash_signatures
.iter()
.map(|s| s.hashes.len() * 8)
.sum::<usize>();
seen_size + minhash_size
}
}
#[derive(Debug, Clone)]
pub struct DeduplicatorBuilder {
mode: DeduplicationMode,
initial_capacity: Option<usize>,
}
impl DeduplicatorBuilder {
pub fn new() -> Self {
Self {
mode: DeduplicationMode::Normalized,
initial_capacity: None,
}
}
pub fn exact(mut self) -> Self {
self.mode = DeduplicationMode::Exact;
self
}
pub fn normalized(mut self) -> Self {
self.mode = DeduplicationMode::Normalized;
self
}
pub fn minhash(mut self, num_hashes: usize, threshold: f32, shingle_size: usize) -> Self {
self.mode = DeduplicationMode::MinHash {
num_hashes,
threshold,
shingle_size,
};
self
}
pub fn capacity(mut self, capacity: usize) -> Self {
self.initial_capacity = Some(capacity);
self
}
pub fn build(self) -> Deduplicator {
let mut dedup = Deduplicator::new(self.mode);
if let Some(cap) = self.initial_capacity {
dedup.seen.reserve(cap);
}
dedup
}
}
impl Default for DeduplicatorBuilder {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_exact_dedup() {
let mut dedup = Deduplicator::exact();
assert!(dedup.is_unique("Hello world!"));
assert!(!dedup.is_unique("Hello world!"));
assert!(dedup.is_unique("HELLO WORLD!"));
assert!(dedup.is_unique("Goodbye world!"));
}
#[test]
fn test_normalized_dedup() {
let mut dedup = Deduplicator::normalized();
assert!(dedup.is_unique("Hello world!"));
assert!(!dedup.is_unique("Hello world!"));
assert!(!dedup.is_unique("HELLO WORLD!"));
assert!(!dedup.is_unique("Hello, world."));
assert!(dedup.is_unique("Goodbye world!"));
}
#[test]
fn test_minhash_dedup() {
let mut dedup = Deduplicator::minhash(64, 0.7, 3);
assert!(dedup.is_unique("The quick brown fox jumps over the lazy dog."));
assert!(!dedup.is_unique("The quick brown fox jumps over a lazy dog."));
assert!(dedup.is_unique("A completely different sentence about cats."));
}
#[test]
fn test_normalization() {
assert_eq!(Deduplicator::normalize("Hello World!"), "hello world");
assert_eq!(
Deduplicator::normalize(" Multiple spaces "),
"multiple spaces"
);
assert_eq!(Deduplicator::normalize("Numbers123Here"), "numbers123here");
assert_eq!(Deduplicator::normalize("!@#$%^&*()"), "");
}
#[test]
fn test_shingle_creation() {
let shingles = Deduplicator::create_shingles("hello", 3);
assert_eq!(shingles.len(), 3);
let shingles = Deduplicator::create_shingles("hi", 3);
assert!(shingles.is_empty()); }
#[test]
fn test_stats() {
let mut dedup = Deduplicator::normalized();
dedup.is_unique("Sentence one.");
dedup.is_unique("Sentence two.");
dedup.is_unique("Sentence one.");
let stats = dedup.stats();
assert_eq!(stats.total, 3);
assert_eq!(stats.unique, 2);
assert_eq!(stats.duplicates, 1);
}
#[test]
fn test_filter() {
let mut dedup = Deduplicator::exact();
let sentences = vec![
"First".to_string(),
"Second".to_string(),
"First".to_string(), "Third".to_string(),
];
let unique: Vec<String> = dedup.filter(sentences.into_iter()).collect();
assert_eq!(unique, vec!["First", "Second", "Third"]);
}
#[test]
fn test_reset() {
let mut dedup = Deduplicator::normalized();
dedup.is_unique("Hello");
assert!(!dedup.is_unique("Hello"));
dedup.reset();
assert!(dedup.is_unique("Hello"));
}
#[test]
fn test_builder() {
let dedup = DeduplicatorBuilder::new()
.normalized()
.capacity(1000)
.build();
assert!(matches!(dedup.mode, DeduplicationMode::Normalized));
}
#[test]
fn test_unicode_dedup() {
let mut dedup = Deduplicator::normalized();
assert!(dedup.is_unique("你好世界"));
assert!(!dedup.is_unique("你好世界"));
assert!(dedup.is_unique("こんにちは"));
assert!(!dedup.is_unique("こんにちは"));
assert!(dedup.is_unique("Hello 世界!"));
assert!(!dedup.is_unique("HELLO 世界!"));
}
}