use sha2::{Sha256, Digest};
use std::collections::HashSet;
use std::sync::RwLock;
pub struct UrlDedup {
seen: RwLock<HashSet<String>>,
}
impl Default for UrlDedup {
fn default() -> Self {
Self::new()
}
}
impl UrlDedup {
pub fn new() -> Self {
Self {
seen: RwLock::new(HashSet::new()),
}
}
pub fn is_duplicate(&self, url: &str) -> bool {
self.seen.read().unwrap().contains(url)
}
pub fn mark_seen(&self, url: &str) -> bool {
self.seen.write().unwrap().insert(url.to_string())
}
pub fn check_and_mark(&self, url: &str) -> bool {
let mut seen = self.seen.write().unwrap();
if seen.contains(url) {
false
} else {
seen.insert(url.to_string());
true
}
}
pub fn count(&self) -> usize {
self.seen.read().unwrap().len()
}
pub fn clear(&self) {
self.seen.write().unwrap().clear();
}
}
pub struct ContentDedup {
exact_hashes: RwLock<HashSet<String>>,
simhashes: RwLock<Vec<u64>>,
hamming_threshold: u32,
}
impl Default for ContentDedup {
fn default() -> Self {
Self::new(3)
}
}
impl ContentDedup {
pub fn new(hamming_threshold: u32) -> Self {
Self {
exact_hashes: RwLock::new(HashSet::new()),
simhashes: RwLock::new(Vec::new()),
hamming_threshold,
}
}
pub fn hash_content(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn simhash(content: &str) -> u64 {
let mut v = [0i32; 64];
for word in content.split_whitespace() {
let hash = Self::fnv_hash(word);
for i in 0..64 {
if (hash >> i) & 1 == 1 {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
let mut simhash: u64 = 0;
for (i, &val) in v.iter().enumerate() {
if val > 0 {
simhash |= 1 << i;
}
}
simhash
}
fn fnv_hash(s: &str) -> u64 {
let mut hash: u64 = 0xcbf29ce484222325;
for byte in s.bytes() {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
hash
}
pub fn hamming_distance(a: u64, b: u64) -> u32 {
(a ^ b).count_ones()
}
pub fn is_exact_duplicate(&self, content: &str) -> bool {
let hash = Self::hash_content(content);
self.exact_hashes.read().unwrap().contains(&hash)
}
pub fn is_near_duplicate(&self, content: &str) -> bool {
let simhash = Self::simhash(content);
let simhashes = self.simhashes.read().unwrap();
simhashes.iter().any(|&existing| {
Self::hamming_distance(simhash, existing) <= self.hamming_threshold
})
}
pub fn is_duplicate(&self, content: &str) -> bool {
self.is_exact_duplicate(content) || self.is_near_duplicate(content)
}
pub fn mark_seen(&self, content: &str) {
let hash = Self::hash_content(content);
let simhash = Self::simhash(content);
self.exact_hashes.write().unwrap().insert(hash);
self.simhashes.write().unwrap().push(simhash);
}
pub fn check_and_mark(&self, content: &str) -> bool {
if self.is_duplicate(content) {
false
} else {
self.mark_seen(content);
true
}
}
pub fn count(&self) -> usize {
self.exact_hashes.read().unwrap().len()
}
pub fn clear(&self) {
self.exact_hashes.write().unwrap().clear();
self.simhashes.write().unwrap().clear();
}
}
pub fn normalize_text_for_dedup(text: &str) -> String {
text.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}