halldyll_core/crawl/
dedup.rs

1//! Dedup - Deduplication by URL and by content
2
3use sha2::{Sha256, Digest};
4use std::collections::HashSet;
5use std::sync::RwLock;
6
7/// Deduplication by URL
8pub struct UrlDedup {
9    /// Set of normalized URLs seen
10    seen: RwLock<HashSet<String>>,
11}
12
13impl Default for UrlDedup {
14    fn default() -> Self {
15        Self::new()
16    }
17}
18
19impl UrlDedup {
20    /// New deduplicator
21    pub fn new() -> Self {
22        Self {
23            seen: RwLock::new(HashSet::new()),
24        }
25    }
26
27    /// Check if the URL has already been seen
28    pub fn is_duplicate(&self, url: &str) -> bool {
29        self.seen.read().unwrap().contains(url)
30    }
31
32    /// Mark URL as seen, returns true if new
33    pub fn mark_seen(&self, url: &str) -> bool {
34        self.seen.write().unwrap().insert(url.to_string())
35    }
36
37    /// Check and mark in one operation
38    pub fn check_and_mark(&self, url: &str) -> bool {
39        let mut seen = self.seen.write().unwrap();
40        if seen.contains(url) {
41            false
42        } else {
43            seen.insert(url.to_string());
44            true
45        }
46    }
47
48    /// Number of URLs seen
49    pub fn count(&self) -> usize {
50        self.seen.read().unwrap().len()
51    }
52
53    /// Clear the deduplicator
54    pub fn clear(&self) {
55        self.seen.write().unwrap().clear();
56    }
57}
58
59/// Deduplication by content (hash + simhash)
60pub struct ContentDedup {
61    /// Set of exact hashes seen
62    exact_hashes: RwLock<HashSet<String>>,
63    /// Set of simhashes for near-duplicates
64    simhashes: RwLock<Vec<u64>>,
65    /// Hamming distance threshold for simhash
66    hamming_threshold: u32,
67}
68
69impl Default for ContentDedup {
70    fn default() -> Self {
71        Self::new(3)
72    }
73}
74
75impl ContentDedup {
76    /// New deduplicator with Hamming threshold
77    pub fn new(hamming_threshold: u32) -> Self {
78        Self {
79            exact_hashes: RwLock::new(HashSet::new()),
80            simhashes: RwLock::new(Vec::new()),
81            hamming_threshold,
82        }
83    }
84
85    /// SHA-256 hash of content
86    pub fn hash_content(content: &str) -> String {
87        let mut hasher = Sha256::new();
88        hasher.update(content.as_bytes());
89        format!("{:x}", hasher.finalize())
90    }
91
92    /// Simhash of content (for near-duplicates)
93    pub fn simhash(content: &str) -> u64 {
94        let mut v = [0i32; 64];
95        
96        // Tokenize into words and compute features
97        for word in content.split_whitespace() {
98            let hash = Self::fnv_hash(word);
99            for i in 0..64 {
100                if (hash >> i) & 1 == 1 {
101                    v[i] += 1;
102                } else {
103                    v[i] -= 1;
104                }
105            }
106        }
107
108        // Convert to fingerprint
109        let mut simhash: u64 = 0;
110        for (i, &val) in v.iter().enumerate() {
111            if val > 0 {
112                simhash |= 1 << i;
113            }
114        }
115        simhash
116    }
117
118    /// FNV-1a hash for tokens
119    fn fnv_hash(s: &str) -> u64 {
120        let mut hash: u64 = 0xcbf29ce484222325;
121        for byte in s.bytes() {
122            hash ^= byte as u64;
123            hash = hash.wrapping_mul(0x100000001b3);
124        }
125        hash
126    }
127
128    /// Hamming distance between two simhashes
129    pub fn hamming_distance(a: u64, b: u64) -> u32 {
130        (a ^ b).count_ones()
131    }
132
133    /// Check if content is an exact duplicate
134    pub fn is_exact_duplicate(&self, content: &str) -> bool {
135        let hash = Self::hash_content(content);
136        self.exact_hashes.read().unwrap().contains(&hash)
137    }
138
139    /// Check if content is a near-duplicate
140    pub fn is_near_duplicate(&self, content: &str) -> bool {
141        let simhash = Self::simhash(content);
142        let simhashes = self.simhashes.read().unwrap();
143        
144        simhashes.iter().any(|&existing| {
145            Self::hamming_distance(simhash, existing) <= self.hamming_threshold
146        })
147    }
148
149    /// Check if duplicate (exact or near)
150    pub fn is_duplicate(&self, content: &str) -> bool {
151        self.is_exact_duplicate(content) || self.is_near_duplicate(content)
152    }
153
154    /// Mark content as seen
155    pub fn mark_seen(&self, content: &str) {
156        let hash = Self::hash_content(content);
157        let simhash = Self::simhash(content);
158        
159        self.exact_hashes.write().unwrap().insert(hash);
160        self.simhashes.write().unwrap().push(simhash);
161    }
162
163    /// Check and mark in one operation
164    pub fn check_and_mark(&self, content: &str) -> bool {
165        if self.is_duplicate(content) {
166            false
167        } else {
168            self.mark_seen(content);
169            true
170        }
171    }
172
173    /// Number of contents seen (exact)
174    pub fn count(&self) -> usize {
175        self.exact_hashes.read().unwrap().len()
176    }
177
178    /// Clear the deduplicator
179    pub fn clear(&self) {
180        self.exact_hashes.write().unwrap().clear();
181        self.simhashes.write().unwrap().clear();
182    }
183}
184
185/// Normalize text before deduplication
186pub fn normalize_text_for_dedup(text: &str) -> String {
187    text.to_lowercase()
188        .split_whitespace()
189        .collect::<Vec<_>>()
190        .join(" ")
191}