halldyll_core/crawl/
dedup.rs1use sha2::{Sha256, Digest};
4use std::collections::HashSet;
5use std::sync::RwLock;
6
7pub struct UrlDedup {
9 seen: RwLock<HashSet<String>>,
11}
12
13impl Default for UrlDedup {
14 fn default() -> Self {
15 Self::new()
16 }
17}
18
19impl UrlDedup {
20 pub fn new() -> Self {
22 Self {
23 seen: RwLock::new(HashSet::new()),
24 }
25 }
26
27 pub fn is_duplicate(&self, url: &str) -> bool {
29 self.seen.read().unwrap().contains(url)
30 }
31
32 pub fn mark_seen(&self, url: &str) -> bool {
34 self.seen.write().unwrap().insert(url.to_string())
35 }
36
37 pub fn check_and_mark(&self, url: &str) -> bool {
39 let mut seen = self.seen.write().unwrap();
40 if seen.contains(url) {
41 false
42 } else {
43 seen.insert(url.to_string());
44 true
45 }
46 }
47
48 pub fn count(&self) -> usize {
50 self.seen.read().unwrap().len()
51 }
52
53 pub fn clear(&self) {
55 self.seen.write().unwrap().clear();
56 }
57}
58
59pub struct ContentDedup {
61 exact_hashes: RwLock<HashSet<String>>,
63 simhashes: RwLock<Vec<u64>>,
65 hamming_threshold: u32,
67}
68
69impl Default for ContentDedup {
70 fn default() -> Self {
71 Self::new(3)
72 }
73}
74
75impl ContentDedup {
76 pub fn new(hamming_threshold: u32) -> Self {
78 Self {
79 exact_hashes: RwLock::new(HashSet::new()),
80 simhashes: RwLock::new(Vec::new()),
81 hamming_threshold,
82 }
83 }
84
85 pub fn hash_content(content: &str) -> String {
87 let mut hasher = Sha256::new();
88 hasher.update(content.as_bytes());
89 format!("{:x}", hasher.finalize())
90 }
91
92 pub fn simhash(content: &str) -> u64 {
94 let mut v = [0i32; 64];
95
96 for word in content.split_whitespace() {
98 let hash = Self::fnv_hash(word);
99 for i in 0..64 {
100 if (hash >> i) & 1 == 1 {
101 v[i] += 1;
102 } else {
103 v[i] -= 1;
104 }
105 }
106 }
107
108 let mut simhash: u64 = 0;
110 for (i, &val) in v.iter().enumerate() {
111 if val > 0 {
112 simhash |= 1 << i;
113 }
114 }
115 simhash
116 }
117
118 fn fnv_hash(s: &str) -> u64 {
120 let mut hash: u64 = 0xcbf29ce484222325;
121 for byte in s.bytes() {
122 hash ^= byte as u64;
123 hash = hash.wrapping_mul(0x100000001b3);
124 }
125 hash
126 }
127
128 pub fn hamming_distance(a: u64, b: u64) -> u32 {
130 (a ^ b).count_ones()
131 }
132
133 pub fn is_exact_duplicate(&self, content: &str) -> bool {
135 let hash = Self::hash_content(content);
136 self.exact_hashes.read().unwrap().contains(&hash)
137 }
138
139 pub fn is_near_duplicate(&self, content: &str) -> bool {
141 let simhash = Self::simhash(content);
142 let simhashes = self.simhashes.read().unwrap();
143
144 simhashes.iter().any(|&existing| {
145 Self::hamming_distance(simhash, existing) <= self.hamming_threshold
146 })
147 }
148
149 pub fn is_duplicate(&self, content: &str) -> bool {
151 self.is_exact_duplicate(content) || self.is_near_duplicate(content)
152 }
153
154 pub fn mark_seen(&self, content: &str) {
156 let hash = Self::hash_content(content);
157 let simhash = Self::simhash(content);
158
159 self.exact_hashes.write().unwrap().insert(hash);
160 self.simhashes.write().unwrap().push(simhash);
161 }
162
163 pub fn check_and_mark(&self, content: &str) -> bool {
165 if self.is_duplicate(content) {
166 false
167 } else {
168 self.mark_seen(content);
169 true
170 }
171 }
172
173 pub fn count(&self) -> usize {
175 self.exact_hashes.read().unwrap().len()
176 }
177
178 pub fn clear(&self) {
180 self.exact_hashes.write().unwrap().clear();
181 self.simhashes.write().unwrap().clear();
182 }
183}
184
185pub fn normalize_text_for_dedup(text: &str) -> String {
187 text.to_lowercase()
188 .split_whitespace()
189 .collect::<Vec<_>>()
190 .join(" ")
191}