Skip to main content

html_cleaning/
dedup.rs

1//! Content deduplication utilities.
2//!
3//! LRU-based cache for detecting duplicate text content.
4
5use std::collections::HashMap;
6
7/// Simple LRU cache for text deduplication.
8///
9/// Maintains insertion order and evicts the oldest entry when capacity is reached.
10/// Note: Updating an existing key does NOT refresh its position (matches Go behavior).
11///
12/// # Example
13///
14/// ```
15/// use html_cleaning::dedup::LruCache;
16///
17/// let mut cache = LruCache::new(3);
18/// cache.put("a", 1);
19/// cache.put("b", 2);
20/// assert_eq!(cache.get("a"), Some(1));
21/// assert!(cache.has("b"));
22/// ```
23pub struct LruCache {
24    max_size: usize,
25    keys: Vec<String>,
26    data: HashMap<String, i32>,
27}
28
29impl LruCache {
30    /// Create new cache with specified capacity.
31    #[must_use]
32    pub fn new(max_size: usize) -> Self {
33        Self {
34            max_size,
35            keys: Vec::new(),
36            data: HashMap::new(),
37        }
38    }
39
40    /// Get value for key.
41    #[must_use]
42    pub fn get(&self, key: &str) -> Option<i32> {
43        self.data.get(key).copied()
44    }
45
46    /// Check if key exists.
47    #[must_use]
48    pub fn has(&self, key: &str) -> bool {
49        self.data.contains_key(key)
50    }
51
52    /// Store key-value pair, evicting oldest if at capacity.
53    pub fn put(&mut self, key: &str, value: i32) {
54        if self.data.contains_key(key) {
55            self.data.insert(key.to_string(), value);
56            return;
57        }
58
59        if self.max_size == 0 {
60            return;
61        }
62
63        if self.keys.len() >= self.max_size {
64            if let Some(oldest_key) = self.keys.first().cloned() {
65                self.keys.remove(0);
66                self.data.remove(&oldest_key);
67            }
68        }
69
70        self.keys.push(key.to_string());
71        self.data.insert(key.to_string(), value);
72    }
73
74    /// Clear all entries.
75    pub fn clear(&mut self) {
76        self.keys.clear();
77        self.data.clear();
78    }
79
80    /// Get current size.
81    #[must_use]
82    pub fn len(&self) -> usize {
83        self.keys.len()
84    }
85
86    /// Check if empty.
87    #[must_use]
88    pub fn is_empty(&self) -> bool {
89        self.keys.is_empty()
90    }
91
92    /// Remove a specific key from the cache.
93    pub fn remove(&mut self, key: &str) {
94        if !self.data.contains_key(key) {
95            return;
96        }
97
98        // Find and remove from keys list
99        if let Some(idx) = self.keys.iter().position(|k| k == key) {
100            self.keys.remove(idx);
101        }
102        self.data.remove(key);
103    }
104}
105
106/// LRU-based content deduplicator.
107///
108/// Tracks seen text fragments to detect duplicates during processing.
109///
110/// # Example
111///
112/// ```
113/// use html_cleaning::dedup::Deduplicator;
114///
115/// let mut dedup = Deduplicator::new(100);
116///
117/// assert!(!dedup.is_duplicate("first occurrence"));
118/// assert!(!dedup.is_duplicate("first occurrence")); // seen once
119/// assert!(!dedup.is_duplicate("first occurrence")); // seen twice
120/// assert!(dedup.is_duplicate("first occurrence"));  // now duplicate (>2)
121/// ```
122pub struct Deduplicator {
123    cache: LruCache,
124    threshold: i32,
125}
126
127impl Deduplicator {
128    /// Create with specified capacity.
129    ///
130    /// Uses default threshold of 2 (text seen more than 2 times is duplicate).
131    ///
132    /// # Arguments
133    /// * `capacity` - Maximum number of entries to track
134    #[must_use]
135    pub fn new(capacity: usize) -> Self {
136        Self {
137            cache: LruCache::new(capacity),
138            threshold: 2,
139        }
140    }
141
142    /// Create with capacity and custom duplicate threshold.
143    ///
144    /// # Arguments
145    /// * `capacity` - Maximum entries
146    /// * `threshold` - Number of times text can appear before considered duplicate
147    #[must_use]
148    pub fn with_threshold(capacity: usize, threshold: i32) -> Self {
149        Self {
150            cache: LruCache::new(capacity),
151            threshold,
152        }
153    }
154
155    /// Check if text is duplicate, adding to cache.
156    ///
157    /// Returns `true` if text has been seen more than threshold times.
158    pub fn is_duplicate(&mut self, text: &str) -> bool {
159        let count = self.cache.get(text).unwrap_or(0);
160        let is_dup = count > self.threshold;
161
162        // Always increment count
163        self.cache.put(text, count + 1);
164
165        is_dup
166    }
167
168    /// Check without adding to cache.
169    #[must_use]
170    pub fn check(&self, text: &str) -> bool {
171        self.cache
172            .get(text)
173            .is_some_and(|count| count > self.threshold)
174    }
175
176    /// Check if text has been seen (regardless of count).
177    #[must_use]
178    pub fn has_seen(&self, text: &str) -> bool {
179        self.cache.has(text)
180    }
181
182    /// Clear the cache.
183    pub fn clear(&mut self) {
184        self.cache.clear();
185    }
186
187    /// Get current cache size.
188    #[must_use]
189    pub fn len(&self) -> usize {
190        self.cache.len()
191    }
192
193    /// Check if cache is empty.
194    #[must_use]
195    pub fn is_empty(&self) -> bool {
196        self.cache.is_empty()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_new_deduplicator() {
206        let dedup = Deduplicator::new(100);
207        assert!(dedup.is_empty());
208        assert_eq!(dedup.len(), 0);
209    }
210
211    #[test]
212    fn test_is_duplicate() {
213        let mut dedup = Deduplicator::new(100);
214
215        // First 3 occurrences should not be duplicate (threshold is 2)
216        assert!(!dedup.is_duplicate("test"));
217        assert!(!dedup.is_duplicate("test"));
218        assert!(!dedup.is_duplicate("test"));
219
220        // Fourth should be duplicate
221        assert!(dedup.is_duplicate("test"));
222    }
223
224    #[test]
225    fn test_check_without_adding() {
226        let mut dedup = Deduplicator::new(100);
227
228        // check on unseen text returns false without modifying state
229        assert!(!dedup.check("test"));
230        assert!(!dedup.has_seen("test"));
231
232        // Add text enough times to exceed threshold
233        dedup.is_duplicate("test");
234        dedup.is_duplicate("test");
235        dedup.is_duplicate("test");
236
237        // check returns true now (count 3 > threshold 2)
238        assert!(dedup.check("test"));
239    }
240
241    #[test]
242    fn test_has_seen() {
243        let mut dedup = Deduplicator::new(100);
244
245        assert!(!dedup.has_seen("test"));
246        dedup.is_duplicate("test");
247        assert!(dedup.has_seen("test"));
248    }
249
250    #[test]
251    fn test_custom_threshold() {
252        let mut dedup = Deduplicator::with_threshold(100, 1);
253
254        // With threshold 1, second occurrence is duplicate
255        assert!(!dedup.is_duplicate("test")); // count: 1
256        assert!(!dedup.is_duplicate("test")); // count: 2, 1 > 1 = false
257        assert!(dedup.is_duplicate("test")); // count: 3, 2 > 1 = true
258    }
259
260    #[test]
261    fn test_clear() {
262        let mut dedup = Deduplicator::new(100);
263
264        dedup.is_duplicate("test");
265        assert!(!dedup.is_empty());
266
267        dedup.clear();
268        assert!(dedup.is_empty());
269        assert!(!dedup.has_seen("test"));
270    }
271
272    #[test]
273    fn test_capacity_eviction() {
274        let mut dedup = Deduplicator::new(2);
275
276        dedup.is_duplicate("a");
277        dedup.is_duplicate("b");
278        assert_eq!(dedup.len(), 2);
279
280        dedup.is_duplicate("c"); // Should evict "a"
281        assert_eq!(dedup.len(), 2);
282        assert!(!dedup.has_seen("a"));
283        assert!(dedup.has_seen("b"));
284        assert!(dedup.has_seen("c"));
285    }
286
287    // ========== LruCache Tests ==========
288
289    #[test]
290    fn test_lru_basic_put_get() {
291        let mut cache = LruCache::new(10);
292
293        cache.put("key1", 1);
294        cache.put("key2", 2);
295
296        assert_eq!(cache.get("key1"), Some(1));
297        assert_eq!(cache.get("key2"), Some(2));
298        assert_eq!(cache.get("key3"), None);
299    }
300
301    #[test]
302    fn test_lru_has() {
303        let mut cache = LruCache::new(10);
304
305        cache.put("exists", 1);
306
307        assert!(cache.has("exists"));
308        assert!(!cache.has("missing"));
309    }
310
311    #[test]
312    fn test_lru_eviction_at_capacity() {
313        let mut cache = LruCache::new(3);
314
315        cache.put("a", 1);
316        cache.put("b", 2);
317        cache.put("c", 3);
318
319        // All three should exist
320        assert!(cache.has("a"));
321        assert!(cache.has("b"));
322        assert!(cache.has("c"));
323
324        // Add fourth, should evict "a" (oldest)
325        cache.put("d", 4);
326
327        assert!(!cache.has("a")); // Evicted
328        assert!(cache.has("b"));
329        assert!(cache.has("c"));
330        assert!(cache.has("d"));
331    }
332
333    #[test]
334    fn test_lru_update_existing_key() {
335        let mut cache = LruCache::new(3);
336
337        cache.put("a", 1);
338        cache.put("b", 2);
339        cache.put("c", 3);
340
341        // Update "a" - should NOT change order
342        cache.put("a", 100);
343
344        assert_eq!(cache.get("a"), Some(100));
345
346        // Add new key, should still evict "a" (oldest by insertion)
347        cache.put("d", 4);
348
349        // Updating doesn't refresh position, so "a" should be evicted
350        assert!(!cache.has("a"));
351    }
352
353    #[test]
354    fn test_lru_remove() {
355        let mut cache = LruCache::new(10);
356
357        cache.put("a", 1);
358        cache.put("b", 2);
359
360        assert!(cache.has("a"));
361        cache.remove("a");
362        assert!(!cache.has("a"));
363        assert!(cache.has("b"));
364    }
365
366    #[test]
367    fn test_lru_clear() {
368        let mut cache = LruCache::new(10);
369
370        cache.put("a", 1);
371        cache.put("b", 2);
372
373        assert_eq!(cache.len(), 2);
374        cache.clear();
375        assert_eq!(cache.len(), 0);
376        assert!(cache.is_empty());
377    }
378
379    #[test]
380    fn test_lru_zero_capacity() {
381        let mut cache = LruCache::new(0);
382
383        // With zero capacity, nothing should be stored
384        cache.put("a", 1);
385        assert!(!cache.has("a"));
386    }
387
388    #[test]
389    fn test_lru_len() {
390        let mut cache = LruCache::new(10);
391
392        assert_eq!(cache.len(), 0);
393        cache.put("a", 1);
394        assert_eq!(cache.len(), 1);
395        cache.put("b", 2);
396        assert_eq!(cache.len(), 2);
397    }
398
399    #[test]
400    fn test_lru_multiple_evictions() {
401        let mut cache = LruCache::new(2);
402
403        cache.put("a", 1);
404        cache.put("b", 2);
405        cache.put("c", 3); // Evicts "a"
406        cache.put("d", 4); // Evicts "b"
407
408        assert!(!cache.has("a"));
409        assert!(!cache.has("b"));
410        assert!(cache.has("c"));
411        assert!(cache.has("d"));
412        assert_eq!(cache.len(), 2);
413    }
414
415    #[test]
416    fn test_lru_remove_then_add() {
417        let mut cache = LruCache::new(3);
418
419        cache.put("a", 1);
420        cache.put("b", 2);
421        cache.remove("a");
422
423        // Should have room for "a" again
424        cache.put("a", 10);
425        assert_eq!(cache.get("a"), Some(10));
426        assert_eq!(cache.len(), 2);
427    }
428
429    #[test]
430    fn test_lru_single_capacity() {
431        let mut cache = LruCache::new(1);
432
433        cache.put("a", 1);
434        assert!(cache.has("a"));
435
436        cache.put("b", 2);
437        assert!(!cache.has("a")); // Evicted
438        assert!(cache.has("b"));
439    }
440
441    #[test]
442    fn test_lru_empty_operations() {
443        let mut cache = LruCache::new(10);
444
445        assert!(cache.is_empty());
446        assert_eq!(cache.len(), 0);
447        assert_eq!(cache.get("missing"), None);
448        assert!(!cache.has("missing"));
449
450        // Remove on empty cache should not panic
451        cache.remove("missing");
452        assert!(cache.is_empty());
453
454        // Clear on empty cache should not panic
455        cache.clear();
456        assert!(cache.is_empty());
457    }
458}