Skip to main content

oximedia_dedup/
near_duplicate.rs

1//! Near-duplicate detection using locality-sensitive hashing (LSH).
2//!
3//! This module provides:
4//! - `SimHash`: SimHash fingerprint for text/feature vectors
5//! - `MinHash`: MinHash with K independent hash functions
6//! - `LshIndex`: LSH bucket index for approximate nearest-neighbour search
7
8#![allow(dead_code)]
9
10// ---------------------------------------------------------------------------
11// SimHash
12// ---------------------------------------------------------------------------
13
14/// SimHash fingerprint for text or feature vectors.
15///
16/// Computed by weighted XOR accumulation with threshold by bit position.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub struct SimHash(pub u64);
19
20impl SimHash {
21    /// Compute a SimHash from a slice of feature values.
22    ///
23    /// Each feature votes for/against each of the 64 bit positions.
24    /// If the weighted vote for a position is positive, that bit is 1.
25    #[must_use]
26    pub fn compute(features: &[u64]) -> Self {
27        let mut counts = [0i64; 64];
28
29        for &feature in features {
30            for bit in 0..64u32 {
31                if (feature >> bit) & 1 == 1 {
32                    counts[bit as usize] += 1;
33                } else {
34                    counts[bit as usize] -= 1;
35                }
36            }
37        }
38
39        let mut hash = 0u64;
40        for (bit, &count) in counts.iter().enumerate() {
41            if count > 0 {
42                hash |= 1u64 << bit;
43            }
44        }
45
46        Self(hash)
47    }
48
49    /// Compute Hamming distance to another SimHash.
50    #[must_use]
51    pub fn hamming_distance(self, other: &Self) -> u32 {
52        (self.0 ^ other.0).count_ones()
53    }
54
55    /// Return the raw 64-bit value.
56    #[must_use]
57    pub fn bits(self) -> u64 {
58        self.0
59    }
60
61    /// Similarity in [0.0, 1.0].
62    #[must_use]
63    pub fn similarity(self, other: &Self) -> f32 {
64        1.0 - self.hamming_distance(other) as f32 / 64.0
65    }
66}
67
68// ---------------------------------------------------------------------------
69// MinHash
70// ---------------------------------------------------------------------------
71
72/// MinHash with K independent hash functions.
73///
74/// Useful for estimating Jaccard similarity between sets of shingles.
75#[derive(Debug, Clone, PartialEq, Eq)]
76pub struct MinHash {
77    /// Number of hash functions (k).
78    pub k: usize,
79    /// MinHash signatures: k minimum hash values.
80    pub signatures: Vec<u64>,
81}
82
83/// FNV-1a-based hash mixing: combines `seed` with `value`.
84fn fnv_hash(seed: u64, value: u64) -> u64 {
85    let mut h: u64 = seed ^ 0xcbf2_9ce4_8422_2325;
86    h = h.wrapping_mul(0x0000_0100_0000_01b3);
87    h ^= value;
88    h = h.wrapping_mul(0x0000_0100_0000_01b3);
89    h
90}
91
92impl MinHash {
93    /// Compute MinHash signatures for a set of shingles using `k` hash functions.
94    ///
95    /// Each hash function is simulated by FNV mixing with a distinct seed.
96    #[must_use]
97    pub fn compute(shingles: &[u64], k: usize) -> Self {
98        if shingles.is_empty() || k == 0 {
99            return Self {
100                k,
101                signatures: vec![u64::MAX; k],
102            };
103        }
104
105        let mut signatures = vec![u64::MAX; k];
106
107        for &shingle in shingles {
108            for (i, sig) in signatures.iter_mut().enumerate() {
109                let h = fnv_hash(i as u64, shingle);
110                if h < *sig {
111                    *sig = h;
112                }
113            }
114        }
115
116        Self { k, signatures }
117    }
118
119    /// Estimate Jaccard similarity as the fraction of matching min-hashes.
120    #[must_use]
121    pub fn jaccard_estimate(&self, other: &Self) -> f32 {
122        let len = self.signatures.len().min(other.signatures.len());
123        if len == 0 {
124            return 0.0;
125        }
126
127        let matches = self
128            .signatures
129            .iter()
130            .zip(other.signatures.iter())
131            .filter(|(a, b)| a == b)
132            .count();
133
134        matches as f32 / len as f32
135    }
136}
137
138// ---------------------------------------------------------------------------
139// LSH Index
140// ---------------------------------------------------------------------------
141
142/// A single LSH bucket.
143#[derive(Debug, Clone)]
144pub struct LshBucket {
145    /// The bucket hash (band signature).
146    pub hash: u64,
147    /// IDs of items in this bucket.
148    pub item_ids: Vec<u64>,
149}
150
151/// LSH index for approximate nearest-neighbour search over MinHash signatures.
152///
153/// Uses a band-based approach: signatures are divided into `b` bands of `r` rows.
154/// Items sharing the same band hash are candidates.
155pub struct LshIndex {
156    /// Number of bands.
157    bands: usize,
158    /// Number of rows per band.
159    rows_per_band: usize,
160    /// Stored (id, minhash) pairs.
161    items: Vec<(u64, MinHash)>,
162}
163
164impl LshIndex {
165    /// Create a new LSH index.
166    ///
167    /// `bands` × `rows_per_band` should equal the MinHash `k` value.
168    #[must_use]
169    pub fn new(bands: usize, rows_per_band: usize) -> Self {
170        Self {
171            bands,
172            rows_per_band,
173            items: Vec::new(),
174        }
175    }
176
177    /// Add an item with given `id` and its `minhash` signature.
178    pub fn add(&mut self, id: u64, minhash: &MinHash) {
179        self.items.push((id, minhash.clone()));
180    }
181
182    /// Find candidate IDs whose estimated Jaccard similarity to `query` is at least `threshold`.
183    #[must_use]
184    pub fn find_candidates(&self, query: &MinHash, threshold: f32) -> Vec<u64> {
185        let mut candidates = Vec::new();
186
187        for &(id, ref minhash) in &self.items {
188            // Check band collision (any band matches → candidate)
189            let mut band_collision = false;
190            for band in 0..self.bands {
191                let start = band * self.rows_per_band;
192                let end = (start + self.rows_per_band).min(minhash.signatures.len());
193                let qend = (start + self.rows_per_band).min(query.signatures.len());
194
195                if start >= minhash.signatures.len() || start >= query.signatures.len() {
196                    break;
197                }
198
199                let band_hash_a = hash_band(&minhash.signatures[start..end]);
200                let band_hash_b = hash_band(&query.signatures[start..qend]);
201
202                if band_hash_a == band_hash_b {
203                    band_collision = true;
204                    break;
205                }
206            }
207
208            if band_collision {
209                // Verify with full Jaccard estimate
210                let sim = minhash.jaccard_estimate(query);
211                if sim >= threshold {
212                    candidates.push(id);
213                }
214            }
215        }
216
217        candidates
218    }
219
220    /// Return the number of indexed items.
221    #[must_use]
222    pub fn len(&self) -> usize {
223        self.items.len()
224    }
225
226    /// Return true if the index is empty.
227    #[must_use]
228    pub fn is_empty(&self) -> bool {
229        self.items.is_empty()
230    }
231}
232
233/// Compute a hash for a band of signature rows.
234fn hash_band(rows: &[u64]) -> u64 {
235    let mut h = 0xcbf2_9ce4_8422_2325u64;
236    for &v in rows {
237        h ^= v;
238        h = h.wrapping_mul(0x0000_0100_0000_01b3);
239    }
240    h
241}
242
243// ---------------------------------------------------------------------------
244// Unit tests
245// ---------------------------------------------------------------------------
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    // --- SimHash tests ---
252
253    #[test]
254    fn test_simhash_empty() {
255        let h = SimHash::compute(&[]);
256        // No features: all counts 0 → all bits 0
257        assert_eq!(h.0, 0);
258    }
259
260    #[test]
261    fn test_simhash_all_ones() {
262        // Feature with all bits set → all counts positive → all bits 1
263        let h = SimHash::compute(&[u64::MAX]);
264        assert_eq!(h.0, u64::MAX);
265    }
266
267    #[test]
268    fn test_simhash_deterministic() {
269        let features = vec![42u64, 123, 9999, 0xDEAD_BEEF];
270        let h1 = SimHash::compute(&features);
271        let h2 = SimHash::compute(&features);
272        assert_eq!(h1, h2);
273    }
274
275    #[test]
276    fn test_simhash_hamming_same() {
277        let h = SimHash::compute(&[42, 100, 200]);
278        assert_eq!(h.hamming_distance(&h), 0);
279    }
280
281    #[test]
282    fn test_simhash_hamming_range() {
283        let h1 = SimHash::compute(&[1, 2, 3]);
284        let h2 = SimHash::compute(&[4, 5, 6]);
285        let dist = h1.hamming_distance(&h2);
286        assert!(dist <= 64);
287    }
288
289    #[test]
290    fn test_simhash_similarity_range() {
291        let h1 = SimHash::compute(&[1, 2, 3]);
292        let h2 = SimHash::compute(&[4, 5, 6]);
293        let sim = h1.similarity(&h2);
294        assert!((0.0..=1.0).contains(&sim));
295    }
296
297    #[test]
298    fn test_simhash_identical_features_equal() {
299        let features = vec![11u64, 22, 33, 44, 55];
300        let h1 = SimHash::compute(&features);
301        let h2 = SimHash::compute(&features);
302        assert_eq!(h1.similarity(&h2), 1.0);
303    }
304
305    // --- MinHash tests ---
306
307    #[test]
308    fn test_minhash_empty_shingles() {
309        let mh = MinHash::compute(&[], 10);
310        assert_eq!(mh.signatures.len(), 10);
311        assert!(mh.signatures.iter().all(|&s| s == u64::MAX));
312    }
313
314    #[test]
315    fn test_minhash_identical_sets_max_similarity() {
316        let shingles = vec![1u64, 2, 3, 4, 5];
317        let mh1 = MinHash::compute(&shingles, 64);
318        let mh2 = MinHash::compute(&shingles, 64);
319        assert_eq!(mh1.jaccard_estimate(&mh2), 1.0);
320    }
321
322    #[test]
323    fn test_minhash_disjoint_sets_low_similarity() {
324        let set_a: Vec<u64> = (0..50).collect();
325        let set_b: Vec<u64> = (1000..1050).collect();
326        let mh1 = MinHash::compute(&set_a, 128);
327        let mh2 = MinHash::compute(&set_b, 128);
328        // Disjoint sets → estimate near 0
329        assert!(mh1.jaccard_estimate(&mh2) < 0.1);
330    }
331
332    #[test]
333    fn test_minhash_partial_overlap() {
334        // 50% overlap
335        let set_a: Vec<u64> = (0..100).collect();
336        let set_b: Vec<u64> = (50..150).collect();
337        let mh1 = MinHash::compute(&set_a, 256);
338        let mh2 = MinHash::compute(&set_b, 256);
339        let sim = mh1.jaccard_estimate(&mh2);
340        // True Jaccard = 50/150 ≈ 0.33; estimate may vary
341        assert!(sim > 0.0 && sim < 1.0);
342    }
343
344    #[test]
345    fn test_minhash_deterministic() {
346        let shingles = vec![7u64, 8, 9, 10];
347        let mh1 = MinHash::compute(&shingles, 32);
348        let mh2 = MinHash::compute(&shingles, 32);
349        assert_eq!(mh1, mh2);
350    }
351
352    // --- LshIndex tests ---
353
354    #[test]
355    fn test_lsh_empty_index() {
356        let index = LshIndex::new(4, 4);
357        let query = MinHash::compute(&[1, 2, 3], 16);
358        let candidates = index.find_candidates(&query, 0.5);
359        assert!(candidates.is_empty());
360    }
361
362    #[test]
363    fn test_lsh_identical_item_found() {
364        let mut index = LshIndex::new(4, 4);
365        let shingles = vec![1u64, 2, 3, 4, 5, 6, 7, 8, 9, 10];
366        let mh = MinHash::compute(&shingles, 16);
367        index.add(42, &mh);
368
369        let candidates = index.find_candidates(&mh, 0.9);
370        assert!(candidates.contains(&42));
371    }
372
373    #[test]
374    fn test_lsh_different_item_not_found_high_threshold() {
375        let mut index = LshIndex::new(4, 4);
376        let shingles_a: Vec<u64> = (0..20).collect();
377        let shingles_b: Vec<u64> = (1000..1020).collect();
378        let mh_a = MinHash::compute(&shingles_a, 16);
379        let mh_b = MinHash::compute(&shingles_b, 16);
380        index.add(1, &mh_a);
381
382        let candidates = index.find_candidates(&mh_b, 0.9);
383        assert!(!candidates.contains(&1));
384    }
385
386    #[test]
387    fn test_lsh_len() {
388        let mut index = LshIndex::new(4, 4);
389        assert_eq!(index.len(), 0);
390        assert!(index.is_empty());
391        let mh = MinHash::compute(&[1, 2, 3], 16);
392        index.add(1, &mh);
393        index.add(2, &mh);
394        assert_eq!(index.len(), 2);
395        assert!(!index.is_empty());
396    }
397
398    #[test]
399    fn test_lsh_multiple_items() {
400        let mut index = LshIndex::new(4, 4);
401        let base: Vec<u64> = (0..30).collect();
402        for i in 0..5u64 {
403            let mh = MinHash::compute(&base, 16);
404            index.add(i, &mh);
405        }
406
407        let query = MinHash::compute(&base, 16);
408        let candidates = index.find_candidates(&query, 0.9);
409        // All 5 should be found (same shingles)
410        assert_eq!(candidates.len(), 5);
411    }
412}