Skip to main content

oximedia_dedup/
minhash.rs

1#![allow(dead_code)]
2//! `MinHash`-based approximate similarity estimation.
3//!
4//! Implements the `MinHash` algorithm for efficiently estimating Jaccard
5//! similarity between sets, useful for near-duplicate detection on
6//! large media collections without pairwise comparison.
7
8use std::collections::HashSet;
9
10/// Default number of hash functions in a `MinHash` signature.
11const DEFAULT_NUM_HASHES: usize = 128;
12
13/// Large prime for hash computation.
14const HASH_PRIME: u64 = 0x0001_0000_01B3;
15
16/// Modulus for hash computation.
17const HASH_MOD: u64 = (1u64 << 61) - 1;
18
19/// A single hash function parameterized by (a, b).
20#[derive(Debug, Clone, Copy)]
21pub struct HashParams {
22    /// Multiplier.
23    a: u64,
24    /// Offset.
25    b: u64,
26}
27
28impl HashParams {
29    /// Create new hash parameters.
30    pub fn new(a: u64, b: u64) -> Self {
31        Self { a, b }
32    }
33
34    /// Apply this hash function to a value.
35    pub fn apply(&self, value: u64) -> u64 {
36        // (a * value + b) mod HASH_MOD
37        let av = self.a.wrapping_mul(value);
38        let avb = av.wrapping_add(self.b);
39        avb % HASH_MOD
40    }
41}
42
43/// Generate deterministic hash parameters from a seed.
44fn generate_hash_params(num_hashes: usize, seed: u64) -> Vec<HashParams> {
45    let mut params = Vec::with_capacity(num_hashes);
46    let mut state = seed;
47    for _ in 0..num_hashes {
48        state = state.wrapping_mul(HASH_PRIME).wrapping_add(1);
49        let a = (state % HASH_MOD).max(1);
50        state = state.wrapping_mul(HASH_PRIME).wrapping_add(1);
51        let b = state % HASH_MOD;
52        params.push(HashParams::new(a, b));
53    }
54    params
55}
56
57/// A `MinHash` signature representing a set.
58#[derive(Debug, Clone)]
59pub struct MinHashSignature {
60    /// The minimum hash values for each hash function.
61    values: Vec<u64>,
62}
63
64impl MinHashSignature {
65    /// Create a new empty signature with the given number of hashes.
66    pub fn new(num_hashes: usize) -> Self {
67        Self {
68            values: vec![u64::MAX; num_hashes],
69        }
70    }
71
72    /// Get the signature values.
73    pub fn values(&self) -> &[u64] {
74        &self.values
75    }
76
77    /// Get the number of hash functions.
78    pub fn num_hashes(&self) -> usize {
79        self.values.len()
80    }
81
82    /// Estimate Jaccard similarity with another signature.
83    #[allow(clippy::cast_precision_loss)]
84    pub fn jaccard_similarity(&self, other: &Self) -> f64 {
85        if self.values.len() != other.values.len() {
86            return 0.0;
87        }
88        if self.values.is_empty() {
89            return 1.0;
90        }
91        let matches = self
92            .values
93            .iter()
94            .zip(other.values.iter())
95            .filter(|(a, b)| a == b)
96            .count();
97        matches as f64 / self.values.len() as f64
98    }
99}
100
101/// `MinHash` engine for computing signatures.
102#[derive(Debug, Clone)]
103pub struct MinHasher {
104    /// Hash function parameters.
105    params: Vec<HashParams>,
106    /// Number of hash functions.
107    num_hashes: usize,
108}
109
110impl MinHasher {
111    /// Create a new `MinHash` engine with the default number of hashes.
112    pub fn new() -> Self {
113        Self::with_num_hashes(DEFAULT_NUM_HASHES)
114    }
115
116    /// Create a new `MinHash` engine with a specific number of hashes.
117    pub fn with_num_hashes(num_hashes: usize) -> Self {
118        let num_hashes = num_hashes.max(1);
119        let params = generate_hash_params(num_hashes, 42);
120        Self { params, num_hashes }
121    }
122
123    /// Create with a custom seed for reproducibility.
124    pub fn with_seed(num_hashes: usize, seed: u64) -> Self {
125        let num_hashes = num_hashes.max(1);
126        let params = generate_hash_params(num_hashes, seed);
127        Self { params, num_hashes }
128    }
129
130    /// Compute a `MinHash` signature from a set of elements.
131    pub fn compute_signature(&self, elements: &[u64]) -> MinHashSignature {
132        let mut sig = MinHashSignature::new(self.num_hashes);
133        for &elem in elements {
134            for (i, param) in self.params.iter().enumerate() {
135                let h = param.apply(elem);
136                if h < sig.values[i] {
137                    sig.values[i] = h;
138                }
139            }
140        }
141        sig
142    }
143
144    /// Compute signature from byte shingles of a given width.
145    pub fn compute_from_bytes(&self, data: &[u8], shingle_width: usize) -> MinHashSignature {
146        let width = shingle_width.max(1);
147        if data.len() < width {
148            return MinHashSignature::new(self.num_hashes);
149        }
150
151        let mut elements = Vec::new();
152        for window in data.windows(width) {
153            let mut h: u64 = 0;
154            for &b in window {
155                h = h.wrapping_mul(31).wrapping_add(u64::from(b));
156            }
157            elements.push(h);
158        }
159        self.compute_signature(&elements)
160    }
161
162    /// Get the number of hash functions.
163    pub fn num_hashes(&self) -> usize {
164        self.num_hashes
165    }
166}
167
168impl Default for MinHasher {
169    fn default() -> Self {
170        Self::new()
171    }
172}
173
174/// A collection of `MinHash` signatures for batch similarity queries.
175#[derive(Debug, Clone)]
176pub struct MinHashIndex {
177    /// Stored signatures with labels.
178    entries: Vec<(String, MinHashSignature)>,
179    /// Similarity threshold for duplicate detection.
180    threshold: f64,
181}
182
183impl MinHashIndex {
184    /// Create a new index with the given similarity threshold.
185    pub fn new(threshold: f64) -> Self {
186        Self {
187            entries: Vec::new(),
188            threshold: threshold.clamp(0.0, 1.0),
189        }
190    }
191
192    /// Add a signature to the index.
193    pub fn insert(&mut self, label: String, signature: MinHashSignature) {
194        self.entries.push((label, signature));
195    }
196
197    /// Get the number of entries.
198    pub fn len(&self) -> usize {
199        self.entries.len()
200    }
201
202    /// Check if the index is empty.
203    pub fn is_empty(&self) -> bool {
204        self.entries.is_empty()
205    }
206
207    /// Find all entries similar to a query signature.
208    pub fn query_similar(&self, query: &MinHashSignature) -> Vec<(&str, f64)> {
209        let mut results = Vec::new();
210        for (label, sig) in &self.entries {
211            let sim = query.jaccard_similarity(sig);
212            if sim >= self.threshold {
213                results.push((label.as_str(), sim));
214            }
215        }
216        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
217        results
218    }
219
220    /// Find all duplicate pairs above the threshold.
221    pub fn find_duplicates(&self) -> Vec<(&str, &str, f64)> {
222        let mut pairs = Vec::new();
223        for i in 0..self.entries.len() {
224            for j in (i + 1)..self.entries.len() {
225                let sim = self.entries[i].1.jaccard_similarity(&self.entries[j].1);
226                if sim >= self.threshold {
227                    pairs.push((self.entries[i].0.as_str(), self.entries[j].0.as_str(), sim));
228                }
229            }
230        }
231        pairs.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
232        pairs
233    }
234
235    /// Get the threshold.
236    pub fn threshold(&self) -> f64 {
237        self.threshold
238    }
239}
240
241/// Compute exact Jaccard similarity between two sets for comparison.
242#[allow(clippy::cast_precision_loss)]
243pub fn exact_jaccard(set_a: &[u64], set_b: &[u64]) -> f64 {
244    let a: HashSet<u64> = set_a.iter().copied().collect();
245    let b: HashSet<u64> = set_b.iter().copied().collect();
246    let intersection = a.intersection(&b).count();
247    let union = a.union(&b).count();
248    if union == 0 {
249        return 1.0;
250    }
251    intersection as f64 / union as f64
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    #[test]
259    fn test_hash_params_apply() {
260        let p = HashParams::new(3, 7);
261        let h1 = p.apply(10);
262        let h2 = p.apply(10);
263        assert_eq!(h1, h2); // deterministic
264    }
265
266    #[test]
267    fn test_generate_hash_params() {
268        let params = generate_hash_params(10, 42);
269        assert_eq!(params.len(), 10);
270        // All a values should be >= 1
271        for p in &params {
272            assert!(p.a >= 1);
273        }
274    }
275
276    #[test]
277    fn test_minhash_signature_new() {
278        let sig = MinHashSignature::new(64);
279        assert_eq!(sig.num_hashes(), 64);
280        assert!(sig.values().iter().all(|&v| v == u64::MAX));
281    }
282
283    #[test]
284    fn test_jaccard_identical() {
285        let hasher = MinHasher::with_num_hashes(64);
286        let elements = vec![1, 2, 3, 4, 5];
287        let sig1 = hasher.compute_signature(&elements);
288        let sig2 = hasher.compute_signature(&elements);
289        assert!((sig1.jaccard_similarity(&sig2) - 1.0).abs() < f64::EPSILON);
290    }
291
292    #[test]
293    fn test_jaccard_disjoint() {
294        let hasher = MinHasher::with_num_hashes(256);
295        let a: Vec<u64> = (0..100).collect();
296        let b: Vec<u64> = (10000..10100).collect();
297        let sig_a = hasher.compute_signature(&a);
298        let sig_b = hasher.compute_signature(&b);
299        let sim = sig_a.jaccard_similarity(&sig_b);
300        // Disjoint sets should have very low similarity
301        assert!(sim < 0.15, "Expected low similarity, got {sim}");
302    }
303
304    #[test]
305    fn test_jaccard_similar_sets() {
306        let hasher = MinHasher::with_num_hashes(256);
307        // 90% overlap
308        let a: Vec<u64> = (0..100).collect();
309        let b: Vec<u64> = (0..90).chain(200..210).collect();
310        let sig_a = hasher.compute_signature(&a);
311        let sig_b = hasher.compute_signature(&b);
312        let estimated = sig_a.jaccard_similarity(&sig_b);
313        let exact = exact_jaccard(&a, &b);
314        // Estimated should be reasonably close to exact
315        assert!(
316            (estimated - exact).abs() < 0.2,
317            "Estimated {estimated} vs exact {exact}"
318        );
319    }
320
321    #[test]
322    fn test_jaccard_different_lengths() {
323        let sig_a = MinHashSignature::new(10);
324        let sig_b = MinHashSignature::new(20);
325        assert!((sig_a.jaccard_similarity(&sig_b) - 0.0).abs() < f64::EPSILON);
326    }
327
328    #[test]
329    fn test_jaccard_empty_signature() {
330        let sig_a = MinHashSignature::new(0);
331        let sig_b = MinHashSignature::new(0);
332        assert!((sig_a.jaccard_similarity(&sig_b) - 1.0).abs() < f64::EPSILON);
333    }
334
335    #[test]
336    fn test_minhasher_default() {
337        let hasher = MinHasher::default();
338        assert_eq!(hasher.num_hashes(), DEFAULT_NUM_HASHES);
339    }
340
341    #[test]
342    fn test_compute_from_bytes() {
343        let hasher = MinHasher::with_num_hashes(64);
344        let data = b"Hello, World! This is a test string for minhash.";
345        let sig = hasher.compute_from_bytes(data, 4);
346        assert_eq!(sig.num_hashes(), 64);
347        // Should have found some minimums
348        assert!(sig.values().iter().any(|&v| v < u64::MAX));
349    }
350
351    #[test]
352    fn test_compute_from_bytes_too_short() {
353        let hasher = MinHasher::with_num_hashes(16);
354        let sig = hasher.compute_from_bytes(b"ab", 5);
355        // Data too short for shingle_width=5, should be all MAX
356        assert!(sig.values().iter().all(|&v| v == u64::MAX));
357    }
358
359    #[test]
360    fn test_minhash_index_insert_and_query() {
361        let hasher = MinHasher::with_num_hashes(64);
362        let mut index = MinHashIndex::new(0.5);
363        let sig1 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
364        let sig2 = hasher.compute_signature(&[1, 2, 3, 4, 5]); // identical
365        let sig3 = hasher.compute_signature(&[100, 200, 300, 400, 500]);
366
367        index.insert("file1".to_string(), sig1);
368        index.insert("file2".to_string(), sig3);
369
370        let results = index.query_similar(&sig2);
371        // sig2 == sig1, so file1 should be found
372        assert!(!results.is_empty());
373        assert_eq!(results[0].0, "file1");
374    }
375
376    #[test]
377    fn test_minhash_index_find_duplicates() {
378        let hasher = MinHasher::with_num_hashes(64);
379        let mut index = MinHashIndex::new(0.9);
380        let sig1 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
381        let sig2 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
382        let sig3 = hasher.compute_signature(&[100, 200, 300]);
383
384        index.insert("a.mp4".to_string(), sig1);
385        index.insert("b.mp4".to_string(), sig2);
386        index.insert("c.mp4".to_string(), sig3);
387
388        let dupes = index.find_duplicates();
389        // a.mp4 and b.mp4 should be duplicates
390        assert!(dupes.iter().any(|(a, b, _)| *a == "a.mp4" && *b == "b.mp4"));
391    }
392
393    #[test]
394    fn test_minhash_index_empty() {
395        let index = MinHashIndex::new(0.5);
396        assert!(index.is_empty());
397        assert_eq!(index.len(), 0);
398    }
399
400    #[test]
401    fn test_exact_jaccard() {
402        let a = vec![1, 2, 3, 4, 5];
403        let b = vec![3, 4, 5, 6, 7];
404        let j = exact_jaccard(&a, &b);
405        // intersection = {3,4,5} = 3, union = {1,2,3,4,5,6,7} = 7
406        assert!((j - 3.0 / 7.0).abs() < f64::EPSILON);
407    }
408
409    #[test]
410    fn test_exact_jaccard_empty() {
411        let j = exact_jaccard(&[], &[]);
412        assert!((j - 1.0).abs() < f64::EPSILON);
413    }
414
415    #[test]
416    fn test_with_seed_deterministic() {
417        let h1 = MinHasher::with_seed(32, 12345);
418        let h2 = MinHasher::with_seed(32, 12345);
419        let data = vec![10, 20, 30];
420        let s1 = h1.compute_signature(&data);
421        let s2 = h2.compute_signature(&data);
422        assert_eq!(s1.values(), s2.values());
423    }
424}