1#![allow(dead_code)]
2use std::collections::HashSet;
9
10const DEFAULT_NUM_HASHES: usize = 128;
12
13const HASH_PRIME: u64 = 0x0001_0000_01B3;
15
16const HASH_MOD: u64 = (1u64 << 61) - 1;
18
19#[derive(Debug, Clone, Copy)]
21pub struct HashParams {
22 a: u64,
24 b: u64,
26}
27
28impl HashParams {
29 pub fn new(a: u64, b: u64) -> Self {
31 Self { a, b }
32 }
33
34 pub fn apply(&self, value: u64) -> u64 {
36 let av = self.a.wrapping_mul(value);
38 let avb = av.wrapping_add(self.b);
39 avb % HASH_MOD
40 }
41}
42
43fn generate_hash_params(num_hashes: usize, seed: u64) -> Vec<HashParams> {
45 let mut params = Vec::with_capacity(num_hashes);
46 let mut state = seed;
47 for _ in 0..num_hashes {
48 state = state.wrapping_mul(HASH_PRIME).wrapping_add(1);
49 let a = (state % HASH_MOD).max(1);
50 state = state.wrapping_mul(HASH_PRIME).wrapping_add(1);
51 let b = state % HASH_MOD;
52 params.push(HashParams::new(a, b));
53 }
54 params
55}
56
57#[derive(Debug, Clone)]
59pub struct MinHashSignature {
60 values: Vec<u64>,
62}
63
64impl MinHashSignature {
65 pub fn new(num_hashes: usize) -> Self {
67 Self {
68 values: vec![u64::MAX; num_hashes],
69 }
70 }
71
72 pub fn values(&self) -> &[u64] {
74 &self.values
75 }
76
77 pub fn num_hashes(&self) -> usize {
79 self.values.len()
80 }
81
82 #[allow(clippy::cast_precision_loss)]
84 pub fn jaccard_similarity(&self, other: &Self) -> f64 {
85 if self.values.len() != other.values.len() {
86 return 0.0;
87 }
88 if self.values.is_empty() {
89 return 1.0;
90 }
91 let matches = self
92 .values
93 .iter()
94 .zip(other.values.iter())
95 .filter(|(a, b)| a == b)
96 .count();
97 matches as f64 / self.values.len() as f64
98 }
99}
100
101#[derive(Debug, Clone)]
103pub struct MinHasher {
104 params: Vec<HashParams>,
106 num_hashes: usize,
108}
109
110impl MinHasher {
111 pub fn new() -> Self {
113 Self::with_num_hashes(DEFAULT_NUM_HASHES)
114 }
115
116 pub fn with_num_hashes(num_hashes: usize) -> Self {
118 let num_hashes = num_hashes.max(1);
119 let params = generate_hash_params(num_hashes, 42);
120 Self { params, num_hashes }
121 }
122
123 pub fn with_seed(num_hashes: usize, seed: u64) -> Self {
125 let num_hashes = num_hashes.max(1);
126 let params = generate_hash_params(num_hashes, seed);
127 Self { params, num_hashes }
128 }
129
130 pub fn compute_signature(&self, elements: &[u64]) -> MinHashSignature {
132 let mut sig = MinHashSignature::new(self.num_hashes);
133 for &elem in elements {
134 for (i, param) in self.params.iter().enumerate() {
135 let h = param.apply(elem);
136 if h < sig.values[i] {
137 sig.values[i] = h;
138 }
139 }
140 }
141 sig
142 }
143
144 pub fn compute_from_bytes(&self, data: &[u8], shingle_width: usize) -> MinHashSignature {
146 let width = shingle_width.max(1);
147 if data.len() < width {
148 return MinHashSignature::new(self.num_hashes);
149 }
150
151 let mut elements = Vec::new();
152 for window in data.windows(width) {
153 let mut h: u64 = 0;
154 for &b in window {
155 h = h.wrapping_mul(31).wrapping_add(u64::from(b));
156 }
157 elements.push(h);
158 }
159 self.compute_signature(&elements)
160 }
161
162 pub fn num_hashes(&self) -> usize {
164 self.num_hashes
165 }
166}
167
168impl Default for MinHasher {
169 fn default() -> Self {
170 Self::new()
171 }
172}
173
174#[derive(Debug, Clone)]
176pub struct MinHashIndex {
177 entries: Vec<(String, MinHashSignature)>,
179 threshold: f64,
181}
182
183impl MinHashIndex {
184 pub fn new(threshold: f64) -> Self {
186 Self {
187 entries: Vec::new(),
188 threshold: threshold.clamp(0.0, 1.0),
189 }
190 }
191
192 pub fn insert(&mut self, label: String, signature: MinHashSignature) {
194 self.entries.push((label, signature));
195 }
196
197 pub fn len(&self) -> usize {
199 self.entries.len()
200 }
201
202 pub fn is_empty(&self) -> bool {
204 self.entries.is_empty()
205 }
206
207 pub fn query_similar(&self, query: &MinHashSignature) -> Vec<(&str, f64)> {
209 let mut results = Vec::new();
210 for (label, sig) in &self.entries {
211 let sim = query.jaccard_similarity(sig);
212 if sim >= self.threshold {
213 results.push((label.as_str(), sim));
214 }
215 }
216 results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
217 results
218 }
219
220 pub fn find_duplicates(&self) -> Vec<(&str, &str, f64)> {
222 let mut pairs = Vec::new();
223 for i in 0..self.entries.len() {
224 for j in (i + 1)..self.entries.len() {
225 let sim = self.entries[i].1.jaccard_similarity(&self.entries[j].1);
226 if sim >= self.threshold {
227 pairs.push((self.entries[i].0.as_str(), self.entries[j].0.as_str(), sim));
228 }
229 }
230 }
231 pairs.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
232 pairs
233 }
234
235 pub fn threshold(&self) -> f64 {
237 self.threshold
238 }
239}
240
241#[allow(clippy::cast_precision_loss)]
243pub fn exact_jaccard(set_a: &[u64], set_b: &[u64]) -> f64 {
244 let a: HashSet<u64> = set_a.iter().copied().collect();
245 let b: HashSet<u64> = set_b.iter().copied().collect();
246 let intersection = a.intersection(&b).count();
247 let union = a.union(&b).count();
248 if union == 0 {
249 return 1.0;
250 }
251 intersection as f64 / union as f64
252}
253
254#[cfg(test)]
255mod tests {
256 use super::*;
257
258 #[test]
259 fn test_hash_params_apply() {
260 let p = HashParams::new(3, 7);
261 let h1 = p.apply(10);
262 let h2 = p.apply(10);
263 assert_eq!(h1, h2); }
265
266 #[test]
267 fn test_generate_hash_params() {
268 let params = generate_hash_params(10, 42);
269 assert_eq!(params.len(), 10);
270 for p in ¶ms {
272 assert!(p.a >= 1);
273 }
274 }
275
276 #[test]
277 fn test_minhash_signature_new() {
278 let sig = MinHashSignature::new(64);
279 assert_eq!(sig.num_hashes(), 64);
280 assert!(sig.values().iter().all(|&v| v == u64::MAX));
281 }
282
283 #[test]
284 fn test_jaccard_identical() {
285 let hasher = MinHasher::with_num_hashes(64);
286 let elements = vec![1, 2, 3, 4, 5];
287 let sig1 = hasher.compute_signature(&elements);
288 let sig2 = hasher.compute_signature(&elements);
289 assert!((sig1.jaccard_similarity(&sig2) - 1.0).abs() < f64::EPSILON);
290 }
291
292 #[test]
293 fn test_jaccard_disjoint() {
294 let hasher = MinHasher::with_num_hashes(256);
295 let a: Vec<u64> = (0..100).collect();
296 let b: Vec<u64> = (10000..10100).collect();
297 let sig_a = hasher.compute_signature(&a);
298 let sig_b = hasher.compute_signature(&b);
299 let sim = sig_a.jaccard_similarity(&sig_b);
300 assert!(sim < 0.15, "Expected low similarity, got {sim}");
302 }
303
304 #[test]
305 fn test_jaccard_similar_sets() {
306 let hasher = MinHasher::with_num_hashes(256);
307 let a: Vec<u64> = (0..100).collect();
309 let b: Vec<u64> = (0..90).chain(200..210).collect();
310 let sig_a = hasher.compute_signature(&a);
311 let sig_b = hasher.compute_signature(&b);
312 let estimated = sig_a.jaccard_similarity(&sig_b);
313 let exact = exact_jaccard(&a, &b);
314 assert!(
316 (estimated - exact).abs() < 0.2,
317 "Estimated {estimated} vs exact {exact}"
318 );
319 }
320
321 #[test]
322 fn test_jaccard_different_lengths() {
323 let sig_a = MinHashSignature::new(10);
324 let sig_b = MinHashSignature::new(20);
325 assert!((sig_a.jaccard_similarity(&sig_b) - 0.0).abs() < f64::EPSILON);
326 }
327
328 #[test]
329 fn test_jaccard_empty_signature() {
330 let sig_a = MinHashSignature::new(0);
331 let sig_b = MinHashSignature::new(0);
332 assert!((sig_a.jaccard_similarity(&sig_b) - 1.0).abs() < f64::EPSILON);
333 }
334
335 #[test]
336 fn test_minhasher_default() {
337 let hasher = MinHasher::default();
338 assert_eq!(hasher.num_hashes(), DEFAULT_NUM_HASHES);
339 }
340
341 #[test]
342 fn test_compute_from_bytes() {
343 let hasher = MinHasher::with_num_hashes(64);
344 let data = b"Hello, World! This is a test string for minhash.";
345 let sig = hasher.compute_from_bytes(data, 4);
346 assert_eq!(sig.num_hashes(), 64);
347 assert!(sig.values().iter().any(|&v| v < u64::MAX));
349 }
350
351 #[test]
352 fn test_compute_from_bytes_too_short() {
353 let hasher = MinHasher::with_num_hashes(16);
354 let sig = hasher.compute_from_bytes(b"ab", 5);
355 assert!(sig.values().iter().all(|&v| v == u64::MAX));
357 }
358
359 #[test]
360 fn test_minhash_index_insert_and_query() {
361 let hasher = MinHasher::with_num_hashes(64);
362 let mut index = MinHashIndex::new(0.5);
363 let sig1 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
364 let sig2 = hasher.compute_signature(&[1, 2, 3, 4, 5]); let sig3 = hasher.compute_signature(&[100, 200, 300, 400, 500]);
366
367 index.insert("file1".to_string(), sig1);
368 index.insert("file2".to_string(), sig3);
369
370 let results = index.query_similar(&sig2);
371 assert!(!results.is_empty());
373 assert_eq!(results[0].0, "file1");
374 }
375
376 #[test]
377 fn test_minhash_index_find_duplicates() {
378 let hasher = MinHasher::with_num_hashes(64);
379 let mut index = MinHashIndex::new(0.9);
380 let sig1 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
381 let sig2 = hasher.compute_signature(&[1, 2, 3, 4, 5]);
382 let sig3 = hasher.compute_signature(&[100, 200, 300]);
383
384 index.insert("a.mp4".to_string(), sig1);
385 index.insert("b.mp4".to_string(), sig2);
386 index.insert("c.mp4".to_string(), sig3);
387
388 let dupes = index.find_duplicates();
389 assert!(dupes.iter().any(|(a, b, _)| *a == "a.mp4" && *b == "b.mp4"));
391 }
392
393 #[test]
394 fn test_minhash_index_empty() {
395 let index = MinHashIndex::new(0.5);
396 assert!(index.is_empty());
397 assert_eq!(index.len(), 0);
398 }
399
400 #[test]
401 fn test_exact_jaccard() {
402 let a = vec![1, 2, 3, 4, 5];
403 let b = vec![3, 4, 5, 6, 7];
404 let j = exact_jaccard(&a, &b);
405 assert!((j - 3.0 / 7.0).abs() < f64::EPSILON);
407 }
408
409 #[test]
410 fn test_exact_jaccard_empty() {
411 let j = exact_jaccard(&[], &[]);
412 assert!((j - 1.0).abs() < f64::EPSILON);
413 }
414
415 #[test]
416 fn test_with_seed_deterministic() {
417 let h1 = MinHasher::with_seed(32, 12345);
418 let h2 = MinHasher::with_seed(32, 12345);
419 let data = vec![10, 20, 30];
420 let s1 = h1.compute_signature(&data);
421 let s2 = h2.compute_signature(&data);
422 assert_eq!(s1.values(), s2.values());
423 }
424}