1#![allow(dead_code)]
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub struct SimHash(pub u64);
19
20impl SimHash {
21 #[must_use]
26 pub fn compute(features: &[u64]) -> Self {
27 let mut counts = [0i64; 64];
28
29 for &feature in features {
30 for bit in 0..64u32 {
31 if (feature >> bit) & 1 == 1 {
32 counts[bit as usize] += 1;
33 } else {
34 counts[bit as usize] -= 1;
35 }
36 }
37 }
38
39 let mut hash = 0u64;
40 for (bit, &count) in counts.iter().enumerate() {
41 if count > 0 {
42 hash |= 1u64 << bit;
43 }
44 }
45
46 Self(hash)
47 }
48
49 #[must_use]
51 pub fn hamming_distance(self, other: &Self) -> u32 {
52 (self.0 ^ other.0).count_ones()
53 }
54
55 #[must_use]
57 pub fn bits(self) -> u64 {
58 self.0
59 }
60
61 #[must_use]
63 pub fn similarity(self, other: &Self) -> f32 {
64 1.0 - self.hamming_distance(other) as f32 / 64.0
65 }
66}
67
68#[derive(Debug, Clone, PartialEq, Eq)]
76pub struct MinHash {
77 pub k: usize,
79 pub signatures: Vec<u64>,
81}
82
83fn fnv_hash(seed: u64, value: u64) -> u64 {
85 let mut h: u64 = seed ^ 0xcbf2_9ce4_8422_2325;
86 h = h.wrapping_mul(0x0000_0100_0000_01b3);
87 h ^= value;
88 h = h.wrapping_mul(0x0000_0100_0000_01b3);
89 h
90}
91
92impl MinHash {
93 #[must_use]
97 pub fn compute(shingles: &[u64], k: usize) -> Self {
98 if shingles.is_empty() || k == 0 {
99 return Self {
100 k,
101 signatures: vec![u64::MAX; k],
102 };
103 }
104
105 let mut signatures = vec![u64::MAX; k];
106
107 for &shingle in shingles {
108 for (i, sig) in signatures.iter_mut().enumerate() {
109 let h = fnv_hash(i as u64, shingle);
110 if h < *sig {
111 *sig = h;
112 }
113 }
114 }
115
116 Self { k, signatures }
117 }
118
119 #[must_use]
121 pub fn jaccard_estimate(&self, other: &Self) -> f32 {
122 let len = self.signatures.len().min(other.signatures.len());
123 if len == 0 {
124 return 0.0;
125 }
126
127 let matches = self
128 .signatures
129 .iter()
130 .zip(other.signatures.iter())
131 .filter(|(a, b)| a == b)
132 .count();
133
134 matches as f32 / len as f32
135 }
136}
137
138#[derive(Debug, Clone)]
144pub struct LshBucket {
145 pub hash: u64,
147 pub item_ids: Vec<u64>,
149}
150
151pub struct LshIndex {
156 bands: usize,
158 rows_per_band: usize,
160 items: Vec<(u64, MinHash)>,
162}
163
164impl LshIndex {
165 #[must_use]
169 pub fn new(bands: usize, rows_per_band: usize) -> Self {
170 Self {
171 bands,
172 rows_per_band,
173 items: Vec::new(),
174 }
175 }
176
177 pub fn add(&mut self, id: u64, minhash: &MinHash) {
179 self.items.push((id, minhash.clone()));
180 }
181
182 #[must_use]
184 pub fn find_candidates(&self, query: &MinHash, threshold: f32) -> Vec<u64> {
185 let mut candidates = Vec::new();
186
187 for &(id, ref minhash) in &self.items {
188 let mut band_collision = false;
190 for band in 0..self.bands {
191 let start = band * self.rows_per_band;
192 let end = (start + self.rows_per_band).min(minhash.signatures.len());
193 let qend = (start + self.rows_per_band).min(query.signatures.len());
194
195 if start >= minhash.signatures.len() || start >= query.signatures.len() {
196 break;
197 }
198
199 let band_hash_a = hash_band(&minhash.signatures[start..end]);
200 let band_hash_b = hash_band(&query.signatures[start..qend]);
201
202 if band_hash_a == band_hash_b {
203 band_collision = true;
204 break;
205 }
206 }
207
208 if band_collision {
209 let sim = minhash.jaccard_estimate(query);
211 if sim >= threshold {
212 candidates.push(id);
213 }
214 }
215 }
216
217 candidates
218 }
219
220 #[must_use]
222 pub fn len(&self) -> usize {
223 self.items.len()
224 }
225
226 #[must_use]
228 pub fn is_empty(&self) -> bool {
229 self.items.is_empty()
230 }
231}
232
233fn hash_band(rows: &[u64]) -> u64 {
235 let mut h = 0xcbf2_9ce4_8422_2325u64;
236 for &v in rows {
237 h ^= v;
238 h = h.wrapping_mul(0x0000_0100_0000_01b3);
239 }
240 h
241}
242
243#[cfg(test)]
248mod tests {
249 use super::*;
250
251 #[test]
254 fn test_simhash_empty() {
255 let h = SimHash::compute(&[]);
256 assert_eq!(h.0, 0);
258 }
259
260 #[test]
261 fn test_simhash_all_ones() {
262 let h = SimHash::compute(&[u64::MAX]);
264 assert_eq!(h.0, u64::MAX);
265 }
266
267 #[test]
268 fn test_simhash_deterministic() {
269 let features = vec![42u64, 123, 9999, 0xDEAD_BEEF];
270 let h1 = SimHash::compute(&features);
271 let h2 = SimHash::compute(&features);
272 assert_eq!(h1, h2);
273 }
274
275 #[test]
276 fn test_simhash_hamming_same() {
277 let h = SimHash::compute(&[42, 100, 200]);
278 assert_eq!(h.hamming_distance(&h), 0);
279 }
280
281 #[test]
282 fn test_simhash_hamming_range() {
283 let h1 = SimHash::compute(&[1, 2, 3]);
284 let h2 = SimHash::compute(&[4, 5, 6]);
285 let dist = h1.hamming_distance(&h2);
286 assert!(dist <= 64);
287 }
288
289 #[test]
290 fn test_simhash_similarity_range() {
291 let h1 = SimHash::compute(&[1, 2, 3]);
292 let h2 = SimHash::compute(&[4, 5, 6]);
293 let sim = h1.similarity(&h2);
294 assert!((0.0..=1.0).contains(&sim));
295 }
296
297 #[test]
298 fn test_simhash_identical_features_equal() {
299 let features = vec![11u64, 22, 33, 44, 55];
300 let h1 = SimHash::compute(&features);
301 let h2 = SimHash::compute(&features);
302 assert_eq!(h1.similarity(&h2), 1.0);
303 }
304
305 #[test]
308 fn test_minhash_empty_shingles() {
309 let mh = MinHash::compute(&[], 10);
310 assert_eq!(mh.signatures.len(), 10);
311 assert!(mh.signatures.iter().all(|&s| s == u64::MAX));
312 }
313
314 #[test]
315 fn test_minhash_identical_sets_max_similarity() {
316 let shingles = vec![1u64, 2, 3, 4, 5];
317 let mh1 = MinHash::compute(&shingles, 64);
318 let mh2 = MinHash::compute(&shingles, 64);
319 assert_eq!(mh1.jaccard_estimate(&mh2), 1.0);
320 }
321
322 #[test]
323 fn test_minhash_disjoint_sets_low_similarity() {
324 let set_a: Vec<u64> = (0..50).collect();
325 let set_b: Vec<u64> = (1000..1050).collect();
326 let mh1 = MinHash::compute(&set_a, 128);
327 let mh2 = MinHash::compute(&set_b, 128);
328 assert!(mh1.jaccard_estimate(&mh2) < 0.1);
330 }
331
332 #[test]
333 fn test_minhash_partial_overlap() {
334 let set_a: Vec<u64> = (0..100).collect();
336 let set_b: Vec<u64> = (50..150).collect();
337 let mh1 = MinHash::compute(&set_a, 256);
338 let mh2 = MinHash::compute(&set_b, 256);
339 let sim = mh1.jaccard_estimate(&mh2);
340 assert!(sim > 0.0 && sim < 1.0);
342 }
343
344 #[test]
345 fn test_minhash_deterministic() {
346 let shingles = vec![7u64, 8, 9, 10];
347 let mh1 = MinHash::compute(&shingles, 32);
348 let mh2 = MinHash::compute(&shingles, 32);
349 assert_eq!(mh1, mh2);
350 }
351
352 #[test]
355 fn test_lsh_empty_index() {
356 let index = LshIndex::new(4, 4);
357 let query = MinHash::compute(&[1, 2, 3], 16);
358 let candidates = index.find_candidates(&query, 0.5);
359 assert!(candidates.is_empty());
360 }
361
362 #[test]
363 fn test_lsh_identical_item_found() {
364 let mut index = LshIndex::new(4, 4);
365 let shingles = vec![1u64, 2, 3, 4, 5, 6, 7, 8, 9, 10];
366 let mh = MinHash::compute(&shingles, 16);
367 index.add(42, &mh);
368
369 let candidates = index.find_candidates(&mh, 0.9);
370 assert!(candidates.contains(&42));
371 }
372
373 #[test]
374 fn test_lsh_different_item_not_found_high_threshold() {
375 let mut index = LshIndex::new(4, 4);
376 let shingles_a: Vec<u64> = (0..20).collect();
377 let shingles_b: Vec<u64> = (1000..1020).collect();
378 let mh_a = MinHash::compute(&shingles_a, 16);
379 let mh_b = MinHash::compute(&shingles_b, 16);
380 index.add(1, &mh_a);
381
382 let candidates = index.find_candidates(&mh_b, 0.9);
383 assert!(!candidates.contains(&1));
384 }
385
386 #[test]
387 fn test_lsh_len() {
388 let mut index = LshIndex::new(4, 4);
389 assert_eq!(index.len(), 0);
390 assert!(index.is_empty());
391 let mh = MinHash::compute(&[1, 2, 3], 16);
392 index.add(1, &mh);
393 index.add(2, &mh);
394 assert_eq!(index.len(), 2);
395 assert!(!index.is_empty());
396 }
397
398 #[test]
399 fn test_lsh_multiple_items() {
400 let mut index = LshIndex::new(4, 4);
401 let base: Vec<u64> = (0..30).collect();
402 for i in 0..5u64 {
403 let mh = MinHash::compute(&base, 16);
404 index.add(i, &mh);
405 }
406
407 let query = MinHash::compute(&base, 16);
408 let candidates = index.find_candidates(&query, 0.9);
409 assert_eq!(candidates.len(), 5);
411 }
412}