minhash_rs/minhash.rs
1//! Module providing the MinHash data structure.
2
3use crate::{
4 atomic::IterHashes,
5 prelude::{Min, Primitive},
6 xorshift::XorShift,
7 zero::Zero,
8};
9use core::hash::Hash;
10use core::ops::Index;
11use core::ops::IndexMut;
12
13use crate::prelude::Maximal;
14
15#[repr(transparent)]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17pub struct MinHash<Word, const PERMUTATIONS: usize> {
18 words: [Word; PERMUTATIONS],
19}
20
21impl<Word: Maximal, const PERMUTATIONS: usize> Default for MinHash<Word, PERMUTATIONS> {
22 /// Create a new MinHash with the maximal value.
23 ///
24 /// # Examples
25 ///
26 /// ```
27 /// use minhash_rs::prelude::*;
28 ///
29 /// let mut minhash = MinHash::<u64, 128>::default();
30 ///
31 /// assert_eq!(minhash, MinHash::<u64, 128>::new());
32 /// ```
33 fn default() -> Self {
34 Self::new()
35 }
36}
37
38impl<Word: Maximal, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
39 /// Create a new MinHash.
40 ///
41 /// # Examples
42 ///
43 /// ```
44 /// use minhash_rs::prelude::*;
45 ///
46 /// let mut minhash = MinHash::<u64, 128>::new();
47 /// ```
48 pub fn new() -> Self {
49 Self {
50 words: [Word::maximal(); PERMUTATIONS],
51 }
52 }
53}
54
55impl<Word: Min + XorShift + Copy + Eq + Maximal + Zero, const PERMUTATIONS: usize>
56 MinHash<Word, PERMUTATIONS>
57where
58 u64: Primitive<Word>,
59{
60 /// Returns whether the MinHash is empty.
61 ///
62 /// # Examples
63 ///
64 /// ```
65 /// use minhash_rs::prelude::*;
66 ///
67 /// let mut minhash = MinHash::<u8, 16>::new();
68 ///
69 /// assert!(minhash.is_empty());
70 /// minhash.insert_with_siphashes13(42);
71 /// assert!(!minhash.is_empty());
72 /// ```
73 ///
74 pub fn is_empty(&self) -> bool {
75 self.iter().all(|word| *word == Word::maximal())
76 }
77
78 /// Returns whether the MinHash is fully saturated.
79 ///
80 /// # Examples
81 ///
82 /// ```
83 /// use minhash_rs::prelude::*;
84 ///
85 /// let mut minhash = MinHash::<u8, 16>::new();
86 ///
87 /// assert!(!minhash.is_full());
88 ///
89 /// for i in 0..1024 {
90 /// minhash.insert_with_siphashes13(i);
91 /// }
92 ///
93 /// assert!(minhash.is_full());
94 /// ```
95 ///
96 pub fn is_full(&self) -> bool {
97 self.iter().all(|word| *word == Word::zero())
98 }
99}
100
101impl<Word: Min + XorShift + Copy + Eq, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS>
102where
103 Self: IterHashes<Word, PERMUTATIONS>,
104 u64: Primitive<Word>,
105{
106 /// Returns whether the MinHash may contain the provided value, using the SipHasher13.
107 ///
108 /// # Arguments
109 /// * `value` - The value to check.
110 ///
111 /// # Implementative details
112 /// The procedure estimates whether the provided value is contained
113 /// in the current MinHash data structure by checking whether all of
114 /// the words are smaller or equal to all of the hash values that
115 /// are calculated using the provided value as seed.
116 ///
117 /// # Examples
118 ///
119 /// ```
120 /// use minhash_rs::prelude::*;
121 ///
122 /// let mut minhash = MinHash::<u64, 128>::new();
123 ///
124 /// assert!(!minhash.may_contain_value_with_siphashes13(42));
125 /// minhash.insert_with_siphashes13(42);
126 /// assert!(minhash.may_contain_value_with_siphashes13(42));
127 /// minhash.insert_with_siphashes13(47);
128 /// assert!(minhash.may_contain_value_with_siphashes13(47));
129 /// ```
130 ///
131 pub fn may_contain_value_with_siphashes13<H: Hash>(&self, value: H) -> bool {
132 self.iter()
133 .zip(Self::iter_siphashes13_from_value(value))
134 .all(|(word, hash)| word.is_min(hash))
135 }
136
137 /// Insert a value into the MinHash using the SipHasher13.
138 ///
139 /// # Arguments
140 /// * `value` - The value to insert.
141 ///
142 /// # Examples
143 /// In the following example we show how we can
144 /// create a MinHash and insert a value in it.
145 ///
146 /// ```
147 /// use minhash_rs::prelude::*;
148 ///
149 /// let mut minhash = MinHash::<u64, 128>::new();
150 ///
151 /// assert!(!minhash.may_contain_value_with_siphashes13(42));
152 /// minhash.insert_with_siphashes13(42);
153 /// assert!(minhash.may_contain_value_with_siphashes13(42));
154 /// minhash.insert_with_siphashes13(47);
155 /// assert!(minhash.may_contain_value_with_siphashes13(47));
156 /// ```
157 pub fn insert_with_siphashes13<H: Hash>(&mut self, value: H) {
158 for (word, hash) in self
159 .iter_mut()
160 .zip(Self::iter_siphashes13_from_value(value))
161 {
162 word.set_min(hash);
163 }
164 }
165
166 /// Returns whether the MinHash may contain the provided value, using the keyed SipHasher13.
167 ///
168 /// # Arguments
169 /// * `value` - The value to check.
170 /// * `key0` - The first key.
171 /// * `key1` - The second key.
172 ///
173 /// # Implementative details
174 /// The procedure estimates whether the provided value is contained
175 /// in the current MinHash data structure by checking whether all of
176 /// the words are smaller or equal to all of the hash values that
177 /// are calculated using the provided value as seed.
178 ///
179 /// # Examples
180 ///
181 /// ```
182 /// use minhash_rs::prelude::*;
183 ///
184 /// let mut minhash = MinHash::<u64, 128>::new();
185 /// let key0 = 0x0123456789ABCDEF;
186 /// let key1 = 0xFEDCBA9876543210;
187 ///
188 /// assert!(!minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
189 /// minhash.insert_with_keyed_siphashes13(42, key0, key1);
190 /// assert!(minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
191 /// minhash.insert_with_keyed_siphashes13(47, key0, key1);
192 /// assert!(minhash.may_contain_value_with_keyed_siphashes13(47, key0, key1));
193 /// ```
194 ///
195 pub fn may_contain_value_with_keyed_siphashes13<H: Hash>(
196 &self,
197 value: H,
198 key0: u64,
199 key1: u64,
200 ) -> bool {
201 self.iter()
202 .zip(Self::iter_keyed_siphashes13_from_value(value, key0, key1))
203 .all(|(word, hash)| word.is_min(hash))
204 }
205
206 /// Insert a value into the MinHash using the keyed SipHasher13.
207 ///
208 /// # Arguments
209 /// * `value` - The value to insert.
210 /// * `key0` - The first key.
211 /// * `key1` - The second key.
212 ///
213 /// # Examples
214 /// In the following example we show how we can
215 /// create a MinHash and insert a value in it.
216 ///
217 /// ```
218 /// use minhash_rs::prelude::*;
219 ///
220 /// let mut minhash = MinHash::<u64, 128>::new();
221 /// let key0 = 0x0123456789ABCDEF;
222 /// let key1 = 0xFEDCBA9876543210;
223 ///
224 /// assert!(!minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
225 /// minhash.insert_with_keyed_siphashes13(42, key0, key1);
226 /// assert!(minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
227 /// minhash.insert_with_keyed_siphashes13(47, key0, key1);
228 /// assert!(minhash.may_contain_value_with_keyed_siphashes13(47, key0, key1));
229 /// ```
230 pub fn insert_with_keyed_siphashes13<H: Hash>(&mut self, value: H, key0: u64, key1: u64) {
231 for (word, hash) in self
232 .iter_mut()
233 .zip(Self::iter_keyed_siphashes13_from_value(value, key0, key1))
234 {
235 word.set_min(hash);
236 }
237 }
238
239 /// Returns whether the MinHash may contain the provided value, using the FVN.
240 ///
241 /// # Arguments
242 /// * `value` - The value to check.
243 ///
244 /// # Implementative details
245 /// The procedure estimates whether the provided value is contained
246 /// in the current MinHash data structure by checking whether all of
247 /// the words are smaller or equal to all of the hash values that
248 /// are calculated using the provided value as seed.
249 ///
250 /// # Examples
251 ///
252 /// ```
253 /// use minhash_rs::prelude::*;
254 ///
255 /// let mut minhash = MinHash::<u64, 128>::new();
256 ///
257 /// assert!(!minhash.may_contain_value_with_fvn(42));
258 /// minhash.insert_with_fvn(42);
259 /// assert!(minhash.may_contain_value_with_fvn(42));
260 /// minhash.insert_with_fvn(47);
261 /// assert!(minhash.may_contain_value_with_fvn(47));
262 /// ```
263 ///
264 pub fn may_contain_value_with_fvn<H: Hash>(&self, value: H) -> bool {
265 self.iter()
266 .zip(Self::iter_fvn_from_value(value))
267 .all(|(word, hash)| word.is_min(hash))
268 }
269
270 /// Insert a value into the MinHash using the FVN.
271 ///
272 /// # Arguments
273 /// * `value` - The value to insert.
274 ///
275 /// # Examples
276 /// In the following example we show how we can
277 /// create a MinHash and insert a value in it.
278 ///
279 /// ```
280 /// use minhash_rs::prelude::*;
281 ///
282 /// let mut minhash = MinHash::<u64, 128>::new();
283 ///
284 /// assert!(!minhash.may_contain_value_with_fvn(42));
285 /// minhash.insert_with_fvn(42);
286 /// assert!(minhash.may_contain_value_with_fvn(42));
287 /// minhash.insert_with_fvn(47);
288 /// assert!(minhash.may_contain_value_with_fvn(47));
289 /// ```
290 pub fn insert_with_fvn<H: Hash>(&mut self, value: H) {
291 for (word, hash) in self.iter_mut().zip(Self::iter_fvn_from_value(value)) {
292 word.set_min(hash);
293 }
294 }
295
296 /// Returns whether the MinHash may contain the provided value, using the keyed FVN.
297 ///
298 /// # Arguments
299 /// * `value` - The value to check.
300 /// * `key` - The first key.
301 ///
302 /// # Implementative details
303 /// The procedure estimates whether the provided value is contained
304 /// in the current MinHash data structure by checking whether all of
305 /// the words are smaller or equal to all of the hash values that
306 /// are calculated using the provided value as seed.
307 ///
308 /// # Examples
309 ///
310 /// ```
311 /// use minhash_rs::prelude::*;
312 ///
313 /// let mut minhash = MinHash::<u64, 128>::new();
314 /// let key = 0x0123456789ABCDEF;
315 ///
316 /// assert!(!minhash.may_contain_value_with_keyed_fvn(42, key));
317 /// minhash.insert_with_keyed_fvn(42, key);
318 /// assert!(minhash.may_contain_value_with_keyed_fvn(42, key));
319 /// minhash.insert_with_keyed_fvn(47, key);
320 /// assert!(minhash.may_contain_value_with_keyed_fvn(47, key));
321 /// ```
322 ///
323 pub fn may_contain_value_with_keyed_fvn<H: Hash>(&self, value: H, key: u64) -> bool {
324 self.iter()
325 .zip(Self::iter_keyed_fvn_from_value(value, key))
326 .all(|(word, hash)| word.is_min(hash))
327 }
328
329 /// Insert a value into the MinHash using the keyed FVN.
330 ///
331 /// # Arguments
332 /// * `value` - The value to insert.
333 /// * `key` - The first key.
334 ///
335 /// # Examples
336 /// In the following example we show how we can
337 /// create a MinHash and insert a value in it.
338 ///
339 /// ```
340 /// use minhash_rs::prelude::*;
341 ///
342 /// let mut minhash = MinHash::<u64, 128>::new();
343 /// let key = 0x0123456789ABCDEF;
344 ///
345 /// assert!(!minhash.may_contain_value_with_keyed_fvn(42, key));
346 /// minhash.insert_with_keyed_fvn(42, key);
347 /// assert!(minhash.may_contain_value_with_keyed_fvn(42, key));
348 /// minhash.insert_with_keyed_fvn(47, key);
349 /// assert!(minhash.may_contain_value_with_keyed_fvn(47, key));
350 /// ```
351 pub fn insert_with_keyed_fvn<H: Hash>(&mut self, value: H, key: u64) {
352 for (word, hash) in self
353 .iter_mut()
354 .zip(Self::iter_keyed_fvn_from_value(value, key))
355 {
356 word.set_min(hash);
357 }
358 }
359}
360
361impl<Word, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
362 /// Iterate over the words.
363 pub fn iter(&self) -> impl Iterator<Item = &Word> {
364 self.words.iter()
365 }
366
367 /// Iterate over the words mutably.
368 pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut Word> {
369 self.words.iter_mut()
370 }
371
372 /// Returns the number of permutations.
373 ///
374 /// # Examples
375 ///
376 /// ```
377 /// use minhash_rs::prelude::*;
378 ///
379 /// let minhash = MinHash::<u64, 128>::new();
380 ///
381 /// assert_eq!(minhash.number_of_permutations(), 128);
382 /// ```
383 pub fn number_of_permutations(&self) -> usize {
384 PERMUTATIONS
385 }
386
387 /// Returns memory required to store the MinHash in bits.
388 ///
389 /// # Examples
390 /// For a MinHash with 128 permutations and 64 bit words, the memory required is 128 * 64 * 8.
391 ///
392 /// ```
393 /// use minhash_rs::prelude::*;
394 ///
395 /// let minhash = MinHash::<u64, 128>::new();
396 ///
397 /// assert_eq!(minhash.memory(), 128 * 64);
398 /// ```
399 ///
400 /// For a MinHash with 128 permutations and 32 bit words, the memory required is 128 * 32 * 8.
401 ///
402 /// ```
403 /// use minhash_rs::prelude::*;
404 ///
405 /// let minhash = MinHash::<u32, 128>::new();
406 ///
407 /// assert_eq!(minhash.memory(), 128 * 32);
408 /// ```
409 ///
410 pub fn memory(&self) -> usize {
411 PERMUTATIONS * core::mem::size_of::<Word>() * 8
412 }
413}
414
415impl<Word: Eq, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
416 /// Calculate the similarity between two MinHashes.
417 ///
418 /// # Arguments
419 /// * `other` - The other MinHash to compare to.
420 ///
421 ///
422 /// # Examples
423 ///
424 /// ```
425 /// use std::collections::HashSet;
426 /// use minhash_rs::prelude::*;
427 ///
428 /// let first_set: HashSet<u64> = [1_u64, 2_u64, 3_u64, 4_u64, 5_u64, 6_u64, 7_u64, 8_u64].iter().copied().collect();
429 /// let second_set: HashSet<u64> = [5_u64, 6_u64, 7_u64, 8_u64, 9_u64, 10_u64, 11_u64, 12_u64].iter().copied().collect();
430 ///
431 /// let mut first_minhash: MinHash<u64, 128> = first_set.iter().collect();
432 /// let mut second_minhash: MinHash<u64, 128> = second_set.iter().collect();
433 ///
434 /// let approximation = first_minhash.estimate_jaccard_index(&second_minhash);
435 /// let ground_truth = first_set.intersection(&second_set).count() as f64 / first_set.union(&second_set).count() as f64;
436 ///
437 /// assert!((approximation - ground_truth).abs() < 0.01, concat!(
438 /// "We expected the approximation to be close to the ground truth, ",
439 /// "but got an error of {} instead. The ground truth is {} and the approximation is {}."
440 /// ), (approximation - ground_truth).abs(), ground_truth, approximation
441 /// );
442 /// ```
443 pub fn estimate_jaccard_index(&self, other: &Self) -> f64 {
444 self.iter()
445 .zip(other.iter())
446 .map(|(l, r)| (l == r) as usize)
447 .sum::<usize>() as f64
448 / PERMUTATIONS as f64
449 }
450}
451
452/// We also implement AsRef and AsMut for direct access on the MinHash words.
453impl<Word, const PERMUTATIONS: usize> AsRef<[Word]> for MinHash<Word, PERMUTATIONS> {
454 fn as_ref(&self) -> &[Word] {
455 &self.words
456 }
457}
458
459impl<Word, const PERMUTATIONS: usize> AsMut<[Word]> for MinHash<Word, PERMUTATIONS> {
460 fn as_mut(&mut self) -> &mut [Word] {
461 &mut self.words
462 }
463}
464
465/// We also provide indexing for the MinHash.
466impl<W: Maximal, const PERMUTATIONS: usize> Index<usize> for MinHash<W, PERMUTATIONS> {
467 type Output = W;
468
469 fn index(&self, index: usize) -> &Self::Output {
470 &self.words[index]
471 }
472}
473
474impl<W: Maximal, const PERMUTATIONS: usize> IndexMut<usize> for MinHash<W, PERMUTATIONS> {
475 fn index_mut(&mut self, index: usize) -> &mut Self::Output {
476 &mut self.words[index]
477 }
478}