minhash_rs/
minhash.rs

1//! Module providing the MinHash data structure.
2
3use crate::{
4    atomic::IterHashes,
5    prelude::{Min, Primitive},
6    xorshift::XorShift,
7    zero::Zero,
8};
9use core::hash::Hash;
10use core::ops::Index;
11use core::ops::IndexMut;
12
13use crate::prelude::Maximal;
14
15#[repr(transparent)]
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17pub struct MinHash<Word, const PERMUTATIONS: usize> {
18    words: [Word; PERMUTATIONS],
19}
20
21impl<Word: Maximal, const PERMUTATIONS: usize> Default for MinHash<Word, PERMUTATIONS> {
22    /// Create a new MinHash with the maximal value.
23    ///
24    /// # Examples
25    ///
26    /// ```
27    /// use minhash_rs::prelude::*;
28    ///
29    /// let mut minhash = MinHash::<u64, 128>::default();
30    ///
31    /// assert_eq!(minhash, MinHash::<u64, 128>::new());
32    /// ```
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl<Word: Maximal, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
39    /// Create a new MinHash.
40    ///
41    /// # Examples
42    ///
43    /// ```
44    /// use minhash_rs::prelude::*;
45    ///
46    /// let mut minhash = MinHash::<u64, 128>::new();
47    /// ```
48    pub fn new() -> Self {
49        Self {
50            words: [Word::maximal(); PERMUTATIONS],
51        }
52    }
53}
54
55impl<Word: Min + XorShift + Copy + Eq + Maximal + Zero, const PERMUTATIONS: usize>
56    MinHash<Word, PERMUTATIONS>
57where
58    u64: Primitive<Word>,
59{
60    /// Returns whether the MinHash is empty.
61    ///
62    /// # Examples
63    ///
64    /// ```
65    /// use minhash_rs::prelude::*;
66    ///
67    /// let mut minhash = MinHash::<u8, 16>::new();
68    ///
69    /// assert!(minhash.is_empty());
70    /// minhash.insert_with_siphashes13(42);
71    /// assert!(!minhash.is_empty());
72    /// ```
73    ///
74    pub fn is_empty(&self) -> bool {
75        self.iter().all(|word| *word == Word::maximal())
76    }
77
78    /// Returns whether the MinHash is fully saturated.
79    ///
80    /// # Examples
81    ///
82    /// ```
83    /// use minhash_rs::prelude::*;
84    ///
85    /// let mut minhash = MinHash::<u8, 16>::new();
86    ///
87    /// assert!(!minhash.is_full());
88    ///
89    /// for i in 0..1024 {
90    ///    minhash.insert_with_siphashes13(i);
91    /// }
92    ///
93    /// assert!(minhash.is_full());
94    /// ```
95    ///
96    pub fn is_full(&self) -> bool {
97        self.iter().all(|word| *word == Word::zero())
98    }
99}
100
101impl<Word: Min + XorShift + Copy + Eq, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS>
102where
103    Self: IterHashes<Word, PERMUTATIONS>,
104    u64: Primitive<Word>,
105{
106    /// Returns whether the MinHash may contain the provided value, using the SipHasher13.
107    ///
108    /// # Arguments
109    /// * `value` - The value to check.
110    ///
111    /// # Implementative details
112    /// The procedure estimates whether the provided value is contained
113    /// in the current MinHash data structure by checking whether all of
114    /// the words are smaller or equal to all of the hash values that
115    /// are calculated using the provided value as seed.
116    ///
117    /// # Examples
118    ///
119    /// ```
120    /// use minhash_rs::prelude::*;
121    ///
122    /// let mut minhash = MinHash::<u64, 128>::new();
123    ///
124    /// assert!(!minhash.may_contain_value_with_siphashes13(42));
125    /// minhash.insert_with_siphashes13(42);
126    /// assert!(minhash.may_contain_value_with_siphashes13(42));
127    /// minhash.insert_with_siphashes13(47);
128    /// assert!(minhash.may_contain_value_with_siphashes13(47));
129    /// ```
130    ///
131    pub fn may_contain_value_with_siphashes13<H: Hash>(&self, value: H) -> bool {
132        self.iter()
133            .zip(Self::iter_siphashes13_from_value(value))
134            .all(|(word, hash)| word.is_min(hash))
135    }
136
137    /// Insert a value into the MinHash using the SipHasher13.
138    ///
139    /// # Arguments
140    /// * `value` - The value to insert.
141    ///
142    /// # Examples
143    /// In the following example we show how we can
144    /// create a MinHash and insert a value in it.
145    ///
146    /// ```
147    /// use minhash_rs::prelude::*;
148    ///
149    /// let mut minhash = MinHash::<u64, 128>::new();
150    ///
151    /// assert!(!minhash.may_contain_value_with_siphashes13(42));
152    /// minhash.insert_with_siphashes13(42);
153    /// assert!(minhash.may_contain_value_with_siphashes13(42));
154    /// minhash.insert_with_siphashes13(47);
155    /// assert!(minhash.may_contain_value_with_siphashes13(47));
156    /// ```
157    pub fn insert_with_siphashes13<H: Hash>(&mut self, value: H) {
158        for (word, hash) in self
159            .iter_mut()
160            .zip(Self::iter_siphashes13_from_value(value))
161        {
162            word.set_min(hash);
163        }
164    }
165
166    /// Returns whether the MinHash may contain the provided value, using the keyed SipHasher13.
167    ///
168    /// # Arguments
169    /// * `value` - The value to check.
170    /// * `key0` - The first key.
171    /// * `key1` - The second key.
172    ///
173    /// # Implementative details
174    /// The procedure estimates whether the provided value is contained
175    /// in the current MinHash data structure by checking whether all of
176    /// the words are smaller or equal to all of the hash values that
177    /// are calculated using the provided value as seed.
178    ///
179    /// # Examples
180    ///
181    /// ```
182    /// use minhash_rs::prelude::*;
183    ///
184    /// let mut minhash = MinHash::<u64, 128>::new();
185    /// let key0 = 0x0123456789ABCDEF;
186    /// let key1 = 0xFEDCBA9876543210;
187    ///
188    /// assert!(!minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
189    /// minhash.insert_with_keyed_siphashes13(42, key0, key1);
190    /// assert!(minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
191    /// minhash.insert_with_keyed_siphashes13(47, key0, key1);
192    /// assert!(minhash.may_contain_value_with_keyed_siphashes13(47, key0, key1));
193    /// ```
194    ///
195    pub fn may_contain_value_with_keyed_siphashes13<H: Hash>(
196        &self,
197        value: H,
198        key0: u64,
199        key1: u64,
200    ) -> bool {
201        self.iter()
202            .zip(Self::iter_keyed_siphashes13_from_value(value, key0, key1))
203            .all(|(word, hash)| word.is_min(hash))
204    }
205
206    /// Insert a value into the MinHash using the keyed SipHasher13.
207    ///
208    /// # Arguments
209    /// * `value` - The value to insert.
210    /// * `key0` - The first key.
211    /// * `key1` - The second key.
212    ///
213    /// # Examples
214    /// In the following example we show how we can
215    /// create a MinHash and insert a value in it.
216    ///
217    /// ```
218    /// use minhash_rs::prelude::*;
219    ///
220    /// let mut minhash = MinHash::<u64, 128>::new();
221    /// let key0 = 0x0123456789ABCDEF;
222    /// let key1 = 0xFEDCBA9876543210;
223    ///
224    /// assert!(!minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
225    /// minhash.insert_with_keyed_siphashes13(42, key0, key1);
226    /// assert!(minhash.may_contain_value_with_keyed_siphashes13(42, key0, key1));
227    /// minhash.insert_with_keyed_siphashes13(47, key0, key1);
228    /// assert!(minhash.may_contain_value_with_keyed_siphashes13(47, key0, key1));
229    /// ```
230    pub fn insert_with_keyed_siphashes13<H: Hash>(&mut self, value: H, key0: u64, key1: u64) {
231        for (word, hash) in self
232            .iter_mut()
233            .zip(Self::iter_keyed_siphashes13_from_value(value, key0, key1))
234        {
235            word.set_min(hash);
236        }
237    }
238
239    /// Returns whether the MinHash may contain the provided value, using the FVN.
240    ///
241    /// # Arguments
242    /// * `value` - The value to check.
243    ///
244    /// # Implementative details
245    /// The procedure estimates whether the provided value is contained
246    /// in the current MinHash data structure by checking whether all of
247    /// the words are smaller or equal to all of the hash values that
248    /// are calculated using the provided value as seed.
249    ///
250    /// # Examples
251    ///
252    /// ```
253    /// use minhash_rs::prelude::*;
254    ///
255    /// let mut minhash = MinHash::<u64, 128>::new();
256    ///
257    /// assert!(!minhash.may_contain_value_with_fvn(42));
258    /// minhash.insert_with_fvn(42);
259    /// assert!(minhash.may_contain_value_with_fvn(42));
260    /// minhash.insert_with_fvn(47);
261    /// assert!(minhash.may_contain_value_with_fvn(47));
262    /// ```
263    ///
264    pub fn may_contain_value_with_fvn<H: Hash>(&self, value: H) -> bool {
265        self.iter()
266            .zip(Self::iter_fvn_from_value(value))
267            .all(|(word, hash)| word.is_min(hash))
268    }
269
270    /// Insert a value into the MinHash using the FVN.
271    ///
272    /// # Arguments
273    /// * `value` - The value to insert.
274    ///
275    /// # Examples
276    /// In the following example we show how we can
277    /// create a MinHash and insert a value in it.
278    ///
279    /// ```
280    /// use minhash_rs::prelude::*;
281    ///
282    /// let mut minhash = MinHash::<u64, 128>::new();
283    ///
284    /// assert!(!minhash.may_contain_value_with_fvn(42));
285    /// minhash.insert_with_fvn(42);
286    /// assert!(minhash.may_contain_value_with_fvn(42));
287    /// minhash.insert_with_fvn(47);
288    /// assert!(minhash.may_contain_value_with_fvn(47));
289    /// ```
290    pub fn insert_with_fvn<H: Hash>(&mut self, value: H) {
291        for (word, hash) in self.iter_mut().zip(Self::iter_fvn_from_value(value)) {
292            word.set_min(hash);
293        }
294    }
295
296    /// Returns whether the MinHash may contain the provided value, using the keyed FVN.
297    ///
298    /// # Arguments
299    /// * `value` - The value to check.
300    /// * `key` - The first key.
301    ///
302    /// # Implementative details
303    /// The procedure estimates whether the provided value is contained
304    /// in the current MinHash data structure by checking whether all of
305    /// the words are smaller or equal to all of the hash values that
306    /// are calculated using the provided value as seed.
307    ///
308    /// # Examples
309    ///
310    /// ```
311    /// use minhash_rs::prelude::*;
312    ///
313    /// let mut minhash = MinHash::<u64, 128>::new();
314    /// let key = 0x0123456789ABCDEF;
315    ///
316    /// assert!(!minhash.may_contain_value_with_keyed_fvn(42, key));
317    /// minhash.insert_with_keyed_fvn(42, key);
318    /// assert!(minhash.may_contain_value_with_keyed_fvn(42, key));
319    /// minhash.insert_with_keyed_fvn(47, key);
320    /// assert!(minhash.may_contain_value_with_keyed_fvn(47, key));
321    /// ```
322    ///
323    pub fn may_contain_value_with_keyed_fvn<H: Hash>(&self, value: H, key: u64) -> bool {
324        self.iter()
325            .zip(Self::iter_keyed_fvn_from_value(value, key))
326            .all(|(word, hash)| word.is_min(hash))
327    }
328
329    /// Insert a value into the MinHash using the keyed FVN.
330    ///
331    /// # Arguments
332    /// * `value` - The value to insert.
333    /// * `key` - The first key.
334    ///
335    /// # Examples
336    /// In the following example we show how we can
337    /// create a MinHash and insert a value in it.
338    ///
339    /// ```
340    /// use minhash_rs::prelude::*;
341    ///
342    /// let mut minhash = MinHash::<u64, 128>::new();
343    /// let key = 0x0123456789ABCDEF;
344    ///
345    /// assert!(!minhash.may_contain_value_with_keyed_fvn(42, key));
346    /// minhash.insert_with_keyed_fvn(42, key);
347    /// assert!(minhash.may_contain_value_with_keyed_fvn(42, key));
348    /// minhash.insert_with_keyed_fvn(47, key);
349    /// assert!(minhash.may_contain_value_with_keyed_fvn(47, key));
350    /// ```
351    pub fn insert_with_keyed_fvn<H: Hash>(&mut self, value: H, key: u64) {
352        for (word, hash) in self
353            .iter_mut()
354            .zip(Self::iter_keyed_fvn_from_value(value, key))
355        {
356            word.set_min(hash);
357        }
358    }
359}
360
361impl<Word, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
362    /// Iterate over the words.
363    pub fn iter(&self) -> impl Iterator<Item = &Word> {
364        self.words.iter()
365    }
366
367    /// Iterate over the words mutably.
368    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut Word> {
369        self.words.iter_mut()
370    }
371
372    /// Returns the number of permutations.
373    ///
374    /// # Examples
375    ///
376    /// ```
377    /// use minhash_rs::prelude::*;
378    ///
379    /// let minhash = MinHash::<u64, 128>::new();
380    ///
381    /// assert_eq!(minhash.number_of_permutations(), 128);
382    /// ```
383    pub fn number_of_permutations(&self) -> usize {
384        PERMUTATIONS
385    }
386
387    /// Returns memory required to store the MinHash in bits.
388    ///
389    /// # Examples
390    /// For a MinHash with 128 permutations and 64 bit words, the memory required is 128 * 64 * 8.
391    ///
392    /// ```
393    /// use minhash_rs::prelude::*;
394    ///
395    /// let minhash = MinHash::<u64, 128>::new();
396    ///
397    /// assert_eq!(minhash.memory(), 128 * 64);
398    /// ```
399    ///
400    /// For a MinHash with 128 permutations and 32 bit words, the memory required is 128 * 32 * 8.
401    ///
402    /// ```
403    /// use minhash_rs::prelude::*;
404    ///
405    /// let minhash = MinHash::<u32, 128>::new();
406    ///
407    /// assert_eq!(minhash.memory(), 128 * 32);
408    /// ```
409    ///
410    pub fn memory(&self) -> usize {
411        PERMUTATIONS * core::mem::size_of::<Word>() * 8
412    }
413}
414
415impl<Word: Eq, const PERMUTATIONS: usize> MinHash<Word, PERMUTATIONS> {
416    /// Calculate the similarity between two MinHashes.
417    ///
418    /// # Arguments
419    /// * `other` - The other MinHash to compare to.
420    ///
421    ///
422    /// # Examples
423    ///
424    /// ```
425    /// use std::collections::HashSet;
426    /// use minhash_rs::prelude::*;
427    ///
428    /// let first_set: HashSet<u64> = [1_u64, 2_u64, 3_u64, 4_u64, 5_u64, 6_u64, 7_u64, 8_u64].iter().copied().collect();
429    /// let second_set: HashSet<u64> = [5_u64, 6_u64, 7_u64, 8_u64, 9_u64, 10_u64, 11_u64, 12_u64].iter().copied().collect();
430    ///
431    /// let mut first_minhash: MinHash<u64, 128> = first_set.iter().collect();
432    /// let mut second_minhash: MinHash<u64, 128> = second_set.iter().collect();
433    ///
434    /// let approximation = first_minhash.estimate_jaccard_index(&second_minhash);
435    /// let ground_truth = first_set.intersection(&second_set).count() as f64 / first_set.union(&second_set).count() as f64;
436    ///
437    /// assert!((approximation - ground_truth).abs() < 0.01, concat!(
438    ///     "We expected the approximation to be close to the ground truth, ",
439    ///    "but got an error of {} instead. The ground truth is {} and the approximation is {}."
440    ///    ), (approximation - ground_truth).abs(), ground_truth, approximation
441    /// );
442    /// ```
443    pub fn estimate_jaccard_index(&self, other: &Self) -> f64 {
444        self.iter()
445            .zip(other.iter())
446            .map(|(l, r)| (l == r) as usize)
447            .sum::<usize>() as f64
448            / PERMUTATIONS as f64
449    }
450}
451
452/// We also implement AsRef and AsMut for direct access on the MinHash words.
453impl<Word, const PERMUTATIONS: usize> AsRef<[Word]> for MinHash<Word, PERMUTATIONS> {
454    fn as_ref(&self) -> &[Word] {
455        &self.words
456    }
457}
458
459impl<Word, const PERMUTATIONS: usize> AsMut<[Word]> for MinHash<Word, PERMUTATIONS> {
460    fn as_mut(&mut self) -> &mut [Word] {
461        &mut self.words
462    }
463}
464
465/// We also provide indexing for the MinHash.
466impl<W: Maximal, const PERMUTATIONS: usize> Index<usize> for MinHash<W, PERMUTATIONS> {
467    type Output = W;
468
469    fn index(&self, index: usize) -> &Self::Output {
470        &self.words[index]
471    }
472}
473
474impl<W: Maximal, const PERMUTATIONS: usize> IndexMut<usize> for MinHash<W, PERMUTATIONS> {
475    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
476        &mut self.words[index]
477    }
478}