Skip to main content

bloom_lib/
minhash.rs

1//! A MinHash sketch for estimating Jaccard similarity between sets.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{
8    hash::{DefaultHashBuilder, HashPair},
9    Error,
10};
11
12/// A fixed-size signature that estimates the Jaccard similarity of the set it
13/// summarises against another sketch.
14///
15/// MinHash represents a set by the minimum hash value each of `k` hash functions
16/// produces over the set's members. The fraction of signature slots two sketches
17/// agree on is an unbiased estimate of the
18/// [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index)
19/// `|A ∩ B| / |A ∪ B|` of the two sets — computed in `O(k)` time and `O(k)`
20/// space, independent of how large the sets are. More hash functions tighten the
21/// estimate (standard error ≈ `1 / sqrt(k)`).
22///
23/// The sketch is generic over the item type `T` and a
24/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
25/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder). Two sketches are only
26/// comparable when built with the same number of hash functions and the same
27/// hasher.
28///
29/// # Examples
30///
31/// ```
32/// use bloom_lib::MinHash;
33///
34/// let mut a = MinHash::new(256).unwrap();
35/// let mut b = MinHash::new(256).unwrap();
36///
37/// for w in ["the", "quick", "brown", "fox"] {
38///     a.insert(w);
39/// }
40/// for w in ["the", "quick", "red", "fox"] {
41///     b.insert(w);
42/// }
43///
44/// // True Jaccard similarity is 3/5 = 0.6 (shared: the, quick, fox).
45/// let similarity = a.similarity(&b).unwrap();
46/// assert!((0.45..=0.75).contains(&similarity));
47/// ```
48#[derive(Debug, Clone)]
49#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
50pub struct MinHash<T: ?Sized, S = DefaultHashBuilder> {
51    signature: Vec<u64>,
52    #[cfg_attr(feature = "serde", serde(skip))]
53    hasher: S,
54    #[cfg_attr(feature = "serde", serde(skip))]
55    _marker: PhantomData<fn(&T)>,
56}
57
58impl<T: ?Sized> MinHash<T, DefaultHashBuilder> {
59    /// Creates a sketch with `num_hashes` hash functions, using the default
60    /// hasher.
61    ///
62    /// More hash functions give a more precise similarity estimate at
63    /// proportional memory and update cost. A few hundred is typical; the
64    /// standard error of the estimate is about `1 / sqrt(num_hashes)`.
65    ///
66    /// # Parameters
67    ///
68    /// - `num_hashes`: the signature length. Must be non-zero.
69    ///
70    /// # Errors
71    ///
72    /// Returns [`Error::InvalidParameter`] if `num_hashes` is zero.
73    ///
74    /// # Examples
75    ///
76    /// ```
77    /// use bloom_lib::MinHash;
78    ///
79    /// let sketch = MinHash::<&str>::new(128).unwrap();
80    /// assert_eq!(sketch.num_hashes(), 128);
81    /// assert!(sketch.is_empty());
82    /// ```
83    pub fn new(num_hashes: usize) -> Result<Self, Error> {
84        Self::with_hasher(num_hashes, DefaultHashBuilder)
85    }
86}
87
88impl<T: ?Sized, S: BuildHasher> MinHash<T, S> {
89    /// Creates a sketch with `num_hashes` hash functions and a caller-supplied
90    /// hasher.
91    ///
92    /// # Errors
93    ///
94    /// Returns [`Error::InvalidParameter`] if `num_hashes` is zero.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// # #[cfg(feature = "std")] {
100    /// use std::collections::hash_map::RandomState;
101    /// use bloom_lib::MinHash;
102    ///
103    /// let sketch: MinHash<&str, RandomState> =
104    ///     MinHash::with_hasher(128, RandomState::new()).unwrap();
105    /// # }
106    /// ```
107    pub fn with_hasher(num_hashes: usize, hasher: S) -> Result<Self, Error> {
108        if num_hashes == 0 {
109            return Err(Error::InvalidParameter {
110                param: "num_hashes",
111                reason: "must be greater than zero",
112            });
113        }
114        Ok(Self {
115            signature: vec![u64::MAX; num_hashes],
116            hasher,
117            _marker: PhantomData,
118        })
119    }
120
121    /// Adds `item` to the set this sketch summarises.
122    ///
123    /// Each of the `k` hash functions is evaluated on `item` and the
124    /// corresponding signature slot keeps the running minimum. Re-inserting an
125    /// item already present has no effect, so the sketch depends only on the
126    /// distinct members of the set.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    /// use bloom_lib::MinHash;
132    ///
133    /// let mut sketch = MinHash::new(64).unwrap();
134    /// sketch.insert("member");
135    /// assert!(!sketch.is_empty());
136    /// ```
137    pub fn insert(&mut self, item: &T)
138    where
139        T: core::hash::Hash,
140    {
141        let pair = HashPair::new(item, &self.hasher);
142        for (i, slot) in self.signature.iter_mut().enumerate() {
143            let value = pair.nth(i as u64);
144            if value < *slot {
145                *slot = value;
146            }
147        }
148    }
149
150    /// Estimates the Jaccard similarity between this sketch and `other`.
151    ///
152    /// The result is the fraction of signature slots whose minimum hash agrees,
153    /// which is an unbiased estimator of `|A ∩ B| / |A ∪ B|`. Identical sets
154    /// estimate `1.0`; disjoint sets estimate near `0.0`. Two empty sketches
155    /// agree on every slot and so report `1.0`.
156    ///
157    /// # Errors
158    ///
159    /// Returns [`Error::IncompatibleParameters`] if the two sketches have
160    /// different signature lengths.
161    ///
162    /// # Examples
163    ///
164    /// ```
165    /// use bloom_lib::MinHash;
166    ///
167    /// let mut a = MinHash::new(256).unwrap();
168    /// let mut b = MinHash::new(256).unwrap();
169    /// for i in 0..1_000u32 {
170    ///     a.insert(&i);
171    /// }
172    /// for i in 0..1_000u32 {
173    ///     b.insert(&i);
174    /// }
175    /// // Identical sets.
176    /// assert_eq!(a.similarity(&b).unwrap(), 1.0);
177    /// ```
178    pub fn similarity(&self, other: &Self) -> Result<f64, Error> {
179        if self.signature.len() != other.signature.len() {
180            return Err(Error::IncompatibleParameters);
181        }
182        let matches = self
183            .signature
184            .iter()
185            .zip(other.signature.iter())
186            .filter(|(a, b)| a == b)
187            .count();
188        Ok(matches as f64 / self.signature.len() as f64)
189    }
190
191    /// Merges `other` into `self`, producing the sketch of the *union* of the
192    /// two sets.
193    ///
194    /// Each signature slot becomes the minimum of the two sketches' slots, which
195    /// is exactly the slot the union would have produced. Both sketches must
196    /// have the same signature length.
197    ///
198    /// # Errors
199    ///
200    /// Returns [`Error::IncompatibleParameters`] if the signature lengths differ.
201    ///
202    /// # Examples
203    ///
204    /// ```
205    /// use bloom_lib::MinHash;
206    ///
207    /// let mut a = MinHash::new(128).unwrap();
208    /// let mut b = MinHash::new(128).unwrap();
209    /// a.insert("x");
210    /// b.insert("y");
211    ///
212    /// a.merge(&b).unwrap();
213    /// // `a` now summarises {x, y}.
214    /// ```
215    pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
216        if self.signature.len() != other.signature.len() {
217            return Err(Error::IncompatibleParameters);
218        }
219        for (dst, src) in self.signature.iter_mut().zip(other.signature.iter()) {
220            *dst = (*dst).min(*src);
221        }
222        Ok(())
223    }
224
225    /// The number of hash functions in the signature.
226    #[inline]
227    #[must_use]
228    pub fn num_hashes(&self) -> usize {
229        self.signature.len()
230    }
231
232    /// Returns `true` if no items have been inserted.
233    #[inline]
234    #[must_use]
235    pub fn is_empty(&self) -> bool {
236        self.signature.iter().all(|&slot| slot == u64::MAX)
237    }
238
239    /// Resets the sketch to empty, retaining the allocation.
240    ///
241    /// # Examples
242    ///
243    /// ```
244    /// use bloom_lib::MinHash;
245    ///
246    /// let mut sketch = MinHash::new(64).unwrap();
247    /// sketch.insert("x");
248    /// sketch.clear();
249    /// assert!(sketch.is_empty());
250    /// ```
251    pub fn clear(&mut self) {
252        self.signature.iter_mut().for_each(|slot| *slot = u64::MAX);
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    #![allow(clippy::unwrap_used)]
259
260    use super::*;
261
262    #[test]
263    fn test_new_rejects_zero() {
264        assert!(matches!(
265            MinHash::<&str>::new(0),
266            Err(Error::InvalidParameter { .. })
267        ));
268    }
269
270    #[test]
271    fn test_identical_sets_are_fully_similar() {
272        let mut a = MinHash::new(256).unwrap();
273        let mut b = MinHash::new(256).unwrap();
274        for i in 0..1_000u32 {
275            a.insert(&i);
276            b.insert(&i);
277        }
278        assert_eq!(a.similarity(&b).unwrap(), 1.0);
279    }
280
281    #[test]
282    fn test_disjoint_sets_are_dissimilar() {
283        let mut a = MinHash::new(256).unwrap();
284        let mut b = MinHash::new(256).unwrap();
285        for i in 0..1_000u32 {
286            a.insert(&i);
287        }
288        for i in 10_000..11_000u32 {
289            b.insert(&i);
290        }
291        let similarity = a.similarity(&b).unwrap();
292        assert!(similarity < 0.1, "disjoint sets too similar: {similarity}");
293    }
294
295    #[test]
296    fn test_partial_overlap_estimate() {
297        // A = [0, 1000), B = [500, 1500). |A∩B| = 500, |A∪B| = 1500 -> J = 1/3.
298        let mut a = MinHash::new(512).unwrap();
299        let mut b = MinHash::new(512).unwrap();
300        for i in 0..1_000u32 {
301            a.insert(&i);
302        }
303        for i in 500..1_500u32 {
304            b.insert(&i);
305        }
306        let similarity = a.similarity(&b).unwrap();
307        assert!(
308            (0.27..=0.40).contains(&similarity),
309            "estimate {similarity} far from 1/3"
310        );
311    }
312
313    #[test]
314    fn test_similarity_rejects_mismatched_lengths() {
315        let a = MinHash::<u32>::new(128).unwrap();
316        let b = MinHash::<u32>::new(256).unwrap();
317        assert_eq!(a.similarity(&b), Err(Error::IncompatibleParameters));
318    }
319
320    #[test]
321    fn test_merge_forms_union() {
322        let mut a = MinHash::new(256).unwrap();
323        let mut b = MinHash::new(256).unwrap();
324        let mut union = MinHash::new(256).unwrap();
325
326        for i in 0..500u32 {
327            a.insert(&i);
328            union.insert(&i);
329        }
330        for i in 500..1_000u32 {
331            b.insert(&i);
332            union.insert(&i);
333        }
334
335        a.merge(&b).unwrap();
336        // The merged sketch should match the directly-built union exactly.
337        assert_eq!(a.similarity(&union).unwrap(), 1.0);
338    }
339
340    #[test]
341    fn test_merge_rejects_mismatched_lengths() {
342        let mut a = MinHash::<u32>::new(128).unwrap();
343        let b = MinHash::<u32>::new(64).unwrap();
344        assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
345    }
346
347    #[test]
348    fn test_clear() {
349        let mut sketch = MinHash::new(64).unwrap();
350        sketch.insert("x");
351        assert!(!sketch.is_empty());
352        sketch.clear();
353        assert!(sketch.is_empty());
354    }
355}