bloom_lib/minhash.rs
1//! A MinHash sketch for estimating Jaccard similarity between sets.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{
8 hash::{DefaultHashBuilder, HashPair},
9 Error,
10};
11
12/// A fixed-size signature that estimates the Jaccard similarity of the set it
13/// summarises against another sketch.
14///
15/// MinHash represents a set by the minimum hash value each of `k` hash functions
16/// produces over the set's members. The fraction of signature slots two sketches
17/// agree on is an unbiased estimate of the
18/// [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index)
19/// `|A ∩ B| / |A ∪ B|` of the two sets — computed in `O(k)` time and `O(k)`
20/// space, independent of how large the sets are. More hash functions tighten the
21/// estimate (standard error ≈ `1 / sqrt(k)`).
22///
23/// The sketch is generic over the item type `T` and a
24/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
25/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder). Two sketches are only
26/// comparable when built with the same number of hash functions and the same
27/// hasher.
28///
29/// # Examples
30///
31/// ```
32/// use bloom_lib::MinHash;
33///
34/// let mut a = MinHash::new(256).unwrap();
35/// let mut b = MinHash::new(256).unwrap();
36///
37/// for w in ["the", "quick", "brown", "fox"] {
38/// a.insert(w);
39/// }
40/// for w in ["the", "quick", "red", "fox"] {
41/// b.insert(w);
42/// }
43///
44/// // True Jaccard similarity is 3/5 = 0.6 (shared: the, quick, fox).
45/// let similarity = a.similarity(&b).unwrap();
46/// assert!((0.45..=0.75).contains(&similarity));
47/// ```
48#[derive(Debug, Clone)]
49#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
50pub struct MinHash<T: ?Sized, S = DefaultHashBuilder> {
51 signature: Vec<u64>,
52 #[cfg_attr(feature = "serde", serde(skip))]
53 hasher: S,
54 #[cfg_attr(feature = "serde", serde(skip))]
55 _marker: PhantomData<fn(&T)>,
56}
57
58impl<T: ?Sized> MinHash<T, DefaultHashBuilder> {
59 /// Creates a sketch with `num_hashes` hash functions, using the default
60 /// hasher.
61 ///
62 /// More hash functions give a more precise similarity estimate at
63 /// proportional memory and update cost. A few hundred is typical; the
64 /// standard error of the estimate is about `1 / sqrt(num_hashes)`.
65 ///
66 /// # Parameters
67 ///
68 /// - `num_hashes`: the signature length. Must be non-zero.
69 ///
70 /// # Errors
71 ///
72 /// Returns [`Error::InvalidParameter`] if `num_hashes` is zero.
73 ///
74 /// # Examples
75 ///
76 /// ```
77 /// use bloom_lib::MinHash;
78 ///
79 /// let sketch = MinHash::<&str>::new(128).unwrap();
80 /// assert_eq!(sketch.num_hashes(), 128);
81 /// assert!(sketch.is_empty());
82 /// ```
83 pub fn new(num_hashes: usize) -> Result<Self, Error> {
84 Self::with_hasher(num_hashes, DefaultHashBuilder)
85 }
86}
87
88impl<T: ?Sized, S: BuildHasher> MinHash<T, S> {
89 /// Creates a sketch with `num_hashes` hash functions and a caller-supplied
90 /// hasher.
91 ///
92 /// # Errors
93 ///
94 /// Returns [`Error::InvalidParameter`] if `num_hashes` is zero.
95 ///
96 /// # Examples
97 ///
98 /// ```
99 /// # #[cfg(feature = "std")] {
100 /// use std::collections::hash_map::RandomState;
101 /// use bloom_lib::MinHash;
102 ///
103 /// let sketch: MinHash<&str, RandomState> =
104 /// MinHash::with_hasher(128, RandomState::new()).unwrap();
105 /// # }
106 /// ```
107 pub fn with_hasher(num_hashes: usize, hasher: S) -> Result<Self, Error> {
108 if num_hashes == 0 {
109 return Err(Error::InvalidParameter {
110 param: "num_hashes",
111 reason: "must be greater than zero",
112 });
113 }
114 Ok(Self {
115 signature: vec![u64::MAX; num_hashes],
116 hasher,
117 _marker: PhantomData,
118 })
119 }
120
121 /// Adds `item` to the set this sketch summarises.
122 ///
123 /// Each of the `k` hash functions is evaluated on `item` and the
124 /// corresponding signature slot keeps the running minimum. Re-inserting an
125 /// item already present has no effect, so the sketch depends only on the
126 /// distinct members of the set.
127 ///
128 /// # Examples
129 ///
130 /// ```
131 /// use bloom_lib::MinHash;
132 ///
133 /// let mut sketch = MinHash::new(64).unwrap();
134 /// sketch.insert("member");
135 /// assert!(!sketch.is_empty());
136 /// ```
137 pub fn insert(&mut self, item: &T)
138 where
139 T: core::hash::Hash,
140 {
141 let pair = HashPair::new(item, &self.hasher);
142 for (i, slot) in self.signature.iter_mut().enumerate() {
143 let value = pair.nth(i as u64);
144 if value < *slot {
145 *slot = value;
146 }
147 }
148 }
149
150 /// Estimates the Jaccard similarity between this sketch and `other`.
151 ///
152 /// The result is the fraction of signature slots whose minimum hash agrees,
153 /// which is an unbiased estimator of `|A ∩ B| / |A ∪ B|`. Identical sets
154 /// estimate `1.0`; disjoint sets estimate near `0.0`. Two empty sketches
155 /// agree on every slot and so report `1.0`.
156 ///
157 /// # Errors
158 ///
159 /// Returns [`Error::IncompatibleParameters`] if the two sketches have
160 /// different signature lengths.
161 ///
162 /// # Examples
163 ///
164 /// ```
165 /// use bloom_lib::MinHash;
166 ///
167 /// let mut a = MinHash::new(256).unwrap();
168 /// let mut b = MinHash::new(256).unwrap();
169 /// for i in 0..1_000u32 {
170 /// a.insert(&i);
171 /// }
172 /// for i in 0..1_000u32 {
173 /// b.insert(&i);
174 /// }
175 /// // Identical sets.
176 /// assert_eq!(a.similarity(&b).unwrap(), 1.0);
177 /// ```
178 pub fn similarity(&self, other: &Self) -> Result<f64, Error> {
179 if self.signature.len() != other.signature.len() {
180 return Err(Error::IncompatibleParameters);
181 }
182 let matches = self
183 .signature
184 .iter()
185 .zip(other.signature.iter())
186 .filter(|(a, b)| a == b)
187 .count();
188 Ok(matches as f64 / self.signature.len() as f64)
189 }
190
191 /// Merges `other` into `self`, producing the sketch of the *union* of the
192 /// two sets.
193 ///
194 /// Each signature slot becomes the minimum of the two sketches' slots, which
195 /// is exactly the slot the union would have produced. Both sketches must
196 /// have the same signature length.
197 ///
198 /// # Errors
199 ///
200 /// Returns [`Error::IncompatibleParameters`] if the signature lengths differ.
201 ///
202 /// # Examples
203 ///
204 /// ```
205 /// use bloom_lib::MinHash;
206 ///
207 /// let mut a = MinHash::new(128).unwrap();
208 /// let mut b = MinHash::new(128).unwrap();
209 /// a.insert("x");
210 /// b.insert("y");
211 ///
212 /// a.merge(&b).unwrap();
213 /// // `a` now summarises {x, y}.
214 /// ```
215 pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
216 if self.signature.len() != other.signature.len() {
217 return Err(Error::IncompatibleParameters);
218 }
219 for (dst, src) in self.signature.iter_mut().zip(other.signature.iter()) {
220 *dst = (*dst).min(*src);
221 }
222 Ok(())
223 }
224
225 /// The number of hash functions in the signature.
226 #[inline]
227 #[must_use]
228 pub fn num_hashes(&self) -> usize {
229 self.signature.len()
230 }
231
232 /// Returns `true` if no items have been inserted.
233 #[inline]
234 #[must_use]
235 pub fn is_empty(&self) -> bool {
236 self.signature.iter().all(|&slot| slot == u64::MAX)
237 }
238
239 /// Resets the sketch to empty, retaining the allocation.
240 ///
241 /// # Examples
242 ///
243 /// ```
244 /// use bloom_lib::MinHash;
245 ///
246 /// let mut sketch = MinHash::new(64).unwrap();
247 /// sketch.insert("x");
248 /// sketch.clear();
249 /// assert!(sketch.is_empty());
250 /// ```
251 pub fn clear(&mut self) {
252 self.signature.iter_mut().for_each(|slot| *slot = u64::MAX);
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 #![allow(clippy::unwrap_used)]
259
260 use super::*;
261
262 #[test]
263 fn test_new_rejects_zero() {
264 assert!(matches!(
265 MinHash::<&str>::new(0),
266 Err(Error::InvalidParameter { .. })
267 ));
268 }
269
270 #[test]
271 fn test_identical_sets_are_fully_similar() {
272 let mut a = MinHash::new(256).unwrap();
273 let mut b = MinHash::new(256).unwrap();
274 for i in 0..1_000u32 {
275 a.insert(&i);
276 b.insert(&i);
277 }
278 assert_eq!(a.similarity(&b).unwrap(), 1.0);
279 }
280
281 #[test]
282 fn test_disjoint_sets_are_dissimilar() {
283 let mut a = MinHash::new(256).unwrap();
284 let mut b = MinHash::new(256).unwrap();
285 for i in 0..1_000u32 {
286 a.insert(&i);
287 }
288 for i in 10_000..11_000u32 {
289 b.insert(&i);
290 }
291 let similarity = a.similarity(&b).unwrap();
292 assert!(similarity < 0.1, "disjoint sets too similar: {similarity}");
293 }
294
295 #[test]
296 fn test_partial_overlap_estimate() {
297 // A = [0, 1000), B = [500, 1500). |A∩B| = 500, |A∪B| = 1500 -> J = 1/3.
298 let mut a = MinHash::new(512).unwrap();
299 let mut b = MinHash::new(512).unwrap();
300 for i in 0..1_000u32 {
301 a.insert(&i);
302 }
303 for i in 500..1_500u32 {
304 b.insert(&i);
305 }
306 let similarity = a.similarity(&b).unwrap();
307 assert!(
308 (0.27..=0.40).contains(&similarity),
309 "estimate {similarity} far from 1/3"
310 );
311 }
312
313 #[test]
314 fn test_similarity_rejects_mismatched_lengths() {
315 let a = MinHash::<u32>::new(128).unwrap();
316 let b = MinHash::<u32>::new(256).unwrap();
317 assert_eq!(a.similarity(&b), Err(Error::IncompatibleParameters));
318 }
319
320 #[test]
321 fn test_merge_forms_union() {
322 let mut a = MinHash::new(256).unwrap();
323 let mut b = MinHash::new(256).unwrap();
324 let mut union = MinHash::new(256).unwrap();
325
326 for i in 0..500u32 {
327 a.insert(&i);
328 union.insert(&i);
329 }
330 for i in 500..1_000u32 {
331 b.insert(&i);
332 union.insert(&i);
333 }
334
335 a.merge(&b).unwrap();
336 // The merged sketch should match the directly-built union exactly.
337 assert_eq!(a.similarity(&union).unwrap(), 1.0);
338 }
339
340 #[test]
341 fn test_merge_rejects_mismatched_lengths() {
342 let mut a = MinHash::<u32>::new(128).unwrap();
343 let b = MinHash::<u32>::new(64).unwrap();
344 assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
345 }
346
347 #[test]
348 fn test_clear() {
349 let mut sketch = MinHash::new(64).unwrap();
350 sketch.insert("x");
351 assert!(!sketch.is_empty());
352 sketch.clear();
353 assert!(sketch.is_empty());
354 }
355}