bloom_lib/bloom.rs
1//! A classic Bloom filter with a tunable false-positive rate.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use crate::{
6 bit_set::BitSet,
7 hash::{reduce, DefaultHashBuilder, HashPair},
8 Error,
9};
10
11/// Natural logarithm of 2, reused by the sizing formulas.
12const LN_2: f64 = core::f64::consts::LN_2;
13
14/// A space-efficient probabilistic set membership test.
15///
16/// A Bloom filter answers "have I seen this item?" using a fraction of the
17/// memory a real set would need. The trade-off is one-sided error:
18/// [`contains`](Self::contains) never reports a false negative (an inserted
19/// item always tests positive) but may report a false positive (an item that
20/// was never inserted may test positive). The probability of a false positive
21/// is tunable at construction time.
22///
23/// Items are not stored, so a Bloom filter cannot enumerate or remove its
24/// contents. When deletion is required, reach for
25/// [`CuckooFilter`](crate::CuckooFilter) instead.
26///
27/// The filter is generic over the item type `T` and a
28/// [`BuildHasher`](core::hash::BuildHasher) `S`, which defaults to the
29/// deterministic [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
30///
31/// # Examples
32///
33/// ```
34/// use bloom_lib::BloomFilter;
35///
36/// // Size for 10,000 items at a 1% false-positive rate.
37/// let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
38///
39/// filter.insert("alice");
40/// filter.insert("bob");
41///
42/// assert!(filter.contains("alice"));
43/// assert!(filter.contains("bob"));
44/// assert!(!filter.contains("carol")); // very likely absent
45/// ```
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub struct BloomFilter<T: ?Sized, S = DefaultHashBuilder> {
49 bits: BitSet,
50 num_hashes: u32,
51 #[cfg_attr(feature = "serde", serde(skip))]
52 hasher: S,
53 #[cfg_attr(feature = "serde", serde(skip))]
54 _marker: PhantomData<fn(&T)>,
55}
56
57impl<T: ?Sized> BloomFilter<T, DefaultHashBuilder> {
58 /// Creates a filter sized for `capacity` items at the target false-positive
59 /// `rate`, using the default hasher.
60 ///
61 /// `capacity` is the number of distinct items you expect to insert; the
62 /// false-positive rate is honoured at that fill level and degrades
63 /// gracefully beyond it. The bit count and hash count are derived from the
64 /// standard Bloom filter formulas.
65 ///
66 /// # Parameters
67 ///
68 /// - `capacity`: expected number of distinct insertions. Must be non-zero.
69 /// - `rate`: desired false-positive probability. Must lie in `(0.0, 1.0)`.
70 ///
71 /// # Errors
72 ///
73 /// Returns [`Error::InvalidParameter`] if `capacity` is zero or `rate` is
74 /// not a finite value strictly between `0.0` and `1.0`.
75 ///
76 /// # Examples
77 ///
78 /// ```
79 /// use bloom_lib::BloomFilter;
80 ///
81 /// let filter = BloomFilter::<&str>::new(1_000, 0.001).unwrap();
82 /// assert!(filter.num_bits() >= 1_000);
83 /// assert!(filter.num_hashes() >= 1);
84 /// ```
85 pub fn new(capacity: usize, rate: f64) -> Result<Self, Error> {
86 Self::with_hasher(capacity, rate, DefaultHashBuilder)
87 }
88
89 /// Creates a filter with an explicit bit count and hash count, using the
90 /// default hasher.
91 ///
92 /// Use this when you want direct control over the filter's geometry rather
93 /// than deriving it from a capacity and rate.
94 ///
95 /// # Parameters
96 ///
97 /// - `num_bits`: size of the bit array. Must be non-zero.
98 /// - `num_hashes`: number of hash positions probed per item. Must be
99 /// non-zero.
100 ///
101 /// # Errors
102 ///
103 /// Returns [`Error::InvalidParameter`] if either argument is zero.
104 ///
105 /// # Examples
106 ///
107 /// ```
108 /// use bloom_lib::BloomFilter;
109 ///
110 /// let filter = BloomFilter::<u64>::with_dimensions(8_192, 5).unwrap();
111 /// assert_eq!(filter.num_bits(), 8_192);
112 /// assert_eq!(filter.num_hashes(), 5);
113 /// ```
114 pub fn with_dimensions(num_bits: u64, num_hashes: u32) -> Result<Self, Error> {
115 Self::with_dimensions_and_hasher(num_bits, num_hashes, DefaultHashBuilder)
116 }
117}
118
119impl<T: ?Sized, S: BuildHasher> BloomFilter<T, S> {
120 /// Creates a filter sized for `capacity` items at the target false-positive
121 /// `rate`, using a caller-supplied hasher.
122 ///
123 /// Identical to [`new`](Self::new) but lets you plug in any
124 /// [`BuildHasher`](core::hash::BuildHasher) — for example a randomly-seeded
125 /// one for resistance against adversarial inputs.
126 ///
127 /// # Errors
128 ///
129 /// Returns [`Error::InvalidParameter`] if `capacity` is zero or `rate` is
130 /// not a finite value strictly between `0.0` and `1.0`.
131 ///
132 /// # Examples
133 ///
134 /// ```
135 /// # #[cfg(feature = "std")] {
136 /// use std::collections::hash_map::RandomState;
137 /// use bloom_lib::BloomFilter;
138 ///
139 /// let filter: BloomFilter<&str, RandomState> =
140 /// BloomFilter::with_hasher(1_000, 0.01, RandomState::new()).unwrap();
141 /// # }
142 /// ```
143 pub fn with_hasher(capacity: usize, rate: f64, hasher: S) -> Result<Self, Error> {
144 if capacity == 0 {
145 return Err(Error::InvalidParameter {
146 param: "capacity",
147 reason: "must be greater than zero",
148 });
149 }
150 if !(rate.is_finite() && rate > 0.0 && rate < 1.0) {
151 return Err(Error::InvalidParameter {
152 param: "rate",
153 reason: "must be a finite value in the open interval (0.0, 1.0)",
154 });
155 }
156
157 let num_bits = optimal_num_bits(capacity, rate);
158 let num_hashes = optimal_num_hashes(num_bits, capacity);
159 Self::with_dimensions_and_hasher(num_bits, num_hashes, hasher)
160 }
161
162 /// Creates a filter with an explicit geometry and a caller-supplied hasher.
163 ///
164 /// # Errors
165 ///
166 /// Returns [`Error::InvalidParameter`] if `num_bits` or `num_hashes` is
167 /// zero.
168 ///
169 /// # Examples
170 ///
171 /// ```
172 /// use bloom_lib::{hash::DefaultHashBuilder, BloomFilter};
173 ///
174 /// let filter = BloomFilter::<u64>::with_dimensions_and_hasher(
175 /// 4_096,
176 /// 4,
177 /// DefaultHashBuilder,
178 /// )
179 /// .unwrap();
180 /// assert_eq!(filter.num_hashes(), 4);
181 /// ```
182 pub fn with_dimensions_and_hasher(
183 num_bits: u64,
184 num_hashes: u32,
185 hasher: S,
186 ) -> Result<Self, Error> {
187 if num_bits == 0 {
188 return Err(Error::InvalidParameter {
189 param: "num_bits",
190 reason: "must be greater than zero",
191 });
192 }
193 if num_hashes == 0 {
194 return Err(Error::InvalidParameter {
195 param: "num_hashes",
196 reason: "must be greater than zero",
197 });
198 }
199
200 Ok(Self {
201 bits: BitSet::new(num_bits),
202 num_hashes,
203 hasher,
204 _marker: PhantomData,
205 })
206 }
207
208 /// Inserts `item`, returning `true` if it was probably not present before.
209 ///
210 /// A return of `true` means at least one of the item's bits was previously
211 /// unset, so the item had definitely not been inserted. A return of `false`
212 /// means every bit was already set, so the item was *probably* already
213 /// present (subject to the filter's false-positive rate). This makes
214 /// `insert` a convenient deduplicating primitive for streaming input.
215 ///
216 /// # Examples
217 ///
218 /// ```
219 /// use bloom_lib::BloomFilter;
220 ///
221 /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
222 /// assert!(filter.insert("first")); // newly added
223 /// assert!(!filter.insert("first")); // already present
224 /// ```
225 pub fn insert(&mut self, item: &T) -> bool
226 where
227 T: core::hash::Hash,
228 {
229 let pair = HashPair::new(item, &self.hasher);
230 let num_bits = self.bits.len();
231 let mut newly_added = false;
232 for i in 0..u64::from(self.num_hashes) {
233 let index = reduce(pair.nth(i), num_bits);
234 // `set` returns the previous value; a `false` means this bit was
235 // unset, so the item is definitely new.
236 if !self.bits.set(index) {
237 newly_added = true;
238 }
239 }
240 newly_added
241 }
242
243 /// Tests whether `item` is in the filter.
244 ///
245 /// Returns `false` only if the item was definitely never inserted. A return
246 /// of `true` means the item is *probably* present, with the filter's
247 /// configured false-positive probability.
248 ///
249 /// # Examples
250 ///
251 /// ```
252 /// use bloom_lib::BloomFilter;
253 ///
254 /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
255 /// filter.insert(&7u32);
256 /// assert!(filter.contains(&7u32));
257 /// assert!(!filter.contains(&99u32));
258 /// ```
259 #[must_use]
260 pub fn contains(&self, item: &T) -> bool
261 where
262 T: core::hash::Hash,
263 {
264 let pair = HashPair::new(item, &self.hasher);
265 let num_bits = self.bits.len();
266 (0..u64::from(self.num_hashes)).all(|i| self.bits.get(reduce(pair.nth(i), num_bits)))
267 }
268
269 /// Removes every item, leaving an empty filter with the same geometry.
270 ///
271 /// The bit allocation is retained, so reusing a cleared filter avoids a
272 /// fresh allocation.
273 ///
274 /// # Examples
275 ///
276 /// ```
277 /// use bloom_lib::BloomFilter;
278 ///
279 /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
280 /// filter.insert("x");
281 /// filter.clear();
282 /// assert!(filter.is_empty());
283 /// assert!(!filter.contains("x"));
284 /// ```
285 pub fn clear(&mut self) {
286 self.bits.clear();
287 }
288
289 /// The size of the underlying bit array.
290 #[inline]
291 #[must_use]
292 pub fn num_bits(&self) -> u64 {
293 self.bits.len()
294 }
295
296 /// The number of hash positions probed per item.
297 #[inline]
298 #[must_use]
299 pub fn num_hashes(&self) -> u32 {
300 self.num_hashes
301 }
302
303 /// The number of set bits in the filter.
304 ///
305 /// This is the raw population count of the bit array, useful for monitoring
306 /// fill level. See [`estimated_len`](Self::estimated_len) for an estimate of
307 /// the number of distinct items inserted.
308 #[inline]
309 #[must_use]
310 pub fn count_ones(&self) -> u64 {
311 self.bits.count_ones()
312 }
313
314 /// Returns `true` if no bits are set, i.e. nothing has been inserted.
315 ///
316 /// # Examples
317 ///
318 /// ```
319 /// use bloom_lib::BloomFilter;
320 ///
321 /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
322 /// assert!(filter.is_empty());
323 /// filter.insert("x");
324 /// assert!(!filter.is_empty());
325 /// ```
326 #[inline]
327 #[must_use]
328 pub fn is_empty(&self) -> bool {
329 self.bits.count_ones() == 0
330 }
331
332 /// Estimates the number of distinct items inserted so far.
333 ///
334 /// Derived from the fraction of set bits using the standard estimator
335 /// `n ≈ -(m / k) · ln(1 − X / m)`, where `m` is the bit count, `k` the hash
336 /// count, and `X` the number of set bits. The estimate is approximate and
337 /// loses accuracy as the filter saturates.
338 ///
339 /// # Examples
340 ///
341 /// ```
342 /// use bloom_lib::BloomFilter;
343 ///
344 /// let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
345 /// for i in 0..1_000u32 {
346 /// filter.insert(&i);
347 /// }
348 /// let estimate = filter.estimated_len();
349 /// // Within a few percent of the true count of 1,000.
350 /// assert!((900..=1_100).contains(&estimate));
351 /// ```
352 #[must_use]
353 pub fn estimated_len(&self) -> u64 {
354 let m = self.bits.len() as f64;
355 let k = f64::from(self.num_hashes);
356 let x = self.bits.count_ones() as f64;
357 if x == 0.0 {
358 return 0;
359 }
360 if x >= m {
361 // Saturated: the estimator diverges, so report the bit count as a
362 // conservative floor rather than infinity.
363 return self.bits.len();
364 }
365 let estimate = -(m / k) * libm::log(1.0 - x / m);
366 // `estimate` is finite and non-negative here; round to the nearest item.
367 libm::round(estimate) as u64
368 }
369
370 /// Estimates the current false-positive probability at the present fill
371 /// level.
372 ///
373 /// Computed as `(X / m)^k`, where `X` is the number of set bits, `m` the bit
374 /// count, and `k` the hash count. This reflects the *actual* fill, so it
375 /// rises above the configured target rate once the filter holds more than
376 /// its design capacity.
377 ///
378 /// # Examples
379 ///
380 /// ```
381 /// use bloom_lib::BloomFilter;
382 ///
383 /// let filter = BloomFilter::<u32>::new(1_000, 0.01).unwrap();
384 /// // An empty filter cannot produce a false positive.
385 /// assert_eq!(filter.estimated_false_positive_rate(), 0.0);
386 /// ```
387 #[must_use]
388 pub fn estimated_false_positive_rate(&self) -> f64 {
389 let m = self.bits.len() as f64;
390 let k = f64::from(self.num_hashes);
391 let fill = self.bits.count_ones() as f64 / m;
392 libm::pow(fill, k)
393 }
394
395 /// Merges `other` into `self` by unioning their bit arrays.
396 ///
397 /// After a successful merge, the filter reports positive for every item
398 /// that was in either operand. Both filters must have been built with
399 /// identical dimensions (bit count and hash count); the hasher is assumed to
400 /// match, which holds automatically for the default hasher.
401 ///
402 /// # Errors
403 ///
404 /// Returns [`Error::IncompatibleParameters`] if the two filters differ in
405 /// bit count or hash count.
406 ///
407 /// # Examples
408 ///
409 /// ```
410 /// use bloom_lib::BloomFilter;
411 ///
412 /// let mut a = BloomFilter::new(1_000, 0.01).unwrap();
413 /// let mut b = BloomFilter::new(1_000, 0.01).unwrap();
414 /// a.insert("from-a");
415 /// b.insert("from-b");
416 ///
417 /// a.merge(&b).unwrap();
418 /// assert!(a.contains("from-a"));
419 /// assert!(a.contains("from-b"));
420 /// ```
421 pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
422 if self.num_hashes != other.num_hashes || !self.bits.is_compatible(&other.bits) {
423 return Err(Error::IncompatibleParameters);
424 }
425 self.bits.union_with(&other.bits);
426 Ok(())
427 }
428}
429
430/// Optimal bit count `m = ceil(-n·ln(p) / (ln2)^2)`, clamped to at least one.
431fn optimal_num_bits(capacity: usize, rate: f64) -> u64 {
432 let n = capacity as f64;
433 let m = -(n * libm::log(rate)) / (LN_2 * LN_2);
434 let rounded = libm::ceil(m);
435 if rounded < 1.0 {
436 1
437 } else {
438 rounded as u64
439 }
440}
441
442/// Optimal hash count `k = round((m/n)·ln2)`, clamped to at least one.
443fn optimal_num_hashes(num_bits: u64, capacity: usize) -> u32 {
444 let k = (num_bits as f64 / capacity as f64) * LN_2;
445 let rounded = libm::round(k);
446 if rounded < 1.0 {
447 1
448 } else {
449 // `m` is bounded by realistic memory, so `k` never approaches u32::MAX.
450 rounded as u32
451 }
452}
453
454#[cfg(test)]
455mod tests {
456 // `insert` returns a novelty flag that most call sites here intentionally
457 // discard; the crate denies `unused_results` for shipping code.
458 #![allow(unused_results)]
459 // Unwrapping known-valid constructor results keeps the tests readable.
460 #![allow(clippy::unwrap_used)]
461
462 use super::*;
463
464 #[test]
465 fn test_new_rejects_zero_capacity() {
466 let err = BloomFilter::<&str>::new(0, 0.01).unwrap_err();
467 assert_eq!(
468 err,
469 Error::InvalidParameter {
470 param: "capacity",
471 reason: "must be greater than zero"
472 }
473 );
474 }
475
476 #[test]
477 fn test_new_rejects_out_of_range_rate() {
478 assert!(matches!(
479 BloomFilter::<&str>::new(10, 0.0),
480 Err(Error::InvalidParameter { .. })
481 ));
482 assert!(matches!(
483 BloomFilter::<&str>::new(10, 1.0),
484 Err(Error::InvalidParameter { .. })
485 ));
486 assert!(matches!(
487 BloomFilter::<&str>::new(10, f64::NAN),
488 Err(Error::InvalidParameter { .. })
489 ));
490 }
491
492 #[test]
493 fn test_with_dimensions_rejects_zeros() {
494 assert!(matches!(
495 BloomFilter::<u8>::with_dimensions(0, 3),
496 Err(Error::InvalidParameter { .. })
497 ));
498 assert!(matches!(
499 BloomFilter::<u8>::with_dimensions(64, 0),
500 Err(Error::InvalidParameter { .. })
501 ));
502 }
503
504 #[test]
505 fn test_no_false_negatives() {
506 let mut filter = BloomFilter::new(1_000, 0.01).unwrap();
507 for i in 0..1_000u32 {
508 filter.insert(&i);
509 }
510 for i in 0..1_000u32 {
511 assert!(filter.contains(&i), "inserted item {i} reported absent");
512 }
513 }
514
515 #[test]
516 fn test_insert_reports_novelty() {
517 let mut filter = BloomFilter::new(100, 0.01).unwrap();
518 assert!(filter.insert("alpha"));
519 assert!(!filter.insert("alpha"));
520 }
521
522 #[test]
523 fn test_false_positive_rate_is_near_target() {
524 let capacity = 10_000;
525 let target = 0.01;
526 let mut filter = BloomFilter::new(capacity, target).unwrap();
527 for i in 0..capacity as u64 {
528 filter.insert(&i);
529 }
530 // Query a disjoint range and measure the observed false-positive rate.
531 let trials = 100_000u64;
532 let mut hits = 0u64;
533 for i in capacity as u64..capacity as u64 + trials {
534 if filter.contains(&i) {
535 hits += 1;
536 }
537 }
538 let observed = hits as f64 / trials as f64;
539 // Allow generous headroom; the point is it is the right order of magnitude.
540 assert!(
541 observed < target * 3.0,
542 "observed FP rate {observed} far exceeds target {target}"
543 );
544 }
545
546 #[test]
547 fn test_clear_empties_filter() {
548 let mut filter = BloomFilter::new(100, 0.01).unwrap();
549 filter.insert("x");
550 assert!(!filter.is_empty());
551 filter.clear();
552 assert!(filter.is_empty());
553 assert!(!filter.contains("x"));
554 }
555
556 #[test]
557 fn test_merge_unions_membership() {
558 let mut a = BloomFilter::new(1_000, 0.01).unwrap();
559 let mut b = BloomFilter::new(1_000, 0.01).unwrap();
560 a.insert("a");
561 b.insert("b");
562 a.merge(&b).unwrap();
563 assert!(a.contains("a"));
564 assert!(a.contains("b"));
565 }
566
567 #[test]
568 fn test_merge_rejects_incompatible() {
569 let mut a = BloomFilter::<u32>::with_dimensions(1_024, 3).unwrap();
570 let b = BloomFilter::<u32>::with_dimensions(2_048, 3).unwrap();
571 assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
572
573 let c = BloomFilter::<u32>::with_dimensions(1_024, 4).unwrap();
574 assert_eq!(a.merge(&c), Err(Error::IncompatibleParameters));
575 }
576
577 #[test]
578 fn test_estimated_len_is_reasonable() {
579 let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
580 for i in 0..1_000u32 {
581 filter.insert(&i);
582 }
583 let estimate = filter.estimated_len();
584 assert!(
585 (900..=1_100).contains(&estimate),
586 "estimate {estimate} not within 10% of 1000"
587 );
588 }
589
590 #[test]
591 fn test_estimated_len_empty_is_zero() {
592 let filter = BloomFilter::<u32>::new(1_000, 0.01).unwrap();
593 assert_eq!(filter.estimated_len(), 0);
594 }
595
596 #[test]
597 fn test_sizing_formulas() {
598 // 10k items at 1% -> ~95,851 bits, k = 7 (textbook values).
599 let bits = optimal_num_bits(10_000, 0.01);
600 assert!(
601 (95_000..=96_500).contains(&bits),
602 "unexpected bit count {bits}"
603 );
604 let k = optimal_num_hashes(bits, 10_000);
605 assert_eq!(k, 7);
606 }
607}