Skip to main content

bloom_lib/
bloom.rs

1//! A classic Bloom filter with a tunable false-positive rate.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use crate::{
6    bit_set::BitSet,
7    hash::{reduce, DefaultHashBuilder, HashPair},
8    Error,
9};
10
11/// Natural logarithm of 2, reused by the sizing formulas.
12const LN_2: f64 = core::f64::consts::LN_2;
13
14/// A space-efficient probabilistic set membership test.
15///
16/// A Bloom filter answers "have I seen this item?" using a fraction of the
17/// memory a real set would need. The trade-off is one-sided error:
18/// [`contains`](Self::contains) never reports a false negative (an inserted
19/// item always tests positive) but may report a false positive (an item that
20/// was never inserted may test positive). The probability of a false positive
21/// is tunable at construction time.
22///
23/// Items are not stored, so a Bloom filter cannot enumerate or remove its
24/// contents. When deletion is required, reach for
25/// [`CuckooFilter`](crate::CuckooFilter) instead.
26///
27/// The filter is generic over the item type `T` and a
28/// [`BuildHasher`](core::hash::BuildHasher) `S`, which defaults to the
29/// deterministic [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
30///
31/// # Examples
32///
33/// ```
34/// use bloom_lib::BloomFilter;
35///
36/// // Size for 10,000 items at a 1% false-positive rate.
37/// let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
38///
39/// filter.insert("alice");
40/// filter.insert("bob");
41///
42/// assert!(filter.contains("alice"));
43/// assert!(filter.contains("bob"));
44/// assert!(!filter.contains("carol")); // very likely absent
45/// ```
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub struct BloomFilter<T: ?Sized, S = DefaultHashBuilder> {
49    bits: BitSet,
50    num_hashes: u32,
51    #[cfg_attr(feature = "serde", serde(skip))]
52    hasher: S,
53    #[cfg_attr(feature = "serde", serde(skip))]
54    _marker: PhantomData<fn(&T)>,
55}
56
57impl<T: ?Sized> BloomFilter<T, DefaultHashBuilder> {
58    /// Creates a filter sized for `capacity` items at the target false-positive
59    /// `rate`, using the default hasher.
60    ///
61    /// `capacity` is the number of distinct items you expect to insert; the
62    /// false-positive rate is honoured at that fill level and degrades
63    /// gracefully beyond it. The bit count and hash count are derived from the
64    /// standard Bloom filter formulas.
65    ///
66    /// # Parameters
67    ///
68    /// - `capacity`: expected number of distinct insertions. Must be non-zero.
69    /// - `rate`: desired false-positive probability. Must lie in `(0.0, 1.0)`.
70    ///
71    /// # Errors
72    ///
73    /// Returns [`Error::InvalidParameter`] if `capacity` is zero or `rate` is
74    /// not a finite value strictly between `0.0` and `1.0`.
75    ///
76    /// # Examples
77    ///
78    /// ```
79    /// use bloom_lib::BloomFilter;
80    ///
81    /// let filter = BloomFilter::<&str>::new(1_000, 0.001).unwrap();
82    /// assert!(filter.num_bits() >= 1_000);
83    /// assert!(filter.num_hashes() >= 1);
84    /// ```
85    pub fn new(capacity: usize, rate: f64) -> Result<Self, Error> {
86        Self::with_hasher(capacity, rate, DefaultHashBuilder)
87    }
88
89    /// Creates a filter with an explicit bit count and hash count, using the
90    /// default hasher.
91    ///
92    /// Use this when you want direct control over the filter's geometry rather
93    /// than deriving it from a capacity and rate.
94    ///
95    /// # Parameters
96    ///
97    /// - `num_bits`: size of the bit array. Must be non-zero.
98    /// - `num_hashes`: number of hash positions probed per item. Must be
99    ///   non-zero.
100    ///
101    /// # Errors
102    ///
103    /// Returns [`Error::InvalidParameter`] if either argument is zero.
104    ///
105    /// # Examples
106    ///
107    /// ```
108    /// use bloom_lib::BloomFilter;
109    ///
110    /// let filter = BloomFilter::<u64>::with_dimensions(8_192, 5).unwrap();
111    /// assert_eq!(filter.num_bits(), 8_192);
112    /// assert_eq!(filter.num_hashes(), 5);
113    /// ```
114    pub fn with_dimensions(num_bits: u64, num_hashes: u32) -> Result<Self, Error> {
115        Self::with_dimensions_and_hasher(num_bits, num_hashes, DefaultHashBuilder)
116    }
117}
118
119impl<T: ?Sized, S: BuildHasher> BloomFilter<T, S> {
120    /// Creates a filter sized for `capacity` items at the target false-positive
121    /// `rate`, using a caller-supplied hasher.
122    ///
123    /// Identical to [`new`](Self::new) but lets you plug in any
124    /// [`BuildHasher`](core::hash::BuildHasher) — for example a randomly-seeded
125    /// one for resistance against adversarial inputs.
126    ///
127    /// # Errors
128    ///
129    /// Returns [`Error::InvalidParameter`] if `capacity` is zero or `rate` is
130    /// not a finite value strictly between `0.0` and `1.0`.
131    ///
132    /// # Examples
133    ///
134    /// ```
135    /// # #[cfg(feature = "std")] {
136    /// use std::collections::hash_map::RandomState;
137    /// use bloom_lib::BloomFilter;
138    ///
139    /// let filter: BloomFilter<&str, RandomState> =
140    ///     BloomFilter::with_hasher(1_000, 0.01, RandomState::new()).unwrap();
141    /// # }
142    /// ```
143    pub fn with_hasher(capacity: usize, rate: f64, hasher: S) -> Result<Self, Error> {
144        if capacity == 0 {
145            return Err(Error::InvalidParameter {
146                param: "capacity",
147                reason: "must be greater than zero",
148            });
149        }
150        if !(rate.is_finite() && rate > 0.0 && rate < 1.0) {
151            return Err(Error::InvalidParameter {
152                param: "rate",
153                reason: "must be a finite value in the open interval (0.0, 1.0)",
154            });
155        }
156
157        let num_bits = optimal_num_bits(capacity, rate);
158        let num_hashes = optimal_num_hashes(num_bits, capacity);
159        Self::with_dimensions_and_hasher(num_bits, num_hashes, hasher)
160    }
161
162    /// Creates a filter with an explicit geometry and a caller-supplied hasher.
163    ///
164    /// # Errors
165    ///
166    /// Returns [`Error::InvalidParameter`] if `num_bits` or `num_hashes` is
167    /// zero.
168    ///
169    /// # Examples
170    ///
171    /// ```
172    /// use bloom_lib::{hash::DefaultHashBuilder, BloomFilter};
173    ///
174    /// let filter = BloomFilter::<u64>::with_dimensions_and_hasher(
175    ///     4_096,
176    ///     4,
177    ///     DefaultHashBuilder,
178    /// )
179    /// .unwrap();
180    /// assert_eq!(filter.num_hashes(), 4);
181    /// ```
182    pub fn with_dimensions_and_hasher(
183        num_bits: u64,
184        num_hashes: u32,
185        hasher: S,
186    ) -> Result<Self, Error> {
187        if num_bits == 0 {
188            return Err(Error::InvalidParameter {
189                param: "num_bits",
190                reason: "must be greater than zero",
191            });
192        }
193        if num_hashes == 0 {
194            return Err(Error::InvalidParameter {
195                param: "num_hashes",
196                reason: "must be greater than zero",
197            });
198        }
199
200        Ok(Self {
201            bits: BitSet::new(num_bits),
202            num_hashes,
203            hasher,
204            _marker: PhantomData,
205        })
206    }
207
208    /// Inserts `item`, returning `true` if it was probably not present before.
209    ///
210    /// A return of `true` means at least one of the item's bits was previously
211    /// unset, so the item had definitely not been inserted. A return of `false`
212    /// means every bit was already set, so the item was *probably* already
213    /// present (subject to the filter's false-positive rate). This makes
214    /// `insert` a convenient deduplicating primitive for streaming input.
215    ///
216    /// # Examples
217    ///
218    /// ```
219    /// use bloom_lib::BloomFilter;
220    ///
221    /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
222    /// assert!(filter.insert("first")); // newly added
223    /// assert!(!filter.insert("first")); // already present
224    /// ```
225    pub fn insert(&mut self, item: &T) -> bool
226    where
227        T: core::hash::Hash,
228    {
229        let pair = HashPair::new(item, &self.hasher);
230        let num_bits = self.bits.len();
231        let mut newly_added = false;
232        for i in 0..u64::from(self.num_hashes) {
233            let index = reduce(pair.nth(i), num_bits);
234            // `set` returns the previous value; a `false` means this bit was
235            // unset, so the item is definitely new.
236            if !self.bits.set(index) {
237                newly_added = true;
238            }
239        }
240        newly_added
241    }
242
243    /// Tests whether `item` is in the filter.
244    ///
245    /// Returns `false` only if the item was definitely never inserted. A return
246    /// of `true` means the item is *probably* present, with the filter's
247    /// configured false-positive probability.
248    ///
249    /// # Examples
250    ///
251    /// ```
252    /// use bloom_lib::BloomFilter;
253    ///
254    /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
255    /// filter.insert(&7u32);
256    /// assert!(filter.contains(&7u32));
257    /// assert!(!filter.contains(&99u32));
258    /// ```
259    #[must_use]
260    pub fn contains(&self, item: &T) -> bool
261    where
262        T: core::hash::Hash,
263    {
264        let pair = HashPair::new(item, &self.hasher);
265        let num_bits = self.bits.len();
266        (0..u64::from(self.num_hashes)).all(|i| self.bits.get(reduce(pair.nth(i), num_bits)))
267    }
268
269    /// Removes every item, leaving an empty filter with the same geometry.
270    ///
271    /// The bit allocation is retained, so reusing a cleared filter avoids a
272    /// fresh allocation.
273    ///
274    /// # Examples
275    ///
276    /// ```
277    /// use bloom_lib::BloomFilter;
278    ///
279    /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
280    /// filter.insert("x");
281    /// filter.clear();
282    /// assert!(filter.is_empty());
283    /// assert!(!filter.contains("x"));
284    /// ```
285    pub fn clear(&mut self) {
286        self.bits.clear();
287    }
288
289    /// The size of the underlying bit array.
290    #[inline]
291    #[must_use]
292    pub fn num_bits(&self) -> u64 {
293        self.bits.len()
294    }
295
296    /// The number of hash positions probed per item.
297    #[inline]
298    #[must_use]
299    pub fn num_hashes(&self) -> u32 {
300        self.num_hashes
301    }
302
303    /// The number of set bits in the filter.
304    ///
305    /// This is the raw population count of the bit array, useful for monitoring
306    /// fill level. See [`estimated_len`](Self::estimated_len) for an estimate of
307    /// the number of distinct items inserted.
308    #[inline]
309    #[must_use]
310    pub fn count_ones(&self) -> u64 {
311        self.bits.count_ones()
312    }
313
314    /// Returns `true` if no bits are set, i.e. nothing has been inserted.
315    ///
316    /// # Examples
317    ///
318    /// ```
319    /// use bloom_lib::BloomFilter;
320    ///
321    /// let mut filter = BloomFilter::new(100, 0.01).unwrap();
322    /// assert!(filter.is_empty());
323    /// filter.insert("x");
324    /// assert!(!filter.is_empty());
325    /// ```
326    #[inline]
327    #[must_use]
328    pub fn is_empty(&self) -> bool {
329        self.bits.count_ones() == 0
330    }
331
332    /// Estimates the number of distinct items inserted so far.
333    ///
334    /// Derived from the fraction of set bits using the standard estimator
335    /// `n ≈ -(m / k) · ln(1 − X / m)`, where `m` is the bit count, `k` the hash
336    /// count, and `X` the number of set bits. The estimate is approximate and
337    /// loses accuracy as the filter saturates.
338    ///
339    /// # Examples
340    ///
341    /// ```
342    /// use bloom_lib::BloomFilter;
343    ///
344    /// let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
345    /// for i in 0..1_000u32 {
346    ///     filter.insert(&i);
347    /// }
348    /// let estimate = filter.estimated_len();
349    /// // Within a few percent of the true count of 1,000.
350    /// assert!((900..=1_100).contains(&estimate));
351    /// ```
352    #[must_use]
353    pub fn estimated_len(&self) -> u64 {
354        let m = self.bits.len() as f64;
355        let k = f64::from(self.num_hashes);
356        let x = self.bits.count_ones() as f64;
357        if x == 0.0 {
358            return 0;
359        }
360        if x >= m {
361            // Saturated: the estimator diverges, so report the bit count as a
362            // conservative floor rather than infinity.
363            return self.bits.len();
364        }
365        let estimate = -(m / k) * libm::log(1.0 - x / m);
366        // `estimate` is finite and non-negative here; round to the nearest item.
367        libm::round(estimate) as u64
368    }
369
370    /// Estimates the current false-positive probability at the present fill
371    /// level.
372    ///
373    /// Computed as `(X / m)^k`, where `X` is the number of set bits, `m` the bit
374    /// count, and `k` the hash count. This reflects the *actual* fill, so it
375    /// rises above the configured target rate once the filter holds more than
376    /// its design capacity.
377    ///
378    /// # Examples
379    ///
380    /// ```
381    /// use bloom_lib::BloomFilter;
382    ///
383    /// let filter = BloomFilter::<u32>::new(1_000, 0.01).unwrap();
384    /// // An empty filter cannot produce a false positive.
385    /// assert_eq!(filter.estimated_false_positive_rate(), 0.0);
386    /// ```
387    #[must_use]
388    pub fn estimated_false_positive_rate(&self) -> f64 {
389        let m = self.bits.len() as f64;
390        let k = f64::from(self.num_hashes);
391        let fill = self.bits.count_ones() as f64 / m;
392        libm::pow(fill, k)
393    }
394
395    /// Merges `other` into `self` by unioning their bit arrays.
396    ///
397    /// After a successful merge, the filter reports positive for every item
398    /// that was in either operand. Both filters must have been built with
399    /// identical dimensions (bit count and hash count); the hasher is assumed to
400    /// match, which holds automatically for the default hasher.
401    ///
402    /// # Errors
403    ///
404    /// Returns [`Error::IncompatibleParameters`] if the two filters differ in
405    /// bit count or hash count.
406    ///
407    /// # Examples
408    ///
409    /// ```
410    /// use bloom_lib::BloomFilter;
411    ///
412    /// let mut a = BloomFilter::new(1_000, 0.01).unwrap();
413    /// let mut b = BloomFilter::new(1_000, 0.01).unwrap();
414    /// a.insert("from-a");
415    /// b.insert("from-b");
416    ///
417    /// a.merge(&b).unwrap();
418    /// assert!(a.contains("from-a"));
419    /// assert!(a.contains("from-b"));
420    /// ```
421    pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
422        if self.num_hashes != other.num_hashes || !self.bits.is_compatible(&other.bits) {
423            return Err(Error::IncompatibleParameters);
424        }
425        self.bits.union_with(&other.bits);
426        Ok(())
427    }
428}
429
430/// Optimal bit count `m = ceil(-n·ln(p) / (ln2)^2)`, clamped to at least one.
431fn optimal_num_bits(capacity: usize, rate: f64) -> u64 {
432    let n = capacity as f64;
433    let m = -(n * libm::log(rate)) / (LN_2 * LN_2);
434    let rounded = libm::ceil(m);
435    if rounded < 1.0 {
436        1
437    } else {
438        rounded as u64
439    }
440}
441
442/// Optimal hash count `k = round((m/n)·ln2)`, clamped to at least one.
443fn optimal_num_hashes(num_bits: u64, capacity: usize) -> u32 {
444    let k = (num_bits as f64 / capacity as f64) * LN_2;
445    let rounded = libm::round(k);
446    if rounded < 1.0 {
447        1
448    } else {
449        // `m` is bounded by realistic memory, so `k` never approaches u32::MAX.
450        rounded as u32
451    }
452}
453
454#[cfg(test)]
455mod tests {
456    // `insert` returns a novelty flag that most call sites here intentionally
457    // discard; the crate denies `unused_results` for shipping code.
458    #![allow(unused_results)]
459    // Unwrapping known-valid constructor results keeps the tests readable.
460    #![allow(clippy::unwrap_used)]
461
462    use super::*;
463
464    #[test]
465    fn test_new_rejects_zero_capacity() {
466        let err = BloomFilter::<&str>::new(0, 0.01).unwrap_err();
467        assert_eq!(
468            err,
469            Error::InvalidParameter {
470                param: "capacity",
471                reason: "must be greater than zero"
472            }
473        );
474    }
475
476    #[test]
477    fn test_new_rejects_out_of_range_rate() {
478        assert!(matches!(
479            BloomFilter::<&str>::new(10, 0.0),
480            Err(Error::InvalidParameter { .. })
481        ));
482        assert!(matches!(
483            BloomFilter::<&str>::new(10, 1.0),
484            Err(Error::InvalidParameter { .. })
485        ));
486        assert!(matches!(
487            BloomFilter::<&str>::new(10, f64::NAN),
488            Err(Error::InvalidParameter { .. })
489        ));
490    }
491
492    #[test]
493    fn test_with_dimensions_rejects_zeros() {
494        assert!(matches!(
495            BloomFilter::<u8>::with_dimensions(0, 3),
496            Err(Error::InvalidParameter { .. })
497        ));
498        assert!(matches!(
499            BloomFilter::<u8>::with_dimensions(64, 0),
500            Err(Error::InvalidParameter { .. })
501        ));
502    }
503
504    #[test]
505    fn test_no_false_negatives() {
506        let mut filter = BloomFilter::new(1_000, 0.01).unwrap();
507        for i in 0..1_000u32 {
508            filter.insert(&i);
509        }
510        for i in 0..1_000u32 {
511            assert!(filter.contains(&i), "inserted item {i} reported absent");
512        }
513    }
514
515    #[test]
516    fn test_insert_reports_novelty() {
517        let mut filter = BloomFilter::new(100, 0.01).unwrap();
518        assert!(filter.insert("alpha"));
519        assert!(!filter.insert("alpha"));
520    }
521
522    #[test]
523    fn test_false_positive_rate_is_near_target() {
524        let capacity = 10_000;
525        let target = 0.01;
526        let mut filter = BloomFilter::new(capacity, target).unwrap();
527        for i in 0..capacity as u64 {
528            filter.insert(&i);
529        }
530        // Query a disjoint range and measure the observed false-positive rate.
531        let trials = 100_000u64;
532        let mut hits = 0u64;
533        for i in capacity as u64..capacity as u64 + trials {
534            if filter.contains(&i) {
535                hits += 1;
536            }
537        }
538        let observed = hits as f64 / trials as f64;
539        // Allow generous headroom; the point is it is the right order of magnitude.
540        assert!(
541            observed < target * 3.0,
542            "observed FP rate {observed} far exceeds target {target}"
543        );
544    }
545
546    #[test]
547    fn test_clear_empties_filter() {
548        let mut filter = BloomFilter::new(100, 0.01).unwrap();
549        filter.insert("x");
550        assert!(!filter.is_empty());
551        filter.clear();
552        assert!(filter.is_empty());
553        assert!(!filter.contains("x"));
554    }
555
556    #[test]
557    fn test_merge_unions_membership() {
558        let mut a = BloomFilter::new(1_000, 0.01).unwrap();
559        let mut b = BloomFilter::new(1_000, 0.01).unwrap();
560        a.insert("a");
561        b.insert("b");
562        a.merge(&b).unwrap();
563        assert!(a.contains("a"));
564        assert!(a.contains("b"));
565    }
566
567    #[test]
568    fn test_merge_rejects_incompatible() {
569        let mut a = BloomFilter::<u32>::with_dimensions(1_024, 3).unwrap();
570        let b = BloomFilter::<u32>::with_dimensions(2_048, 3).unwrap();
571        assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
572
573        let c = BloomFilter::<u32>::with_dimensions(1_024, 4).unwrap();
574        assert_eq!(a.merge(&c), Err(Error::IncompatibleParameters));
575    }
576
577    #[test]
578    fn test_estimated_len_is_reasonable() {
579        let mut filter = BloomFilter::new(10_000, 0.01).unwrap();
580        for i in 0..1_000u32 {
581            filter.insert(&i);
582        }
583        let estimate = filter.estimated_len();
584        assert!(
585            (900..=1_100).contains(&estimate),
586            "estimate {estimate} not within 10% of 1000"
587        );
588    }
589
590    #[test]
591    fn test_estimated_len_empty_is_zero() {
592        let filter = BloomFilter::<u32>::new(1_000, 0.01).unwrap();
593        assert_eq!(filter.estimated_len(), 0);
594    }
595
596    #[test]
597    fn test_sizing_formulas() {
598        // 10k items at 1% -> ~95,851 bits, k = 7 (textbook values).
599        let bits = optimal_num_bits(10_000, 0.01);
600        assert!(
601            (95_000..=96_500).contains(&bits),
602            "unexpected bit count {bits}"
603        );
604        let k = optimal_num_hashes(bits, 10_000);
605        assert_eq!(k, 7);
606    }
607}