stable_bloom_filter/lib.rs
1// StableBloomFilter implements a Stable Bloom Filter as described by Deng and
2// Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable
3// Bloom Filters:
4//
5// http://webdocs.cs.ualberta.ca/~drafiei/papers/DupDet06Sigmod.pdf
6//
7// A Stable Bloom Filter (SBF) continuously evicts stale information so that it
8// has room for more recent elements. Like traditional Bloom filters, an SBF
9// has a non-zero probability of false positives, which is controlled by
10// several parameters. Unlike the classic Bloom filter, an SBF has a tight
11// upper bound on the rate of false positives while introducing a non-zero rate
12// of false negatives. The false-positive rate of a classic Bloom filter
13// eventually reaches 1, after which all queries result in a false positive.
14// The stable-point property of an SBF means the false-positive rate
15// asymptotically approaches a configurable fixed constant. A classic Bloom
16// filter is actually a special case of SBF where the eviction rate is zero, so
17// this package provides support for them as well.
18//
19// Stable Bloom Filters are useful for cases where the size of the data set
20// isn't known a priori, which is a requirement for traditional Bloom filters,
21// and memory is bounded. For example, an SBF can be used to deduplicate
22// events from an unbounded event stream with a specified upper bound on false
23// positives and minimal false negatives.
24pub mod buckets;
25pub mod fnv;
26pub mod stable;
27
28pub trait Filter {
29 fn test(&self, _data: &[u8]) -> bool;
30
31 fn add(&mut self, _data: &[u8]) -> &Self;
32
33 fn test_and_add(&mut self, _data: &[u8]) -> bool;
34}
35
36/// Calculates the optimal number of hash functions to use for a Bloom
37/// filter based on the desired rate of false positives.
38pub(crate) fn optimal_k(fp_rate: f64) -> usize {
39 (1.0 / fp_rate).log2().ceil() as usize
40}
41
42/// Returns the optimal number of cells to decrement, p, per
43/// iteration for the provided parameters of an SBF.
44pub(crate) fn optimal_stable_p(m: usize, k: usize, d: u8, fp_rate: f64) -> usize {
45 let max = (2_u64.pow(u32::from(d)) - 1) as f64;
46 let sub_denom = (1.0 - fp_rate.powf(1.0 / (k as f64))).powf(1.0 / max);
47 let denom = (1.0 / sub_denom - 1.0) * (1.0 / (k as f64) - 1.0 / (m as f64));
48
49 let mut p = (1.0 / denom) as usize;
50
51 if p == 0 {
52 p = 1;
53 }
54
55 p
56}