fastbloom_rs/
builder.rs

1use crate::bloom::{BloomFilter, CountingBloomFilter};
2use crate::Membership;
3
4/// Builder for Bloom Filters.
5#[derive(Clone)]
6#[derive(Debug)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
8pub struct FilterBuilder {
9    pub expected_elements: u64,
10    pub false_positive_probability: f64,
11    pub size: u64,
12    pub hashes: u32,
13    /// Usage for CountingBloomFilter.
14    pub enable_repeat_insert: bool,
15    pub(crate) done: bool,
16}
17
18#[cfg(target_pointer_width = "32")]
19pub(crate) const SUFFIX: usize = 0b0001_1111;
20#[cfg(target_pointer_width = "64")]
21pub(crate) const SUFFIX: usize = 0b0011_1111;
22#[cfg(target_pointer_width = "32")]
23pub(crate) const MASK: u64 = 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11100000;
24#[cfg(target_pointer_width = "64")]
25pub(crate) const MASK: u64 = 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11000000;
26
27/// Calculates the optimal size `m` of the bloom filter in bits given `n` (expected
28/// number of elements in bloom filter) and `p` (tolerable false positive rate).
29#[inline]
30fn optimal_m(n: u64, p: f64) -> u64 {
31    let fact = -(n as f64) * p.ln();
32    let div = 2f64.ln().powi(2);
33    let m: f64 = fact / div;
34    let mut m = m.ceil() as u64;
35    if (m & SUFFIX as u64) != 0 {
36        m = (m & MASK) + SUFFIX as u64 + 1;
37    };
38    m
39}
40
41/// Calculates the optimal `hashes` (number of hash function) given `n` (expected number of
42/// elements in bloom filter) and `m` (size of bloom filter in bits).
43#[inline]
44fn optimal_k(n: u64, m: u64) -> u32 {
45    let k: f64 = (m as f64 * 2f64.ln()) / n as f64;
46    k.ceil() as u32
47}
48
49/// Calculates the amount of elements a Bloom filter for which the given configuration of size `m`
50/// and hashes `k` is optimal.
51#[inline]
52fn optimal_n(k: u32, m: u64) -> u64 {
53    let n = (2f64.ln() * m as f64) / k as f64;
54    n.ceil() as u64
55}
56
57
58/// Calculates the best-case (uniform hash function) false positive probability.
59/// `k` number of hashes.
60/// `m` The size of the bloom filter in bits.
61/// `n` number of elements inserted in the filter.
62#[inline]
63fn optimal_p(k: u32, m: u64, n: u64) -> f64 {
64    let nk = -(k as f64);
65    (1.0 - (nk * n as f64 / m as f64).exp()).powi(k as i32)
66}
67
68impl FilterBuilder {
69    /// Constructs a new Bloom Filter Builder by specifying the expected size of the filter and the
70    /// tolerable false positive probability. The size of the BLoom filter in in bits and the
71    /// optimal number of hash functions will be inferred from this.
72    ///
73    /// # Examples
74    ///
75    /// ```rust
76    /// use fastbloom_rs::FilterBuilder;
77    /// let mut builder = FilterBuilder::new(100_000_000, 0.01);
78    /// let bloom = builder.build_bloom_filter();
79    ///
80    /// ```
81    pub fn new(expected_elements: u64, false_positive_probability: f64) -> Self {
82        FilterBuilder {
83            expected_elements,
84            false_positive_probability,
85            size: 0,
86            hashes: 0,
87            enable_repeat_insert: true,
88            done: false,
89        }
90    }
91
92    /// Constructs a new Bloom Filter Builder by specifying the size of the bloom filter in bits
93    /// and the number of hashes. The expected size of the filter and the tolerable false positive
94    /// probability will be inferred from this.
95    pub fn from_size_and_hashes(size: u64, hashes: u32) -> Self {
96        let n = optimal_n(hashes, size);
97        let p = optimal_p(hashes, size, n);
98        FilterBuilder {
99            expected_elements: n,
100            false_positive_probability: p,
101            size,
102            hashes,
103            enable_repeat_insert: true,
104            done: true,
105        }
106    }
107
108    /// set the expected size of the filter.
109    fn expected_elements(&mut self, expected_elements: u64) {
110        assert!(expected_elements > 0, "expected_elements must larger than 0!");
111        self.expected_elements = expected_elements;
112    }
113
114    /// set the tolerable false positive probability.
115    fn false_positive_probability(&mut self, false_positive_probability: f64) {
116        assert!(false_positive_probability < 1.0 && false_positive_probability > 0.0,
117                "false_positive_probability must between (0.0, 1.0)!");
118        self.false_positive_probability = false_positive_probability;
119    }
120
121    /// Use for CountingBloomFilter.
122    ///
123    /// # Example:
124    /// ```rust
125    /// use fastbloom_rs::{Deletable, FilterBuilder, Membership};
126    ///
127    /// let mut builder = FilterBuilder::new(100_000, 0.01);
128    /// // enable_repeat_insert is true
129    /// builder.enable_repeat_insert(true);
130    /// let mut cbf = builder.build_counting_bloom_filter();
131    /// cbf.add(b"hello"); // modify underlying vector counter.
132    /// cbf.add(b"hello"); // modify underlying vector counter.
133    /// assert_eq!(cbf.contains(b"hello"), true);
134    /// cbf.remove(b"hello");
135    /// assert_eq!(cbf.contains(b"hello"), true);
136    /// cbf.remove(b"hello");
137    /// assert_eq!(cbf.contains(b"hello"), false);
138    ///
139    /// // enable_repeat_insert is false
140    /// builder.enable_repeat_insert(false);
141    /// let mut cbf = builder.build_counting_bloom_filter();
142    /// cbf.add(b"hello"); // modify underlying vector counter.
143    /// cbf.add(b"hello"); // not modify underlying vector counter because b"hello" has been added.
144    /// assert_eq!(cbf.contains(b"hello"), true);
145    /// cbf.remove(b"hello");
146    /// assert_eq!(cbf.contains(b"hello"), false);
147    /// ```
148    pub fn enable_repeat_insert(&mut self, enable: bool) {
149        self.enable_repeat_insert = enable;
150    }
151
152    /// set  the size of the bloom filter in bits.
153    fn size(&mut self, size: u64) {
154        assert_eq!(size & SUFFIX as u64, 0);
155        self.size = size;
156    }
157
158
159    /// Checks if all necessary parameters were set and tries to infer optimal parameters (e.g.
160    /// size and hashes from given expected_elements (`n`) and falsePositiveProbability (`p`)).
161    /// This is done automatically.
162    pub(crate) fn complete(&mut self) {
163        if !self.done {
164            if self.size == 0 {
165                self.size = optimal_m(self.expected_elements, self.false_positive_probability);
166                self.hashes = optimal_k(self.expected_elements, self.size);
167            }
168            self.done = true;
169        }
170    }
171
172    /// Constructs a Bloom filter using the specified parameters and computing missing parameters
173    /// if possible (e.g. the optimal Bloom filter bit size).
174    pub fn build_bloom_filter(&mut self) -> BloomFilter {
175        self.complete();
176        BloomFilter::new(self.clone())
177    }
178
179    /// Constructs a Counting Bloom filter using the specified parameters and computing missing parameters
180    /// if possible (e.g. the optimal Bloom filter bit size).
181    pub fn build_counting_bloom_filter(&mut self) -> CountingBloomFilter {
182        self.complete();
183        CountingBloomFilter::new(self.clone())
184    }
185
186    /// Checks whether a configuration is compatible to another configuration based on the size of
187    /// the Bloom filter and its hash functions.
188    pub(crate) fn is_compatible_to(&self, other: &FilterBuilder) -> bool {
189        self.size == other.size && self.hashes == other.hashes
190    }
191}
192
193#[test]
194fn optimal_test() {
195    let m = optimal_m(100_000_000, 0.01);
196    let k = optimal_k(100_000_000, m);
197    let n = optimal_n(k, m);
198    let p = optimal_p(k, m, n);
199    println!("{m} {k} {n} {p}");
200    assert_eq!(m, 958505856);
201    assert_eq!(k, 7)
202}
203
204#[test]
205fn builder_test() {
206    let mut bloom = FilterBuilder::new(100_000_000, 0.01)
207        .build_bloom_filter();
208    bloom.add(b"helloworld");
209    assert_eq!(bloom.contains(b"helloworld"), true);
210    assert_eq!(bloom.contains(b"helloworld!"), false);
211}