fastbloom_rs/builder.rs
1use crate::bloom::{BloomFilter, CountingBloomFilter};
2use crate::Membership;
3
4/// Builder for Bloom Filters.
5#[derive(Clone)]
6#[derive(Debug)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
8pub struct FilterBuilder {
9 pub expected_elements: u64,
10 pub false_positive_probability: f64,
11 pub size: u64,
12 pub hashes: u32,
13 /// Usage for CountingBloomFilter.
14 pub enable_repeat_insert: bool,
15 pub(crate) done: bool,
16}
17
18#[cfg(target_pointer_width = "32")]
19pub(crate) const SUFFIX: usize = 0b0001_1111;
20#[cfg(target_pointer_width = "64")]
21pub(crate) const SUFFIX: usize = 0b0011_1111;
22#[cfg(target_pointer_width = "32")]
23pub(crate) const MASK: u64 = 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11100000;
24#[cfg(target_pointer_width = "64")]
25pub(crate) const MASK: u64 = 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11000000;
26
27/// Calculates the optimal size `m` of the bloom filter in bits given `n` (expected
28/// number of elements in bloom filter) and `p` (tolerable false positive rate).
29#[inline]
30fn optimal_m(n: u64, p: f64) -> u64 {
31 let fact = -(n as f64) * p.ln();
32 let div = 2f64.ln().powi(2);
33 let m: f64 = fact / div;
34 let mut m = m.ceil() as u64;
35 if (m & SUFFIX as u64) != 0 {
36 m = (m & MASK) + SUFFIX as u64 + 1;
37 };
38 m
39}
40
41/// Calculates the optimal `hashes` (number of hash function) given `n` (expected number of
42/// elements in bloom filter) and `m` (size of bloom filter in bits).
43#[inline]
44fn optimal_k(n: u64, m: u64) -> u32 {
45 let k: f64 = (m as f64 * 2f64.ln()) / n as f64;
46 k.ceil() as u32
47}
48
49/// Calculates the amount of elements a Bloom filter for which the given configuration of size `m`
50/// and hashes `k` is optimal.
51#[inline]
52fn optimal_n(k: u32, m: u64) -> u64 {
53 let n = (2f64.ln() * m as f64) / k as f64;
54 n.ceil() as u64
55}
56
57
58/// Calculates the best-case (uniform hash function) false positive probability.
59/// `k` number of hashes.
60/// `m` The size of the bloom filter in bits.
61/// `n` number of elements inserted in the filter.
62#[inline]
63fn optimal_p(k: u32, m: u64, n: u64) -> f64 {
64 let nk = -(k as f64);
65 (1.0 - (nk * n as f64 / m as f64).exp()).powi(k as i32)
66}
67
68impl FilterBuilder {
69 /// Constructs a new Bloom Filter Builder by specifying the expected size of the filter and the
70 /// tolerable false positive probability. The size of the BLoom filter in in bits and the
71 /// optimal number of hash functions will be inferred from this.
72 ///
73 /// # Examples
74 ///
75 /// ```rust
76 /// use fastbloom_rs::FilterBuilder;
77 /// let mut builder = FilterBuilder::new(100_000_000, 0.01);
78 /// let bloom = builder.build_bloom_filter();
79 ///
80 /// ```
81 pub fn new(expected_elements: u64, false_positive_probability: f64) -> Self {
82 FilterBuilder {
83 expected_elements,
84 false_positive_probability,
85 size: 0,
86 hashes: 0,
87 enable_repeat_insert: true,
88 done: false,
89 }
90 }
91
92 /// Constructs a new Bloom Filter Builder by specifying the size of the bloom filter in bits
93 /// and the number of hashes. The expected size of the filter and the tolerable false positive
94 /// probability will be inferred from this.
95 pub fn from_size_and_hashes(size: u64, hashes: u32) -> Self {
96 let n = optimal_n(hashes, size);
97 let p = optimal_p(hashes, size, n);
98 FilterBuilder {
99 expected_elements: n,
100 false_positive_probability: p,
101 size,
102 hashes,
103 enable_repeat_insert: true,
104 done: true,
105 }
106 }
107
108 /// set the expected size of the filter.
109 fn expected_elements(&mut self, expected_elements: u64) {
110 assert!(expected_elements > 0, "expected_elements must larger than 0!");
111 self.expected_elements = expected_elements;
112 }
113
114 /// set the tolerable false positive probability.
115 fn false_positive_probability(&mut self, false_positive_probability: f64) {
116 assert!(false_positive_probability < 1.0 && false_positive_probability > 0.0,
117 "false_positive_probability must between (0.0, 1.0)!");
118 self.false_positive_probability = false_positive_probability;
119 }
120
121 /// Use for CountingBloomFilter.
122 ///
123 /// # Example:
124 /// ```rust
125 /// use fastbloom_rs::{Deletable, FilterBuilder, Membership};
126 ///
127 /// let mut builder = FilterBuilder::new(100_000, 0.01);
128 /// // enable_repeat_insert is true
129 /// builder.enable_repeat_insert(true);
130 /// let mut cbf = builder.build_counting_bloom_filter();
131 /// cbf.add(b"hello"); // modify underlying vector counter.
132 /// cbf.add(b"hello"); // modify underlying vector counter.
133 /// assert_eq!(cbf.contains(b"hello"), true);
134 /// cbf.remove(b"hello");
135 /// assert_eq!(cbf.contains(b"hello"), true);
136 /// cbf.remove(b"hello");
137 /// assert_eq!(cbf.contains(b"hello"), false);
138 ///
139 /// // enable_repeat_insert is false
140 /// builder.enable_repeat_insert(false);
141 /// let mut cbf = builder.build_counting_bloom_filter();
142 /// cbf.add(b"hello"); // modify underlying vector counter.
143 /// cbf.add(b"hello"); // not modify underlying vector counter because b"hello" has been added.
144 /// assert_eq!(cbf.contains(b"hello"), true);
145 /// cbf.remove(b"hello");
146 /// assert_eq!(cbf.contains(b"hello"), false);
147 /// ```
148 pub fn enable_repeat_insert(&mut self, enable: bool) {
149 self.enable_repeat_insert = enable;
150 }
151
152 /// set the size of the bloom filter in bits.
153 fn size(&mut self, size: u64) {
154 assert_eq!(size & SUFFIX as u64, 0);
155 self.size = size;
156 }
157
158
159 /// Checks if all necessary parameters were set and tries to infer optimal parameters (e.g.
160 /// size and hashes from given expected_elements (`n`) and falsePositiveProbability (`p`)).
161 /// This is done automatically.
162 pub(crate) fn complete(&mut self) {
163 if !self.done {
164 if self.size == 0 {
165 self.size = optimal_m(self.expected_elements, self.false_positive_probability);
166 self.hashes = optimal_k(self.expected_elements, self.size);
167 }
168 self.done = true;
169 }
170 }
171
172 /// Constructs a Bloom filter using the specified parameters and computing missing parameters
173 /// if possible (e.g. the optimal Bloom filter bit size).
174 pub fn build_bloom_filter(&mut self) -> BloomFilter {
175 self.complete();
176 BloomFilter::new(self.clone())
177 }
178
179 /// Constructs a Counting Bloom filter using the specified parameters and computing missing parameters
180 /// if possible (e.g. the optimal Bloom filter bit size).
181 pub fn build_counting_bloom_filter(&mut self) -> CountingBloomFilter {
182 self.complete();
183 CountingBloomFilter::new(self.clone())
184 }
185
186 /// Checks whether a configuration is compatible to another configuration based on the size of
187 /// the Bloom filter and its hash functions.
188 pub(crate) fn is_compatible_to(&self, other: &FilterBuilder) -> bool {
189 self.size == other.size && self.hashes == other.hashes
190 }
191}
192
193#[test]
194fn optimal_test() {
195 let m = optimal_m(100_000_000, 0.01);
196 let k = optimal_k(100_000_000, m);
197 let n = optimal_n(k, m);
198 let p = optimal_p(k, m, n);
199 println!("{m} {k} {n} {p}");
200 assert_eq!(m, 958505856);
201 assert_eq!(k, 7)
202}
203
204#[test]
205fn builder_test() {
206 let mut bloom = FilterBuilder::new(100_000_000, 0.01)
207 .build_bloom_filter();
208 bloom.add(b"helloworld");
209 assert_eq!(bloom.contains(b"helloworld"), true);
210 assert_eq!(bloom.contains(b"helloworld!"), false);
211}