deterministic_bloom/runtime_size.rs
1use crate::{
2 common::{BloomParams, HashIndexIterator},
3 utils::HexFieldDebug,
4};
5use bitvec::{prelude::Lsb0, view::BitView};
6use std::fmt::Debug;
7
8//------------------------------------------------------------------------------
9// Type Definitions
10//------------------------------------------------------------------------------
11
12/// An implementation of a basic [bloom filter].
13///
14/// Its size can be chosen (or made optimal for given parameters) at creation time,
15/// but its size will have to stay the same during usage. I.e. you need to know
16/// what your target capacity and false positive rates should be in advance.
17///
18/// Unlike the [`const_size::BloomFilter`](crate::const_size::BloomFilter) however,
19/// this implementation doesn't require you to know the parameters at compile time.
20///
21/// # Example
22///
23/// ```
24/// use deterministic_bloom::runtime_size::BloomFilter;
25///
26/// let mut filter = BloomFilter::new_from_fpr(1_000, 1.0 / 1_000_000.0);
27/// filter.insert(b"Hello, World!");
28///
29/// assert!(filter.contains(b"Hello, World!"));
30/// assert!(!filter.contains(b"Hello?")); // true in all but 1 in a million cases
31/// ```
32///
33/// [bloom filter]: https://en.wikipedia.org/wiki/Bloom_filter
34#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
35pub struct BloomFilter {
36 k_hashes: usize,
37 bytes: Box<[u8]>,
38}
39
40impl BloomFilter {
41 /// Construct a bloom filter with optimal parameters for given maximum capacity `n_elems`
42 /// and false positive rate `fpr`.
43 ///
44 /// `n_elems` must be non-zero and `fpr` must be a number between 0 and 1 exclusive.
45 ///
46 /// # Example
47 ///
48 /// ```
49 /// use deterministic_bloom::runtime_size::BloomFilter;
50 ///
51 /// let mut filter = BloomFilter::new_from_fpr(1_000_000, 1.0 / 1_000_000.0);
52 ///
53 /// assert_eq!(filter.as_bytes().len(), 3594397);
54 /// assert_eq!(filter.hash_count(), 20);
55 /// ```
56 pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
57 let params = BloomParams::new_from_fpr(n_elems, fpr);
58 let bits = Box::from(vec![0u8; params.byte_size].as_ref());
59 Self {
60 k_hashes: params.k_hashes,
61 bytes: bits,
62 }
63 }
64
65 /// Construct an optimal power-of-two (po2) sized bloom filter for given maximum capacity
66 /// `n_elems` and false positive rate `fpr`.
67 ///
68 /// `n_elems` must be non-zero and `fpr` must be a number between 0 and 1 exclusive.
69 ///
70 /// Using a power-of-two size can be beneficial due to not requiring rejection sampling
71 /// when generating the hash indicies for items inserted into the filter.
72 ///
73 /// # Example
74 ///
75 /// ```
76 /// use deterministic_bloom::runtime_size::BloomFilter;
77 ///
78 /// let mut filter = BloomFilter::new_from_fpr_po2(10_000_000, 0.01);
79 ///
80 /// assert_eq!(filter.as_bytes().len(), 16_777_216);
81 /// assert_eq!(filter.as_bytes().len().count_ones(), 1); // size is a power of two
82 /// assert_eq!(filter.hash_count(), 10);
83 /// ```
84 pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
85 let params = BloomParams::new_from_fpr_po2(n_elems, fpr);
86 let bits = Box::from(vec![0u8; params.byte_size].as_ref());
87 Self {
88 k_hashes: params.k_hashes,
89 bytes: bits,
90 }
91 }
92
93 /// Construct a bloom filter with given target size and target capacity, both must
94 /// be non-zero.
95 ///
96 /// This will compute the optimal number of hash evaluations per item inserted, but the
97 /// false positive rate completely depends on the given filter size to maximum capacity
98 /// ratio.
99 ///
100 /// # Example
101 ///
102 /// ```
103 /// use deterministic_bloom::runtime_size::BloomFilter;
104 ///
105 /// let mut filter = BloomFilter::new_from_size(1000, 1000);
106 ///
107 /// // False positive rate isn't controlled though:
108 /// assert!((filter.false_positive_rate_at(1000) - 0.0215).abs() < 1e-4);
109 /// ```
110 pub fn new_from_size(bloom_bytes: usize, n_elems: u64) -> Self {
111 let params = BloomParams::new_from_size(bloom_bytes, n_elems);
112 let bits = Box::from(vec![0u8; params.byte_size].as_ref());
113 Self {
114 k_hashes: params.k_hashes,
115 bytes: bits,
116 }
117 }
118
119 /// Construct the bloom filter from existing components.
120 ///
121 /// This is useful when e.g. deserializing a bloom filter.
122 ///
123 /// # Example
124 ///
125 /// ```
126 /// use deterministic_bloom::runtime_size::BloomFilter;
127 ///
128 /// let mut filter = BloomFilter::new_from_fpr_po2(100_000, 0.01);
129 ///
130 /// filter.insert(b"Hello, World!");
131 ///
132 /// // Serialize
133 /// let k_hashes = filter.hash_count();
134 /// let bytes = Box::from(filter.as_bytes());
135 ///
136 /// // Deserialize
137 /// let filter2 = BloomFilter::new_with(k_hashes, bytes);
138 /// assert_eq!(filter, filter2);
139 /// ```
140 pub fn new_with(k_hashes: usize, bytes: Box<[u8]>) -> Self {
141 Self { k_hashes, bytes }
142 }
143
144 /// Compute the bloom parameters for this bloom filter.
145 /// This contains information about its size and hash function evaluations per
146 /// item (`k_hashes`).
147 pub fn get_bloom_params(&self) -> BloomParams {
148 BloomParams {
149 k_hashes: self.k_hashes,
150 byte_size: self.bytes.len(),
151 }
152 }
153
154 /// Get the approximate false positive rate at given capacity for this bloom filter.
155 /// Returns a number between 0 and 1.
156 pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
157 self.get_bloom_params().false_positive_rate_at(n_elems)
158 }
159
160 /// Get the approximate false positive rate at the current capacity of this bloom filter.
161 /// Returns a number between 0 and 1.
162 pub fn current_false_positive_rate(&self) -> f64 {
163 let m = (self.bytes.len() * 8) as f64;
164 let m_set = self.count_ones() as f64;
165 let load = m_set / m;
166 load.powi(self.hash_count() as i32)
167 }
168
169 /// Counts the amount of bits set in the bloom filter.
170 pub fn count_ones(&self) -> usize {
171 self.bytes.view_bits::<Lsb0>().count_ones()
172 }
173
174 /// Insert an element into the bloom filter.
175 ///
176 /// The element will be hashed, thus it needs to be representable as bytes.
177 ///
178 /// Note: If you're using the bloom filter in a non-trusted
179 /// environment, so e.g. the items can be chosen by an adversary, please
180 /// make sure to pre-hash your items with a cryptographic hashing function
181 /// like SHA-256 or BLAKE3.
182 /// Otherwise an adversary will be able to generate elements that cause
183 /// the bloom filter to e.g. be unusually full with an unusually high false
184 /// positive rate or cheaply generate elements that are false positives.
185 ///
186 /// # Example
187 ///
188 /// ```
189 /// use deterministic_bloom::runtime_size::BloomFilter;
190 ///
191 /// let mut filter = BloomFilter::new_from_fpr(1000, 0.0001);
192 ///
193 /// for i in 0u32..1000 {
194 /// filter.insert(&i.to_le_bytes());
195 /// }
196 ///
197 /// // Slightly more than half filled with zeros
198 /// assert_eq!(filter.as_bytes().len() / 2 * 8, filter.count_ones() - 322);
199 ///
200 /// assert!(filter.contains(&10u32.to_le_bytes()));
201 /// assert!(!filter.contains(&1001u32.to_le_bytes())); // Except in 0.01%
202 /// ```
203 pub fn insert(&mut self, item: &impl AsRef<[u8]>) {
204 for i in self.hash_indices(item) {
205 self.bytes.view_bits_mut::<Lsb0>().set(i, true);
206 }
207 }
208
209 /// Check whether an element was added into the bloom filter.
210 ///
211 /// This will always return true if the element was indeed added before,
212 /// but it *may* sometimes return true, even if it wasn't.
213 /// This is called a false positive and the false positive rate
214 /// at certain capacities can be checked with [`false_positive_rate_at`](BloomFilter::false_positive_rate_at)
215 /// and a desired false positive rate can be configured on creation with
216 /// [`new_from_fpr`](BloomFilter::new_from_fpr) or [`new_from_fpr_po2`](BloomFilter::new_from_fpr_po2).
217 ///
218 /// # Example
219 ///
220 /// ```
221 /// use deterministic_bloom::runtime_size::BloomFilter;
222 ///
223 /// let mut filter = BloomFilter::new_from_fpr(100, 0.1); // very high false-positive rate
224 ///
225 /// for i in 0u32..100 {
226 /// filter.insert(&i.to_le_bytes());
227 /// }
228 ///
229 /// // Inserted items will always return true
230 /// assert!(filter.contains(&50u32.to_le_bytes()));
231 /// // Non-inserted items mostly return false, but sometimes true
232 /// assert!(!filter.contains(&101u32.to_le_bytes()));
233 /// // But sometimes there exist false positives (in this case 10% of the time)
234 /// assert!(filter.contains(&106u32.to_le_bytes()));
235 /// ```
236 pub fn contains(&self, item: &impl AsRef<[u8]>) -> bool {
237 for i in self.hash_indices(item) {
238 if !self.bytes.view_bits::<Lsb0>()[i] {
239 return false;
240 }
241 }
242 true
243 }
244
245 /// Returns how many hash function invocations are used pre item inserted
246 pub fn hash_count(&self) -> usize {
247 self.k_hashes
248 }
249
250 /// Return the underlying array used to store the bloom bits (always on the heap)
251 pub fn as_bytes(&self) -> &[u8] {
252 &self.bytes
253 }
254
255 /// Return the indices that a given element would set in the filter
256 pub fn hash_indices<'a>(&self, item: &'a impl AsRef<[u8]>) -> impl Iterator<Item = usize> + 'a {
257 HashIndexIterator::new(item, self.bytes.len() * 8).take(self.hash_count())
258 }
259}
260
261impl Debug for BloomFilter {
262 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263 f.debug_struct("BloomFilter")
264 .field("k_hashes", &self.k_hashes)
265 .field("bytes", &HexFieldDebug(&self.bytes))
266 .finish()
267 }
268}
269
270#[cfg(test)]
271mod tests {
272 use super::BloomFilter;
273
274 #[test]
275 fn serialization_round_trip() {
276 let mut filter = BloomFilter::new_from_fpr(100, 0.001);
277 filter.insert(b"Hello");
278 filter.insert(b"World!");
279 let serialized_bytes = filter.as_bytes();
280 let serialized_k = filter.hash_count();
281 let deserialized = BloomFilter::new_with(serialized_k, Box::from(serialized_bytes));
282 assert!(deserialized.contains(b"Hello"));
283 assert!(!deserialized.contains(b"abc"));
284 assert_eq!(deserialized, filter);
285 }
286
287 #[test]
288 fn empty_bloom_filter() {
289 let filter = BloomFilter::new_with(3, Box::new([]));
290 // Technically an empty bloom "contains" anything, since everything is a false positive.
291 assert!(filter.contains(&[1, 2, 3]));
292 }
293}
294
295#[cfg(test)]
296mod proptests {
297 use super::BloomFilter;
298 use proptest::prop_assert;
299 use test_strategy::proptest;
300
301 #[proptest]
302 fn inserted_always_contained(items: Vec<u64>, #[strategy(100usize..10_000)] size: usize) {
303 let capacity = std::cmp::max(items.len() as u64, 1);
304 let mut filter = BloomFilter::new_from_size(size, capacity);
305
306 for item in items.iter() {
307 filter.insert(&item.to_le_bytes());
308 }
309
310 for item in items.iter() {
311 prop_assert!(filter.contains(&item.to_le_bytes()));
312 }
313 }
314
315 #[proptest]
316 fn false_positive_rate_as_predicted(
317 #[strategy(100u64..1_000)] n_elems: u64,
318 #[strategy(100.0..10_000.0)] inv_fpr: f64,
319 ) {
320 let fpr = 1.0 / inv_fpr;
321 let mut filter = BloomFilter::new_from_fpr(n_elems, fpr);
322
323 for i in 0..n_elems {
324 filter.insert(&i.to_le_bytes());
325 }
326
327 let measurements = 100_000;
328 let mut false_positives = 0;
329
330 for i in n_elems..n_elems + measurements {
331 if filter.contains(&i.to_le_bytes()) {
332 false_positives += 1;
333 }
334 }
335
336 let computed_fpr = false_positives as f64 / measurements as f64;
337 // The actual FPR should be pretty close
338 prop_assert!((computed_fpr - fpr).abs() < 1.5e-3);
339 }
340}