deterministic_bloom/
runtime_size.rs

1use crate::{
2    common::{BloomParams, HashIndexIterator},
3    utils::HexFieldDebug,
4};
5use bitvec::{prelude::Lsb0, view::BitView};
6use std::fmt::Debug;
7
8//------------------------------------------------------------------------------
9// Type Definitions
10//------------------------------------------------------------------------------
11
12/// An implementation of a basic [bloom filter].
13///
14/// Its size can be chosen (or made optimal for given parameters) at creation time,
15/// but its size will have to stay the same during usage. I.e. you need to know
16/// what your target capacity and false positive rates should be in advance.
17///
18/// Unlike the [`const_size::BloomFilter`](crate::const_size::BloomFilter) however,
19/// this implementation doesn't require you to know the parameters at compile time.
20///
21/// # Example
22///
23/// ```
24/// use deterministic_bloom::runtime_size::BloomFilter;
25///
26/// let mut filter = BloomFilter::new_from_fpr(1_000, 1.0 / 1_000_000.0);
27/// filter.insert(b"Hello, World!");
28///
29/// assert!(filter.contains(b"Hello, World!"));
30/// assert!(!filter.contains(b"Hello?")); // true in all but 1 in a million cases
31/// ```
32///
33/// [bloom filter]: https://en.wikipedia.org/wiki/Bloom_filter
34#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
35pub struct BloomFilter {
36    k_hashes: usize,
37    bytes: Box<[u8]>,
38}
39
40impl BloomFilter {
41    /// Construct a bloom filter with optimal parameters for given maximum capacity `n_elems`
42    /// and false positive rate `fpr`.
43    ///
44    /// `n_elems` must be non-zero and `fpr` must be a number between 0 and 1 exclusive.
45    ///
46    /// # Example
47    ///
48    /// ```
49    /// use deterministic_bloom::runtime_size::BloomFilter;
50    ///
51    /// let mut filter = BloomFilter::new_from_fpr(1_000_000, 1.0 / 1_000_000.0);
52    ///
53    /// assert_eq!(filter.as_bytes().len(), 3594397);
54    /// assert_eq!(filter.hash_count(), 20);
55    /// ```
56    pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
57        let params = BloomParams::new_from_fpr(n_elems, fpr);
58        let bits = Box::from(vec![0u8; params.byte_size].as_ref());
59        Self {
60            k_hashes: params.k_hashes,
61            bytes: bits,
62        }
63    }
64
65    /// Construct an optimal power-of-two (po2) sized bloom filter for given maximum capacity
66    /// `n_elems` and false positive rate `fpr`.
67    ///
68    /// `n_elems` must be non-zero and `fpr` must be a number between 0 and 1 exclusive.
69    ///
70    /// Using a power-of-two size can be beneficial due to not requiring rejection sampling
71    /// when generating the hash indicies for items inserted into the filter.
72    ///
73    /// # Example
74    ///
75    /// ```
76    /// use deterministic_bloom::runtime_size::BloomFilter;
77    ///
78    /// let mut filter = BloomFilter::new_from_fpr_po2(10_000_000, 0.01);
79    ///
80    /// assert_eq!(filter.as_bytes().len(), 16_777_216);
81    /// assert_eq!(filter.as_bytes().len().count_ones(), 1); // size is a power of two
82    /// assert_eq!(filter.hash_count(), 10);
83    /// ```
84    pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
85        let params = BloomParams::new_from_fpr_po2(n_elems, fpr);
86        let bits = Box::from(vec![0u8; params.byte_size].as_ref());
87        Self {
88            k_hashes: params.k_hashes,
89            bytes: bits,
90        }
91    }
92
93    /// Construct a bloom filter with given target size and target capacity, both must
94    /// be non-zero.
95    ///
96    /// This will compute the optimal number of hash evaluations per item inserted, but the
97    /// false positive rate completely depends on the given filter size to maximum capacity
98    /// ratio.
99    ///
100    /// # Example
101    ///
102    /// ```
103    /// use deterministic_bloom::runtime_size::BloomFilter;
104    ///
105    /// let mut filter = BloomFilter::new_from_size(1000, 1000);
106    ///
107    /// // False positive rate isn't controlled though:
108    /// assert!((filter.false_positive_rate_at(1000) - 0.0215).abs() < 1e-4);
109    /// ```
110    pub fn new_from_size(bloom_bytes: usize, n_elems: u64) -> Self {
111        let params = BloomParams::new_from_size(bloom_bytes, n_elems);
112        let bits = Box::from(vec![0u8; params.byte_size].as_ref());
113        Self {
114            k_hashes: params.k_hashes,
115            bytes: bits,
116        }
117    }
118
119    /// Construct the bloom filter from existing components.
120    ///
121    /// This is useful when e.g. deserializing a bloom filter.
122    ///
123    /// # Example
124    ///
125    /// ```
126    /// use deterministic_bloom::runtime_size::BloomFilter;
127    ///
128    /// let mut filter = BloomFilter::new_from_fpr_po2(100_000, 0.01);
129    ///
130    /// filter.insert(b"Hello, World!");
131    ///
132    /// // Serialize
133    /// let k_hashes = filter.hash_count();
134    /// let bytes = Box::from(filter.as_bytes());
135    ///
136    /// // Deserialize
137    /// let filter2 = BloomFilter::new_with(k_hashes, bytes);
138    /// assert_eq!(filter, filter2);
139    /// ```
140    pub fn new_with(k_hashes: usize, bytes: Box<[u8]>) -> Self {
141        Self { k_hashes, bytes }
142    }
143
144    /// Compute the bloom parameters for this bloom filter.
145    /// This contains information about its size and hash function evaluations per
146    /// item (`k_hashes`).
147    pub fn get_bloom_params(&self) -> BloomParams {
148        BloomParams {
149            k_hashes: self.k_hashes,
150            byte_size: self.bytes.len(),
151        }
152    }
153
154    /// Get the approximate false positive rate at given capacity for this bloom filter.
155    /// Returns a number between 0 and 1.
156    pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
157        self.get_bloom_params().false_positive_rate_at(n_elems)
158    }
159
160    /// Get the approximate false positive rate at the current capacity of this bloom filter.
161    /// Returns a number between 0 and 1.
162    pub fn current_false_positive_rate(&self) -> f64 {
163        let m = (self.bytes.len() * 8) as f64;
164        let m_set = self.count_ones() as f64;
165        let load = m_set / m;
166        load.powi(self.hash_count() as i32)
167    }
168
169    /// Counts the amount of bits set in the bloom filter.
170    pub fn count_ones(&self) -> usize {
171        self.bytes.view_bits::<Lsb0>().count_ones()
172    }
173
174    /// Insert an element into the bloom filter.
175    ///
176    /// The element will be hashed, thus it needs to be representable as bytes.
177    ///
178    /// Note: If you're using the bloom filter in a non-trusted
179    /// environment, so e.g. the items can be chosen by an adversary, please
180    /// make sure to pre-hash your items with a cryptographic hashing function
181    /// like SHA-256 or BLAKE3.
182    /// Otherwise an adversary will be able to generate elements that cause
183    /// the bloom filter to e.g. be unusually full with an unusually high false
184    /// positive rate or cheaply generate elements that are false positives.
185    ///
186    /// # Example
187    ///
188    /// ```
189    /// use deterministic_bloom::runtime_size::BloomFilter;
190    ///
191    /// let mut filter = BloomFilter::new_from_fpr(1000, 0.0001);
192    ///
193    /// for i in 0u32..1000 {
194    ///     filter.insert(&i.to_le_bytes());
195    /// }
196    ///
197    /// // Slightly more than half filled with zeros
198    /// assert_eq!(filter.as_bytes().len() / 2 * 8, filter.count_ones() - 322);
199    ///
200    /// assert!(filter.contains(&10u32.to_le_bytes()));
201    /// assert!(!filter.contains(&1001u32.to_le_bytes())); // Except in 0.01%
202    /// ```
203    pub fn insert(&mut self, item: &impl AsRef<[u8]>) {
204        for i in self.hash_indices(item) {
205            self.bytes.view_bits_mut::<Lsb0>().set(i, true);
206        }
207    }
208
209    /// Check whether an element was added into the bloom filter.
210    ///
211    /// This will always return true if the element was indeed added before,
212    /// but it *may* sometimes return true, even if it wasn't.
213    /// This is called a false positive and the false positive rate
214    /// at certain capacities can be checked with [`false_positive_rate_at`](BloomFilter::false_positive_rate_at)
215    /// and a desired false positive rate can be configured on creation with
216    /// [`new_from_fpr`](BloomFilter::new_from_fpr) or [`new_from_fpr_po2`](BloomFilter::new_from_fpr_po2).
217    ///
218    /// # Example
219    ///
220    /// ```
221    /// use deterministic_bloom::runtime_size::BloomFilter;
222    ///
223    /// let mut filter = BloomFilter::new_from_fpr(100, 0.1); // very high false-positive rate
224    ///
225    /// for i in 0u32..100 {
226    ///     filter.insert(&i.to_le_bytes());
227    /// }
228    ///
229    /// // Inserted items will always return true
230    /// assert!(filter.contains(&50u32.to_le_bytes()));
231    /// // Non-inserted items mostly return false, but sometimes true
232    /// assert!(!filter.contains(&101u32.to_le_bytes()));
233    /// // But sometimes there exist false positives (in this case 10% of the time)
234    /// assert!(filter.contains(&106u32.to_le_bytes()));
235    /// ```
236    pub fn contains(&self, item: &impl AsRef<[u8]>) -> bool {
237        for i in self.hash_indices(item) {
238            if !self.bytes.view_bits::<Lsb0>()[i] {
239                return false;
240            }
241        }
242        true
243    }
244
245    /// Returns how many hash function invocations are used pre item inserted
246    pub fn hash_count(&self) -> usize {
247        self.k_hashes
248    }
249
250    /// Return the underlying array used to store the bloom bits (always on the heap)
251    pub fn as_bytes(&self) -> &[u8] {
252        &self.bytes
253    }
254
255    /// Return the indices that a given element would set in the filter
256    pub fn hash_indices<'a>(&self, item: &'a impl AsRef<[u8]>) -> impl Iterator<Item = usize> + 'a {
257        HashIndexIterator::new(item, self.bytes.len() * 8).take(self.hash_count())
258    }
259}
260
261impl Debug for BloomFilter {
262    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263        f.debug_struct("BloomFilter")
264            .field("k_hashes", &self.k_hashes)
265            .field("bytes", &HexFieldDebug(&self.bytes))
266            .finish()
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use super::BloomFilter;
273
274    #[test]
275    fn serialization_round_trip() {
276        let mut filter = BloomFilter::new_from_fpr(100, 0.001);
277        filter.insert(b"Hello");
278        filter.insert(b"World!");
279        let serialized_bytes = filter.as_bytes();
280        let serialized_k = filter.hash_count();
281        let deserialized = BloomFilter::new_with(serialized_k, Box::from(serialized_bytes));
282        assert!(deserialized.contains(b"Hello"));
283        assert!(!deserialized.contains(b"abc"));
284        assert_eq!(deserialized, filter);
285    }
286
287    #[test]
288    fn empty_bloom_filter() {
289        let filter = BloomFilter::new_with(3, Box::new([]));
290        // Technically an empty bloom "contains" anything, since everything is a false positive.
291        assert!(filter.contains(&[1, 2, 3]));
292    }
293}
294
295#[cfg(test)]
296mod proptests {
297    use super::BloomFilter;
298    use proptest::prop_assert;
299    use test_strategy::proptest;
300
301    #[proptest]
302    fn inserted_always_contained(items: Vec<u64>, #[strategy(100usize..10_000)] size: usize) {
303        let capacity = std::cmp::max(items.len() as u64, 1);
304        let mut filter = BloomFilter::new_from_size(size, capacity);
305
306        for item in items.iter() {
307            filter.insert(&item.to_le_bytes());
308        }
309
310        for item in items.iter() {
311            prop_assert!(filter.contains(&item.to_le_bytes()));
312        }
313    }
314
315    #[proptest]
316    fn false_positive_rate_as_predicted(
317        #[strategy(100u64..1_000)] n_elems: u64,
318        #[strategy(100.0..10_000.0)] inv_fpr: f64,
319    ) {
320        let fpr = 1.0 / inv_fpr;
321        let mut filter = BloomFilter::new_from_fpr(n_elems, fpr);
322
323        for i in 0..n_elems {
324            filter.insert(&i.to_le_bytes());
325        }
326
327        let measurements = 100_000;
328        let mut false_positives = 0;
329
330        for i in n_elems..n_elems + measurements {
331            if filter.contains(&i.to_le_bytes()) {
332                false_positives += 1;
333            }
334        }
335
336        let computed_fpr = false_positives as f64 / measurements as f64;
337        // The actual FPR should be pretty close
338        prop_assert!((computed_fpr - fpr).abs() < 1.5e-3);
339    }
340}