deterministic_bloom/
common.rs

1use std::{f64::consts::LN_2, fmt::Debug};
2use xxhash_rust::xxh3;
3
4/// An iterator that generates indices into some bloom filter based on deterministic hashing of specified item.
5///
6/// # Examples
7///
8/// ```
9/// use deterministic_bloom::const_size::BloomFilter;
10///
11/// let filter = BloomFilter::<256, 30>::default();
12/// let indices = filter.hash_indices(&[0xF5u8; 32]);
13/// let indices = indices.collect::<Vec<_>>();
14///
15/// assert_eq!(indices.len(), 30);
16/// ```
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct HashIndexIterator<'a, T: AsRef<[u8]>> {
19    item: &'a T,
20    bit_size: usize,
21    index: u64,
22}
23
24/// Optimal bloom parameters for some false positive rate at a maximum number of
25/// elements added, or for some byte size with target element count, etc.
26///
27/// Captures the bloom filter byte size needed as well as the number of hash function
28/// evaluations needed per item to insert.
29///
30/// To construct this, use
31/// - [`BloomParams::new_from_fpr`] for constructing this from a given false positive rate and desired capacity,
32/// - similarly [`BloomParams::new_from_fpr_po2`], but with power-of-two sizes,
33/// - [`BloomParams::new_from_size`] for constructing from desired size and capacity.
34#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
35pub struct BloomParams {
36    /// size of the bloom filter in bytes, non-zero
37    pub byte_size: usize,
38    /// hashing functions used/number of bits set per element, non-zero
39    pub k_hashes: usize,
40}
41
42/// Errors for [BloomFilter] operations.
43#[derive(thiserror::Error, miette::Diagnostic, Debug)]
44pub enum Error {
45    /// Report a size mismatch when importing a Bloom filter from a [Vec].
46    #[error("Cannot convert vector to BloomFilter: expected {expected}, but got {actual}")]
47    #[diagnostic(url(docsrs))]
48    VectorImportSizeMismatch {
49        /// The expected size in the [BloomFilter].
50        expected: usize,
51
52        /// The actual size of the [Vec].
53        actual: usize,
54    },
55}
56
57//------------------------------------------------------------------------------
58// Implementations
59//------------------------------------------------------------------------------
60
61impl<'a, T: AsRef<[u8]>> HashIndexIterator<'a, T> {
62    /// Creates a new iterator.
63    pub fn new(item: &'a T, bit_size: usize) -> Self {
64        Self {
65            item,
66            index: 0,
67            bit_size,
68        }
69    }
70}
71
72impl<T: AsRef<[u8]>> Iterator for HashIndexIterator<'_, T> {
73    type Item = usize;
74
75    fn next(&mut self) -> Option<Self::Item> {
76        if self.bit_size == 0 {
77            // This avoids an infinite loop in rejection sampling.
78            return None;
79        }
80
81        let bit_size_po2 = self.bit_size.next_power_of_two();
82        loop {
83            let hash = xxh3::xxh3_64_with_seed(self.item.as_ref(), self.index) as usize;
84            self.index += 1;
85
86            // Rejection sampling for non-power-of-two bit sizes
87            let value = hash % bit_size_po2;
88            if value < self.bit_size {
89                return Some(value);
90            }
91        }
92    }
93}
94
95impl BloomParams {
96    /// Construct optimal bloom parameters for given number maximum elements
97    /// that the bloom filter will hold as well as the approximate
98    /// false positive rate it should have at that capacity.
99    ///
100    /// `n_elems` must be non-zero, and `fpr` must be between 0 and 1, exclusive.
101    ///
102    /// This will generate non-power-of-two sizes for bloom filters.
103    /// For a variant that power-of-two (po2) sizes, see [`BloomParams::new_from_fpr_po2`].
104    ///
105    /// # Example
106    ///
107    /// ```
108    /// use deterministic_bloom::common::BloomParams;
109    ///
110    /// // figure out bloom parameters for 47 elements with a one in a billion false positive rate:
111    /// let params = BloomParams::new_from_fpr(47, 1.0 / 1_000_000_000.0);
112    /// assert_eq!(params, BloomParams {
113    ///     byte_size: 254,
114    ///     k_hashes: 30,
115    /// })
116    /// ```
117    pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
118        let byte_size = Self::optimal_byte_size(n_elems, fpr);
119        let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
120
121        Self {
122            byte_size,
123            k_hashes,
124        }
125    }
126
127    /// Construct optimal bloom parameters for given capacity `n_elems` and false positive rate,
128    /// where the target size will always be a power-of-two.
129    ///
130    /// `n_elems` must be non-zero, and `fpr` must be between 0.0 and 1.0, exclusive.
131    ///
132    /// It is often desirable to go for power-of-two sizes, since that simplifies generating
133    /// bit indices by not requiring rejection sampling.
134    ///
135    /// # Example
136    ///
137    /// ```
138    /// use deterministic_bloom::common::BloomParams;
139    ///
140    /// // Generate some bloom parameters
141    /// let params = BloomParams::new_from_fpr_po2(1_000_000, 0.0001);
142    /// assert_eq!(params.byte_size, params.byte_size.next_power_of_two());
143    /// ```
144    pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
145        let byte_size = Self::optimal_byte_size(n_elems, fpr).next_power_of_two();
146        let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
147
148        Self {
149            byte_size,
150            k_hashes,
151        }
152    }
153
154    /// Construct optimal bloom parameters for given bloom filter `byte_size` and capacity `n_elems`.
155    pub fn new_from_size(byte_size: usize, n_elems: u64) -> Self {
156        Self {
157            byte_size,
158            k_hashes: Self::optimal_k_hashes(byte_size * 8, n_elems),
159        }
160    }
161
162    /// Compute the approximate false positive rate at `n_elems`.
163    /// `n_elems` must be non-zero.
164    ///
165    /// Returns the false positive rate as a number between 0.0 and 1.0.
166    pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
167        debug_assert!(n_elems != 0);
168
169        let k = self.k_hashes as f64;
170        let ki = self.k_hashes as i32;
171        let m = (self.byte_size * 8) as f64;
172        let n = n_elems as f64;
173
174        // see https://hur.st/bloomfilter/
175        (1.0 - (-k / (m / n)).exp()).powi(ki)
176    }
177
178    fn optimal_byte_size(n_elems: u64, fpr: f64) -> usize {
179        debug_assert!(n_elems != 0);
180        debug_assert!(fpr > 0.0 && fpr < 1.0);
181
182        let n = n_elems as f64;
183        let bit_size = n * fpr.ln() / -(LN_2 * LN_2);
184        (bit_size / 8.0).ceil() as usize
185    }
186
187    fn optimal_k_hashes(bloom_bits: usize, n_elems: u64) -> usize {
188        debug_assert!(bloom_bits != 0);
189        debug_assert!(n_elems != 0);
190
191        let m = bloom_bits as f64;
192        let n = n_elems as f64;
193        let k_hashes = ((m / n) * LN_2).ceil() as usize;
194        std::cmp::max(k_hashes, 1)
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::HashIndexIterator;
201
202    #[test]
203    fn test_zero_bit_size() {
204        let mut iterator = HashIndexIterator::new(&[1, 2, 3], 0);
205        assert_eq!(iterator.next(), None);
206    }
207}
208
209#[cfg(test)]
210mod proptests {
211    use super::BloomParams;
212    use proptest::prop_assert;
213    use test_strategy::proptest;
214
215    #[proptest(cases = 10_000)]
216    fn bloom_params_fpr_calc_round_trips(
217        #[strategy(100u64..1_000_000)] n_elems: u64,
218        #[strategy(0.0..0.1)] fpr: f64,
219    ) {
220        if fpr == 0.0 {
221            return Ok(());
222        }
223
224        let params = BloomParams::new_from_fpr(n_elems, fpr);
225        let fpr_computed = params.false_positive_rate_at(n_elems);
226
227        // The computed FPR can differ from the target FPR due to
228        // rounding errors and the fact that only multiple-of-8
229        // bloom sizes are allowed.
230        let fpr_diff = (fpr_computed - fpr).abs();
231        // We're fine if it's within 15% of a margin-of-error.
232        prop_assert!(fpr_diff < fpr * 0.15);
233    }
234}