deterministic_bloom/common.rs
1use std::{f64::consts::LN_2, fmt::Debug};
2use xxhash_rust::xxh3;
3
4/// An iterator that generates indices into some bloom filter based on deterministic hashing of specified item.
5///
6/// # Examples
7///
8/// ```
9/// use deterministic_bloom::const_size::BloomFilter;
10///
11/// let filter = BloomFilter::<256, 30>::default();
12/// let indices = filter.hash_indices(&[0xF5u8; 32]);
13/// let indices = indices.collect::<Vec<_>>();
14///
15/// assert_eq!(indices.len(), 30);
16/// ```
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct HashIndexIterator<'a, T: AsRef<[u8]>> {
19 item: &'a T,
20 bit_size: usize,
21 index: u64,
22}
23
24/// Optimal bloom parameters for some false positive rate at a maximum number of
25/// elements added, or for some byte size with target element count, etc.
26///
27/// Captures the bloom filter byte size needed as well as the number of hash function
28/// evaluations needed per item to insert.
29///
30/// To construct this, use
31/// - [`BloomParams::new_from_fpr`] for constructing this from a given false positive rate and desired capacity,
32/// - similarly [`BloomParams::new_from_fpr_po2`], but with power-of-two sizes,
33/// - [`BloomParams::new_from_size`] for constructing from desired size and capacity.
34#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
35pub struct BloomParams {
36 /// size of the bloom filter in bytes, non-zero
37 pub byte_size: usize,
38 /// hashing functions used/number of bits set per element, non-zero
39 pub k_hashes: usize,
40}
41
42/// Errors for [BloomFilter] operations.
43#[derive(thiserror::Error, miette::Diagnostic, Debug)]
44pub enum Error {
45 /// Report a size mismatch when importing a Bloom filter from a [Vec].
46 #[error("Cannot convert vector to BloomFilter: expected {expected}, but got {actual}")]
47 #[diagnostic(url(docsrs))]
48 VectorImportSizeMismatch {
49 /// The expected size in the [BloomFilter].
50 expected: usize,
51
52 /// The actual size of the [Vec].
53 actual: usize,
54 },
55}
56
57//------------------------------------------------------------------------------
58// Implementations
59//------------------------------------------------------------------------------
60
61impl<'a, T: AsRef<[u8]>> HashIndexIterator<'a, T> {
62 /// Creates a new iterator.
63 pub fn new(item: &'a T, bit_size: usize) -> Self {
64 Self {
65 item,
66 index: 0,
67 bit_size,
68 }
69 }
70}
71
72impl<T: AsRef<[u8]>> Iterator for HashIndexIterator<'_, T> {
73 type Item = usize;
74
75 fn next(&mut self) -> Option<Self::Item> {
76 if self.bit_size == 0 {
77 // This avoids an infinite loop in rejection sampling.
78 return None;
79 }
80
81 let bit_size_po2 = self.bit_size.next_power_of_two();
82 loop {
83 let hash = xxh3::xxh3_64_with_seed(self.item.as_ref(), self.index) as usize;
84 self.index += 1;
85
86 // Rejection sampling for non-power-of-two bit sizes
87 let value = hash % bit_size_po2;
88 if value < self.bit_size {
89 return Some(value);
90 }
91 }
92 }
93}
94
95impl BloomParams {
96 /// Construct optimal bloom parameters for given number maximum elements
97 /// that the bloom filter will hold as well as the approximate
98 /// false positive rate it should have at that capacity.
99 ///
100 /// `n_elems` must be non-zero, and `fpr` must be between 0 and 1, exclusive.
101 ///
102 /// This will generate non-power-of-two sizes for bloom filters.
103 /// For a variant that power-of-two (po2) sizes, see [`BloomParams::new_from_fpr_po2`].
104 ///
105 /// # Example
106 ///
107 /// ```
108 /// use deterministic_bloom::common::BloomParams;
109 ///
110 /// // figure out bloom parameters for 47 elements with a one in a billion false positive rate:
111 /// let params = BloomParams::new_from_fpr(47, 1.0 / 1_000_000_000.0);
112 /// assert_eq!(params, BloomParams {
113 /// byte_size: 254,
114 /// k_hashes: 30,
115 /// })
116 /// ```
117 pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
118 let byte_size = Self::optimal_byte_size(n_elems, fpr);
119 let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
120
121 Self {
122 byte_size,
123 k_hashes,
124 }
125 }
126
127 /// Construct optimal bloom parameters for given capacity `n_elems` and false positive rate,
128 /// where the target size will always be a power-of-two.
129 ///
130 /// `n_elems` must be non-zero, and `fpr` must be between 0.0 and 1.0, exclusive.
131 ///
132 /// It is often desirable to go for power-of-two sizes, since that simplifies generating
133 /// bit indices by not requiring rejection sampling.
134 ///
135 /// # Example
136 ///
137 /// ```
138 /// use deterministic_bloom::common::BloomParams;
139 ///
140 /// // Generate some bloom parameters
141 /// let params = BloomParams::new_from_fpr_po2(1_000_000, 0.0001);
142 /// assert_eq!(params.byte_size, params.byte_size.next_power_of_two());
143 /// ```
144 pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
145 let byte_size = Self::optimal_byte_size(n_elems, fpr).next_power_of_two();
146 let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
147
148 Self {
149 byte_size,
150 k_hashes,
151 }
152 }
153
154 /// Construct optimal bloom parameters for given bloom filter `byte_size` and capacity `n_elems`.
155 pub fn new_from_size(byte_size: usize, n_elems: u64) -> Self {
156 Self {
157 byte_size,
158 k_hashes: Self::optimal_k_hashes(byte_size * 8, n_elems),
159 }
160 }
161
162 /// Compute the approximate false positive rate at `n_elems`.
163 /// `n_elems` must be non-zero.
164 ///
165 /// Returns the false positive rate as a number between 0.0 and 1.0.
166 pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
167 debug_assert!(n_elems != 0);
168
169 let k = self.k_hashes as f64;
170 let ki = self.k_hashes as i32;
171 let m = (self.byte_size * 8) as f64;
172 let n = n_elems as f64;
173
174 // see https://hur.st/bloomfilter/
175 (1.0 - (-k / (m / n)).exp()).powi(ki)
176 }
177
178 fn optimal_byte_size(n_elems: u64, fpr: f64) -> usize {
179 debug_assert!(n_elems != 0);
180 debug_assert!(fpr > 0.0 && fpr < 1.0);
181
182 let n = n_elems as f64;
183 let bit_size = n * fpr.ln() / -(LN_2 * LN_2);
184 (bit_size / 8.0).ceil() as usize
185 }
186
187 fn optimal_k_hashes(bloom_bits: usize, n_elems: u64) -> usize {
188 debug_assert!(bloom_bits != 0);
189 debug_assert!(n_elems != 0);
190
191 let m = bloom_bits as f64;
192 let n = n_elems as f64;
193 let k_hashes = ((m / n) * LN_2).ceil() as usize;
194 std::cmp::max(k_hashes, 1)
195 }
196}
197
198#[cfg(test)]
199mod tests {
200 use super::HashIndexIterator;
201
202 #[test]
203 fn test_zero_bit_size() {
204 let mut iterator = HashIndexIterator::new(&[1, 2, 3], 0);
205 assert_eq!(iterator.next(), None);
206 }
207}
208
209#[cfg(test)]
210mod proptests {
211 use super::BloomParams;
212 use proptest::prop_assert;
213 use test_strategy::proptest;
214
215 #[proptest(cases = 10_000)]
216 fn bloom_params_fpr_calc_round_trips(
217 #[strategy(100u64..1_000_000)] n_elems: u64,
218 #[strategy(0.0..0.1)] fpr: f64,
219 ) {
220 if fpr == 0.0 {
221 return Ok(());
222 }
223
224 let params = BloomParams::new_from_fpr(n_elems, fpr);
225 let fpr_computed = params.false_positive_rate_at(n_elems);
226
227 // The computed FPR can differ from the target FPR due to
228 // rounding errors and the fact that only multiple-of-8
229 // bloom sizes are allowed.
230 let fpr_diff = (fpr_computed - fpr).abs();
231 // We're fine if it's within 15% of a margin-of-error.
232 prop_assert!(fpr_diff < fpr * 0.15);
233 }
234}