1use serde::{Deserialize, Serialize};
34
35const MIN_BITS: u64 = 64;
38
39const MIN_HASHES: u8 = 1;
42
43const MAX_HASHES: u8 = 32;
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct BloomFilter {
51 bits: Vec<u64>,
53 hashes: u8,
55 n_bits: u64,
58}
59
60impl BloomFilter {
61 #[must_use]
78 pub fn with_size_and_fp_rate(n: usize, fp: f64) -> Self {
79 let fp = if fp.is_nan() || fp <= 0.0 {
80 0.000_001
81 } else if fp >= 1.0 {
82 0.999_999
83 } else {
84 fp
85 };
86
87 let (n_bits, hashes) = compute_params(n, fp);
88 let chunk_count = (n_bits / 64) as usize;
89 Self {
90 bits: vec![0_u64; chunk_count],
91 hashes,
92 n_bits,
93 }
94 }
95
96 #[must_use]
98 pub fn n_bits(&self) -> u64 {
99 self.n_bits
100 }
101
102 #[must_use]
104 pub fn hash_count(&self) -> u8 {
105 self.hashes
106 }
107
108 pub fn insert(&mut self, key: &[u8]) {
113 for idx in self.indices(key) {
114 self.set_bit(idx);
115 }
116 }
117
118 #[must_use]
126 pub fn contains(&self, key: &[u8]) -> bool {
127 for idx in self.indices(key) {
128 if !self.test_bit(idx) {
129 return false;
130 }
131 }
132 true
133 }
134
135 #[must_use]
143 pub fn false_positive_rate(&self, n_inserted: usize) -> f64 {
144 if n_inserted == 0 || self.n_bits == 0 {
145 return 0.0;
146 }
147 let k = f64::from(self.hashes);
148 let n = usize_to_f64(n_inserted);
149 let m = u64_to_f64(self.n_bits);
150 let occupancy = 1.0 - (-k * n / m).exp();
151 occupancy.powf(k)
152 }
153
154 fn indices(&self, key: &[u8]) -> impl Iterator<Item = u64> {
160 let h = blake3::hash(key);
161 let bytes = h.as_bytes();
162 let mut h1_buf = [0_u8; 8];
163 let mut h2_buf = [0_u8; 8];
164 h1_buf.copy_from_slice(&bytes[0..8]);
165 h2_buf.copy_from_slice(&bytes[8..16]);
166 let h1 = u64::from_le_bytes(h1_buf);
167 let h2 = u64::from_le_bytes(h2_buf);
168 let n_bits = self.n_bits;
169 let k = self.hashes;
170 (0..u64::from(k)).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) % n_bits)
171 }
172
173 fn set_bit(&mut self, idx: u64) {
174 let chunk = u64_to_usize(idx / 64);
175 let off = idx % 64;
176 self.bits[chunk] |= 1_u64 << off;
177 }
178
179 fn test_bit(&self, idx: u64) -> bool {
180 let chunk = u64_to_usize(idx / 64);
181 let off = idx % 64;
182 (self.bits[chunk] >> off) & 1 == 1
183 }
184}
185
186fn compute_params(n: usize, fp: f64) -> (u64, u8) {
192 let n_f = usize_to_f64(n);
193 let raw_m = if n == 0 {
194 u64_to_f64(MIN_BITS)
195 } else {
196 -n_f * fp.ln() / (std::f64::consts::LN_2 * std::f64::consts::LN_2)
197 };
198 let m_rounded = raw_m.ceil().max(u64_to_f64(MIN_BITS));
199
200 let m_as_u64 = f64_to_u64_saturating(m_rounded);
203 let m_u64_chunks = m_as_u64.div_ceil(64).max(1);
204 let n_bits = m_u64_chunks * 64;
205
206 let k_raw = if n == 0 {
207 f64::from(MIN_HASHES)
208 } else {
209 (u64_to_f64(n_bits) / n_f) * std::f64::consts::LN_2
210 };
211 let k_rounded = k_raw.ceil().max(f64::from(MIN_HASHES));
212 let hashes_u32 = f64_to_u32_saturating(k_rounded).min(u32::from(MAX_HASHES));
213 let hashes = u8::try_from(hashes_u32)
214 .unwrap_or(MAX_HASHES)
215 .max(MIN_HASHES);
216 (n_bits, hashes)
217}
218
219#[allow(
224 clippy::cast_precision_loss,
225 reason = "bloom dimension formula: u64 -> f64 widens by design; values are bounded by the configured bit count and have far fewer than 2^53 significant bits in practice."
226)]
227fn u64_to_f64(x: u64) -> f64 {
228 x as f64
229}
230
231#[allow(
233 clippy::cast_precision_loss,
234 reason = "bloom dimension formula: usize -> f64 may round for n above 2^53; that is the same precision loss the reference Mitzenmacher derivation accepts because the formula uses ln(fp), which dominates the rounding budget."
235)]
236fn usize_to_f64(x: usize) -> f64 {
237 x as f64
238}
239
240#[allow(
243 clippy::cast_possible_truncation,
244 clippy::cast_sign_loss,
245 reason = "bloom dimension formula: f64 -> u64 with explicit guards. Caller has already ceiled the input and the formula's outputs are positive; the saturating cast pins the rare overflow case to u64::MAX."
246)]
247fn f64_to_u64_saturating(x: f64) -> u64 {
248 if !x.is_finite() || x <= 0.0 {
249 return 0;
250 }
251 if x >= u64_to_f64(u64::MAX) {
252 return u64::MAX;
253 }
254 x as u64
255}
256
257#[allow(
260 clippy::cast_possible_truncation,
261 clippy::cast_sign_loss,
262 reason = "bloom dimension formula: f64 -> u32 with explicit guards. The hash count is bounded by MAX_HASHES (32) so the cast is well within u32 range; the saturation arm guards against pathological inputs."
263)]
264fn f64_to_u32_saturating(x: f64) -> u32 {
265 if !x.is_finite() || x <= 0.0 {
266 return 0;
267 }
268 if x >= f64::from(u32::MAX) {
269 return u32::MAX;
270 }
271 x as u32
272}
273
274#[allow(
280 clippy::cast_possible_truncation,
281 reason = "indexing into a Vec<u64> by chunk: idx is already bounded by self.n_bits, which was derived from a Vec capacity (usize-bounded) at construction time."
282)]
283fn u64_to_usize(x: u64) -> usize {
284 x as usize
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290
291 #[test]
292 fn bloom_no_false_negatives_on_inserted_keys() {
293 let mut bf = BloomFilter::with_size_and_fp_rate(1000, 0.01);
294 let keys: Vec<Vec<u8>> = (0..200_u32)
295 .map(|i| format!("key-{i}").into_bytes())
296 .collect();
297 for k in &keys {
298 bf.insert(k);
299 }
300 for k in &keys {
301 assert!(bf.contains(k), "false negative on inserted key {k:?}");
302 }
303 }
304
305 #[test]
306 fn bloom_false_positive_rate_under_5pct_at_design_load() {
307 let n: u32 = 10_000;
308 let fp_target = 0.01;
309 let mut bf = BloomFilter::with_size_and_fp_rate(n as usize, fp_target);
310 for i in 0..n {
311 let k = format!("inserted-{i}");
312 bf.insert(k.as_bytes());
313 }
314 let probes = 5_000_u32;
316 let mut fps: u32 = 0;
317 for i in 0..probes {
318 let k = format!("probe-{i}-not-in-filter");
319 if bf.contains(k.as_bytes()) {
320 fps += 1;
321 }
322 }
323 let observed = f64::from(fps) / f64::from(probes);
324 assert!(
325 observed < 0.05,
326 "observed fp rate {observed} exceeded 5%; \
327 theoretical={}",
328 bf.false_positive_rate(n as usize)
329 );
330 }
331
332 #[test]
333 fn bloom_zero_keys_returns_false_for_anything() {
334 let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
335 assert!(!bf.contains(b"nope"));
336 assert!(!bf.contains(b""));
337 assert!(!bf.contains(b"another"));
338 }
339
340 #[test]
341 fn bloom_with_zero_n_does_not_panic() {
342 let mut bf = BloomFilter::with_size_and_fp_rate(0, 0.01);
343 assert_eq!(bf.n_bits(), MIN_BITS);
344 bf.insert(b"x");
345 assert!(bf.contains(b"x"));
346 }
347
348 #[test]
349 fn bloom_with_extreme_fp_rates_clamps() {
350 let bf_low = BloomFilter::with_size_and_fp_rate(100, 0.0);
351 assert!(bf_low.n_bits() >= MIN_BITS);
352 let bf_high = BloomFilter::with_size_and_fp_rate(100, 1.0);
353 assert!(bf_high.n_bits() >= MIN_BITS);
354 let bf_nan = BloomFilter::with_size_and_fp_rate(100, f64::NAN);
355 assert!(bf_nan.n_bits() >= MIN_BITS);
356 }
357
358 #[test]
359 fn bloom_hash_count_is_capped() {
360 let bf = BloomFilter::with_size_and_fp_rate(10, 1e-30);
363 assert!(bf.hash_count() <= MAX_HASHES);
364 }
365
366 #[test]
367 fn bloom_false_positive_rate_is_zero_for_empty_filter() {
368 let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
369 let rate = bf.false_positive_rate(0);
370 assert!(rate.abs() < f64::EPSILON);
371 }
372}