Skip to main content

dyntext/
bloom.rs

1//! Standard bloom filter with configurable bit count and hash
2//! count.
3//!
4//! The filter is parameterised by a target false positive rate
5//! `fp` and an expected insert count `n`. The constructor
6//! computes the optimal bit count and hash count from the
7//! standard textbook formulas:
8//!
9//! ```text
10//! m_bits  = ceil( -n * ln(fp) / (ln 2)^2 )
11//! k_hashes = ceil( (m_bits / n) * ln 2 )
12//! ```
13//!
14//! # Hashing
15//!
16//! Each insert / contains call BLAKE3-hashes the key once and
17//! synthesises the `k` bit indices via double hashing
18//! (`h1 + i * h2 mod m`). This is the standard Kirsch-Mitzenmacher
19//! construction; for two independent base hashes the
20//! double-hashing scheme reproduces the false positive rate of
21//! a true `k`-independent hash family within rounding error,
22//! and BLAKE3's first sixteen output bytes split cleanly into
23//! two independent `u64` halves.
24//!
25//! # No false negatives
26//!
27//! `contains` returns `true` for every key the filter has ever
28//! observed via `insert`. Conversely, a `true` return does NOT
29//! prove the key was inserted; the false positive rate is
30//! determined by the constructor parameters and the actual
31//! number of distinct keys inserted.
32
33use serde::{Deserialize, Serialize};
34
35/// Minimum bit count. Provides a usable filter even when the
36/// caller's `n` is zero (degenerate corner case).
37const MIN_BITS: u64 = 64;
38
39/// Minimum number of hash functions. A single hash is enough to
40/// give the filter its no-false-negative guarantee.
41const MIN_HASHES: u8 = 1;
42
43/// Maximum number of hash functions. Beyond this point the
44/// false-positive rate climbs again and additional hashes only
45/// slow inserts and lookups; cap it at a sensible ceiling.
46const MAX_HASHES: u8 = 32;
47
48/// Bloom filter over arbitrary byte keys.
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct BloomFilter {
51    /// Bit vector packed into `u64` chunks.
52    bits: Vec<u64>,
53    /// Number of hash functions per insert / lookup.
54    hashes: u8,
55    /// Total bit count `64 * bits.len()`. Cached so the modulo
56    /// computation does not re-multiply on every hash.
57    n_bits: u64,
58}
59
60impl BloomFilter {
61    /// Construct a filter sized for `n` expected inserts at a
62    /// target false positive rate of `fp`.
63    ///
64    /// `fp` must be strictly between 0 and 1; values outside
65    /// that range are clamped. `n` of zero produces the minimum
66    /// usable filter (64 bits, 1 hash) so callers do not need
67    /// to special-case empty corpora.
68    ///
69    /// # Examples
70    ///
71    /// ```
72    /// use dyntext::bloom::BloomFilter;
73    /// let mut bf = BloomFilter::with_size_and_fp_rate(1000, 0.01);
74    /// bf.insert(b"hello");
75    /// assert!(bf.contains(b"hello"));
76    /// ```
77    #[must_use]
78    pub fn with_size_and_fp_rate(n: usize, fp: f64) -> Self {
79        let fp = if fp.is_nan() || fp <= 0.0 {
80            0.000_001
81        } else if fp >= 1.0 {
82            0.999_999
83        } else {
84            fp
85        };
86
87        let (n_bits, hashes) = compute_params(n, fp);
88        let chunk_count = (n_bits / 64) as usize;
89        Self {
90            bits: vec![0_u64; chunk_count],
91            hashes,
92            n_bits,
93        }
94    }
95
96    /// Total bit count (multiple of 64).
97    #[must_use]
98    pub fn n_bits(&self) -> u64 {
99        self.n_bits
100    }
101
102    /// Number of hash functions per insert / lookup.
103    #[must_use]
104    pub fn hash_count(&self) -> u8 {
105        self.hashes
106    }
107
108    /// Insert `key` into the filter.
109    ///
110    /// After this call, [`Self::contains`] returns `true` for
111    /// `key`. Repeated inserts of the same key are idempotent.
112    pub fn insert(&mut self, key: &[u8]) {
113        for idx in self.indices(key) {
114            self.set_bit(idx);
115        }
116    }
117
118    /// Test whether `key` MAY be present.
119    ///
120    /// Returns `false` only if `key` has provably never been
121    /// inserted (no false negatives). Returns `true` if every
122    /// hash bit is set, which can happen either because `key`
123    /// was inserted or because the bits were collectively set
124    /// by other keys (false positive).
125    #[must_use]
126    pub fn contains(&self, key: &[u8]) -> bool {
127        for idx in self.indices(key) {
128            if !self.test_bit(idx) {
129                return false;
130            }
131        }
132        true
133    }
134
135    /// Theoretical false positive rate after `n_inserted`
136    /// distinct insertions.
137    ///
138    /// Formula: `(1 - exp(-k * n / m))^k`, where `k` is the
139    /// hash count and `m` is the bit count. Returns 0 for an
140    /// empty filter (no inserts) and approaches 1 for a
141    /// fully saturated filter.
142    #[must_use]
143    pub fn false_positive_rate(&self, n_inserted: usize) -> f64 {
144        if n_inserted == 0 || self.n_bits == 0 {
145            return 0.0;
146        }
147        let k = f64::from(self.hashes);
148        let n = usize_to_f64(n_inserted);
149        let m = u64_to_f64(self.n_bits);
150        let occupancy = 1.0 - (-k * n / m).exp();
151        occupancy.powf(k)
152    }
153
154    /// Compute the `k` bit indices a key hashes to.
155    ///
156    /// Uses double hashing (`h1 + i * h2`) over the first
157    /// 16 bytes of the BLAKE3 digest split as two `u64`s. This
158    /// is the Kirsch-Mitzenmacher construction.
159    fn indices(&self, key: &[u8]) -> impl Iterator<Item = u64> {
160        let h = blake3::hash(key);
161        let bytes = h.as_bytes();
162        let mut h1_buf = [0_u8; 8];
163        let mut h2_buf = [0_u8; 8];
164        h1_buf.copy_from_slice(&bytes[0..8]);
165        h2_buf.copy_from_slice(&bytes[8..16]);
166        let h1 = u64::from_le_bytes(h1_buf);
167        let h2 = u64::from_le_bytes(h2_buf);
168        let n_bits = self.n_bits;
169        let k = self.hashes;
170        (0..u64::from(k)).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) % n_bits)
171    }
172
173    fn set_bit(&mut self, idx: u64) {
174        let chunk = u64_to_usize(idx / 64);
175        let off = idx % 64;
176        self.bits[chunk] |= 1_u64 << off;
177    }
178
179    fn test_bit(&self, idx: u64) -> bool {
180        let chunk = u64_to_usize(idx / 64);
181        let off = idx % 64;
182        (self.bits[chunk] >> off) & 1 == 1
183    }
184}
185
186/// Compute `(n_bits, hashes)` for a filter sized for `n` keys at
187/// false positive rate `fp` (already clamped to `(0, 1)`).
188///
189/// Returns `n_bits` rounded up to a multiple of 64, and `hashes`
190/// clamped into `[MIN_HASHES, MAX_HASHES]`.
191fn compute_params(n: usize, fp: f64) -> (u64, u8) {
192    let n_f = usize_to_f64(n);
193    let raw_m = if n == 0 {
194        u64_to_f64(MIN_BITS)
195    } else {
196        -n_f * fp.ln() / (std::f64::consts::LN_2 * std::f64::consts::LN_2)
197    };
198    let m_rounded = raw_m.ceil().max(u64_to_f64(MIN_BITS));
199
200    // Round up to the next multiple of 64 so the bit vector
201    // packs cleanly into u64 chunks.
202    let m_as_u64 = f64_to_u64_saturating(m_rounded);
203    let m_u64_chunks = m_as_u64.div_ceil(64).max(1);
204    let n_bits = m_u64_chunks * 64;
205
206    let k_raw = if n == 0 {
207        f64::from(MIN_HASHES)
208    } else {
209        (u64_to_f64(n_bits) / n_f) * std::f64::consts::LN_2
210    };
211    let k_rounded = k_raw.ceil().max(f64::from(MIN_HASHES));
212    let hashes_u32 = f64_to_u32_saturating(k_rounded).min(u32::from(MAX_HASHES));
213    let hashes = u8::try_from(hashes_u32)
214        .unwrap_or(MAX_HASHES)
215        .max(MIN_HASHES);
216    (n_bits, hashes)
217}
218
219/// Lossless conversion of a small `u64` (always <= 2^53) into
220/// `f64`. Wider values lose precision, which is acceptable for
221/// the bloom-dimension formula but is documented at every call
222/// site.
223#[allow(
224    clippy::cast_precision_loss,
225    reason = "bloom dimension formula: u64 -> f64 widens by design; values are bounded by the configured bit count and have far fewer than 2^53 significant bits in practice."
226)]
227fn u64_to_f64(x: u64) -> f64 {
228    x as f64
229}
230
231/// Lossless-or-rounded conversion of `usize` to `f64`.
232#[allow(
233    clippy::cast_precision_loss,
234    reason = "bloom dimension formula: usize -> f64 may round for n above 2^53; that is the same precision loss the reference Mitzenmacher derivation accepts because the formula uses ln(fp), which dominates the rounding budget."
235)]
236fn usize_to_f64(x: usize) -> f64 {
237    x as f64
238}
239
240/// Convert a non-negative finite `f64` to `u64`, saturating at
241/// `u64::MAX`. Negative or NaN inputs are mapped to 0.
242#[allow(
243    clippy::cast_possible_truncation,
244    clippy::cast_sign_loss,
245    reason = "bloom dimension formula: f64 -> u64 with explicit guards. Caller has already ceiled the input and the formula's outputs are positive; the saturating cast pins the rare overflow case to u64::MAX."
246)]
247fn f64_to_u64_saturating(x: f64) -> u64 {
248    if !x.is_finite() || x <= 0.0 {
249        return 0;
250    }
251    if x >= u64_to_f64(u64::MAX) {
252        return u64::MAX;
253    }
254    x as u64
255}
256
257/// Convert a non-negative finite `f64` to `u32`, saturating at
258/// `u32::MAX`. Negative or NaN inputs are mapped to 0.
259#[allow(
260    clippy::cast_possible_truncation,
261    clippy::cast_sign_loss,
262    reason = "bloom dimension formula: f64 -> u32 with explicit guards. The hash count is bounded by MAX_HASHES (32) so the cast is well within u32 range; the saturation arm guards against pathological inputs."
263)]
264fn f64_to_u32_saturating(x: f64) -> u32 {
265    if !x.is_finite() || x <= 0.0 {
266        return 0;
267    }
268    if x >= f64::from(u32::MAX) {
269        return u32::MAX;
270    }
271    x as u32
272}
273
274/// Convert a `u64` bit-vector index to `usize`. On 64-bit
275/// targets this is lossless; on 32-bit targets values larger
276/// than `usize::MAX` are saturated, but the bloom code never
277/// produces them because the bit-vector length is itself bounded
278/// by the host's heap allocator (which is also `usize`-bounded).
279#[allow(
280    clippy::cast_possible_truncation,
281    reason = "indexing into a Vec<u64> by chunk: idx is already bounded by self.n_bits, which was derived from a Vec capacity (usize-bounded) at construction time."
282)]
283fn u64_to_usize(x: u64) -> usize {
284    x as usize
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290
291    #[test]
292    fn bloom_no_false_negatives_on_inserted_keys() {
293        let mut bf = BloomFilter::with_size_and_fp_rate(1000, 0.01);
294        let keys: Vec<Vec<u8>> = (0..200_u32)
295            .map(|i| format!("key-{i}").into_bytes())
296            .collect();
297        for k in &keys {
298            bf.insert(k);
299        }
300        for k in &keys {
301            assert!(bf.contains(k), "false negative on inserted key {k:?}");
302        }
303    }
304
305    #[test]
306    fn bloom_false_positive_rate_under_5pct_at_design_load() {
307        let n: u32 = 10_000;
308        let fp_target = 0.01;
309        let mut bf = BloomFilter::with_size_and_fp_rate(n as usize, fp_target);
310        for i in 0..n {
311            let k = format!("inserted-{i}");
312            bf.insert(k.as_bytes());
313        }
314        // Probe with disjoint keys; count false positives.
315        let probes = 5_000_u32;
316        let mut fps: u32 = 0;
317        for i in 0..probes {
318            let k = format!("probe-{i}-not-in-filter");
319            if bf.contains(k.as_bytes()) {
320                fps += 1;
321            }
322        }
323        let observed = f64::from(fps) / f64::from(probes);
324        assert!(
325            observed < 0.05,
326            "observed fp rate {observed} exceeded 5%; \
327             theoretical={}",
328            bf.false_positive_rate(n as usize)
329        );
330    }
331
332    #[test]
333    fn bloom_zero_keys_returns_false_for_anything() {
334        let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
335        assert!(!bf.contains(b"nope"));
336        assert!(!bf.contains(b""));
337        assert!(!bf.contains(b"another"));
338    }
339
340    #[test]
341    fn bloom_with_zero_n_does_not_panic() {
342        let mut bf = BloomFilter::with_size_and_fp_rate(0, 0.01);
343        assert_eq!(bf.n_bits(), MIN_BITS);
344        bf.insert(b"x");
345        assert!(bf.contains(b"x"));
346    }
347
348    #[test]
349    fn bloom_with_extreme_fp_rates_clamps() {
350        let bf_low = BloomFilter::with_size_and_fp_rate(100, 0.0);
351        assert!(bf_low.n_bits() >= MIN_BITS);
352        let bf_high = BloomFilter::with_size_and_fp_rate(100, 1.0);
353        assert!(bf_high.n_bits() >= MIN_BITS);
354        let bf_nan = BloomFilter::with_size_and_fp_rate(100, f64::NAN);
355        assert!(bf_nan.n_bits() >= MIN_BITS);
356    }
357
358    #[test]
359    fn bloom_hash_count_is_capped() {
360        // Asking for a microscopic fp rate should cap k at
361        // MAX_HASHES rather than spending forever per query.
362        let bf = BloomFilter::with_size_and_fp_rate(10, 1e-30);
363        assert!(bf.hash_count() <= MAX_HASHES);
364    }
365
366    #[test]
367    fn bloom_false_positive_rate_is_zero_for_empty_filter() {
368        let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
369        let rate = bf.false_positive_rate(0);
370        assert!(rate.abs() < f64::EPSILON);
371    }
372}