use serde::{Deserialize, Serialize};
const MIN_BITS: u64 = 64;
const MIN_HASHES: u8 = 1;
const MAX_HASHES: u8 = 32;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BloomFilter {
bits: Vec<u64>,
hashes: u8,
n_bits: u64,
}
impl BloomFilter {
#[must_use]
pub fn with_size_and_fp_rate(n: usize, fp: f64) -> Self {
let fp = if fp.is_nan() || fp <= 0.0 {
0.000_001
} else if fp >= 1.0 {
0.999_999
} else {
fp
};
let (n_bits, hashes) = compute_params(n, fp);
let chunk_count = (n_bits / 64) as usize;
Self {
bits: vec![0_u64; chunk_count],
hashes,
n_bits,
}
}
#[must_use]
pub fn n_bits(&self) -> u64 {
self.n_bits
}
#[must_use]
pub fn hash_count(&self) -> u8 {
self.hashes
}
pub fn insert(&mut self, key: &[u8]) {
for idx in self.indices(key) {
self.set_bit(idx);
}
}
#[must_use]
pub fn contains(&self, key: &[u8]) -> bool {
for idx in self.indices(key) {
if !self.test_bit(idx) {
return false;
}
}
true
}
#[must_use]
pub fn false_positive_rate(&self, n_inserted: usize) -> f64 {
if n_inserted == 0 || self.n_bits == 0 {
return 0.0;
}
let k = f64::from(self.hashes);
let n = usize_to_f64(n_inserted);
let m = u64_to_f64(self.n_bits);
let occupancy = 1.0 - (-k * n / m).exp();
occupancy.powf(k)
}
fn indices(&self, key: &[u8]) -> impl Iterator<Item = u64> {
let h = blake3::hash(key);
let bytes = h.as_bytes();
let mut h1_buf = [0_u8; 8];
let mut h2_buf = [0_u8; 8];
h1_buf.copy_from_slice(&bytes[0..8]);
h2_buf.copy_from_slice(&bytes[8..16]);
let h1 = u64::from_le_bytes(h1_buf);
let h2 = u64::from_le_bytes(h2_buf);
let n_bits = self.n_bits;
let k = self.hashes;
(0..u64::from(k)).map(move |i| h1.wrapping_add(i.wrapping_mul(h2)) % n_bits)
}
fn set_bit(&mut self, idx: u64) {
let chunk = u64_to_usize(idx / 64);
let off = idx % 64;
self.bits[chunk] |= 1_u64 << off;
}
fn test_bit(&self, idx: u64) -> bool {
let chunk = u64_to_usize(idx / 64);
let off = idx % 64;
(self.bits[chunk] >> off) & 1 == 1
}
}
fn compute_params(n: usize, fp: f64) -> (u64, u8) {
let n_f = usize_to_f64(n);
let raw_m = if n == 0 {
u64_to_f64(MIN_BITS)
} else {
-n_f * fp.ln() / (std::f64::consts::LN_2 * std::f64::consts::LN_2)
};
let m_rounded = raw_m.ceil().max(u64_to_f64(MIN_BITS));
let m_as_u64 = f64_to_u64_saturating(m_rounded);
let m_u64_chunks = m_as_u64.div_ceil(64).max(1);
let n_bits = m_u64_chunks * 64;
let k_raw = if n == 0 {
f64::from(MIN_HASHES)
} else {
(u64_to_f64(n_bits) / n_f) * std::f64::consts::LN_2
};
let k_rounded = k_raw.ceil().max(f64::from(MIN_HASHES));
let hashes_u32 = f64_to_u32_saturating(k_rounded).min(u32::from(MAX_HASHES));
let hashes = u8::try_from(hashes_u32)
.unwrap_or(MAX_HASHES)
.max(MIN_HASHES);
(n_bits, hashes)
}
#[allow(
clippy::cast_precision_loss,
reason = "bloom dimension formula: u64 -> f64 widens by design; values are bounded by the configured bit count and have far fewer than 2^53 significant bits in practice."
)]
fn u64_to_f64(x: u64) -> f64 {
x as f64
}
#[allow(
clippy::cast_precision_loss,
reason = "bloom dimension formula: usize -> f64 may round for n above 2^53; that is the same precision loss the reference Mitzenmacher derivation accepts because the formula uses ln(fp), which dominates the rounding budget."
)]
fn usize_to_f64(x: usize) -> f64 {
x as f64
}
#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "bloom dimension formula: f64 -> u64 with explicit guards. Caller has already ceiled the input and the formula's outputs are positive; the saturating cast pins the rare overflow case to u64::MAX."
)]
fn f64_to_u64_saturating(x: f64) -> u64 {
if !x.is_finite() || x <= 0.0 {
return 0;
}
if x >= u64_to_f64(u64::MAX) {
return u64::MAX;
}
x as u64
}
#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "bloom dimension formula: f64 -> u32 with explicit guards. The hash count is bounded by MAX_HASHES (32) so the cast is well within u32 range; the saturation arm guards against pathological inputs."
)]
fn f64_to_u32_saturating(x: f64) -> u32 {
if !x.is_finite() || x <= 0.0 {
return 0;
}
if x >= f64::from(u32::MAX) {
return u32::MAX;
}
x as u32
}
#[allow(
clippy::cast_possible_truncation,
reason = "indexing into a Vec<u64> by chunk: idx is already bounded by self.n_bits, which was derived from a Vec capacity (usize-bounded) at construction time."
)]
fn u64_to_usize(x: u64) -> usize {
x as usize
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bloom_no_false_negatives_on_inserted_keys() {
let mut bf = BloomFilter::with_size_and_fp_rate(1000, 0.01);
let keys: Vec<Vec<u8>> = (0..200_u32)
.map(|i| format!("key-{i}").into_bytes())
.collect();
for k in &keys {
bf.insert(k);
}
for k in &keys {
assert!(bf.contains(k), "false negative on inserted key {k:?}");
}
}
#[test]
fn bloom_false_positive_rate_under_5pct_at_design_load() {
let n: u32 = 10_000;
let fp_target = 0.01;
let mut bf = BloomFilter::with_size_and_fp_rate(n as usize, fp_target);
for i in 0..n {
let k = format!("inserted-{i}");
bf.insert(k.as_bytes());
}
let probes = 5_000_u32;
let mut fps: u32 = 0;
for i in 0..probes {
let k = format!("probe-{i}-not-in-filter");
if bf.contains(k.as_bytes()) {
fps += 1;
}
}
let observed = f64::from(fps) / f64::from(probes);
assert!(
observed < 0.05,
"observed fp rate {observed} exceeded 5%; \
theoretical={}",
bf.false_positive_rate(n as usize)
);
}
#[test]
fn bloom_zero_keys_returns_false_for_anything() {
let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
assert!(!bf.contains(b"nope"));
assert!(!bf.contains(b""));
assert!(!bf.contains(b"another"));
}
#[test]
fn bloom_with_zero_n_does_not_panic() {
let mut bf = BloomFilter::with_size_and_fp_rate(0, 0.01);
assert_eq!(bf.n_bits(), MIN_BITS);
bf.insert(b"x");
assert!(bf.contains(b"x"));
}
#[test]
fn bloom_with_extreme_fp_rates_clamps() {
let bf_low = BloomFilter::with_size_and_fp_rate(100, 0.0);
assert!(bf_low.n_bits() >= MIN_BITS);
let bf_high = BloomFilter::with_size_and_fp_rate(100, 1.0);
assert!(bf_high.n_bits() >= MIN_BITS);
let bf_nan = BloomFilter::with_size_and_fp_rate(100, f64::NAN);
assert!(bf_nan.n_bits() >= MIN_BITS);
}
#[test]
fn bloom_hash_count_is_capped() {
let bf = BloomFilter::with_size_and_fp_rate(10, 1e-30);
assert!(bf.hash_count() <= MAX_HASHES);
}
#[test]
fn bloom_false_positive_rate_is_zero_for_empty_filter() {
let bf = BloomFilter::with_size_and_fp_rate(100, 0.01);
let rate = bf.false_positive_rate(0);
assert!(rate.abs() < f64::EPSILON);
}
}