const K: usize = 4;
fn fnv64a(data: &[u8], seed: u64) -> u64 {
const PRIME: u64 = 0x00000100000001B3;
let mut h = 0xcbf29ce484222325u64 ^ seed;
for &b in data {
h ^= b as u64;
h = h.wrapping_mul(PRIME);
}
h
}
#[derive(Clone)]
pub struct BloomFilter {
bits: Vec<u64>,
num_bits: usize,
}
impl BloomFilter {
pub fn with_capacity(capacity: usize, fpr: f64) -> Self {
let n = capacity.max(1) as f64;
let bits_f = -n * fpr.ln() / std::f64::consts::LN_2.powi(2);
let num_bits = (bits_f.ceil() as usize).max(64);
let num_bits = (num_bits + 63) & !63;
Self {
bits: vec![0u64; num_bits / 64],
num_bits,
}
}
fn probes(&self, term: &[u8]) -> [usize; K] {
let h1 = fnv64a(term, 0);
let h2 = fnv64a(term, 0xcbf29ce484222325u64);
let m = self.num_bits as u64;
let mut out = [0usize; K];
for (i, slot) in out.iter_mut().enumerate().take(K) {
*slot = (h1.wrapping_add((i as u64).wrapping_mul(h2)) % m) as usize;
}
out
}
pub fn insert(&mut self, term: &str) {
for pos in self.probes(term.as_bytes()) {
self.bits[pos / 64] |= 1u64 << (pos % 64);
}
}
pub fn may_contain(&self, term: &str) -> bool {
for pos in self.probes(term.as_bytes()) {
if self.bits[pos / 64] & (1u64 << (pos % 64)) == 0 {
return false;
}
}
true
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut out = Vec::with_capacity(8 + self.bits.len() * 8);
out.extend_from_slice(&(self.num_bits as u64).to_le_bytes());
for &w in &self.bits {
out.extend_from_slice(&w.to_le_bytes());
}
out
}
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.len() < 8 {
return None;
}
let num_bits = u64::from_le_bytes(bytes[0..8].try_into().ok()?) as usize;
if num_bits == 0 || !num_bits.is_multiple_of(64) {
return None;
}
let word_count = num_bits / 64;
if bytes.len() < 8 + word_count * 8 {
return None;
}
let bits: Vec<u64> = (0..word_count)
.map(|i| u64::from_le_bytes(bytes[8 + i * 8..16 + i * 8].try_into().unwrap()))
.collect();
Some(Self { bits, num_bits })
}
pub fn num_bits(&self) -> usize {
self.num_bits
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn inserted_terms_always_found() {
let mut bf = BloomFilter::with_capacity(100, 0.01);
let terms = ["rust", "iceberg", "puffin", "vector", "bloom"];
for t in &terms {
bf.insert(t);
}
for t in &terms {
assert!(
bf.may_contain(t),
"term '{t}' should be found after insertion"
);
}
}
#[test]
fn absent_terms_mostly_absent() {
let mut bf = BloomFilter::with_capacity(1000, 0.01);
for i in 0..500u32 {
bf.insert(&format!("term_{i}"));
}
let fp: usize = (500u32..600)
.filter(|i| bf.may_contain(&format!("absent_{i}")))
.count();
assert!(fp <= 5, "too many false positives: {fp}/100");
}
#[test]
fn roundtrip_serialization() {
let mut bf = BloomFilter::with_capacity(50, 0.01);
bf.insert("hello");
bf.insert("world");
let bytes = bf.to_bytes();
let restored = BloomFilter::from_bytes(&bytes).expect("deserialization should succeed");
assert!(restored.may_contain("hello"));
assert!(restored.may_contain("world"));
assert_eq!(restored.num_bits(), bf.num_bits());
}
#[test]
fn from_bytes_returns_none_on_short_input() {
assert!(BloomFilter::from_bytes(&[]).is_none());
assert!(BloomFilter::from_bytes(&[0u8; 7]).is_none());
}
#[test]
fn definitely_absent_term_returns_false() {
let bf = BloomFilter::with_capacity(64, 0.001);
assert!(
!bf.may_contain("anything"),
"empty filter must return false for all terms"
);
}
}