Skip to main content

subms_bloom_filter/
lib.rs

1//! Minimal bloom filter - standalone, reusable, zero-dependency.
2//!
3//! Standard double-hashed bloom filter: FNV-1a 64-bit produces two 32-bit
4//! subhashes for the double-hashing trick. Sizing defaults to ~10 bits per
5//! key and k=7, which gives ~1% false-positive rate. Suitable as a building
6//! block for other cookbook samples (LSM tree SSTables, in particular).
7//!
8//! The on-disk layout is fixed and language-agnostic:
9//!
10//! ```text
11//! bit_count: u32 (big-endian)
12//! k:         u32 (big-endian)
13//! words:     u32 (big-endian) - number of u64 words
14//! bits:      (u64 big-endian) * words
15//! ```
16
17#[cfg(feature = "harness")]
18pub mod recipe;
19
20// Opt-in feature modules. Each is independent of the base filter and
21// gated by its own Cargo feature; `cargo add subms-bloom-filter` alone
22// keeps the base zero-dep + std-only shape.
23//
24// See README and the cookbook page for the per-feature p99 numbers,
25// memory cost, and composition guidance.
26#[cfg(any(feature = "counting", feature = "scalable", feature = "partitioned"))]
27pub mod features;
28
29#[cfg(feature = "counting")]
30pub use features::counting::CountingBloomFilter;
31#[cfg(feature = "partitioned")]
32pub use features::partitioned::PartitionedBloomFilter;
33#[cfg(feature = "scalable")]
34pub use features::scalable::ScalableBloomFilter;
35
36use std::io::{self, Write};
37
38pub(crate) const FNV_OFFSET: u64 = 0xcbf29ce484222325;
39pub(crate) const FNV_PRIME: u64 = 0x100000001b3;
40
41pub struct BloomFilter {
42    bit_count: u32,
43    k: u32,
44    bits: Vec<u64>,
45}
46
47impl BloomFilter {
48    /// Build an empty filter sized for `expected_entries` at ~1% FPR
49    /// (10 bits/key, k=7). The 64-bit floor matters when expected_entries is small.
50    pub fn new(expected_entries: usize) -> Self {
51        let bit_count = expected_entries.saturating_mul(10).max(64) as u32;
52        let words = (bit_count as usize).div_ceil(64);
53        Self {
54            bit_count,
55            k: 7,
56            bits: vec![0u64; words],
57        }
58    }
59
60    pub fn add(&mut self, key: &str) {
61        let h = fnv1a64(key);
62        let h1 = h as u32;
63        let h2 = ((h >> 32) as u32) | 1;
64        for i in 0..self.k {
65            let idx = h1.wrapping_add(i.wrapping_mul(h2)) % self.bit_count;
66            self.bits[(idx / 64) as usize] |= 1u64 << (idx % 64);
67        }
68    }
69
70    pub fn might_contain(&self, key: &str) -> bool {
71        let h = fnv1a64(key);
72        let h1 = h as u32;
73        let h2 = ((h >> 32) as u32) | 1;
74        for i in 0..self.k {
75            let idx = h1.wrapping_add(i.wrapping_mul(h2)) % self.bit_count;
76            if self.bits[(idx / 64) as usize] & (1u64 << (idx % 64)) == 0 {
77                return false;
78            }
79        }
80        true
81    }
82
83    pub fn bit_count(&self) -> u32 {
84        self.bit_count
85    }
86
87    pub fn k(&self) -> u32 {
88        self.k
89    }
90
91    pub fn write_to<W: Write>(&self, out: &mut W) -> io::Result<()> {
92        out.write_all(&self.bit_count.to_be_bytes())?;
93        out.write_all(&self.k.to_be_bytes())?;
94        out.write_all(&(self.bits.len() as u32).to_be_bytes())?;
95        for w in &self.bits {
96            out.write_all(&w.to_be_bytes())?;
97        }
98        Ok(())
99    }
100
101    /// Parse a serialised bloom filter from `buf`. Errors if the buffer is
102    /// shorter than the header or truncated mid-bits.
103    pub fn parse(buf: &[u8]) -> io::Result<Self> {
104        if buf.len() < 12 {
105            return Err(io::Error::new(
106                io::ErrorKind::InvalidData,
107                "bloom section too short",
108            ));
109        }
110        let bit_count = u32::from_be_bytes(buf[0..4].try_into().unwrap());
111        let k = u32::from_be_bytes(buf[4..8].try_into().unwrap());
112        let words = u32::from_be_bytes(buf[8..12].try_into().unwrap()) as usize;
113        if buf.len() < 12 + words * 8 {
114            return Err(io::Error::new(
115                io::ErrorKind::InvalidData,
116                "bloom section truncated",
117            ));
118        }
119        let mut bits = Vec::with_capacity(words);
120        for i in 0..words {
121            let off = 12 + i * 8;
122            bits.push(u64::from_be_bytes(buf[off..off + 8].try_into().unwrap()));
123        }
124        Ok(Self { bit_count, k, bits })
125    }
126}
127
128pub(crate) fn fnv1a64(key: &str) -> u64 {
129    let mut h = FNV_OFFSET;
130    for &b in key.as_bytes() {
131        h ^= b as u64;
132        h = h.wrapping_mul(FNV_PRIME);
133    }
134    h
135}