use std::sync::Arc;
use wide::u64x4;
use xxhash_rust::xxh3::xxh3_64;
pub const BLOCK_BYTES: usize = 64;
pub const BLOCK_BITS: usize = BLOCK_BYTES * 8; const BLOCK_WORDS: usize = BLOCK_BYTES / 8; pub const K: usize = 4;
pub const DEFAULT_N_BLOCKS: usize = 1024;
pub const DEFAULT_BLOOM_BYTES: usize = DEFAULT_N_BLOCKS * BLOCK_BYTES;
const GOLDEN_RATIO_U64: u64 = 0x9E37_79B9_7F4A_7C15;
#[derive(Clone, Debug)]
pub struct Bloom {
words: Arc<[u64]>,
n_blocks_mask: u32,
}
impl Bloom {
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if !bytes.len().is_multiple_of(BLOCK_BYTES) {
return None;
}
let n_blocks = bytes.len() / BLOCK_BYTES;
if n_blocks == 0 || !n_blocks.is_power_of_two() {
return None;
}
let mut words = vec![0u64; n_blocks * BLOCK_WORDS];
for (i, w) in words.iter_mut().enumerate() {
let off = i * 8;
let mut buf = [0u8; 8];
buf.copy_from_slice(&bytes[off..off + 8]);
*w = u64::from_le_bytes(buf);
}
Some(Self {
words: words.into(),
n_blocks_mask: (n_blocks - 1) as u32,
})
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut out = Vec::with_capacity(self.words.len() * 8);
for w in self.words.iter() {
out.extend_from_slice(&w.to_le_bytes());
}
out
}
#[inline]
pub fn contains(&self, key: &[u8]) -> bool {
let h = xxh3_64(key);
let (block_idx, mask) = block_and_mask(h, self.n_blocks_mask);
let block_offset = block_idx * BLOCK_WORDS;
let block: &[u64; BLOCK_WORDS] = (&self.words[block_offset..block_offset + BLOCK_WORDS])
.try_into()
.expect("BLOCK_WORDS-sized slice");
contains_block(block, &mask)
}
pub fn n_blocks(&self) -> usize {
(self.n_blocks_mask as usize) + 1
}
pub fn len(&self) -> usize {
self.n_blocks() * BLOCK_BYTES
}
pub fn is_empty(&self) -> bool {
self.words.is_empty()
}
}
pub struct BloomBuilder {
words: Vec<u64>,
n_blocks_mask: u32,
}
impl BloomBuilder {
pub fn new() -> Self {
Self::with_n_blocks(DEFAULT_N_BLOCKS)
}
pub fn with_n_blocks(n_blocks: usize) -> Self {
debug_assert!(
n_blocks.is_power_of_two() && n_blocks >= 1,
"n_blocks must be a power of two ≥ 1; got {n_blocks}",
);
Self {
words: vec![0u64; n_blocks * BLOCK_WORDS],
n_blocks_mask: (n_blocks - 1) as u32,
}
}
pub fn insert(&mut self, key: &[u8]) {
let h = xxh3_64(key);
let (block_idx, mask) = block_and_mask(h, self.n_blocks_mask);
let block_offset = block_idx * BLOCK_WORDS;
let block = &mut self.words[block_offset..block_offset + BLOCK_WORDS];
for (b, m) in block.iter_mut().zip(mask.iter()) {
*b |= *m;
}
}
pub fn finish(self) -> Bloom {
Bloom {
words: self.words.into(),
n_blocks_mask: self.n_blocks_mask,
}
}
pub fn n_blocks(&self) -> usize {
(self.n_blocks_mask as usize) + 1
}
}
impl Default for BloomBuilder {
fn default() -> Self {
Self::new()
}
}
#[inline]
fn contains_block(block: &[u64; BLOCK_WORDS], mask: &[u64; BLOCK_WORDS]) -> bool {
let block_lo = u64x4::new([block[0], block[1], block[2], block[3]]);
let block_hi = u64x4::new([block[4], block[5], block[6], block[7]]);
let mask_lo = u64x4::new([mask[0], mask[1], mask[2], mask[3]]);
let mask_hi = u64x4::new([mask[4], mask[5], mask[6], mask[7]]);
let r_lo = !block_lo & mask_lo;
let r_hi = !block_hi & mask_hi;
let combined = r_lo | r_hi;
let parts = combined.to_array();
(parts[0] | parts[1] | parts[2] | parts[3]) == 0
}
#[inline]
fn block_and_mask(h: u64, n_blocks_mask: u32) -> (usize, [u64; BLOCK_WORDS]) {
let block_idx = ((h >> 32) as u32 & n_blocks_mask) as usize;
let h2 = h.wrapping_mul(GOLDEN_RATIO_U64);
let h_low = h2 as u32;
let h_high = (h2 >> 32) as u32;
let block_bits_mask = (BLOCK_BITS as u32) - 1;
let mut mask = [0u64; BLOCK_WORDS];
let p0 = (h_low & block_bits_mask) as usize;
let p1 = (h_low.wrapping_add(h_high) & block_bits_mask) as usize;
let p2 = (h_low.wrapping_add(h_high.wrapping_mul(2)) & block_bits_mask) as usize;
let p3 = (h_low.wrapping_add(h_high.wrapping_mul(3)) & block_bits_mask) as usize;
mask[p0 / 64] |= 1u64 << (p0 % 64);
mask[p1 / 64] |= 1u64 << (p1 % 64);
mask[p2 / 64] |= 1u64 << (p2 % 64);
mask[p3 / 64] |= 1u64 << (p3 % 64);
(block_idx, mask)
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, str::from_utf8};
use super::*;
fn build_with(keys: &[&[u8]]) -> Bloom {
let mut b = BloomBuilder::new();
for k in keys {
b.insert(k);
}
b.finish()
}
#[test]
fn empty_bloom_contains_nothing() {
let b = BloomBuilder::new().finish();
assert!(!b.contains(b"alpha"));
assert!(!b.contains(b"beta"));
assert!(!b.contains(b""));
}
#[test]
fn inserted_keys_are_definitely_present() {
let keys: &[&[u8]] = &[
b"alpha", b"beta", b"gamma", b"delta", b"epsilon", b"zeta", b"eta", b"theta",
];
let b = build_with(keys);
for k in keys {
assert!(
b.contains(k),
"inserted key {:?} must be reported present",
from_utf8(k).unwrap_or("<non-utf8>"),
);
}
}
#[test]
fn insert_is_idempotent() {
let mut b = BloomBuilder::new();
b.insert(b"hello");
let bytes_a = b.words.clone();
b.insert(b"hello");
let bytes_b = b.words.clone();
assert_eq!(bytes_a, bytes_b);
}
#[test]
fn fpr_is_within_target_band_at_100k_keys() {
let mut b = BloomBuilder::new();
let n_inserted = 100_000usize;
for i in 0..n_inserted {
b.insert(format!("term{i}").as_bytes());
}
let bloom = b.finish();
let mut false_positives = 0usize;
let n_probes = 100_000usize;
for i in 0..n_probes {
let probe = format!("absent_term_{i}");
if bloom.contains(probe.as_bytes()) {
false_positives += 1;
}
}
let fpr = false_positives as f64 / n_probes as f64;
assert!(
(0.03..=0.12).contains(&fpr),
"FPR {} outside expected band [0.03, 0.12]",
fpr,
);
}
#[test]
fn no_false_negatives_at_100k_keys() {
let mut b = BloomBuilder::new();
let n = 100_000usize;
for i in 0..n {
b.insert(format!("term{i}").as_bytes());
}
let bloom = b.finish();
for i in 0..n {
let key = format!("term{i}");
assert!(
bloom.contains(key.as_bytes()),
"false negative on inserted key {key}",
);
}
}
#[test]
fn default_size_is_64kib_1024_blocks() {
let b = BloomBuilder::new().finish();
assert_eq!(b.len(), DEFAULT_BLOOM_BYTES);
assert_eq!(b.n_blocks(), DEFAULT_N_BLOCKS);
}
#[test]
fn custom_size_round_trips_through_builder() {
for n_blocks in [1usize, 2, 4, 8, 256, 4096] {
let b = BloomBuilder::with_n_blocks(n_blocks).finish();
assert_eq!(b.n_blocks(), n_blocks);
assert_eq!(b.len(), n_blocks * BLOCK_BYTES);
}
}
#[test]
#[cfg(debug_assertions)]
#[should_panic(expected = "n_blocks must be a power of two")]
fn non_power_of_two_n_blocks_panics_in_debug() {
let _ = BloomBuilder::with_n_blocks(3);
}
#[test]
fn to_bytes_from_bytes_round_trip_preserves_membership() {
let keys: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
let b1 = build_with(keys);
let bytes = b1.to_bytes();
let b2 = Bloom::from_bytes(&bytes).expect("valid bytes");
assert_eq!(b2.n_blocks(), b1.n_blocks());
for k in keys {
assert!(b2.contains(k));
}
let probes: &[&[u8]] = &[b"never1", b"never2", b"never3", b"never4"];
let n_collisions = probes.iter().filter(|p| b2.contains(p)).count();
assert!(
n_collisions < probes.len(),
"all probes false-positive — bloom appears saturated",
);
}
#[test]
fn from_bytes_rejects_misaligned_buffer() {
assert!(Bloom::from_bytes(&[0u8; BLOCK_BYTES + 1]).is_none());
assert!(Bloom::from_bytes(&[0u8; BLOCK_BYTES * 3]).is_none());
assert!(Bloom::from_bytes(&[]).is_none());
}
#[test]
fn block_load_is_roughly_uniform() {
let n_blocks = DEFAULT_N_BLOCKS;
let mut load = vec![0usize; n_blocks];
for i in 0..100_000usize {
let h = xxh3_64(format!("term{i}").as_bytes());
let (block_idx, _) = block_and_mask(h, (n_blocks - 1) as u32);
load[block_idx] += 1;
}
let max = *load.iter().max().expect("non-empty");
let mean = load.iter().sum::<usize>() as f64 / n_blocks as f64;
assert!(
(max as f64) < 3.0 * mean,
"max-block-load {max} more than 3× mean {mean}",
);
}
#[test]
fn different_keys_rarely_collide_on_same_signature() {
let mut sigs: HashMap<(usize, [u64; BLOCK_WORDS]), Vec<String>> = HashMap::new();
for i in 0..1000usize {
let key = format!("a-{i}");
let h = xxh3_64(key.as_bytes());
let sig = block_and_mask(h, (DEFAULT_N_BLOCKS - 1) as u32);
sigs.entry(sig).or_default().push(key);
}
let collisions = sigs.values().filter(|v| v.len() > 1).count();
assert!(
collisions < 5,
"{collisions} signature collisions in 1000 keys"
);
}
#[test]
fn clone_shares_words_via_arc() {
let b1 = build_with(&[b"alpha", b"beta"]);
let b2 = b1.clone();
assert_eq!(b1.words.as_ptr(), b2.words.as_ptr());
}
#[test]
fn debug_format_doesnt_explode() {
let b = build_with(&[b"x"]);
let s = format!("{:?}", b);
assert!(s.contains("Bloom"));
}
#[test]
fn builder_and_bloom_geometry_is_consistent() {
const CUSTOM_BLOCKS: usize = 4;
let mut b = BloomBuilder::with_n_blocks(CUSTOM_BLOCKS);
assert_eq!(
b.n_blocks(),
CUSTOM_BLOCKS,
"builder reports its block count"
);
b.insert(b"alpha");
let bloom = b.finish();
assert_eq!(bloom.n_blocks(), CUSTOM_BLOCKS);
assert_eq!(bloom.len(), CUSTOM_BLOCKS * BLOCK_BYTES);
assert!(!bloom.is_empty(), "a sized bloom's word array is non-empty");
assert_eq!(
BloomBuilder::default().n_blocks(),
BloomBuilder::new().n_blocks()
);
}
}