use std::hash::{Hash, Hasher};
use twox_hash::XxHash64;
const CACHE_LINE_BYTES: usize = 64;
const CACHE_LINE_BITS: usize = CACHE_LINE_BYTES * 8; const WORDS_PER_BLOCK: usize = CACHE_LINE_BYTES / 8;
pub struct BlockedBloomFilter {
blocks: Vec<[u64; WORDS_PER_BLOCK]>,
num_blocks: usize,
num_hashes: usize,
count: usize,
}
impl BlockedBloomFilter {
#[must_use]
pub fn new(expected_elements: usize, false_positive_rate: f64) -> Self {
let num_bits = (-(expected_elements as f64) * false_positive_rate.ln()
/ (2.0_f64.ln().powi(2)))
.ceil() as usize;
let num_hashes =
((num_bits as f64 / expected_elements as f64) * 2.0_f64.ln()).ceil() as usize;
let num_blocks = num_bits.div_ceil(CACHE_LINE_BITS);
Self {
blocks: vec![[0u64; WORDS_PER_BLOCK]; num_blocks],
num_blocks,
num_hashes: num_hashes.clamp(1, CACHE_LINE_BITS), count: 0,
}
}
#[inline]
pub fn insert<T: Hash + ?Sized>(&mut self, item: &T) {
let (block_index, block_hash) = self.hash_to_block(item);
for i in 0..self.num_hashes {
let bit_index = Self::nth_bit_in_block(block_hash, i);
let word_index = bit_index / 64;
let bit_offset = bit_index % 64;
self.blocks[block_index][word_index] |= 1u64 << bit_offset;
}
self.count += 1;
}
#[inline]
pub fn contains<T: Hash + ?Sized>(&self, item: &T) -> bool {
let (block_index, block_hash) = self.hash_to_block(item);
for i in 0..self.num_hashes {
let bit_index = Self::nth_bit_in_block(block_hash, i);
let word_index = bit_index / 64;
let bit_offset = bit_index % 64;
if (self.blocks[block_index][word_index] & (1u64 << bit_offset)) == 0 {
return false; }
}
true }
#[must_use]
pub const fn len(&self) -> usize {
self.count
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.count == 0
}
#[must_use]
pub const fn size_bytes(&self) -> usize {
self.blocks.len() * CACHE_LINE_BYTES + std::mem::size_of::<Self>()
}
#[must_use]
pub fn false_positive_rate(&self) -> f64 {
if self.count == 0 {
return 0.0;
}
let k = self.num_hashes as f64;
let n = self.count as f64;
let m = (self.num_blocks * CACHE_LINE_BITS) as f64;
(1.0 - (-k * n / m).exp()).powf(k)
}
#[must_use]
pub fn to_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::new();
bytes.extend_from_slice(&(self.num_blocks as u64).to_le_bytes());
bytes.extend_from_slice(&(self.num_hashes as u32).to_le_bytes());
bytes.extend_from_slice(&(self.count as u64).to_le_bytes());
for block in &self.blocks {
for &word in block {
bytes.extend_from_slice(&word.to_le_bytes());
}
}
bytes
}
#[must_use]
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.len() < 20 {
return None; }
let num_blocks = u64::from_le_bytes(bytes[0..8].try_into().ok()?) as usize;
let num_hashes = u32::from_le_bytes(bytes[8..12].try_into().ok()?) as usize;
let count = u64::from_le_bytes(bytes[12..20].try_into().ok()?) as usize;
let expected_len = 20 + num_blocks * CACHE_LINE_BYTES;
if bytes.len() != expected_len {
return None;
}
let mut blocks = Vec::with_capacity(num_blocks);
for block_idx in 0..num_blocks {
let mut block = [0u64; WORDS_PER_BLOCK];
for (word_idx, word) in block.iter_mut().enumerate().take(WORDS_PER_BLOCK) {
let start = 20 + block_idx * CACHE_LINE_BYTES + word_idx * 8;
*word = u64::from_le_bytes(bytes[start..start + 8].try_into().ok()?);
}
blocks.push(block);
}
Some(Self {
blocks,
num_blocks,
num_hashes,
count,
})
}
fn hash_to_block<T: Hash + ?Sized>(&self, item: &T) -> (usize, u64) {
let h1 = {
let mut hasher = XxHash64::with_seed(0);
item.hash(&mut hasher);
hasher.finish()
};
let h2 = {
let mut hasher = XxHash64::with_seed(1);
item.hash(&mut hasher);
hasher.finish()
};
let block_index = (h1 % self.num_blocks as u64) as usize;
(block_index, h2)
}
const fn nth_bit_in_block(block_hash: u64, n: usize) -> usize {
let h = block_hash.wrapping_add((n as u64).wrapping_mul(block_hash >> 32));
(h % CACHE_LINE_BITS as u64) as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_blocked_basic() {
let mut bloom = BlockedBloomFilter::new(100, 0.01);
bloom.insert(&"hello");
bloom.insert(&"world");
assert!(bloom.contains(&"hello"));
assert!(bloom.contains(&"world"));
assert!(!bloom.contains(&"foo"));
}
#[test]
fn test_blocked_serialization() {
let mut bloom = BlockedBloomFilter::new(100, 0.01);
bloom.insert(&"test1");
bloom.insert(&"test2");
let bytes = bloom.to_bytes();
let bloom2 = BlockedBloomFilter::from_bytes(&bytes).unwrap();
assert!(bloom2.contains(&"test1"));
assert!(bloom2.contains(&"test2"));
assert!(!bloom2.contains(&"test3"));
}
#[test]
fn test_blocked_cache_line_alignment() {
let bloom = BlockedBloomFilter::new(10000, 0.01);
assert_eq!(std::mem::size_of_val(&bloom.blocks[0]), CACHE_LINE_BYTES);
let (_block_idx, block_hash) = bloom.hash_to_block(&"test");
for i in 0..bloom.num_hashes {
let bit_idx = BlockedBloomFilter::nth_bit_in_block(block_hash, i);
assert!(
bit_idx < CACHE_LINE_BITS,
"Bit {} exceeds block boundary",
bit_idx
);
}
}
#[test]
fn test_blocked_space_efficiency() {
let bloom = BlockedBloomFilter::new(10000, 0.01);
let bytes_per_element = bloom.size_bytes() as f64 / 10000.0;
assert!(
bytes_per_element < 3.0,
"Space efficiency check: {} bytes/elem",
bytes_per_element
);
}
#[test]
fn test_blocked_false_positive_rate() {
let mut bloom = BlockedBloomFilter::new(1000, 0.01);
for i in 0..1000 {
bloom.insert(&format!("key{}", i));
}
let mut false_positives = 0;
for i in 1000..11000 {
if bloom.contains(&format!("key{}", i)) {
false_positives += 1;
}
}
let observed_fpr = false_positives as f64 / 10000.0;
assert!(
observed_fpr < 0.03,
"FPR too high: {:.4} (expected < 0.03)",
observed_fpr
);
}
}