const SALTS: [u32; 8] = [
0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31,
];
#[repr(align(32))]
#[derive(Clone, Default, PartialEq, Eq)]
pub struct Block {
words: [u32; 8],
}
impl std::fmt::Debug for Block {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Block({:08x?})", &self.words)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SplitBlockBloom {
blocks: Vec<Block>,
mask: usize,
}
impl SplitBlockBloom {
pub fn with_capacity(n: usize) -> Self {
let bits_needed = (n * 10).max(256);
let blocks_needed = bits_needed.div_ceil(256);
let num_blocks = blocks_needed.next_power_of_two();
Self {
blocks: vec![Block::default(); num_blocks],
mask: num_blocks - 1,
}
}
#[inline]
pub fn insert(&mut self, key: u32) {
let block_idx = (key as usize) & self.mask;
let block = &mut self.blocks[block_idx];
for (i, &salt) in SALTS.iter().enumerate() {
let bit = key.wrapping_mul(salt) >> 27; block.words[i] |= 1u32 << bit;
}
}
#[inline]
pub fn probe(&self, key: u32) -> bool {
let block_idx = (key as usize) & self.mask;
let block = &self.blocks[block_idx];
for (i, &salt) in SALTS.iter().enumerate() {
let bit = key.wrapping_mul(salt) >> 27;
if block.words[i] & (1u32 << bit) == 0 {
return false;
}
}
true
}
pub fn num_blocks(&self) -> usize {
self.blocks.len()
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut out = Vec::with_capacity(4 + self.blocks.len() * 32);
out.extend_from_slice(&(self.blocks.len() as u32).to_le_bytes());
for block in &self.blocks {
for &w in &block.words {
out.extend_from_slice(&w.to_le_bytes());
}
}
out
}
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
if bytes.len() < 4 {
return None;
}
let num_blocks = u32::from_le_bytes(bytes[0..4].try_into().ok()?) as usize;
if num_blocks == 0 || !num_blocks.is_power_of_two() {
return None;
}
if bytes.len() < 4 + num_blocks * 32 {
return None;
}
let mut blocks = Vec::with_capacity(num_blocks);
let mut cur = 4;
for _ in 0..num_blocks {
let mut words = [0u32; 8];
for w in &mut words {
*w = u32::from_le_bytes(bytes[cur..cur + 4].try_into().ok()?);
cur += 4;
}
blocks.push(Block { words });
}
Some(Self {
blocks,
mask: num_blocks - 1,
})
}
}
pub fn hash_bytes_u32(bytes: &[u8]) -> u32 {
use std::hash::{Hash, Hasher};
let mut h = std::collections::hash_map::DefaultHasher::new();
bytes.hash(&mut h);
let bits = h.finish();
(bits ^ (bits >> 32)) as u32
}
pub fn hash_value_u32(v: &crate::storage::schema::Value) -> u32 {
use std::hash::{Hash, Hasher};
let mut h = std::collections::hash_map::DefaultHasher::new();
v.hash(&mut h);
let bits = h.finish();
(bits ^ (bits >> 32)) as u32
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_insert_then_probe() {
let mut bloom = SplitBlockBloom::with_capacity(100);
for i in 0u32..100 {
bloom.insert(i);
}
for i in 0u32..100 {
assert!(bloom.probe(i), "false negative for key {i}");
}
}
#[test]
fn test_absent_key_may_return_false() {
let mut bloom = SplitBlockBloom::with_capacity(1000);
for i in 0u32..1000 {
bloom.insert(i * 2); }
for i in 0u32..1000 {
assert!(bloom.probe(i * 2), "false negative for key {}", i * 2);
}
}
#[test]
fn to_bytes_from_bytes_round_trips_and_keeps_no_false_negatives() {
let mut bloom = SplitBlockBloom::with_capacity(500);
for i in 0u32..500 {
bloom.insert(i.wrapping_mul(2_654_435_761));
}
let blob = bloom.to_bytes();
let restored = SplitBlockBloom::from_bytes(&blob).expect("round-trips");
assert_eq!(restored, bloom);
for i in 0u32..500 {
assert!(restored.probe(i.wrapping_mul(2_654_435_761)));
}
}
#[test]
fn from_bytes_rejects_truncated_or_non_power_of_two() {
assert!(SplitBlockBloom::from_bytes(&[]).is_none());
assert!(SplitBlockBloom::from_bytes(&[1, 2, 3]).is_none());
assert!(SplitBlockBloom::from_bytes(&3u32.to_le_bytes()).is_none());
assert!(SplitBlockBloom::from_bytes(&2u32.to_le_bytes()).is_none());
}
#[test]
fn hash_bytes_u32_is_stable_across_calls() {
let a = hash_bytes_u32(&7u64.to_le_bytes());
let b = hash_bytes_u32(&7u64.to_le_bytes());
assert_eq!(a, b);
}
#[test]
fn test_false_positive_rate_approximately_one_percent() {
const N: usize = 10_000;
let mut bloom = SplitBlockBloom::with_capacity(N);
for i in 0u32..N as u32 {
bloom.insert(i);
}
let mut fp = 0usize;
let probes = 10_000usize;
for i in N as u32..(N as u32 + probes as u32) {
if bloom.probe(i) {
fp += 1;
}
}
let fpr = fp as f64 / probes as f64;
assert!(fpr < 0.05, "FPR too high: {fpr:.3}");
}
}