Skip to main content

reddb_server/storage/primitives/
split_block_bloom.rs

1//! SplitBlockBloomFilter — cache-line-aligned, SIMD-friendly bloom filter.
2//!
3//! Direct port of MongoDB's `SplitBlockBloomFilter` from
4//! `src/mongo/db/exec/sbe/util/bloom_filter.h`.
5//!
6//! # Design
7//!
8//! Each block is 32 bytes (8 × u32 = 256 bits), aligned to a cache line.
9//! Insert/probe uses 8 independent salt multiplications, one bit per word.
10//! Block index uses power-of-2 masking (fast modulo via bitwise AND).
11//!
12//! For k=8 bits per element: ~10 bits needed per element for ~1% FPR.
13//! At n=1000: 8 blocks (256 bytes). At n=10_000: 128 blocks (4 KB).
14//!
15//! # Usage in `CompiledEntityFilter`
16//!
17//! For `Filter::In` with >IN_BLOOM_THRESHOLD values, the compiled filter
18//! builds a `SplitBlockBloom` at compile time. At evaluate time:
19//! 1. Hash field value to u32 (fast, no allocation)
20//! 2. Bloom probe — if **false**, skip HashSet (definite miss, O(1))
21//! 3. HashSet probe — exact membership check (only ~1% FPR false positives reach here)
22//!
23//! Benefit: for rows where the field exists but doesn't match any IN value,
24//! the bloom eliminates the HashSet::contains call ~99% of the time.
25
26const SALTS: [u32; 8] = [
27    0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31,
28];
29
30/// One 32-byte cache-line-aligned block: 8 × u32 words.
31#[repr(align(32))]
32#[derive(Clone, Default)]
33pub struct Block {
34    words: [u32; 8],
35}
36
37impl std::fmt::Debug for Block {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        write!(f, "Block({:08x?})", &self.words)
40    }
41}
42
43/// Split-block bloom filter. Build once at compile time, probe at query time.
44/// Zero-allocation probe path.
45#[derive(Debug)]
46pub struct SplitBlockBloom {
47    blocks: Vec<Block>,
48    /// `num_blocks - 1` — mask for fast modulo via bitwise AND.
49    mask: usize,
50}
51
52impl SplitBlockBloom {
53    /// Build a filter sized for `n` elements with ~1% FPR.
54    pub fn with_capacity(n: usize) -> Self {
55        // ~10 bits per element for 1% FPR with 8 salt bits per insert.
56        // Each block holds 256 bits. Round up to next power of two.
57        let bits_needed = (n * 10).max(256);
58        let blocks_needed = bits_needed.div_ceil(256);
59        let num_blocks = blocks_needed.next_power_of_two();
60        Self {
61            blocks: vec![Block::default(); num_blocks],
62            mask: num_blocks - 1,
63        }
64    }
65
66    /// Insert a u32 key into the filter.
67    #[inline]
68    pub fn insert(&mut self, key: u32) {
69        let block_idx = (key as usize) & self.mask;
70        let block = &mut self.blocks[block_idx];
71        for (i, &salt) in SALTS.iter().enumerate() {
72            let bit = key.wrapping_mul(salt) >> 27; // 5-bit position 0..31
73            block.words[i] |= 1u32 << bit;
74        }
75    }
76
77    /// Return `true` if `key` **may** be in the set (false positives possible).
78    /// Return `false` if `key` is **definitely absent** (no false negatives).
79    #[inline]
80    pub fn probe(&self, key: u32) -> bool {
81        let block_idx = (key as usize) & self.mask;
82        let block = &self.blocks[block_idx];
83        for (i, &salt) in SALTS.iter().enumerate() {
84            let bit = key.wrapping_mul(salt) >> 27;
85            if block.words[i] & (1u32 << bit) == 0 {
86                return false;
87            }
88        }
89        true
90    }
91
92    /// Number of blocks allocated (each block = 32 bytes).
93    pub fn num_blocks(&self) -> usize {
94        self.blocks.len()
95    }
96}
97
98/// Hash a `Value` to a u32 for bloom filter use.
99/// Uses the standard Hash impl (which hashes discriminant + content).
100/// Folds the 64-bit DefaultHasher output to 32 bits via XOR.
101pub fn hash_value_u32(v: &crate::storage::schema::Value) -> u32 {
102    use std::hash::{Hash, Hasher};
103    let mut h = std::collections::hash_map::DefaultHasher::new();
104    v.hash(&mut h);
105    let bits = h.finish();
106    (bits ^ (bits >> 32)) as u32
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn test_insert_then_probe() {
115        let mut bloom = SplitBlockBloom::with_capacity(100);
116        for i in 0u32..100 {
117            bloom.insert(i);
118        }
119        for i in 0u32..100 {
120            assert!(bloom.probe(i), "false negative for key {i}");
121        }
122    }
123
124    #[test]
125    fn test_absent_key_may_return_false() {
126        let mut bloom = SplitBlockBloom::with_capacity(1000);
127        for i in 0u32..1000 {
128            bloom.insert(i * 2); // insert even numbers
129        }
130        // odd numbers were never inserted — some may be false positives, but
131        // most should be absent. We just verify there are NO false negatives.
132        for i in 0u32..1000 {
133            assert!(bloom.probe(i * 2), "false negative for key {}", i * 2);
134        }
135    }
136
137    #[test]
138    fn test_false_positive_rate_approximately_one_percent() {
139        const N: usize = 10_000;
140        let mut bloom = SplitBlockBloom::with_capacity(N);
141        for i in 0u32..N as u32 {
142            bloom.insert(i);
143        }
144        let mut fp = 0usize;
145        let probes = 10_000usize;
146        for i in N as u32..(N as u32 + probes as u32) {
147            if bloom.probe(i) {
148                fp += 1;
149            }
150        }
151        let fpr = fp as f64 / probes as f64;
152        // Allow up to 5% FPR — the theoretical ~1% varies with data patterns.
153        assert!(fpr < 0.05, "FPR too high: {fpr:.3}");
154    }
155}