reddb_server/storage/primitives/split_block_bloom.rs
1//! SplitBlockBloomFilter — cache-line-aligned, SIMD-friendly bloom filter.
2//!
3//! Direct port of MongoDB's `SplitBlockBloomFilter` from
4//! `src/mongo/db/exec/sbe/util/bloom_filter.h`.
5//!
6//! # Design
7//!
8//! Each block is 32 bytes (8 × u32 = 256 bits), aligned to a cache line.
9//! Insert/probe uses 8 independent salt multiplications, one bit per word.
10//! Block index uses power-of-2 masking (fast modulo via bitwise AND).
11//!
12//! For k=8 bits per element: ~10 bits needed per element for ~1% FPR.
13//! At n=1000: 8 blocks (256 bytes). At n=10_000: 128 blocks (4 KB).
14//!
15//! # Usage in `CompiledEntityFilter`
16//!
17//! For `Filter::In` with >IN_BLOOM_THRESHOLD values, the compiled filter
18//! builds a `SplitBlockBloom` at compile time. At evaluate time:
19//! 1. Hash field value to u32 (fast, no allocation)
20//! 2. Bloom probe — if **false**, skip HashSet (definite miss, O(1))
21//! 3. HashSet probe — exact membership check (only ~1% FPR false positives reach here)
22//!
23//! Benefit: for rows where the field exists but doesn't match any IN value,
24//! the bloom eliminates the HashSet::contains call ~99% of the time.
25
26const SALTS: [u32; 8] = [
27 0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31,
28];
29
30/// One 32-byte cache-line-aligned block: 8 × u32 words.
31#[repr(align(32))]
32#[derive(Clone, Default)]
33pub struct Block {
34 words: [u32; 8],
35}
36
37impl std::fmt::Debug for Block {
38 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39 write!(f, "Block({:08x?})", &self.words)
40 }
41}
42
43/// Split-block bloom filter. Build once at compile time, probe at query time.
44/// Zero-allocation probe path.
45#[derive(Debug)]
46pub struct SplitBlockBloom {
47 blocks: Vec<Block>,
48 /// `num_blocks - 1` — mask for fast modulo via bitwise AND.
49 mask: usize,
50}
51
52impl SplitBlockBloom {
53 /// Build a filter sized for `n` elements with ~1% FPR.
54 pub fn with_capacity(n: usize) -> Self {
55 // ~10 bits per element for 1% FPR with 8 salt bits per insert.
56 // Each block holds 256 bits. Round up to next power of two.
57 let bits_needed = (n * 10).max(256);
58 let blocks_needed = bits_needed.div_ceil(256);
59 let num_blocks = blocks_needed.next_power_of_two();
60 Self {
61 blocks: vec![Block::default(); num_blocks],
62 mask: num_blocks - 1,
63 }
64 }
65
66 /// Insert a u32 key into the filter.
67 #[inline]
68 pub fn insert(&mut self, key: u32) {
69 let block_idx = (key as usize) & self.mask;
70 let block = &mut self.blocks[block_idx];
71 for (i, &salt) in SALTS.iter().enumerate() {
72 let bit = key.wrapping_mul(salt) >> 27; // 5-bit position 0..31
73 block.words[i] |= 1u32 << bit;
74 }
75 }
76
77 /// Return `true` if `key` **may** be in the set (false positives possible).
78 /// Return `false` if `key` is **definitely absent** (no false negatives).
79 #[inline]
80 pub fn probe(&self, key: u32) -> bool {
81 let block_idx = (key as usize) & self.mask;
82 let block = &self.blocks[block_idx];
83 for (i, &salt) in SALTS.iter().enumerate() {
84 let bit = key.wrapping_mul(salt) >> 27;
85 if block.words[i] & (1u32 << bit) == 0 {
86 return false;
87 }
88 }
89 true
90 }
91
92 /// Number of blocks allocated (each block = 32 bytes).
93 pub fn num_blocks(&self) -> usize {
94 self.blocks.len()
95 }
96}
97
98/// Hash a `Value` to a u32 for bloom filter use.
99/// Uses the standard Hash impl (which hashes discriminant + content).
100/// Folds the 64-bit DefaultHasher output to 32 bits via XOR.
101pub fn hash_value_u32(v: &crate::storage::schema::Value) -> u32 {
102 use std::hash::{Hash, Hasher};
103 let mut h = std::collections::hash_map::DefaultHasher::new();
104 v.hash(&mut h);
105 let bits = h.finish();
106 (bits ^ (bits >> 32)) as u32
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112
113 #[test]
114 fn test_insert_then_probe() {
115 let mut bloom = SplitBlockBloom::with_capacity(100);
116 for i in 0u32..100 {
117 bloom.insert(i);
118 }
119 for i in 0u32..100 {
120 assert!(bloom.probe(i), "false negative for key {i}");
121 }
122 }
123
124 #[test]
125 fn test_absent_key_may_return_false() {
126 let mut bloom = SplitBlockBloom::with_capacity(1000);
127 for i in 0u32..1000 {
128 bloom.insert(i * 2); // insert even numbers
129 }
130 // odd numbers were never inserted — some may be false positives, but
131 // most should be absent. We just verify there are NO false negatives.
132 for i in 0u32..1000 {
133 assert!(bloom.probe(i * 2), "false negative for key {}", i * 2);
134 }
135 }
136
137 #[test]
138 fn test_false_positive_rate_approximately_one_percent() {
139 const N: usize = 10_000;
140 let mut bloom = SplitBlockBloom::with_capacity(N);
141 for i in 0u32..N as u32 {
142 bloom.insert(i);
143 }
144 let mut fp = 0usize;
145 let probes = 10_000usize;
146 for i in N as u32..(N as u32 + probes as u32) {
147 if bloom.probe(i) {
148 fp += 1;
149 }
150 }
151 let fpr = fp as f64 / probes as f64;
152 // Allow up to 5% FPR — the theoretical ~1% varies with data patterns.
153 assert!(fpr < 0.05, "FPR too high: {fpr:.3}");
154 }
155}