1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
//! Bloom filter for SST files.
//!
//! Uses the "double hashing" technique from Kirsch & Mitzenmacher
//! to generate k hash functions from two base hashes.
/// A bloom filter builder + checker.
pub struct BloomFilter {
/// Bits per key.
bits_per_key: u32,
/// Number of hash functions.
k: u32,
}
impl BloomFilter {
/// Create a new bloom filter policy.
pub fn new(bits_per_key: u32) -> Self {
// k = ln(2) * (bits_per_key)
let k = ((bits_per_key as f64) * std::f64::consts::LN_2).round() as u32;
let k = k.clamp(1, 30);
Self { bits_per_key, k }
}
/// Build a filter for the given set of keys.
/// Returns the filter data.
pub fn create_filter(&self, keys: &[&[u8]]) -> Vec<u8> {
// Compute the bit count in u64 to avoid overflow, then cap the byte
// count so that `bytes * 8` always fits in a u32 (the modulus type used
// during probing). Without the cap, a pathological bits_per_key/key
// count could wrap the bit count to 0 and panic on `h % bits`.
const MAX_FILTER_BYTES: u64 = (u32::MAX / 8) as u64;
let bits = (keys.len() as u64)
.saturating_mul(self.bits_per_key as u64)
.max(64);
let bytes = bits.div_ceil(8).min(MAX_FILTER_BYTES) as usize;
let bits = (bytes as u32) * 8;
let mut filter = vec![0u8; bytes + 1]; // +1 for k at the end
*filter.last_mut().unwrap() = self.k as u8;
for key in keys {
let h = bloom_hash(key);
let delta = h.rotate_left(15); // rotate_left(15) == rotate_right(17)
let mut h = h;
for _ in 0..self.k {
let bit_pos = h % bits;
filter[(bit_pos / 8) as usize] |= 1 << (bit_pos % 8);
h = h.wrapping_add(delta);
}
}
filter
}
/// Check if a key might be in the filter.
/// Returns `true` if possibly present, `false` if definitely absent.
pub fn key_may_match(key: &[u8], filter: &[u8]) -> bool {
if filter.len() < 2 {
return true; // degenerate: assume present
}
let k = *filter.last().unwrap() as u32;
if k > 30 {
return true; // reserved for future use
}
let bits = ((filter.len() - 1) * 8) as u32;
let h = bloom_hash(key);
let delta = h.rotate_left(15);
let mut h = h;
for _ in 0..k {
let bit_pos = h % bits;
if filter[(bit_pos / 8) as usize] & (1 << (bit_pos % 8)) == 0 {
return false;
}
h = h.wrapping_add(delta);
}
true
}
}
/// Hash function for bloom filter based on MurmurHash2.
/// Matches LevelDB/RocksDB's BloomHash for compatibility and quality.
fn bloom_hash(key: &[u8]) -> u32 {
let seed: u32 = 0xbc9f1d34;
let m: u32 = 0x5bd1e995;
let r: u32 = 24;
let len = key.len() as u32;
// Initialize with seed XOR'd with length (anti-length-extension)
let mut h: u32 = seed ^ len;
// Process 4-byte chunks
let mut i = 0;
while i + 4 <= key.len() {
let mut k = u32::from_le_bytes(key[i..i + 4].try_into().unwrap());
k = k.wrapping_mul(m);
k ^= k >> r;
k = k.wrapping_mul(m);
h = h.wrapping_mul(m);
h ^= k;
i += 4;
}
// Process remaining bytes
let remaining = key.len() - i;
if remaining >= 3 {
h ^= (key[i + 2] as u32) << 16;
}
if remaining >= 2 {
h ^= (key[i + 1] as u32) << 8;
}
if remaining >= 1 {
h ^= key[i] as u32;
h = h.wrapping_mul(m);
}
// Final avalanche mixing
h ^= h >> 13;
h = h.wrapping_mul(m);
h ^= h >> 15;
h
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bloom_filter_basic() {
let bf = BloomFilter::new(10);
let keys: Vec<&[u8]> = vec![b"hello", b"world", b"foo", b"bar"];
let filter = bf.create_filter(&keys);
// All inserted keys should match
for &key in &keys {
assert!(
BloomFilter::key_may_match(key, &filter),
"expected {:?} to match",
key
);
}
// Most non-inserted keys should not match (probabilistic)
let mut false_positives = 0;
for i in 0..1000 {
let key = format!("nonexistent_{}", i);
if BloomFilter::key_may_match(key.as_bytes(), &filter) {
false_positives += 1;
}
}
// With 10 bits per key and 4 keys, FP rate should be ~1%
assert!(
false_positives < 50,
"too many false positives: {}",
false_positives
);
}
#[test]
fn test_bloom_filter_empty() {
let bf = BloomFilter::new(10);
let filter = bf.create_filter(&[]);
// With empty filter, no keys should match
assert!(!BloomFilter::key_may_match(b"anything", &filter));
}
#[test]
fn test_bloom_filter_many_keys() {
let bf = BloomFilter::new(10);
let keys: Vec<Vec<u8>> = (0..10000)
.map(|i| format!("key_{:06}", i).into_bytes())
.collect();
let key_refs: Vec<&[u8]> = keys.iter().map(|k| k.as_slice()).collect();
let filter = bf.create_filter(&key_refs);
// All keys should match
for key in &keys {
assert!(BloomFilter::key_may_match(key, &filter));
}
// Check FP rate
let mut fp = 0;
for i in 10000..20000 {
let key = format!("key_{:06}", i);
if BloomFilter::key_may_match(key.as_bytes(), &filter) {
fp += 1;
}
}
let fp_rate = fp as f64 / 10000.0;
assert!(fp_rate < 0.02, "FP rate too high: {:.4}", fp_rate);
}
#[test]
fn test_bloom_bits_per_key_zero() {
// bits_per_key=0 should still produce a valid (minimal) filter
let bf = BloomFilter::new(0);
assert_eq!(bf.k, 1, "k should be clamped to minimum of 1");
let keys: Vec<&[u8]> = vec![b"aaa", b"bbb", b"ccc"];
let filter = bf.create_filter(&keys);
// Filter should still be valid (at least 2 bytes: min 64 bits -> 8 bytes + 1 for k)
assert!(filter.len() >= 2);
// Inserted keys should still match (bloom has no false negatives)
for &key in &keys {
assert!(
BloomFilter::key_may_match(key, &filter),
"inserted key {:?} must match",
key
);
}
}
#[test]
fn test_bloom_very_high_bits() {
// bits_per_key=100 should work and have very low FP rate
let bf = BloomFilter::new(100);
// k is clamped to max 30
assert!(bf.k <= 30, "k should be clamped to max 30, got {}", bf.k);
let keys: Vec<Vec<u8>> = (0..100)
.map(|i| format!("key_{:04}", i).into_bytes())
.collect();
let key_refs: Vec<&[u8]> = keys.iter().map(|k| k.as_slice()).collect();
let filter = bf.create_filter(&key_refs);
// All inserted keys must match
for key in &keys {
assert!(BloomFilter::key_may_match(key, &filter));
}
// With 100 bits per key, FP rate should be extremely low
let mut fp = 0;
for i in 100..1100 {
let key = format!("key_{:04}", i);
if BloomFilter::key_may_match(key.as_bytes(), &filter) {
fp += 1;
}
}
// Expect near-zero false positives
assert!(
fp < 10,
"with 100 bits/key, expected very few FPs but got {}",
fp
);
}
#[test]
fn test_bloom_identical_keys() {
// All keys are the same -- should not panic or produce invalid filter
let bf = BloomFilter::new(10);
let keys: Vec<&[u8]> = vec![b"same"; 1000];
let filter = bf.create_filter(&keys);
// The identical key should always match
assert!(BloomFilter::key_may_match(b"same", &filter));
// A different key should likely not match
// (not guaranteed, but with only 1 distinct key and 10 bits/key,
// the filter should be relatively sparse)
// We just verify no panic and the filter is structurally valid
let _ = BloomFilter::key_may_match(b"different", &filter);
}
#[test]
fn test_bloom_binary_keys() {
// Keys containing 0x00 and 0xFF bytes
let bf = BloomFilter::new(10);
let k1: Vec<u8> = vec![0x00, 0x00, 0x00];
let k2: Vec<u8> = vec![0xFF, 0xFF, 0xFF];
let k3: Vec<u8> = vec![0x00, 0xFF, 0x00, 0xFF];
let k4: Vec<u8> = vec![]; // empty key
let keys: Vec<&[u8]> = vec![&k1, &k2, &k3, &k4];
let filter = bf.create_filter(&keys);
// All inserted keys must match
assert!(BloomFilter::key_may_match(&k1, &filter));
assert!(BloomFilter::key_may_match(&k2, &filter));
assert!(BloomFilter::key_may_match(&k3, &filter));
assert!(BloomFilter::key_may_match(&k4, &filter));
// A key not in the set -- just verifying no panic on binary data
let not_in = vec![0x01, 0x02, 0x03, 0x04, 0x05];
let _ = BloomFilter::key_may_match(¬_in, &filter);
}
}