reddb_server/storage/primitives/
bloom.rs1pub struct BloomFilter {
7 bits: Vec<u8>, num_hashes: u8, size: u32, }
11
12impl BloomFilter {
13 pub fn new(size: u32, num_hashes: u8) -> Self {
15 let byte_size = size.div_ceil(8) as usize;
16 Self {
17 bits: vec![0u8; byte_size],
18 num_hashes,
19 size,
20 }
21 }
22
23 pub fn with_capacity(elements: usize, false_positive_rate: f64) -> Self {
27 let size = Self::optimal_size(elements, false_positive_rate);
28 let num_hashes = Self::optimal_hashes(elements, size);
29 Self::new(size, num_hashes)
30 }
31
32 fn optimal_size(elements: usize, fp_rate: f64) -> u32 {
33 let n = elements as f64;
34 let p = fp_rate;
35 let size = -(n * p.ln()) / (2.0_f64.ln().powi(2));
36 size.ceil() as u32
37 }
38
39 fn optimal_hashes(elements: usize, size: u32) -> u8 {
40 let n = elements as f64;
41 let m = size as f64;
42 let k = (m / n) * 2.0_f64.ln();
43 k.ceil().clamp(1.0, 10.0) as u8
44 }
45
46 pub fn insert(&mut self, key: &[u8]) {
48 for i in 0..self.num_hashes {
49 let hash = self.hash(key, i);
50 self.set_bit(hash);
51 }
52 }
53
54 pub fn contains(&self, key: &[u8]) -> bool {
56 for i in 0..self.num_hashes {
57 let hash = self.hash(key, i);
58 if !self.get_bit(hash) {
59 return false; }
61 }
62 true }
64
65 fn hash(&self, key: &[u8], index: u8) -> u32 {
67 match index {
68 0 => self.hash_fnv1a(key),
69 1 => self.hash_murmur3(key),
70 2 => self.hash_djb2(key),
71 3 => self.hash_sdbm(key),
72 4 => self.hash_lose(key),
73 _ => self.hash_fnv1a(key),
74 }
75 }
76
77 fn hash_fnv1a(&self, key: &[u8]) -> u32 {
79 let mut hash = 2166136261u32;
80 for &byte in key {
81 hash ^= byte as u32;
82 hash = hash.wrapping_mul(16777619);
83 }
84 hash % self.size
85 }
86
87 fn hash_murmur3(&self, key: &[u8]) -> u32 {
89 let mut hash = 0u32;
90 for &byte in key {
91 hash = hash.wrapping_add(byte as u32);
92 hash = hash.wrapping_add(hash << 10);
93 hash ^= hash >> 6;
94 }
95 hash = hash.wrapping_add(hash << 3);
96 hash ^= hash >> 11;
97 hash = hash.wrapping_add(hash << 15);
98 hash % self.size
99 }
100
101 fn hash_djb2(&self, key: &[u8]) -> u32 {
103 let mut hash = 5381u32;
104 for &byte in key {
105 hash = hash.wrapping_mul(33).wrapping_add(byte as u32);
106 }
107 hash % self.size
108 }
109
110 fn hash_sdbm(&self, key: &[u8]) -> u32 {
112 let mut hash = 0u32;
113 for &byte in key {
114 hash = (byte as u32)
115 .wrapping_add(hash << 6)
116 .wrapping_add(hash << 16)
117 .wrapping_sub(hash);
118 }
119 hash % self.size
120 }
121
122 fn hash_lose(&self, key: &[u8]) -> u32 {
124 let mut hash = 0u32;
125 for chunk in key.chunks(4) {
126 let mut val = 0u32;
127 for (i, &byte) in chunk.iter().enumerate() {
128 val |= (byte as u32) << (i * 8);
129 }
130 hash ^= val;
131 }
132 hash % self.size
133 }
134
135 fn set_bit(&mut self, pos: u32) {
137 let byte_index = (pos / 8) as usize;
138 let bit_offset = (pos % 8) as u8;
139 if byte_index < self.bits.len() {
140 self.bits[byte_index] |= 1 << bit_offset;
141 }
142 }
143
144 fn get_bit(&self, pos: u32) -> bool {
146 let byte_index = (pos / 8) as usize;
147 let bit_offset = (pos % 8) as u8;
148 if byte_index < self.bits.len() {
149 (self.bits[byte_index] & (1 << bit_offset)) != 0
150 } else {
151 false
152 }
153 }
154
155 pub fn as_bytes(&self) -> &[u8] {
157 &self.bits
158 }
159
160 pub fn from_bytes(bytes: Vec<u8>, num_hashes: u8) -> Self {
162 let size = (bytes.len() * 8) as u32;
163 Self {
164 bits: bytes,
165 num_hashes,
166 size,
167 }
168 }
169
170 pub fn from_bytes_with_size(bytes: Vec<u8>, num_hashes: u8, size: u32) -> Self {
177 Self {
178 bits: bytes,
179 num_hashes,
180 size,
181 }
182 }
183
184 pub fn byte_size(&self) -> usize {
186 self.bits.len()
187 }
188
189 pub fn bit_size(&self) -> u32 {
191 self.size
192 }
193
194 pub fn clear(&mut self) {
196 for byte in &mut self.bits {
197 *byte = 0;
198 }
199 }
200
201 pub fn estimate_fp_rate(&self, inserted_count: usize) -> f64 {
203 let m = self.size as f64;
204 let n = inserted_count as f64;
205 let k = self.num_hashes as f64;
206
207 let exp_term = (-k * n / m).exp();
209 (1.0 - exp_term).powf(k)
210 }
211
212 pub fn count_set_bits(&self) -> u32 {
214 let mut count = 0;
215 for &byte in &self.bits {
216 count += byte.count_ones();
217 }
218 count
219 }
220
221 pub fn fill_ratio(&self) -> f64 {
223 self.count_set_bits() as f64 / self.size as f64
224 }
225
226 pub fn merge(&self, other: &BloomFilter) -> Option<BloomFilter> {
229 if self.size != other.size || self.num_hashes != other.num_hashes {
230 return None;
231 }
232 let mut merged_bits = self.bits.clone();
233 for (i, &byte) in other.bits.iter().enumerate() {
234 merged_bits[i] |= byte;
235 }
236 Some(BloomFilter {
237 bits: merged_bits,
238 num_hashes: self.num_hashes,
239 size: self.size,
240 })
241 }
242
243 pub fn union_inplace(&mut self, other: &BloomFilter) -> bool {
246 if self.size != other.size || self.num_hashes != other.num_hashes {
247 return false;
248 }
249 for (i, &byte) in other.bits.iter().enumerate() {
250 self.bits[i] |= byte;
251 }
252 true
253 }
254
255 pub fn num_hashes(&self) -> u8 {
257 self.num_hashes
258 }
259}
260
261pub struct BloomFilterBuilder {
263 expected_elements: Option<usize>,
264 false_positive_rate: f64,
265 size: Option<u32>,
266 num_hashes: u8,
267}
268
269impl BloomFilterBuilder {
270 pub fn new() -> Self {
271 Self {
272 expected_elements: None,
273 false_positive_rate: 0.01, size: None,
275 num_hashes: 3,
276 }
277 }
278
279 pub fn expected_elements(mut self, n: usize) -> Self {
280 self.expected_elements = Some(n);
281 self
282 }
283
284 pub fn false_positive_rate(mut self, rate: f64) -> Self {
285 self.false_positive_rate = rate;
286 self
287 }
288
289 pub fn size(mut self, size: u32) -> Self {
290 self.size = Some(size);
291 self
292 }
293
294 pub fn num_hashes(mut self, n: u8) -> Self {
295 self.num_hashes = n;
296 self
297 }
298
299 pub fn build(self) -> BloomFilter {
300 if let Some(size) = self.size {
301 BloomFilter::new(size, self.num_hashes)
302 } else if let Some(elements) = self.expected_elements {
303 BloomFilter::with_capacity(elements, self.false_positive_rate)
304 } else {
305 BloomFilter::with_capacity(100_000, self.false_positive_rate)
307 }
308 }
309}
310
311#[cfg(test)]
312mod tests {
313 use super::*;
314
315 #[test]
316 fn test_bloom_basic() {
317 let mut bloom = BloomFilter::new(1000, 3);
318
319 bloom.insert(b"hello");
320 bloom.insert(b"world");
321
322 assert!(bloom.contains(b"hello"));
323 assert!(bloom.contains(b"world"));
324 assert!(!bloom.contains(b"foo"));
325 }
326
327 #[test]
328 fn test_bloom_with_capacity() {
329 let mut bloom = BloomFilter::with_capacity(1000, 0.01);
330
331 for i in 0..1000 {
332 let key = format!("key{}", i);
333 bloom.insert(key.as_bytes());
334 }
335
336 for i in 0..1000 {
338 let key = format!("key{}", i);
339 assert!(bloom.contains(key.as_bytes()));
340 }
341
342 let mut false_positives = 0;
344 for i in 1000..2000 {
345 let key = format!("key{}", i);
346 if bloom.contains(key.as_bytes()) {
347 false_positives += 1;
348 }
349 }
350
351 let fp_rate = false_positives as f64 / 1000.0;
352 println!("False positive rate: {:.2}%", fp_rate * 100.0);
353 assert!(fp_rate < 0.05); }
355
356 #[test]
357 fn test_bloom_serialization() {
358 let mut bloom1 = BloomFilter::new(1000, 3);
359 bloom1.insert(b"test1");
360 bloom1.insert(b"test2");
361
362 let bytes = bloom1.as_bytes().to_vec();
363 let bloom2 = BloomFilter::from_bytes(bytes, 3);
364
365 assert!(bloom2.contains(b"test1"));
366 assert!(bloom2.contains(b"test2"));
367 assert!(!bloom2.contains(b"test3"));
368 }
369
370 #[test]
371 fn test_bloom_clear() {
372 let mut bloom = BloomFilter::new(1000, 3);
373 bloom.insert(b"hello");
374 assert!(bloom.contains(b"hello"));
375
376 bloom.clear();
377 assert!(!bloom.contains(b"hello"));
378 }
379
380 #[test]
381 fn test_bloom_fill_ratio() {
382 let mut bloom = BloomFilter::new(1000, 3);
383 assert_eq!(bloom.fill_ratio(), 0.0);
384
385 for i in 0..100 {
386 bloom.insert(format!("key{}", i).as_bytes());
387 }
388
389 let ratio = bloom.fill_ratio();
390 println!("Fill ratio: {:.2}%", ratio * 100.0);
391 assert!(ratio > 0.0);
392 assert!(ratio < 1.0);
393 }
394
395 #[test]
396 fn test_bloom_builder() {
397 let bloom = BloomFilterBuilder::new()
398 .expected_elements(10000)
399 .false_positive_rate(0.01)
400 .build();
401
402 assert!(bloom.bit_size() > 0);
403 assert!(bloom.byte_size() > 0);
404 }
405
406 #[test]
407 fn test_hash_functions() {
408 let bloom = BloomFilter::new(1000, 3);
409 let key = b"test";
410
411 let h1 = bloom.hash_fnv1a(key);
412 let h2 = bloom.hash_murmur3(key);
413 let h3 = bloom.hash_djb2(key);
414
415 assert_ne!(h1, h2);
417 assert_ne!(h2, h3);
418 assert_ne!(h1, h3);
419
420 assert!(h1 < 1000);
422 assert!(h2 < 1000);
423 assert!(h3 < 1000);
424 }
425
426 #[test]
427 fn test_bloom_merge() {
428 let mut bloom1 = BloomFilter::new(1000, 3);
429 bloom1.insert(b"alpha");
430 bloom1.insert(b"beta");
431
432 let mut bloom2 = BloomFilter::new(1000, 3);
433 bloom2.insert(b"gamma");
434 bloom2.insert(b"delta");
435
436 let merged = bloom1.merge(&bloom2).unwrap();
437 assert!(merged.contains(b"alpha"));
438 assert!(merged.contains(b"beta"));
439 assert!(merged.contains(b"gamma"));
440 assert!(merged.contains(b"delta"));
441 assert!(!merged.contains(b"missing"));
442 }
443
444 #[test]
445 fn test_bloom_merge_incompatible() {
446 let bloom1 = BloomFilter::new(1000, 3);
447 let bloom2 = BloomFilter::new(2000, 3);
448 assert!(bloom1.merge(&bloom2).is_none());
449
450 let bloom3 = BloomFilter::new(1000, 4);
451 assert!(bloom1.merge(&bloom3).is_none());
452 }
453
454 #[test]
455 fn test_bloom_union_inplace() {
456 let mut bloom1 = BloomFilter::new(1000, 3);
457 bloom1.insert(b"one");
458
459 let mut bloom2 = BloomFilter::new(1000, 3);
460 bloom2.insert(b"two");
461
462 assert!(bloom1.union_inplace(&bloom2));
463 assert!(bloom1.contains(b"one"));
464 assert!(bloom1.contains(b"two"));
465 }
466
467 #[test]
468 fn test_optimal_sizing() {
469 let bloom = BloomFilter::with_capacity(1_000_000, 0.01);
470
471 println!("Bloom filter stats:");
472 println!(" Elements: 1,000,000");
473 println!(" FP rate: 1%");
474 println!(" Bits: {}", bloom.bit_size());
475 println!(" Bytes: {}", bloom.byte_size());
476 println!(" Hashes: {}", bloom.num_hashes);
477
478 let bits_per_element = bloom.bit_size() as f64 / 1_000_000.0;
480 println!(" Bits/element: {:.2}", bits_per_element);
481
482 assert!(bits_per_element > 8.0);
483 assert!(bits_per_element < 12.0);
484 }
485}