Skip to main content

reddb_server/storage/primitives/
bloom.rs

1// Bloom Filter Implementation (From Scratch)
2// Zero external dependencies - Rust std only
3
4/// Bloom filter for fast negative lookups
5/// False positive rate: ~1% with 3 hash functions
6pub struct BloomFilter {
7    bits: Vec<u8>,  // Bit array (size / 8 bytes)
8    num_hashes: u8, // Number of hash functions (3 optimal)
9    size: u32,      // Total bits
10}
11
12impl BloomFilter {
13    /// Create new bloom filter with given size in bits
14    pub fn new(size: u32, num_hashes: u8) -> Self {
15        let byte_size = size.div_ceil(8) as usize;
16        Self {
17            bits: vec![0u8; byte_size],
18            num_hashes,
19            size,
20        }
21    }
22
23    /// Create bloom filter optimized for expected number of elements
24    /// Formula: size = -n * ln(p) / (ln(2)^2)
25    /// Where n = elements, p = false positive rate (0.01 = 1%)
26    pub fn with_capacity(elements: usize, false_positive_rate: f64) -> Self {
27        let size = Self::optimal_size(elements, false_positive_rate);
28        let num_hashes = Self::optimal_hashes(elements, size);
29        Self::new(size, num_hashes)
30    }
31
32    fn optimal_size(elements: usize, fp_rate: f64) -> u32 {
33        let n = elements as f64;
34        let p = fp_rate;
35        let size = -(n * p.ln()) / (2.0_f64.ln().powi(2));
36        size.ceil() as u32
37    }
38
39    fn optimal_hashes(elements: usize, size: u32) -> u8 {
40        let n = elements as f64;
41        let m = size as f64;
42        let k = (m / n) * 2.0_f64.ln();
43        k.ceil().clamp(1.0, 10.0) as u8
44    }
45
46    /// Insert key into bloom filter
47    pub fn insert(&mut self, key: &[u8]) {
48        for i in 0..self.num_hashes {
49            let hash = self.hash(key, i);
50            self.set_bit(hash);
51        }
52    }
53
54    /// Check if key might exist (false positive possible, no false negative)
55    pub fn contains(&self, key: &[u8]) -> bool {
56        for i in 0..self.num_hashes {
57            let hash = self.hash(key, i);
58            if !self.get_bit(hash) {
59                return false; // Definitely not present
60            }
61        }
62        true // Might be present
63    }
64
65    /// Get hash value for key using hash function index
66    fn hash(&self, key: &[u8], index: u8) -> u32 {
67        match index {
68            0 => self.hash_fnv1a(key),
69            1 => self.hash_murmur3(key),
70            2 => self.hash_djb2(key),
71            3 => self.hash_sdbm(key),
72            4 => self.hash_lose(key),
73            _ => self.hash_fnv1a(key),
74        }
75    }
76
77    /// FNV-1a hash (fast, good distribution)
78    fn hash_fnv1a(&self, key: &[u8]) -> u32 {
79        let mut hash = 2166136261u32;
80        for &byte in key {
81            hash ^= byte as u32;
82            hash = hash.wrapping_mul(16777619);
83        }
84        hash % self.size
85    }
86
87    /// MurmurHash3-inspired (bit mixing for better distribution)
88    fn hash_murmur3(&self, key: &[u8]) -> u32 {
89        let mut hash = 0u32;
90        for &byte in key {
91            hash = hash.wrapping_add(byte as u32);
92            hash = hash.wrapping_add(hash << 10);
93            hash ^= hash >> 6;
94        }
95        hash = hash.wrapping_add(hash << 3);
96        hash ^= hash >> 11;
97        hash = hash.wrapping_add(hash << 15);
98        hash % self.size
99    }
100
101    /// DJB2 hash (simple and fast)
102    fn hash_djb2(&self, key: &[u8]) -> u32 {
103        let mut hash = 5381u32;
104        for &byte in key {
105            hash = hash.wrapping_mul(33).wrapping_add(byte as u32);
106        }
107        hash % self.size
108    }
109
110    /// SDBM hash (used in many hash tables)
111    fn hash_sdbm(&self, key: &[u8]) -> u32 {
112        let mut hash = 0u32;
113        for &byte in key {
114            hash = (byte as u32)
115                .wrapping_add(hash << 6)
116                .wrapping_add(hash << 16)
117                .wrapping_sub(hash);
118        }
119        hash % self.size
120    }
121
122    /// Lose Lose hash (simple XOR folding)
123    fn hash_lose(&self, key: &[u8]) -> u32 {
124        let mut hash = 0u32;
125        for chunk in key.chunks(4) {
126            let mut val = 0u32;
127            for (i, &byte) in chunk.iter().enumerate() {
128                val |= (byte as u32) << (i * 8);
129            }
130            hash ^= val;
131        }
132        hash % self.size
133    }
134
135    /// Set bit at position
136    fn set_bit(&mut self, pos: u32) {
137        let byte_index = (pos / 8) as usize;
138        let bit_offset = (pos % 8) as u8;
139        if byte_index < self.bits.len() {
140            self.bits[byte_index] |= 1 << bit_offset;
141        }
142    }
143
144    /// Get bit at position
145    fn get_bit(&self, pos: u32) -> bool {
146        let byte_index = (pos / 8) as usize;
147        let bit_offset = (pos % 8) as u8;
148        if byte_index < self.bits.len() {
149            (self.bits[byte_index] & (1 << bit_offset)) != 0
150        } else {
151            false
152        }
153    }
154
155    /// Get raw bytes for serialization
156    pub fn as_bytes(&self) -> &[u8] {
157        &self.bits
158    }
159
160    /// Load from raw bytes
161    pub fn from_bytes(bytes: Vec<u8>, num_hashes: u8) -> Self {
162        let size = (bytes.len() * 8) as u32;
163        Self {
164            bits: bytes,
165            num_hashes,
166            size,
167        }
168    }
169
170    /// Load from raw bytes with an explicit original bit size.
171    ///
172    /// Use this when `from_bytes` would lose the declared size (when the
173    /// original size was not a multiple of 8). The hash functions reduce
174    /// positions modulo `size`, so restoring with a different value would
175    /// produce false negatives.
176    pub fn from_bytes_with_size(bytes: Vec<u8>, num_hashes: u8, size: u32) -> Self {
177        Self {
178            bits: bytes,
179            num_hashes,
180            size,
181        }
182    }
183
184    /// Get size in bytes
185    pub fn byte_size(&self) -> usize {
186        self.bits.len()
187    }
188
189    /// Get number of bits
190    pub fn bit_size(&self) -> u32 {
191        self.size
192    }
193
194    /// Clear all bits
195    pub fn clear(&mut self) {
196        for byte in &mut self.bits {
197            *byte = 0;
198        }
199    }
200
201    /// Estimate false positive rate based on current fill
202    pub fn estimate_fp_rate(&self, inserted_count: usize) -> f64 {
203        let m = self.size as f64;
204        let n = inserted_count as f64;
205        let k = self.num_hashes as f64;
206
207        // Formula: (1 - e^(-kn/m))^k
208        let exp_term = (-k * n / m).exp();
209        (1.0 - exp_term).powf(k)
210    }
211
212    /// Get number of set bits
213    pub fn count_set_bits(&self) -> u32 {
214        let mut count = 0;
215        for &byte in &self.bits {
216            count += byte.count_ones();
217        }
218        count
219    }
220
221    /// Calculate fill ratio (0.0 to 1.0)
222    pub fn fill_ratio(&self) -> f64 {
223        self.count_set_bits() as f64 / self.size as f64
224    }
225
226    /// Merge two bloom filters via bitwise OR.
227    /// Returns `None` if the filters have different size or hash count.
228    pub fn merge(&self, other: &BloomFilter) -> Option<BloomFilter> {
229        if self.size != other.size || self.num_hashes != other.num_hashes {
230            return None;
231        }
232        let mut merged_bits = self.bits.clone();
233        for (i, &byte) in other.bits.iter().enumerate() {
234            merged_bits[i] |= byte;
235        }
236        Some(BloomFilter {
237            bits: merged_bits,
238            num_hashes: self.num_hashes,
239            size: self.size,
240        })
241    }
242
243    /// Merge another bloom filter into this one in-place.
244    /// Returns `false` if the filters are incompatible (different size/hashes).
245    pub fn union_inplace(&mut self, other: &BloomFilter) -> bool {
246        if self.size != other.size || self.num_hashes != other.num_hashes {
247            return false;
248        }
249        for (i, &byte) in other.bits.iter().enumerate() {
250            self.bits[i] |= byte;
251        }
252        true
253    }
254
255    /// Get the number of hash functions
256    pub fn num_hashes(&self) -> u8 {
257        self.num_hashes
258    }
259}
260
261/// Builder for creating bloom filters
262pub struct BloomFilterBuilder {
263    expected_elements: Option<usize>,
264    false_positive_rate: f64,
265    size: Option<u32>,
266    num_hashes: u8,
267}
268
269impl BloomFilterBuilder {
270    pub fn new() -> Self {
271        Self {
272            expected_elements: None,
273            false_positive_rate: 0.01, // 1% default
274            size: None,
275            num_hashes: 3,
276        }
277    }
278
279    pub fn expected_elements(mut self, n: usize) -> Self {
280        self.expected_elements = Some(n);
281        self
282    }
283
284    pub fn false_positive_rate(mut self, rate: f64) -> Self {
285        self.false_positive_rate = rate;
286        self
287    }
288
289    pub fn size(mut self, size: u32) -> Self {
290        self.size = Some(size);
291        self
292    }
293
294    pub fn num_hashes(mut self, n: u8) -> Self {
295        self.num_hashes = n;
296        self
297    }
298
299    pub fn build(self) -> BloomFilter {
300        if let Some(size) = self.size {
301            BloomFilter::new(size, self.num_hashes)
302        } else if let Some(elements) = self.expected_elements {
303            BloomFilter::with_capacity(elements, self.false_positive_rate)
304        } else {
305            // Default: 100K elements, 1% FP rate
306            BloomFilter::with_capacity(100_000, self.false_positive_rate)
307        }
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_bloom_basic() {
317        let mut bloom = BloomFilter::new(1000, 3);
318
319        bloom.insert(b"hello");
320        bloom.insert(b"world");
321
322        assert!(bloom.contains(b"hello"));
323        assert!(bloom.contains(b"world"));
324        assert!(!bloom.contains(b"foo"));
325    }
326
327    #[test]
328    fn test_bloom_with_capacity() {
329        let mut bloom = BloomFilter::with_capacity(1000, 0.01);
330
331        for i in 0..1000 {
332            let key = format!("key{}", i);
333            bloom.insert(key.as_bytes());
334        }
335
336        // Check all inserted keys are found
337        for i in 0..1000 {
338            let key = format!("key{}", i);
339            assert!(bloom.contains(key.as_bytes()));
340        }
341
342        // Check false positive rate
343        let mut false_positives = 0;
344        for i in 1000..2000 {
345            let key = format!("key{}", i);
346            if bloom.contains(key.as_bytes()) {
347                false_positives += 1;
348            }
349        }
350
351        let fp_rate = false_positives as f64 / 1000.0;
352        println!("False positive rate: {:.2}%", fp_rate * 100.0);
353        assert!(fp_rate < 0.05); // Should be under 5%
354    }
355
356    #[test]
357    fn test_bloom_serialization() {
358        let mut bloom1 = BloomFilter::new(1000, 3);
359        bloom1.insert(b"test1");
360        bloom1.insert(b"test2");
361
362        let bytes = bloom1.as_bytes().to_vec();
363        let bloom2 = BloomFilter::from_bytes(bytes, 3);
364
365        assert!(bloom2.contains(b"test1"));
366        assert!(bloom2.contains(b"test2"));
367        assert!(!bloom2.contains(b"test3"));
368    }
369
370    #[test]
371    fn test_bloom_clear() {
372        let mut bloom = BloomFilter::new(1000, 3);
373        bloom.insert(b"hello");
374        assert!(bloom.contains(b"hello"));
375
376        bloom.clear();
377        assert!(!bloom.contains(b"hello"));
378    }
379
380    #[test]
381    fn test_bloom_fill_ratio() {
382        let mut bloom = BloomFilter::new(1000, 3);
383        assert_eq!(bloom.fill_ratio(), 0.0);
384
385        for i in 0..100 {
386            bloom.insert(format!("key{}", i).as_bytes());
387        }
388
389        let ratio = bloom.fill_ratio();
390        println!("Fill ratio: {:.2}%", ratio * 100.0);
391        assert!(ratio > 0.0);
392        assert!(ratio < 1.0);
393    }
394
395    #[test]
396    fn test_bloom_builder() {
397        let bloom = BloomFilterBuilder::new()
398            .expected_elements(10000)
399            .false_positive_rate(0.01)
400            .build();
401
402        assert!(bloom.bit_size() > 0);
403        assert!(bloom.byte_size() > 0);
404    }
405
406    #[test]
407    fn test_hash_functions() {
408        let bloom = BloomFilter::new(1000, 3);
409        let key = b"test";
410
411        let h1 = bloom.hash_fnv1a(key);
412        let h2 = bloom.hash_murmur3(key);
413        let h3 = bloom.hash_djb2(key);
414
415        // Hashes should be different
416        assert_ne!(h1, h2);
417        assert_ne!(h2, h3);
418        assert_ne!(h1, h3);
419
420        // Hashes should be within range
421        assert!(h1 < 1000);
422        assert!(h2 < 1000);
423        assert!(h3 < 1000);
424    }
425
426    #[test]
427    fn test_bloom_merge() {
428        let mut bloom1 = BloomFilter::new(1000, 3);
429        bloom1.insert(b"alpha");
430        bloom1.insert(b"beta");
431
432        let mut bloom2 = BloomFilter::new(1000, 3);
433        bloom2.insert(b"gamma");
434        bloom2.insert(b"delta");
435
436        let merged = bloom1.merge(&bloom2).unwrap();
437        assert!(merged.contains(b"alpha"));
438        assert!(merged.contains(b"beta"));
439        assert!(merged.contains(b"gamma"));
440        assert!(merged.contains(b"delta"));
441        assert!(!merged.contains(b"missing"));
442    }
443
444    #[test]
445    fn test_bloom_merge_incompatible() {
446        let bloom1 = BloomFilter::new(1000, 3);
447        let bloom2 = BloomFilter::new(2000, 3);
448        assert!(bloom1.merge(&bloom2).is_none());
449
450        let bloom3 = BloomFilter::new(1000, 4);
451        assert!(bloom1.merge(&bloom3).is_none());
452    }
453
454    #[test]
455    fn test_bloom_union_inplace() {
456        let mut bloom1 = BloomFilter::new(1000, 3);
457        bloom1.insert(b"one");
458
459        let mut bloom2 = BloomFilter::new(1000, 3);
460        bloom2.insert(b"two");
461
462        assert!(bloom1.union_inplace(&bloom2));
463        assert!(bloom1.contains(b"one"));
464        assert!(bloom1.contains(b"two"));
465    }
466
467    #[test]
468    fn test_optimal_sizing() {
469        let bloom = BloomFilter::with_capacity(1_000_000, 0.01);
470
471        println!("Bloom filter stats:");
472        println!("  Elements: 1,000,000");
473        println!("  FP rate: 1%");
474        println!("  Bits: {}", bloom.bit_size());
475        println!("  Bytes: {}", bloom.byte_size());
476        println!("  Hashes: {}", bloom.num_hashes);
477
478        // Should be around 9.6 bits per element for 1% FP rate
479        let bits_per_element = bloom.bit_size() as f64 / 1_000_000.0;
480        println!("  Bits/element: {:.2}", bits_per_element);
481
482        assert!(bits_per_element > 8.0);
483        assert!(bits_per_element < 12.0);
484    }
485}