Skip to main content

dbx_core/storage/
index.rs

1//! Bloom Filter Index — Tier 4 Index
2//!
3//! Creates Bloom Filters per Parquet file to minimize I/O
4//! Simple bitmap-based implementation
5
6use std::collections::hash_map::DefaultHasher;
7use std::hash::{Hash, Hasher};
8
9/// Bloom Filter Index
10pub struct BloomIndex {
11    bitmap: Vec<u64>,
12    num_hashes: usize,
13    items_count: usize,
14    false_positive_rate: f64,
15}
16
17impl BloomIndex {
18    /// Creates a new Bloom Filter
19    pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
20        let bitmap_bits = Self::optimal_bitmap_size(expected_items, false_positive_rate);
21        let num_hashes = Self::optimal_num_hashes(expected_items, bitmap_bits);
22        let bitmap_size = bitmap_bits.div_ceil(64); // Round up to u64
23
24        Self {
25            bitmap: vec![0u64; bitmap_size],
26            num_hashes,
27            items_count: 0,
28            false_positive_rate,
29        }
30    }
31
32    /// Creates with default false positive rate (1%)
33    pub fn with_default_fpr(expected_items: usize) -> Self {
34        Self::new(expected_items, 0.01)
35    }
36
37    /// Inserts a key
38    pub fn insert(&mut self, key: &[u8]) {
39        for i in 0..self.num_hashes {
40            let hash = self.hash_with_seed(key, i);
41            let bit_index = (hash as usize) % (self.bitmap.len() * 64);
42            let word_index = bit_index / 64;
43            let bit_offset = bit_index % 64;
44            self.bitmap[word_index] |= 1u64 << bit_offset;
45        }
46        self.items_count += 1;
47    }
48
49    /// Checks if key may exist
50    pub fn may_contain(&self, key: &[u8]) -> bool {
51        for i in 0..self.num_hashes {
52            let hash = self.hash_with_seed(key, i);
53            let bit_index = (hash as usize) % (self.bitmap.len() * 64);
54            let word_index = bit_index / 64;
55            let bit_offset = bit_index % 64;
56            if (self.bitmap[word_index] & (1u64 << bit_offset)) == 0 {
57                return false;
58            }
59        }
60        true
61    }
62
63    /// Returns number of inserted items
64    pub fn len(&self) -> usize {
65        self.items_count
66    }
67
68    /// Checks if empty
69    pub fn is_empty(&self) -> bool {
70        self.items_count == 0
71    }
72
73    /// Returns Bloom Filter statistics
74    pub fn stats(&self) -> BloomStats {
75        BloomStats {
76            items_count: self.items_count,
77            bitmap_size: self.bitmap.len() * 64,
78            num_hashes: self.num_hashes,
79            target_fpr: self.false_positive_rate,
80        }
81    }
82
83    /// Hashes with seed
84    fn hash_with_seed(&self, key: &[u8], seed: usize) -> u64 {
85        let mut hasher = DefaultHasher::new();
86        seed.hash(&mut hasher);
87        key.hash(&mut hasher);
88        hasher.finish()
89    }
90
91    /// Calculates optimal bitmap size
92    fn optimal_bitmap_size(n: usize, p: f64) -> usize {
93        let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
94        let m = -(n as f64) * p.ln() / ln2_squared;
95        m.ceil() as usize
96    }
97
98    /// Calculates optimal number of hash functions
99    fn optimal_num_hashes(n: usize, m: usize) -> usize {
100        let k = (m as f64 / n as f64) * std::f64::consts::LN_2;
101        (k.ceil() as usize).max(1)
102    }
103}
104
105/// Bloom Filter statistics
106#[derive(Debug, Clone)]
107pub struct BloomStats {
108    pub items_count: usize,
109    pub bitmap_size: usize,
110    pub num_hashes: usize,
111    pub target_fpr: f64,
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    #[test]
119    fn test_bloom_basic() {
120        let mut bloom = BloomIndex::with_default_fpr(1000);
121
122        bloom.insert(b"key1");
123        bloom.insert(b"key2");
124        bloom.insert(b"key3");
125
126        assert!(bloom.may_contain(b"key1"));
127        assert!(bloom.may_contain(b"key2"));
128        assert!(bloom.may_contain(b"key3"));
129        assert!(!bloom.may_contain(b"key4"));
130    }
131
132    #[test]
133    fn test_bloom_false_positive_rate() {
134        let mut bloom = BloomIndex::new(1000, 0.01);
135
136        for i in 0..1000 {
137            bloom.insert(format!("key{}", i).as_bytes());
138        }
139
140        let mut false_positives = 0;
141        for i in 1000..2000 {
142            if bloom.may_contain(format!("key{}", i).as_bytes()) {
143                false_positives += 1;
144            }
145        }
146
147        let actual_fpr = false_positives as f64 / 1000.0;
148        println!("Actual FPR: {:.4}", actual_fpr);
149
150        assert!(actual_fpr < 0.05);
151    }
152
153    #[test]
154    fn test_bloom_stats() {
155        let mut bloom = BloomIndex::with_default_fpr(1000);
156
157        for i in 0..100 {
158            bloom.insert(format!("key{}", i).as_bytes());
159        }
160
161        let stats = bloom.stats();
162        assert_eq!(stats.items_count, 100);
163        assert!(stats.bitmap_size > 0);
164        assert!(stats.num_hashes > 0);
165    }
166
167    #[test]
168    fn test_bloom_empty() {
169        let bloom = BloomIndex::with_default_fpr(1000);
170
171        assert!(bloom.is_empty());
172        assert_eq!(bloom.len(), 0);
173        assert!(!bloom.may_contain(b"key1"));
174    }
175}