dbx_core/storage/
index.rs1use std::collections::hash_map::DefaultHasher;
7use std::hash::{Hash, Hasher};
8
9pub struct BloomIndex {
11 bitmap: Vec<u64>,
12 num_hashes: usize,
13 items_count: usize,
14 false_positive_rate: f64,
15}
16
17impl BloomIndex {
18 pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
20 let bitmap_bits = Self::optimal_bitmap_size(expected_items, false_positive_rate);
21 let num_hashes = Self::optimal_num_hashes(expected_items, bitmap_bits);
22 let bitmap_size = bitmap_bits.div_ceil(64); Self {
25 bitmap: vec![0u64; bitmap_size],
26 num_hashes,
27 items_count: 0,
28 false_positive_rate,
29 }
30 }
31
32 pub fn with_default_fpr(expected_items: usize) -> Self {
34 Self::new(expected_items, 0.01)
35 }
36
37 pub fn insert(&mut self, key: &[u8]) {
39 for i in 0..self.num_hashes {
40 let hash = self.hash_with_seed(key, i);
41 let bit_index = (hash as usize) % (self.bitmap.len() * 64);
42 let word_index = bit_index / 64;
43 let bit_offset = bit_index % 64;
44 self.bitmap[word_index] |= 1u64 << bit_offset;
45 }
46 self.items_count += 1;
47 }
48
49 pub fn may_contain(&self, key: &[u8]) -> bool {
51 for i in 0..self.num_hashes {
52 let hash = self.hash_with_seed(key, i);
53 let bit_index = (hash as usize) % (self.bitmap.len() * 64);
54 let word_index = bit_index / 64;
55 let bit_offset = bit_index % 64;
56 if (self.bitmap[word_index] & (1u64 << bit_offset)) == 0 {
57 return false;
58 }
59 }
60 true
61 }
62
63 pub fn len(&self) -> usize {
65 self.items_count
66 }
67
68 pub fn is_empty(&self) -> bool {
70 self.items_count == 0
71 }
72
73 pub fn stats(&self) -> BloomStats {
75 BloomStats {
76 items_count: self.items_count,
77 bitmap_size: self.bitmap.len() * 64,
78 num_hashes: self.num_hashes,
79 target_fpr: self.false_positive_rate,
80 }
81 }
82
83 fn hash_with_seed(&self, key: &[u8], seed: usize) -> u64 {
85 let mut hasher = DefaultHasher::new();
86 seed.hash(&mut hasher);
87 key.hash(&mut hasher);
88 hasher.finish()
89 }
90
91 fn optimal_bitmap_size(n: usize, p: f64) -> usize {
93 let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
94 let m = -(n as f64) * p.ln() / ln2_squared;
95 m.ceil() as usize
96 }
97
98 fn optimal_num_hashes(n: usize, m: usize) -> usize {
100 let k = (m as f64 / n as f64) * std::f64::consts::LN_2;
101 (k.ceil() as usize).max(1)
102 }
103}
104
105#[derive(Debug, Clone)]
107pub struct BloomStats {
108 pub items_count: usize,
109 pub bitmap_size: usize,
110 pub num_hashes: usize,
111 pub target_fpr: f64,
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn test_bloom_basic() {
120 let mut bloom = BloomIndex::with_default_fpr(1000);
121
122 bloom.insert(b"key1");
123 bloom.insert(b"key2");
124 bloom.insert(b"key3");
125
126 assert!(bloom.may_contain(b"key1"));
127 assert!(bloom.may_contain(b"key2"));
128 assert!(bloom.may_contain(b"key3"));
129 assert!(!bloom.may_contain(b"key4"));
130 }
131
132 #[test]
133 fn test_bloom_false_positive_rate() {
134 let mut bloom = BloomIndex::new(1000, 0.01);
135
136 for i in 0..1000 {
137 bloom.insert(format!("key{}", i).as_bytes());
138 }
139
140 let mut false_positives = 0;
141 for i in 1000..2000 {
142 if bloom.may_contain(format!("key{}", i).as_bytes()) {
143 false_positives += 1;
144 }
145 }
146
147 let actual_fpr = false_positives as f64 / 1000.0;
148 println!("Actual FPR: {:.4}", actual_fpr);
149
150 assert!(actual_fpr < 0.05);
151 }
152
153 #[test]
154 fn test_bloom_stats() {
155 let mut bloom = BloomIndex::with_default_fpr(1000);
156
157 for i in 0..100 {
158 bloom.insert(format!("key{}", i).as_bytes());
159 }
160
161 let stats = bloom.stats();
162 assert_eq!(stats.items_count, 100);
163 assert!(stats.bitmap_size > 0);
164 assert!(stats.num_hashes > 0);
165 }
166
167 #[test]
168 fn test_bloom_empty() {
169 let bloom = BloomIndex::with_default_fpr(1000);
170
171 assert!(bloom.is_empty());
172 assert_eq!(bloom.len(), 0);
173 assert!(!bloom.may_contain(b"key1"));
174 }
175}