rustywallet_bloom/
bloom.rs1use crate::hash::{double_hash, fnv1a_64, fnv1a_64_seeded};
4
5pub struct BloomFilter {
27 bits: Vec<u64>,
28 num_bits: usize,
29 num_hashes: usize,
30 count: usize,
31}
32
33impl BloomFilter {
34 pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
51 let fpr = false_positive_rate.clamp(1e-15, 0.5); let n = expected_items.max(1) as f64;
53
54 let ln2_sq = std::f64::consts::LN_2 * std::f64::consts::LN_2;
56 let num_bits = ((-n * fpr.ln()) / ln2_sq).ceil() as usize;
57 let num_bits = num_bits.max(64);
58
59 let num_hashes = ((num_bits as f64 / n) * std::f64::consts::LN_2).ceil() as usize;
62 let num_hashes = num_hashes.clamp(3, 50);
63
64 let num_words = num_bits.div_ceil(64);
66
67 Self {
68 bits: vec![0u64; num_words],
69 num_bits,
70 num_hashes,
71 count: 0,
72 }
73 }
74
75 pub fn with_params(num_bits: usize, num_hashes: usize) -> Self {
84 let num_bits = num_bits.max(64);
85 let num_hashes = num_hashes.clamp(1, 16);
86 let num_words = num_bits.div_ceil(64);
87
88 Self {
89 bits: vec![0u64; num_words],
90 num_bits,
91 num_hashes,
92 count: 0,
93 }
94 }
95
96 #[inline]
100 pub fn insert<T: AsRef<[u8]>>(&mut self, item: T) {
101 let data = item.as_ref();
102 let h1 = fnv1a_64(data);
103 let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15); for i in 0..self.num_hashes {
106 let pos = double_hash(h1, h2, i, self.num_bits);
107 self.set_bit(pos);
108 }
109
110 self.count += 1;
111 }
112
113 #[inline]
118 pub fn contains<T: AsRef<[u8]>>(&self, item: T) -> bool {
119 let data = item.as_ref();
120 let h1 = fnv1a_64(data);
121 let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
122
123 for i in 0..self.num_hashes {
124 let pos = double_hash(h1, h2, i, self.num_bits);
125 if !self.get_bit(pos) {
126 return false;
127 }
128 }
129
130 true
131 }
132
133 pub fn len(&self) -> usize {
135 self.count
136 }
137
138 pub fn is_empty(&self) -> bool {
140 self.count == 0
141 }
142
143 pub fn memory_usage(&self) -> usize {
145 self.bits.len() * 8
146 }
147
148 pub fn num_bits(&self) -> usize {
150 self.num_bits
151 }
152
153 pub fn num_hashes(&self) -> usize {
155 self.num_hashes
156 }
157
158 pub fn estimated_fpr(&self) -> f64 {
160 let bits_set = self.bits.iter().map(|w| w.count_ones() as usize).sum::<usize>();
161 let fill_ratio = bits_set as f64 / self.num_bits as f64;
162 fill_ratio.powi(self.num_hashes as i32)
163 }
164
165 pub fn clear(&mut self) {
167 self.bits.fill(0);
168 self.count = 0;
169 }
170
171 #[inline]
172 fn set_bit(&mut self, pos: usize) {
173 let word = pos / 64;
174 let bit = pos % 64;
175 self.bits[word] |= 1u64 << bit;
176 }
177
178 #[inline]
179 fn get_bit(&self, pos: usize) -> bool {
180 let word = pos / 64;
181 let bit = pos % 64;
182 (self.bits[word] & (1u64 << bit)) != 0
183 }
184}
185
186impl std::fmt::Debug for BloomFilter {
187 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188 f.debug_struct("BloomFilter")
189 .field("items", &self.count)
190 .field("bits", &self.num_bits)
191 .field("hashes", &self.num_hashes)
192 .field("memory_mb", &(self.memory_usage() / 1_000_000))
193 .finish()
194 }
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200
201 #[test]
202 fn test_insert_and_contains() {
203 let mut bloom = BloomFilter::new(100, 0.01);
204
205 bloom.insert("test1");
206 bloom.insert("test2");
207 bloom.insert(b"binary_data");
208
209 assert!(bloom.contains("test1"));
210 assert!(bloom.contains("test2"));
211 assert!(bloom.contains(b"binary_data"));
212 assert_eq!(bloom.len(), 3);
213 }
214
215 #[test]
216 fn test_clear() {
217 let mut bloom = BloomFilter::new(100, 0.01);
218 bloom.insert("test");
219 assert!(bloom.contains("test"));
220
221 bloom.clear();
222 assert!(!bloom.contains("test"));
223 assert_eq!(bloom.len(), 0);
224 }
225
226 #[test]
227 fn test_with_params() {
228 let bloom = BloomFilter::with_params(1_000_000, 5);
229 assert_eq!(bloom.num_bits(), 1_000_000);
230 assert_eq!(bloom.num_hashes(), 5);
231 }
232
233 #[test]
234 fn test_large_dataset() {
235 let n = 100_000;
236 let mut bloom = BloomFilter::new(n, 0.01);
237
238 for i in 0..n {
239 bloom.insert(format!("addr_{}", i));
240 }
241
242 for i in 0..n {
244 assert!(bloom.contains(format!("addr_{}", i)));
245 }
246
247 assert_eq!(bloom.len(), n);
248 }
249}