oximedia_dedup/
bloom_prescreen.rs1#![allow(dead_code)]
17#![allow(clippy::cast_precision_loss)]
18#![allow(clippy::cast_possible_truncation)]
19
20#[derive(Debug, Clone)]
33pub struct BloomPrescreen {
34 pub bits: Vec<u64>,
36 pub k_hashes: u32,
38 pub num_bits: usize,
40 items_inserted: u64,
42}
43
44impl BloomPrescreen {
45 #[must_use]
54 pub fn new(capacity: usize, false_positive_rate: f64) -> Self {
55 let n = capacity.max(1) as f64;
56 let p = false_positive_rate.clamp(1e-15, 1.0 - f64::EPSILON);
57 let ln2 = std::f64::consts::LN_2;
58
59 let m_float = -n * p.ln() / (ln2 * ln2);
61 let m = (m_float.ceil() as usize).max(64);
62
63 let k_float = (m as f64 / n) * ln2;
65 let k = (k_float.round() as u32).clamp(1, 32);
66
67 let num_words = (m + 63) / 64;
69
70 Self {
71 bits: vec![0u64; num_words],
72 k_hashes: k,
73 num_bits: m,
74 items_inserted: 0,
75 }
76 }
77
78 pub fn insert(&mut self, key: &[u8]) {
83 for i in 0..self.k_hashes {
84 let bit_idx = self.bit_index(key, i);
85 let word = bit_idx / 64;
86 let bit = bit_idx % 64;
87 if word < self.bits.len() {
88 self.bits[word] |= 1u64 << bit;
89 }
90 }
91 self.items_inserted += 1;
92 }
93
94 #[must_use]
97 pub fn might_contain(&self, key: &[u8]) -> bool {
98 for i in 0..self.k_hashes {
99 let bit_idx = self.bit_index(key, i);
100 let word = bit_idx / 64;
101 let bit = bit_idx % 64;
102 match self.bits.get(word) {
103 Some(w) if w & (1u64 << bit) != 0 => continue,
104 _ => return false,
105 }
106 }
107 true
108 }
109
110 #[must_use]
116 pub fn false_positive_rate_estimate(&self) -> f64 {
117 if self.num_bits == 0 {
118 return 1.0;
119 }
120 let k = self.k_hashes as f64;
121 let n = self.items_inserted as f64;
122 let m = self.num_bits as f64;
123 let inner = (-k * n / m).exp();
124 (1.0 - inner).powf(k)
125 }
126
127 #[must_use]
129 pub fn items_inserted(&self) -> u64 {
130 self.items_inserted
131 }
132
133 pub fn clear(&mut self) {
135 for w in &mut self.bits {
136 *w = 0;
137 }
138 self.items_inserted = 0;
139 }
140
141 #[must_use]
143 pub fn set_bit_count(&self) -> u64 {
144 self.bits.iter().map(|w| u64::from(w.count_ones())).sum()
145 }
146
147 fn bit_index(&self, key: &[u8], i: u32) -> usize {
157 let h1 = fnv1a_64(key);
158 let seed = u64::from(i)
160 .wrapping_mul(0x9e37_79b9_7f4a_7c15)
161 .wrapping_add(0x6c62_272e_07bb_0142);
162 let h2 = fnv1a_64_seeded(key, seed);
163 let combined = h1.wrapping_add(u64::from(i).wrapping_mul(h2));
164 (combined % self.num_bits as u64) as usize
165 }
166}
167
168fn fnv1a_64(data: &[u8]) -> u64 {
172 let mut h: u64 = 0xcbf2_9ce4_8422_2325;
173 for &b in data {
174 h ^= u64::from(b);
175 h = h.wrapping_mul(0x0000_0100_0000_01b3);
176 }
177 h
178}
179
180fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
182 let mut h: u64 = 0xcbf2_9ce4_8422_2325 ^ seed.wrapping_mul(0x0000_0100_0000_01b3);
183 for &b in data {
184 h ^= u64::from(b);
185 h = h.wrapping_mul(0x0000_0100_0000_01b3);
186 }
187 h
188}
189
190#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
199 fn test_new_allocates_bits() {
200 let bp = BloomPrescreen::new(1000, 0.01);
201 assert!(!bp.bits.is_empty());
202 assert!(bp.k_hashes >= 1);
203 assert!(bp.num_bits >= 64);
204 }
205
206 #[test]
207 fn test_new_k_clamp_lower() {
208 let bp = BloomPrescreen::new(1, 0.99);
210 assert!(bp.k_hashes >= 1);
211 }
212
213 #[test]
214 fn test_new_k_clamp_upper() {
215 let bp = BloomPrescreen::new(1_000_000, 1e-15);
217 assert!(bp.k_hashes <= 32);
218 }
219
220 #[test]
221 fn test_larger_capacity_more_bits() {
222 let small = BloomPrescreen::new(100, 0.01);
223 let large = BloomPrescreen::new(100_000, 0.01);
224 assert!(large.num_bits > small.num_bits);
225 }
226
227 #[test]
228 fn test_stricter_fpr_more_hashes() {
229 let loose = BloomPrescreen::new(1000, 0.1);
230 let strict = BloomPrescreen::new(1000, 0.0001);
231 assert!(strict.k_hashes >= loose.k_hashes);
232 }
233
234 #[test]
237 fn test_insert_then_might_contain() {
238 let mut bp = BloomPrescreen::new(100, 0.01);
239 let keys: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
240 for k in keys {
241 bp.insert(k);
242 }
243 for k in keys {
244 assert!(bp.might_contain(k), "no false negative for {:?}", k);
245 }
246 }
247
248 #[test]
249 fn test_empty_filter_returns_false() {
250 let bp = BloomPrescreen::new(500, 0.01);
251 assert!(!bp.might_contain(b"not_inserted"));
252 assert!(!bp.might_contain(b"oximedia"));
253 }
254
255 #[test]
256 fn test_single_insert() {
257 let mut bp = BloomPrescreen::new(50, 0.01);
258 bp.insert(b"only_one");
259 assert!(bp.might_contain(b"only_one"));
260 }
261
262 #[test]
263 fn test_bulk_no_false_negatives() {
264 let mut bp = BloomPrescreen::new(2000, 0.01);
265 let items: Vec<Vec<u8>> = (0u64..500).map(|i| i.to_le_bytes().to_vec()).collect();
266 for item in &items {
267 bp.insert(item);
268 }
269 for item in &items {
270 assert!(bp.might_contain(item), "false negative detected");
271 }
272 }
273
274 #[test]
277 fn test_fpr_estimate_zero_when_empty() {
278 let bp = BloomPrescreen::new(1000, 0.01);
279 assert_eq!(bp.false_positive_rate_estimate(), 0.0);
281 }
282
283 #[test]
284 fn test_fpr_estimate_increases_with_items() {
285 let mut bp = BloomPrescreen::new(100, 0.01);
286 let fpr_before = bp.false_positive_rate_estimate();
287 for i in 0u64..50 {
288 bp.insert(&i.to_le_bytes());
289 }
290 let fpr_after = bp.false_positive_rate_estimate();
291 assert!(fpr_after > fpr_before);
292 }
293
294 #[test]
295 fn test_fpr_estimate_at_design_capacity_near_design_rate() {
296 let n = 1000usize;
298 let design_fpr = 0.01f64;
299 let mut bp = BloomPrescreen::new(n, design_fpr);
300 for i in 0u64..n as u64 {
301 bp.insert(&i.to_le_bytes());
302 }
303 let est = bp.false_positive_rate_estimate();
304 assert!(est > 0.0, "FPR estimate must be positive");
306 assert!(est < 0.5, "FPR estimate must be reasonable");
307 }
308
309 #[test]
312 fn test_items_inserted_counter() {
313 let mut bp = BloomPrescreen::new(100, 0.01);
314 assert_eq!(bp.items_inserted(), 0);
315 bp.insert(b"one");
316 assert_eq!(bp.items_inserted(), 1);
317 bp.insert(b"two");
318 assert_eq!(bp.items_inserted(), 2);
319 }
320
321 #[test]
324 fn test_clear_resets_filter() {
325 let mut bp = BloomPrescreen::new(100, 0.01);
326 bp.insert(b"persistent");
327 assert!(bp.might_contain(b"persistent"));
328 bp.clear();
329 assert!(!bp.might_contain(b"persistent"));
330 assert_eq!(bp.items_inserted(), 0);
331 }
332
333 #[test]
334 fn test_clear_resets_set_bit_count() {
335 let mut bp = BloomPrescreen::new(200, 0.01);
336 for i in 0u64..50 {
337 bp.insert(&i.to_le_bytes());
338 }
339 assert!(bp.set_bit_count() > 0);
340 bp.clear();
341 assert_eq!(bp.set_bit_count(), 0);
342 }
343
344 #[test]
347 fn test_set_bit_count_zero_initially() {
348 let bp = BloomPrescreen::new(500, 0.01);
349 assert_eq!(bp.set_bit_count(), 0);
350 }
351
352 #[test]
353 fn test_set_bit_count_increases_with_inserts() {
354 let mut bp = BloomPrescreen::new(500, 0.01);
355 let before = bp.set_bit_count();
356 bp.insert(b"new_item");
357 let after = bp.set_bit_count();
358 assert!(after >= before);
359 }
360
361 #[test]
364 fn test_num_bits_at_least_64() {
365 let bp = BloomPrescreen::new(1, 0.5);
367 assert!(bp.num_bits >= 64);
368 assert_eq!(bp.bits.len(), bp.num_bits / 64);
369 }
370
371 #[test]
372 fn test_k_hashes_sensible_range() {
373 let bp = BloomPrescreen::new(10_000, 0.01);
375 assert!(bp.k_hashes >= 4, "k should be at least 4 for 1% FPR");
376 assert!(bp.k_hashes <= 20, "k should be at most 20 for 1% FPR");
377 }
378
379 #[test]
382 fn test_empty_key_insert_and_query() {
383 let mut bp = BloomPrescreen::new(100, 0.01);
384 bp.insert(b"");
385 assert!(bp.might_contain(b""));
386 }
387
388 #[test]
389 fn test_long_key_insert_and_query() {
390 let mut bp = BloomPrescreen::new(100, 0.01);
391 let long_key = vec![0xABu8; 4096];
392 bp.insert(&long_key);
393 assert!(bp.might_contain(&long_key));
394 }
395
396 #[test]
397 fn test_binary_keys() {
398 let mut bp = BloomPrescreen::new(200, 0.01);
399 let values: Vec<u64> = vec![0, 1, u64::MAX, 0xDEAD_BEEF, 0x0102_0304_0506_0708];
401 for v in &values {
402 bp.insert(&v.to_le_bytes());
403 }
404 for v in &values {
405 assert!(bp.might_contain(&v.to_le_bytes()), "no false neg for {v}");
406 }
407 }
408}