1use crate::{bitmap::CompressedBitmap, FilterSize, VecBitmap};
2use std::collections::hash_map::RandomState;
3use std::hash::{BuildHasher, Hash};
4use std::marker::PhantomData;
5
6pub trait Bitmap {
16 fn new_with_capacity(max_key: usize) -> Self;
19
20 fn set(&mut self, key: usize, value: bool);
22
23 fn get(&self, key: usize) -> bool;
25
26 fn byte_size(&self) -> usize;
28
29 fn or(&self, other: &Self) -> Self;
31}
32
33pub struct BloomFilterBuilder<H, B>
46where
47 H: BuildHasher,
48 B: Bitmap,
49{
50 hasher: H,
51 bitmap: B,
52 key_size: FilterSize,
53}
54
55impl std::default::Default for BloomFilterBuilder<RandomState, CompressedBitmap> {
63 fn default() -> BloomFilterBuilder<RandomState, CompressedBitmap> {
64 let size = FilterSize::KeyBytes2;
65 BloomFilterBuilder {
66 hasher: RandomState::default(),
67 bitmap: CompressedBitmap::new(key_size_to_bits(size)),
68 key_size: size,
69 }
70 }
71}
72
73impl<H, B> BloomFilterBuilder<H, B>
74where
75 H: BuildHasher,
76 B: Bitmap,
77{
78 pub fn with_bitmap_data(self, bitmap: B, key_size: FilterSize) -> Self {
89 let _ = bitmap.get(key_size as usize);
92
93 Self {
94 bitmap,
95 key_size,
96 ..self
97 }
98 }
99
100 pub fn with_bitmap<U>(self) -> BloomFilterBuilder<H, U>
101 where
102 U: Bitmap,
103 {
104 BloomFilterBuilder {
105 hasher: self.hasher,
106 bitmap: U::new_with_capacity(key_size_to_bits(self.key_size)),
107 key_size: self.key_size,
108 }
109 }
110
111 pub fn build<T: Hash>(self) -> Bloom2<H, B, T> {
113 Bloom2 {
114 hasher: self.hasher,
115 bitmap: self.bitmap,
116 key_size: self.key_size,
117 _key_type: PhantomData,
118 }
119 }
120
121 pub fn size(self, size: FilterSize) -> Self {
128 Self {
129 key_size: size,
130 bitmap: B::new_with_capacity(key_size_to_bits(size)),
131 ..self
132 }
133 }
134}
135
136impl<H> BloomFilterBuilder<H, CompressedBitmap>
137where
138 H: BuildHasher,
139{
140 pub fn hasher(hasher: H) -> Self {
145 let size = FilterSize::KeyBytes2;
146 Self {
147 hasher,
148 bitmap: CompressedBitmap::new(key_size_to_bits(size)),
149 key_size: size,
150 }
151 }
152}
153
154fn key_size_to_bits(k: FilterSize) -> usize {
155 2_usize.pow(8 * k as u32)
156}
157
158#[derive(Debug, Clone, PartialEq)]
185#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
186pub struct Bloom2<H, B, T>
187where
188 H: BuildHasher,
189 B: Bitmap,
190{
191 #[cfg_attr(feature = "serde", serde(skip))]
192 hasher: H,
193 bitmap: B,
194 key_size: FilterSize,
195
196 #[cfg_attr(feature = "serde", serde(skip))]
197 _key_type: PhantomData<T>,
198}
199
200impl<T> std::default::Default for Bloom2<RandomState, CompressedBitmap, T>
212where
213 T: Hash,
214{
215 fn default() -> Self {
216 crate::BloomFilterBuilder::default().build()
217 }
218}
219
220impl<H, B, T> Bloom2<H, B, T>
221where
222 H: BuildHasher,
223 B: Bitmap,
224 T: Hash,
225{
226 pub fn insert(&mut self, data: &'_ T) {
274 self.hasher
277 .hash_one(data)
278 .to_be_bytes()
279 .chunks(self.key_size as usize)
280 .for_each(|chunk| self.bitmap.set(bytes_to_usize_key(chunk), true));
281 }
282
283 pub fn contains(&self, data: &'_ T) -> bool {
289 self.hasher
291 .hash_one(data)
292 .to_be_bytes()
293 .chunks(self.key_size as usize)
294 .any(|chunk| self.bitmap.get(bytes_to_usize_key(chunk)))
295 }
296
297 pub fn union(&mut self, other: &Self) {
310 assert_eq!(self.key_size, other.key_size);
311 self.bitmap = self.bitmap.or(&other.bitmap);
312 }
313
314 pub fn byte_size(&mut self) -> usize {
316 self.bitmap.byte_size()
317 }
318}
319
320impl<H, T> Bloom2<H, CompressedBitmap, T>
321where
322 H: BuildHasher,
323{
324 pub fn shrink_to_fit(&mut self) {
327 self.bitmap.shrink_to_fit();
328 }
329}
330
331impl<H, T> Bloom2<H, VecBitmap, T>
332where
333 H: BuildHasher,
334{
335 pub fn compress(self) -> Bloom2<H, CompressedBitmap, T> {
341 Bloom2::from(self)
342 }
343}
344
345fn bytes_to_usize_key<'a, I: IntoIterator<Item = &'a u8>>(bytes: I) -> usize {
346 bytes
347 .into_iter()
348 .fold(0, |key, &byte| (key << 8) | byte as usize)
349}
350
351impl<H, T> From<Bloom2<H, VecBitmap, T>> for Bloom2<H, CompressedBitmap, T>
352where
353 H: BuildHasher,
354{
355 fn from(v: Bloom2<H, VecBitmap, T>) -> Self {
356 Self {
357 hasher: v.hasher,
358 bitmap: CompressedBitmap::from(v.bitmap),
359 key_size: v.key_size,
360 _key_type: PhantomData,
361 }
362 }
363}
364
365#[cfg(test)]
366mod tests {
367 use super::*;
368 use proptest::prelude::*;
369 use quickcheck_macros::quickcheck;
370
371 use std::{
372 cell::RefCell,
373 collections::HashSet,
374 hash::{BuildHasherDefault, Hasher},
375 };
376
377 #[derive(Debug, Clone, Default)]
378 struct MockHasher {
379 return_hash: u64,
380 }
381
382 impl Hasher for MockHasher {
383 fn write(&mut self, _bytes: &[u8]) {}
384 fn finish(&self) -> u64 {
385 self.return_hash
386 }
387 }
388
389 impl BuildHasher for MockHasher {
390 type Hasher = Self;
391 fn build_hasher(&self) -> MockHasher {
392 self.clone()
393 }
394 }
395
396 #[derive(Debug, Default)]
397 struct MockBitmap {
398 set_calls: Vec<(usize, bool)>,
399 get_calls: RefCell<Vec<usize>>,
400 }
401 impl Bitmap for MockBitmap {
402 fn set(&mut self, key: usize, value: bool) {
403 self.set_calls.push((key, value))
404 }
405 fn get(&self, key: usize) -> bool {
406 self.get_calls.borrow_mut().push(key);
407 false
408 }
409 fn byte_size(&self) -> usize {
410 42
411 }
412
413 fn or(&self, _other: &Self) -> Self {
414 unreachable!()
415 }
416
417 fn new_with_capacity(_max_key: usize) -> Self {
418 Self::default()
419 }
420 }
421
422 fn new_test_bloom<T: Hash>() -> Bloom2<MockHasher, MockBitmap, T> {
423 Bloom2 {
424 hasher: MockHasher::default(),
425 bitmap: MockBitmap::default(),
426 key_size: FilterSize::KeyBytes1,
427 _key_type: PhantomData,
428 }
429 }
430
431 #[test]
432 fn test_default() {
433 let mut b = Bloom2::default();
434 assert_eq!(b.key_size, FilterSize::KeyBytes2);
435
436 b.insert(&42);
437 assert!(b.contains(&42));
438 }
439
440 #[quickcheck]
441 fn test_default_prop(vals: Vec<u16>) {
442 let mut b = Bloom2::default();
443 for v in &vals {
444 b.insert(v);
445 }
446
447 for v in &vals {
448 assert!(b.contains(v));
449 }
450 }
451
452 #[test]
453 fn test_insert_contains_kb1() {
454 let mut b = new_test_bloom();
455 b.hasher.return_hash = 12345678901234567890;
456
457 b.insert(&[1, 2, 3, 4]);
458 assert_eq!(
459 b.bitmap.set_calls,
460 vec![
461 (171, true),
462 (84, true),
463 (169, true),
464 (140, true),
465 (235, true),
466 (31, true),
467 (10, true),
468 (210, true),
469 ]
470 );
471
472 b.contains(&[1, 2, 3, 4]);
473 assert_eq!(
474 b.bitmap.get_calls.into_inner(),
475 vec![171, 84, 169, 140, 235, 31, 10, 210]
476 );
477 }
478
479 #[test]
480 fn test_insert_contains_kb2() {
481 let mut b = new_test_bloom();
482 b.key_size = FilterSize::KeyBytes2;
483 b.hasher.return_hash = 12345678901234567890;
484
485 b.insert(&[1, 2, 3, 4]);
486
487 assert_eq!(
488 b.bitmap.set_calls,
489 vec![(43860, true), (43404, true), (60191, true), (2770, true),]
490 );
491 assert!(b.bitmap.get_calls.into_inner().is_empty());
492 }
493
494 #[test]
495 fn test_issue_3() {
496 let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, &str> =
497 BloomFilterBuilder::default()
498 .size(FilterSize::KeyBytes4)
499 .build();
500
501 bloom_filter.insert(&"a");
502 bloom_filter.insert(&"b");
503 bloom_filter.insert(&"c");
504 bloom_filter.insert(&"d");
505 }
506
507 #[test]
508 fn test_size_shrink() {
509 let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, _> =
510 BloomFilterBuilder::default()
511 .size(FilterSize::KeyBytes4)
512 .build();
513
514 for i in 0..10 {
515 bloom_filter.insert(&i);
516 }
517
518 assert_eq!(bloom_filter.byte_size(), 8388920);
519 bloom_filter.shrink_to_fit();
520 assert_eq!(bloom_filter.byte_size(), 8388824);
521 }
522
523 #[test]
524 fn set_hasher() {
525 let mut bloom_filter: Bloom2<
526 BuildHasherDefault<twox_hash::XxHash64>,
527 CompressedBitmap,
528 i32,
529 > = BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
530 .size(FilterSize::KeyBytes4)
531 .build();
532
533 for i in 0..10 {
534 bloom_filter.insert(&i);
535 }
536
537 for i in 0..10 {
538 assert!(bloom_filter.contains(&i), "did not contain {}", i);
539 }
540 }
541
542 #[quickcheck]
543 fn test_union(mut a: Vec<usize>, mut b: Vec<usize>, mut control: Vec<usize>) {
544 a.truncate(50);
546 b.truncate(50);
547 control.truncate(100);
548
549 let mut bitmap_a =
550 BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
551 .size(FilterSize::KeyBytes2)
552 .build();
553
554 let mut bitmap_b = bitmap_a.clone();
555
556 for v in &a {
558 bitmap_a.insert(v);
559 }
560 for v in &b {
561 bitmap_b.insert(v);
562 }
563
564 let mut merged = bitmap_a.clone();
566 merged.union(&bitmap_b);
567
568 for v in &a {
571 assert!(merged.contains(v));
572 }
573
574 for v in &b {
577 assert!(merged.contains(v));
578 }
579
580 for v in &control {
583 let input_maybe_contains = bitmap_a.contains(v) || bitmap_b.contains(v);
584 assert_eq!(input_maybe_contains, merged.contains(v));
585 }
586 }
587
588 #[cfg(feature = "serde")]
589 #[test]
590 fn serde() {
591 type MyBuildHasher = BuildHasherDefault<twox_hash::XxHash64>;
592
593 let mut bloom_filter: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
594 BloomFilterBuilder::hasher(MyBuildHasher::default())
595 .size(FilterSize::KeyBytes4)
596 .build();
597
598 for i in 0..10 {
599 bloom_filter.insert(&i);
600 }
601
602 let encoded = serde_json::to_string(&bloom_filter).unwrap();
603 let decoded: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
604 serde_json::from_str(&encoded).unwrap();
605
606 assert_eq!(bloom_filter.bitmap, decoded.bitmap);
607
608 for i in 0..10 {
609 assert!(decoded.contains(&i), "didn't contain {}", i);
610 }
611 }
612
613 pub fn arbitrary_value() -> impl Strategy<Value = usize> {
617 prop_oneof![
618 5 => 0_usize..100,
619 1 => any::<usize>(),
620 ]
621 }
622
623 #[derive(Debug, Clone)]
624 pub enum Op {
625 Insert(usize),
627 Contains(usize),
629 }
630
631 pub fn arbitrary_op(s: impl Strategy<Value = usize>) -> impl Strategy<Value = Op> {
632 s.prop_flat_map(|v| prop_oneof![Just(Op::Insert(v)), Just(Op::Contains(v))])
633 }
634
635 proptest! {
636 #[test]
637 fn prop_ops_compressed_bitmap(
638 ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
639 ) {
640 run_ops_fuzz::<CompressedBitmap>(ops);
641 }
642
643 #[test]
644 fn prop_ops_vec_bitmap(
645 ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
646 ) {
647 run_ops_fuzz::<VecBitmap>(ops);
648 }
649
650 #[test]
651 fn prop_ops_compress(
652 values in prop::collection::vec(arbitrary_value(), 1..100),
653 check in prop::collection::vec(arbitrary_value(), 1..100),
654 ) {
655 let mut b = BloomFilterBuilder::default().with_bitmap::<VecBitmap>().build();
656
657 let mut control: HashSet<usize, RandomState> = HashSet::default();
658 for v in values {
659 b.insert(&v);
660 control.insert(v);
661 }
662
663 let compressed = b.clone().compress();
664
665 for v in control {
668 assert!(compressed.contains(&v));
669 }
670
671 for v in check {
675 assert_eq!(compressed.contains(&v), b.contains(&v));
676 }
677 }
678 }
679
680 fn run_ops_fuzz<B>(ops: Vec<Op>)
681 where
682 B: Bitmap,
683 {
684 let mut b = BloomFilterBuilder::default().with_bitmap::<B>().build();
685
686 let mut control: HashSet<usize, RandomState> = HashSet::default();
687 for op in ops {
688 match op {
689 Op::Insert(v) => {
690 b.insert(&v);
691 control.insert(v);
692 }
693 Op::Contains(v) => {
694 if control.contains(&v) {
697 assert!(b.contains(&v));
698 }
699 }
700 }
701 }
702 }
703}