bloom2/
bloom.rs

1use crate::{bitmap::CompressedBitmap, FilterSize, VecBitmap};
2use std::collections::hash_map::RandomState;
3use std::hash::{BuildHasher, Hash};
4use std::marker::PhantomData;
5
6// TODO(dom): AND, XOR, NOT + examples
7
8// [`Bloom2`]: crate::bloom2::Bloom2
9// [`BloomFilterBuilder`]: crate::BloomFilterBuilder
10// [`hash`]: std::hash::Hash
11// [`FilterSize`]: crate::FilterSize
12
13/// A trait to abstract bit storage for use in a [`Bloom2`](crate::Bloom2)
14/// filter.
15pub trait Bitmap {
16    /// Construct a new [`Bitmap`] impl with capacity to hold at least `max_key`
17    /// number of bits.
18    fn new_with_capacity(max_key: usize) -> Self;
19
20    /// Set bit indexed by `key` to `value`.
21    fn set(&mut self, key: usize, value: bool);
22
23    /// Return `true` if the given bit index was previously set to `true`.
24    fn get(&self, key: usize) -> bool;
25
26    /// Return the size of the bitmap in bytes.
27    fn byte_size(&self) -> usize;
28
29    /// Return the bitwise OR of both `self` and `other`.`
30    fn or(&self, other: &Self) -> Self;
31}
32
33/// Construct [`Bloom2`] instances with varying parameters.
34///
35/// ```rust
36/// use std::collections::hash_map::RandomState;
37/// use bloom2::{BloomFilterBuilder, FilterSize};
38///
39/// let mut filter = BloomFilterBuilder::hasher(RandomState::default())
40///                     .size(FilterSize::KeyBytes2)
41///                     .build();
42///
43/// filter.insert(&"success!");
44/// ```
45pub struct BloomFilterBuilder<H, B>
46where
47    H: BuildHasher,
48    B: Bitmap,
49{
50    hasher: H,
51    bitmap: B,
52    key_size: FilterSize,
53}
54
55/// Initialise a `BloomFilterBuilder` that unless changed, will construct a
56/// `Bloom2` instance using a [2 byte key] and use Rust's [`DefaultHasher`]
57/// ([SipHash] at the time of writing).
58///
59/// [2 byte key]: crate::FilterSize::KeyBytes2
60/// [`DefaultHasher`]: std::collections::hash_map::RandomState
61/// [SipHash]: https://131002.net/siphash/
62impl std::default::Default for BloomFilterBuilder<RandomState, CompressedBitmap> {
63    fn default() -> BloomFilterBuilder<RandomState, CompressedBitmap> {
64        let size = FilterSize::KeyBytes2;
65        BloomFilterBuilder {
66            hasher: RandomState::default(),
67            bitmap: CompressedBitmap::new(key_size_to_bits(size)),
68            key_size: size,
69        }
70    }
71}
72
73impl<H, B> BloomFilterBuilder<H, B>
74where
75    H: BuildHasher,
76    B: Bitmap,
77{
78    /// Set the bit storage (bitmap) for the bloom filter.
79    ///
80    /// # Panics
81    ///
82    /// This method may panic if `bitmap` is too small to hold any value in the
83    /// range produced by the [key size](FilterSize).
84    ///
85    /// Providing a `bitmap` instance that is non-empty can be used to restore
86    /// the state of a [`Bloom2`] instance (although using `serde` can achieve
87    /// this safely too).
88    pub fn with_bitmap_data(self, bitmap: B, key_size: FilterSize) -> Self {
89        // Invariant: reading the last bit succeeds, ensuring it has sufficient
90        // capacity.
91        let _ = bitmap.get(key_size as usize);
92
93        Self {
94            bitmap,
95            key_size,
96            ..self
97        }
98    }
99
100    pub fn with_bitmap<U>(self) -> BloomFilterBuilder<H, U>
101    where
102        U: Bitmap,
103    {
104        BloomFilterBuilder {
105            hasher: self.hasher,
106            bitmap: U::new_with_capacity(key_size_to_bits(self.key_size)),
107            key_size: self.key_size,
108        }
109    }
110
111    /// Initialise the [`Bloom2`] instance with the provided parameters.
112    pub fn build<T: Hash>(self) -> Bloom2<H, B, T> {
113        Bloom2 {
114            hasher: self.hasher,
115            bitmap: self.bitmap,
116            key_size: self.key_size,
117            _key_type: PhantomData,
118        }
119    }
120
121    /// Control the in-memory size and false-positive probability of the filter.
122    ///
123    /// Setting the bitmap size replaces the current `Bitmap` instance with a
124    /// new `CompressedBitmap` of the appropriate size.
125    ///
126    /// See [`FilterSize`].
127    pub fn size(self, size: FilterSize) -> Self {
128        Self {
129            key_size: size,
130            bitmap: B::new_with_capacity(key_size_to_bits(size)),
131            ..self
132        }
133    }
134}
135
136impl<H> BloomFilterBuilder<H, CompressedBitmap>
137where
138    H: BuildHasher,
139{
140    /// Initialise a `BloomFilterBuilder` that unless changed, will construct a
141    /// `Bloom2` instance using a [2 byte key] and use the specified hasher.
142    ///
143    /// [2 byte key]: crate::FilterSize::KeyBytes2
144    pub fn hasher(hasher: H) -> Self {
145        let size = FilterSize::KeyBytes2;
146        Self {
147            hasher,
148            bitmap: CompressedBitmap::new(key_size_to_bits(size)),
149            key_size: size,
150        }
151    }
152}
153
154fn key_size_to_bits(k: FilterSize) -> usize {
155    2_usize.pow(8 * k as u32)
156}
157
158/// A fast, memory efficient, sparse bloom filter.
159///
160/// Most users can quickly initialise a `Bloom2` instance by calling
161/// `Bloom2::default()` and start inserting anything that implements the
162/// [`Hash`] trait:
163///
164/// ```rust
165/// use bloom2::Bloom2;
166///
167/// let mut b = Bloom2::default();
168/// b.insert(&"hello 🐐");
169/// assert!(b.contains(&"hello 🐐"));
170/// ```
171///
172/// Initialising a `Bloom2` this way uses some [sensible
173/// default](crate::BloomFilterBuilder) values for a easy-to-use, memory
174/// efficient filter. If you want to tune the probability of a false-positive
175/// lookup, change the hashing algorithm, memory size of the filter, etc, a
176/// [`BloomFilterBuilder`] can be used to initialise a `Bloom2` instance with
177/// the desired properties.
178///
179/// The sparse nature of this filter trades a small amount of insert performance
180/// for decreased memory usage. For filters initialised infrequently and held
181/// for a meaningful duration of time, this is almost always worth the
182/// marginally increased insert latency. When testing performance, be sure to
183/// use a release build - there's a significant performance difference!
184#[derive(Debug, Clone, PartialEq)]
185#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
186pub struct Bloom2<H, B, T>
187where
188    H: BuildHasher,
189    B: Bitmap,
190{
191    #[cfg_attr(feature = "serde", serde(skip))]
192    hasher: H,
193    bitmap: B,
194    key_size: FilterSize,
195
196    #[cfg_attr(feature = "serde", serde(skip))]
197    _key_type: PhantomData<T>,
198}
199
200/// Initialise a `Bloom2` instance using the default implementation of
201/// [`BloomFilterBuilder`].
202///
203/// It is the equivalent of:
204///
205/// ```rust
206/// use bloom2::BloomFilterBuilder;
207///
208/// let mut b = BloomFilterBuilder::default().build();
209/// # b.insert(&42);
210/// ```
211impl<T> std::default::Default for Bloom2<RandomState, CompressedBitmap, T>
212where
213    T: Hash,
214{
215    fn default() -> Self {
216        crate::BloomFilterBuilder::default().build()
217    }
218}
219
220impl<H, B, T> Bloom2<H, B, T>
221where
222    H: BuildHasher,
223    B: Bitmap,
224    T: Hash,
225{
226    /// Insert places `data` into the bloom filter.
227    ///
228    /// Any subsequent calls to [`contains`](Bloom2::contains) for the same
229    /// `data` will always return true.
230    ///
231    /// Insertion is significantly faster in release builds.
232    ///
233    /// The `data` provided can be anything that implements the [`Hash`] trait,
234    /// for example:
235    ///
236    /// ```rust
237    /// use bloom2::Bloom2;
238    ///
239    /// let mut b = Bloom2::default();
240    /// b.insert(&"hello 🐐");
241    /// assert!(b.contains(&"hello 🐐"));
242    ///
243    /// let mut b = Bloom2::default();
244    /// b.insert(&vec!["fox", "cat", "banana"]);
245    /// assert!(b.contains(&vec!["fox", "cat", "banana"]));
246    ///
247    /// let mut b = Bloom2::default();
248    /// let data: [u8; 4] = [1, 2, 3, 42];
249    /// b.insert(&data);
250    /// assert!(b.contains(&data));
251    /// ```
252    ///
253    /// As well as structs if they implement the [`Hash`] trait, which be
254    /// helpfully derived:
255    ///
256    /// ```rust
257    /// # use bloom2::Bloom2;
258    /// # let mut b = Bloom2::default();
259    /// #[derive(Hash)]
260    /// struct User {
261    ///     id: u64,
262    ///     email: String,
263    /// }
264    ///
265    /// let user = User{
266    ///     id: 42,
267    ///     email: "dom@itsallbroken.com".to_string(),
268    /// };
269    ///
270    /// b.insert(&&user);
271    /// assert!(b.contains(&&user));
272    /// ```
273    pub fn insert(&mut self, data: &'_ T) {
274        // Generate a hash (u64) value for data and split the u64 hash into
275        // several smaller values to use as unique indexes in the bitmap.
276        self.hasher
277            .hash_one(data)
278            .to_be_bytes()
279            .chunks(self.key_size as usize)
280            .for_each(|chunk| self.bitmap.set(bytes_to_usize_key(chunk), true));
281    }
282
283    /// Checks if `data` exists in the filter.
284    ///
285    /// If `contains` returns true, `hash` has **probably** been inserted
286    /// previously. If `contains` returns false, `hash` has **definitely not**
287    /// been inserted into the filter.
288    pub fn contains(&self, data: &'_ T) -> bool {
289        // Generate a hash (u64) value for data
290        self.hasher
291            .hash_one(data)
292            .to_be_bytes()
293            .chunks(self.key_size as usize)
294            .any(|chunk| self.bitmap.get(bytes_to_usize_key(chunk)))
295    }
296
297    /// Union two [`Bloom2`] instances (of identical configuration), returning
298    /// the merged combination of both.
299    ///
300    /// The returned filter will return "true" for all calls to
301    /// [`Bloom2::contains()`] for all values that would return true for one (or
302    /// both) of the inputs, and will return "false" for all values that return
303    /// false from both inputs.
304    ///
305    /// # Panics
306    ///
307    /// This method panics if the two [`Bloom2`] instances have different
308    /// configuration.
309    pub fn union(&mut self, other: &Self) {
310        assert_eq!(self.key_size, other.key_size);
311        self.bitmap = self.bitmap.or(&other.bitmap);
312    }
313
314    /// Return the byte size of this filter.
315    pub fn byte_size(&mut self) -> usize {
316        self.bitmap.byte_size()
317    }
318}
319
320impl<H, T> Bloom2<H, CompressedBitmap, T>
321where
322    H: BuildHasher,
323{
324    /// Minimise the memory usage of this instance by by shrinking the
325    /// underlying vectors, discarding their excess capacity.
326    pub fn shrink_to_fit(&mut self) {
327        self.bitmap.shrink_to_fit();
328    }
329}
330
331impl<H, T> Bloom2<H, VecBitmap, T>
332where
333    H: BuildHasher,
334{
335    /// Compress the bitmap to reduce memory consumption.
336    ///
337    /// The compressed representation is optimised for reads, but subsequent
338    /// inserts will be slower. This reduction is `O(n)` in time, and up to
339    /// `O(2n)` in space.
340    pub fn compress(self) -> Bloom2<H, CompressedBitmap, T> {
341        Bloom2::from(self)
342    }
343}
344
345fn bytes_to_usize_key<'a, I: IntoIterator<Item = &'a u8>>(bytes: I) -> usize {
346    bytes
347        .into_iter()
348        .fold(0, |key, &byte| (key << 8) | byte as usize)
349}
350
351impl<H, T> From<Bloom2<H, VecBitmap, T>> for Bloom2<H, CompressedBitmap, T>
352where
353    H: BuildHasher,
354{
355    fn from(v: Bloom2<H, VecBitmap, T>) -> Self {
356        Self {
357            hasher: v.hasher,
358            bitmap: CompressedBitmap::from(v.bitmap),
359            key_size: v.key_size,
360            _key_type: PhantomData,
361        }
362    }
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368    use proptest::prelude::*;
369    use quickcheck_macros::quickcheck;
370
371    use std::{
372        cell::RefCell,
373        collections::HashSet,
374        hash::{BuildHasherDefault, Hasher},
375    };
376
377    #[derive(Debug, Clone, Default)]
378    struct MockHasher {
379        return_hash: u64,
380    }
381
382    impl Hasher for MockHasher {
383        fn write(&mut self, _bytes: &[u8]) {}
384        fn finish(&self) -> u64 {
385            self.return_hash
386        }
387    }
388
389    impl BuildHasher for MockHasher {
390        type Hasher = Self;
391        fn build_hasher(&self) -> MockHasher {
392            self.clone()
393        }
394    }
395
396    #[derive(Debug, Default)]
397    struct MockBitmap {
398        set_calls: Vec<(usize, bool)>,
399        get_calls: RefCell<Vec<usize>>,
400    }
401    impl Bitmap for MockBitmap {
402        fn set(&mut self, key: usize, value: bool) {
403            self.set_calls.push((key, value))
404        }
405        fn get(&self, key: usize) -> bool {
406            self.get_calls.borrow_mut().push(key);
407            false
408        }
409        fn byte_size(&self) -> usize {
410            42
411        }
412
413        fn or(&self, _other: &Self) -> Self {
414            unreachable!()
415        }
416
417        fn new_with_capacity(_max_key: usize) -> Self {
418            Self::default()
419        }
420    }
421
422    fn new_test_bloom<T: Hash>() -> Bloom2<MockHasher, MockBitmap, T> {
423        Bloom2 {
424            hasher: MockHasher::default(),
425            bitmap: MockBitmap::default(),
426            key_size: FilterSize::KeyBytes1,
427            _key_type: PhantomData,
428        }
429    }
430
431    #[test]
432    fn test_default() {
433        let mut b = Bloom2::default();
434        assert_eq!(b.key_size, FilterSize::KeyBytes2);
435
436        b.insert(&42);
437        assert!(b.contains(&42));
438    }
439
440    #[quickcheck]
441    fn test_default_prop(vals: Vec<u16>) {
442        let mut b = Bloom2::default();
443        for v in &vals {
444            b.insert(v);
445        }
446
447        for v in &vals {
448            assert!(b.contains(v));
449        }
450    }
451
452    #[test]
453    fn test_insert_contains_kb1() {
454        let mut b = new_test_bloom();
455        b.hasher.return_hash = 12345678901234567890;
456
457        b.insert(&[1, 2, 3, 4]);
458        assert_eq!(
459            b.bitmap.set_calls,
460            vec![
461                (171, true),
462                (84, true),
463                (169, true),
464                (140, true),
465                (235, true),
466                (31, true),
467                (10, true),
468                (210, true),
469            ]
470        );
471
472        b.contains(&[1, 2, 3, 4]);
473        assert_eq!(
474            b.bitmap.get_calls.into_inner(),
475            vec![171, 84, 169, 140, 235, 31, 10, 210]
476        );
477    }
478
479    #[test]
480    fn test_insert_contains_kb2() {
481        let mut b = new_test_bloom();
482        b.key_size = FilterSize::KeyBytes2;
483        b.hasher.return_hash = 12345678901234567890;
484
485        b.insert(&[1, 2, 3, 4]);
486
487        assert_eq!(
488            b.bitmap.set_calls,
489            vec![(43860, true), (43404, true), (60191, true), (2770, true),]
490        );
491        assert!(b.bitmap.get_calls.into_inner().is_empty());
492    }
493
494    #[test]
495    fn test_issue_3() {
496        let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, &str> =
497            BloomFilterBuilder::default()
498                .size(FilterSize::KeyBytes4)
499                .build();
500
501        bloom_filter.insert(&"a");
502        bloom_filter.insert(&"b");
503        bloom_filter.insert(&"c");
504        bloom_filter.insert(&"d");
505    }
506
507    #[test]
508    fn test_size_shrink() {
509        let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, _> =
510            BloomFilterBuilder::default()
511                .size(FilterSize::KeyBytes4)
512                .build();
513
514        for i in 0..10 {
515            bloom_filter.insert(&i);
516        }
517
518        assert_eq!(bloom_filter.byte_size(), 8388920);
519        bloom_filter.shrink_to_fit();
520        assert_eq!(bloom_filter.byte_size(), 8388824);
521    }
522
523    #[test]
524    fn set_hasher() {
525        let mut bloom_filter: Bloom2<
526            BuildHasherDefault<twox_hash::XxHash64>,
527            CompressedBitmap,
528            i32,
529        > = BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
530            .size(FilterSize::KeyBytes4)
531            .build();
532
533        for i in 0..10 {
534            bloom_filter.insert(&i);
535        }
536
537        for i in 0..10 {
538            assert!(bloom_filter.contains(&i), "did not contain {}", i);
539        }
540    }
541
542    #[quickcheck]
543    fn test_union(mut a: Vec<usize>, mut b: Vec<usize>, mut control: Vec<usize>) {
544        // Reduce the test state space.
545        a.truncate(50);
546        b.truncate(50);
547        control.truncate(100);
548
549        let mut bitmap_a =
550            BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
551                .size(FilterSize::KeyBytes2)
552                .build();
553
554        let mut bitmap_b = bitmap_a.clone();
555
556        // Populate the bitmaps to be merged.
557        for v in &a {
558            bitmap_a.insert(v);
559        }
560        for v in &b {
561            bitmap_b.insert(v);
562        }
563
564        // Merge the two bitmaps.
565        let mut merged = bitmap_a.clone();
566        merged.union(&bitmap_b);
567
568        // Invariant 1: all of the values in "a" must appear in the merged
569        // result.
570        for v in &a {
571            assert!(merged.contains(v));
572        }
573
574        // Invariant 2: all of the values in "b" must appear in the merged
575        // result.
576        for v in &b {
577            assert!(merged.contains(v));
578        }
579
580        // Invariant 3: control values that appear in neither bitmap A nor
581        // bitmap B must not appear in the merged result.
582        for v in &control {
583            let input_maybe_contains = bitmap_a.contains(v) || bitmap_b.contains(v);
584            assert_eq!(input_maybe_contains, merged.contains(v));
585        }
586    }
587
588    #[cfg(feature = "serde")]
589    #[test]
590    fn serde() {
591        type MyBuildHasher = BuildHasherDefault<twox_hash::XxHash64>;
592
593        let mut bloom_filter: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
594            BloomFilterBuilder::hasher(MyBuildHasher::default())
595                .size(FilterSize::KeyBytes4)
596                .build();
597
598        for i in 0..10 {
599            bloom_filter.insert(&i);
600        }
601
602        let encoded = serde_json::to_string(&bloom_filter).unwrap();
603        let decoded: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
604            serde_json::from_str(&encoded).unwrap();
605
606        assert_eq!(bloom_filter.bitmap, decoded.bitmap);
607
608        for i in 0..10 {
609            assert!(decoded.contains(&i), "didn't contain {}", i);
610        }
611    }
612
613    /// Generate an arbitrary `usize` value.
614    ///
615    /// Prefers generating values from a small range to encourage collisions.
616    pub fn arbitrary_value() -> impl Strategy<Value = usize> {
617        prop_oneof![
618            5 => 0_usize..100,
619            1 => any::<usize>(),
620        ]
621    }
622
623    #[derive(Debug, Clone)]
624    pub enum Op {
625        /// Insert a random value.
626        Insert(usize),
627        /// Check a random value exists in the set.
628        Contains(usize),
629    }
630
631    pub fn arbitrary_op(s: impl Strategy<Value = usize>) -> impl Strategy<Value = Op> {
632        s.prop_flat_map(|v| prop_oneof![Just(Op::Insert(v)), Just(Op::Contains(v))])
633    }
634
635    proptest! {
636        #[test]
637        fn prop_ops_compressed_bitmap(
638            ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
639        ) {
640            run_ops_fuzz::<CompressedBitmap>(ops);
641        }
642
643        #[test]
644        fn prop_ops_vec_bitmap(
645            ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
646        ) {
647            run_ops_fuzz::<VecBitmap>(ops);
648        }
649
650        #[test]
651        fn prop_ops_compress(
652            values in prop::collection::vec(arbitrary_value(), 1..100),
653            check in prop::collection::vec(arbitrary_value(), 1..100),
654        ) {
655            let mut b = BloomFilterBuilder::default().with_bitmap::<VecBitmap>().build();
656
657            let mut control: HashSet<usize, RandomState> = HashSet::default();
658            for v in values {
659                b.insert(&v);
660                control.insert(v);
661            }
662
663            let compressed = b.clone().compress();
664
665            // Validate the control set is still contained within the
666            // now-compressed bloom filter.
667            for v in control {
668                assert!(compressed.contains(&v));
669            }
670
671            // A further set of random "check" values show equality between both
672            // the compressed and the uncompressed bitmap, ensuring equal false
673            // positive rates.
674            for v in check {
675                assert_eq!(compressed.contains(&v), b.contains(&v));
676            }
677        }
678    }
679
680    fn run_ops_fuzz<B>(ops: Vec<Op>)
681    where
682        B: Bitmap,
683    {
684        let mut b = BloomFilterBuilder::default().with_bitmap::<B>().build();
685
686        let mut control: HashSet<usize, RandomState> = HashSet::default();
687        for op in ops {
688            match op {
689                Op::Insert(v) => {
690                    b.insert(&v);
691                    control.insert(v);
692                }
693                Op::Contains(v) => {
694                    // This check cannot be an equality assert, as the bloom
695                    // filter may return a false positive.
696                    if control.contains(&v) {
697                        assert!(b.contains(&v));
698                    }
699                }
700            }
701        }
702    }
703}