1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
//! `BloomFilter` is a probabilistic data structure that can quickly and definitively conclude that it does
//! *not* contain an item. On the other hand, it can only conclude that it *probably* contains an
//! item, i.e., the data structure has an inherent false-positive rate greater than 0%.
//!
//! Items can be added to the Bloom filter, but cannot be removed - this would introduce false
//! negative cases. If this is required, an alternative might be to use a Counting Bloom filter
//! (not (yet) implemented in this crate).
//!
//! This implementation is backed by a Rust implementation of the [xxHash hashing
//! algorithm](https://github.com/Cyan4973/xxHash), [twox-hash](https://crates.io/crates/twox-hash).
//!
//! # References
//! - [Less Hashing, Same Performance: Building a Better Bloom
//! Filter](https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf)
//! - [Wikipedia article](https://en.wikipedia.org/wiki/Bloom_filter)
//! - [Bloom Filters by Example](https://llimllib.github.io/bloomfilter-tutorial/)
//! - [Bloom Filter Calculator](https://hur.st/bloomfilter/)
use bitvec::{bitvec, BitVec};
use std::f64::consts::{E, LN_2};
use std::hash::{BuildHasher, Hash, Hasher};
use std::marker::PhantomData;
use twox_hash::RandomXxHashBuilder;

const LN2_SQUARED: f64 = LN_2 * LN_2;

/// Represents a Bloom filter.
///
/// When constructing the filter using `new`, you need to specify the desired acceptable
/// false-positive rate, and the number of items you intend to store in the filter. Neither can be
/// adjusted after creation - create a new filter instead.
///
/// # Example
/// ```rust
/// use flit::BloomFilter;
///
/// let mut filter = BloomFilter::new(0.01, 10000);
/// filter.add(&"Hello, world!");
///
/// assert_eq!(filter.might_contain(&"Hello, world!"), true); // probably true
/// assert_eq!(filter.might_contain(&"Dogs are cool!"), false); // definitely false!
/// ```
pub struct BloomFilter<T> {
    n: u64,
    m: u64,
    k: u32,
    bit_vec: BitVec,
    build_hasher: RandomXxHashBuilder,
    _phantom: PhantomData<T>,
}

impl<T: Hash> BloomFilter<T> {
    /// Creates a new Bloom filter based on the required false positive rate and the estimated
    /// number of items that will be added to the filter.
    ///
    /// The parameters influence the size of the filter, as well as the number of
    /// hashes that must be applied to the items.
    ///
    /// # Panics
    ///
    /// This function will panic if `false_positive_rate` is not between 0 and 1 (non inclusive),
    /// or if `estimated_items` is not greater than 0.
    pub fn new(false_positive_rate: f64, estimated_items: usize) -> Self {
        assert!(
            false_positive_rate > 0_f64 && false_positive_rate < 1_f64,
            "False positive rate must be between 0 and 1 (non-inclusive)"
        );
        assert!(
            estimated_items > 0,
            "Number of estimated items must be greater than zero"
        );

        let num_bits = -(estimated_items as f64) * false_positive_rate.ln() / LN2_SQUARED;
        let num_hashes = (num_bits / estimated_items as f64) * LN_2;

        let num_bits = num_bits.ceil() as u64;
        let num_hashes = num_hashes.ceil() as u32;

        BloomFilter {
            n: 0,
            m: num_bits,
            k: num_hashes,
            bit_vec: bitvec![0; num_bits as usize],
            build_hasher: RandomXxHashBuilder::default(),
            _phantom: PhantomData,
        }
    }

    /// Adds the `item` to the filter by setting the appropriate bits in the filter to `true`.
    pub fn add(&mut self, item: &T) {
        for i in indices_for_hash(split_hash(item, &self.build_hasher), self.m, self.k) {
            self.bit_vec.set(i, true);
        }

        self.n += 1;
    }

    /// Checks if the filter *might* contain the `item`.
    ///
    /// If this function returns false, the filter definitely does not contain the item.
    /// If this function returns true, the filter *might* contain the item, but it might also be a
    /// false-positive.
    pub fn might_contain(&self, item: &T) -> bool {
        for i in indices_for_hash(split_hash(item, &self.build_hasher), self.m, self.k) {
            if !self.bit_vec[i] {
                return false;
            }
        }

        true
    }

    /// Calculates the current expected false positive rate given the number of items in the
    /// filter.
    pub fn false_positive_rate(&self) -> f64 {
        (1_f64 - E.powf(-1_f64 * f64::from(self.k) * self.n as f64 / self.m as f64))
            .powi(self.k as i32)
    }
}

/// Hashes `item` using a `Hasher`, and produces a two-element tuple.
/// The first element is the "upper half" of the `u64` produced by the hash function, and the second
/// element is the "lower half".
fn split_hash<T: Hash>(item: &T, hasher: &impl BuildHasher) -> (u32, u32) {
    let mut hasher = hasher.build_hasher();
    item.hash(&mut hasher);
    let hash = hasher.finish();

    (((hash >> 32) as u32), hash as u32)
}

/// Returns the indices to be set to "true" in a Bloom filter for a given hash.
///
/// `split_hash` is a tuple of two `u32` values produced by passing the item to be added through
/// the `split_hash` function.
/// `m` is the number of indices in the filter.
/// `k` is the number of hash functions that the item should be passed through.
fn indices_for_hash(split_hash: (u32, u32), m: u64, k: u32) -> impl Iterator<Item = usize> {
    (0..k).map(move |i| {
        (u64::from(split_hash.0.wrapping_add(split_hash.1.wrapping_mul(i))) % m) as usize
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_num_bits_and_hashes() {
        let filter = BloomFilter::<&str>::new(0.01_f64, 216553);

        assert_eq!(filter.m, 2_075_674);
        assert_eq!(filter.k, 7);
    }

    #[test]
    fn test_false_positive_rate_empty() {
        let filter = BloomFilter::<&str>::new(0.01_f64, 216553);

        // False positive rate with nothing added to the filter should be 0.
        assert_eq!(filter.false_positive_rate(), 0_f64);
    }

    #[test]
    fn test_add() {
        let mut filter = BloomFilter::new(0.03_f64, 10);

        filter.add(&"Hello, world!");

        assert!(filter.false_positive_rate() > 0.0);
        assert_eq!(filter.might_contain(&"Hello, world!"), true);
        assert_eq!(filter.might_contain(&"Dogs are cool!"), false);
    }
}