bloom_filter_rs/
bloom.rs

1use murmur3::murmur3_32;
2use bit_vec::BitVec;
3
4use std::io::{Cursor, Read};
5use std::f64::consts::{LN_2, E};
6
7/// A trait for hashing an arbitrary stream of bytes into a bloom filter.
8pub trait BloomHasher {
9    /// Returns the hashed value of the bytes given some seed.
10    #[inline]
11    fn hash(&self, seed: u32, bytes: &[u8]) -> u32;
12}
13
14/// A unit struct for the murmur3 hash function.
15pub struct Murmur3;
16
17impl BloomHasher for Murmur3 {
18    fn hash(&self, seed: u32, bytes: &[u8]) -> u32 {
19        let mut cursor = Cursor::new(bytes);
20        murmur3_32(cursor.by_ref(), seed)
21    }
22}
23
24/// BloomFilter
25///
26/// An implementation of a bloom filter
27pub struct BloomFilter<T> {
28    hasher: T,
29    k: u32,
30    bit_vec: BitVec,
31    insert_count: u64,
32}
33
34impl<T: BloomHasher> BloomFilter<T> {
35    /// Create a new `BloomFilter` given a `hasher`,
36    /// the number of hash functions to use,
37    /// and the size of the underlying bit array.
38    ///
39    /// Typically, this function should not be called directly unless,
40    /// the optimal number of hash functions and optimal array size are
41    /// already known.
42    pub fn new(hasher: T, k: u32, array_size: u64) -> Self {
43        Self {
44            hasher,
45            k,
46            bit_vec: BitVec::from_elem(array_size as usize, false),
47            insert_count: 0,
48        }
49    }
50
51    /// Create a `BloomFilter` by computing its optimal parameters.
52    ///
53    /// This function computes the optimal array size using
54    /// ```text
55    /// -(n * ln(p)) / ln(2) ^ 2
56    /// ```
57    /// and computes the optimal number of hash functions using
58    /// ```text
59    /// m / n * ln(2)
60    /// ```
61    pub fn optimal(hasher: T, max_elements: u64, error_rate: f64) -> Self {
62        // Check error rate is valid
63        if error_rate <= 0_f64 || error_rate >= 1_f64 {
64            panic!("Error rate must be 0 <= error_rate < 1");
65        }
66
67        // Calculate the length of the bit vector
68        let m = optimal_vec_size(max_elements as u64, error_rate);
69
70        // Calculate the number of hash functions to use
71        let k = optimal_hash_functions(m, max_elements as u64);
72
73        // Create the bloom filter
74        Self::new(hasher, k, m)
75    }
76
77    /// Insert a slice of bytes into the `BloomFilter`.
78    pub fn insert(&mut self, bytes: &[u8]) {
79        for seed in 0..self.k {
80            let hash = self.hasher.hash(seed, bytes) as usize % self.bit_vec.len();
81            self.bit_vec.set(hash, true);
82        }
83        self.insert_count += 1;
84    }
85
86    /// Insert a slice of slices of bytes into the `BloomFilter`.
87    pub fn insert_all<B: AsRef<[u8]>>(&mut self, slice: &[B]) {
88        for item in slice {
89            self.insert(item.as_ref());
90        }
91    }
92
93    /// Check whether a slice of bytes exists in the `BloomFilter`.
94    ///
95    /// This is a probabilistic function that may return a false positive but will
96    /// never return a false negative.
97    ///
98    /// # Examples
99    ///
100    /// ```
101    /// extern crate bloom_filter_rs as bloom_filter;
102    ///
103    /// use std::vec::Vec;
104    /// use bloom_filter::{BloomFilter, Murmur3};
105    ///
106    /// let words = vec!["Hello", "I", "am", "some", "words"];
107    ///
108    /// let mut bloom_filter = BloomFilter::optimal(Murmur3, words.len() as u64, 0.01);
109    ///
110    /// bloom_filter.insert_all(&words);
111    ///
112    /// for word in words.iter() {
113    ///     assert!(bloom_filter.contains(&word));
114    /// }
115    /// ```
116    pub fn contains<B: AsRef<[u8]>>(&self, bytes: B) -> bool {
117        for seed in 0..self.k {
118            let hash = self.hasher.hash(seed, bytes.as_ref()) as usize % self.bit_vec.len();
119            if !self.bit_vec[hash] {
120                return false;
121            }
122        }
123
124        true
125    }
126
127    /// Calculate the expected false positive rate given the current state of
128    /// the `BloomFilter`.
129    pub fn false_positive_rate(&self) -> f64 {
130        false_positive_rate(self.insert_count, self.bit_vec.len() as u64, self.k)
131    }
132}
133
134/// This function computes the false positive rate given n, m, and k.
135#[inline]
136fn false_positive_rate(n: u64, m: u64, k: u32) -> f64 {
137    (1_f64 - E.powf(-(n as f64) * m as f64 / k as f64)).powf(k as f64)
138}
139
140#[inline]
141fn optimal_hash_functions(m: u64, n: u64) -> u32 {
142    1_f64.max(m as f64 / n as f64 * LN_2).ceil() as u32
143}
144
145#[inline]
146fn optimal_vec_size(n: u64, p: f64) -> u64 {
147    (-(n as f64 * p.ln()) / LN_2.powi(2)).ceil() as u64
148}
149
150#[cfg(test)]
151mod tests {
152    use std::fs::File;
153    use std::io::{BufReader, BufRead};
154    use super::*;
155
156    #[test]
157    fn test_optimal_hash_functions() {
158        assert_eq!(1, optimal_hash_functions(1, 10));
159        assert_eq!(7, optimal_hash_functions(95851, 10000));
160        assert_eq!(7, optimal_hash_functions(9586, 1000));
161        assert_eq!(7, optimal_hash_functions(90000, 10000));
162    }
163
164    #[test]
165    fn test_optimal_vec_size() {
166        assert_eq!(95851, optimal_vec_size(10000, 0.01));
167        assert_eq!(9586, optimal_vec_size(1000, 0.01));
168    }
169
170    #[test]
171    #[should_panic]
172    fn test_error_rate_too_low() {
173        BloomFilter::optimal(Murmur3, 10000, 0_f64);
174    }
175
176    #[test]
177    #[should_panic]
178    fn test_error_rate_too_high() {
179        BloomFilter::optimal(Murmur3, 10000, 1_f64);
180    }
181
182    #[test]
183    fn test_no_false_negatives() {
184        let words: Vec<String> = BufReader::new(File::open("./resources/1000.txt").unwrap())
185            .lines()
186            .map(|s| s.unwrap())
187            .collect();
188
189        let mut bloom_filter = BloomFilter::optimal(Murmur3, words.len() as u64, 0.01);
190
191        bloom_filter.insert_all(&words);
192
193        for word in words.iter() {
194            assert!(bloom_filter.contains(&word));
195        }
196//
197//        let bloom_filter = BloomFilter::from_iter(words.iter(), Murmur3, 0.01);
198//
199//        for word in words.iter() {
200//            assert!(bloom_filter.contains(&word));
201//        }
202    }
203}