bloom_filter_wbj/
lib.rs

1
2extern crate bit_vec;
3use bit_vec::BitVec;
4
5use std::hash::Hash;
6use std::hash::Hasher;
7use std::hash::BuildHasher;
8
9use std::collections::hash_map::RandomState;
10
11/// Calculate the probability of getting a false positive
12///
13/// # Arguments
14/// * `n_buckets`: number of buckets
15/// * `n_hashers`: number of hashers
16/// * `n_elems`: number of elements
17fn false_positive_rate(n_buckets: usize, n_hashers: usize, n_elems: usize)
18    -> f32
19{
20    let k = n_hashers as f32;
21    let n = n_elems as f32;
22    let m = n_buckets as f32;
23        
24    (1. - ((-k * n) / m).exp()).powf(k)
25}
26
27fn min_n_buckets(n_elems: usize, fp_rate: f32) -> usize
28{
29    let n = n_elems as f32;
30
31    (-1. * n * fp_rate.ln() / (2f32.ln().powf(2.))).ceil() as usize
32}
33
34/// Calculate the optimal number of hashers
35///
36/// # Arguments
37/// * `n_buckets`: number of buckets
38/// * `n_elems`: number of elemements
39fn optimal_n_hashers(n_buckets: usize, n_elems: usize) -> usize
40{
41    let n = n_elems as f32;
42    let m = n_buckets as f32;
43
44    ((m / n) * 2f32.ln()).ceil() as usize
45}
46
47/// Space-efficient probabilistic hash set
48#[derive(Debug)]
49pub struct BloomFilter
50{
51    buffer: BitVec,
52    size: usize,
53    hashers: Vec<RandomState>
54}
55
56impl BloomFilter
57{
58    /// Build a Bloom Filter with a specified false positive rate
59    ///
60    /// # Arguments
61    /// * `n_elems`: expected number of elements
62    /// * `fp_rate`: desired false positive rate (0.0 -> 1.0)
63    pub fn new_with_fp(n_elems: usize, fp_rate: f32) -> BloomFilter
64    {
65        let min_buckets = min_n_buckets(n_elems, fp_rate);
66        let n_hashers = optimal_n_hashers(min_buckets, n_elems);
67
68        BloomFilter {
69            size: 0,
70            buffer: BitVec::from_elem(min_buckets, false),
71            hashers: (0..n_hashers).map(|_| RandomState::new()).collect()
72        }
73    }
74
75    /// Create a new Bloom Filter with specified buffer size
76    ///
77    /// # Arguments
78    /// * `n_elems`: expected number of elements
79    /// * `size`: desired buffer size
80    pub fn new_with_size(n_elems: usize, size: usize) -> BloomFilter
81    {
82        let n_hashers = optimal_n_hashers(size, n_elems);
83
84        BloomFilter {
85            size: 0,
86            buffer: BitVec::from_elem(size, false),
87            hashers: (0..n_hashers).map(|_| RandomState::new()).collect()
88        }
89    }
90
91    /// Add a member
92    pub fn add<T>(&mut self, e: &T)
93        where T: Hash
94    {
95        for idx in self.indexes(e) {
96            self.buffer.set(idx, true);
97        }
98
99        self.size += 1;
100    }
101
102    /// Check membership
103    pub fn may_contain<T>(&self, e: &T) -> bool
104        where T: Hash
105    {
106        let mut may_contain = true;
107
108        for idx in self.indexes(e) {
109            may_contain &= self.buffer.get(idx).unwrap();
110        }
111
112        may_contain
113    }
114
115    /// Number of elements in the `BloomFilter`
116    pub fn size(&self) -> usize
117    {
118        self.size
119    }
120
121    /// Number of buckets that a memebr can occupy
122    pub fn buckets(&self) -> usize
123    {
124        self.buffer.capacity()
125    }
126
127    /// Number of hashers being used
128    pub fn n_hashers(&self) -> usize
129    {
130        self.hashers.len()
131    }
132
133    /// False positive rate
134    pub fn fp_rate(&self) -> f32
135    {
136        false_positive_rate(self.buckets(), self.n_hashers(), self.size())
137    }
138
139    /// The indexes that a element hashes to
140    fn indexes<T>(&self, e: &T) -> Vec<usize>
141        where T: Hash
142    {
143        let mut idxs = vec![];
144        for h in &self.hashers {
145            let mut hasher = h.build_hasher();
146            e.hash(&mut hasher);
147            idxs.push(hasher.finish() as usize % self.buffer.len()); 
148        }
149        idxs
150    }
151}
152
153#[cfg(test)]
154mod test
155{
156    use super::*;
157
158    /// Test that the bloom filter will always return the same results
159    #[test]
160    fn test_is_deterministic()
161    {
162        let to_add = "do add this";
163        let dont_add = 123;
164        let mut filter = BloomFilter::new_with_size(1, 100);
165        filter.add(&to_add);
166
167        // Check membership twice to make sure that the results are reproducable
168        // even though the hashers are being reset
169        assert_eq!(true,  filter.may_contain(&to_add));
170        assert_eq!(true,  filter.may_contain(&to_add));
171
172        assert_eq!(false, filter.may_contain(&dont_add));
173        assert_eq!(false, filter.may_contain(&dont_add));
174
175    }
176
177    #[test]
178    fn test_size_increments()
179    {
180        let to_add = "do add this";
181
182        let mut filter = BloomFilter::new_with_size(3, 100);
183        filter.add(&to_add);
184        filter.add(&to_add);
185        filter.add(&to_add);
186
187        assert_eq!(3, filter.size());
188    }
189
190    #[test]
191    fn test_fp_rate_is_zero_no_elems()
192    {
193        let filter = BloomFilter::new_with_size(100, 100);
194        assert_eq!(0.0, filter.fp_rate());
195    }
196}