selectors_bloom/
lib.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5//! Simple counting bloom filters.
6
7extern crate fnv;
8
9use fnv::FnvHasher;
10use std::hash::Hasher;
11
12const KEY_SIZE: usize = 12;
13const ARRAY_SIZE: usize = 1 << KEY_SIZE;
14const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
15const KEY_SHIFT: usize = 16;
16
17/// A counting Bloom filter with 8-bit counters.  For now we assume
18/// that having two hash functions is enough, but we may revisit that
19/// decision later.
20///
21/// The filter uses an array with 2**KeySize entries.
22///
23/// Assuming a well-distributed hash function, a Bloom filter with
24/// array size M containing N elements and
25/// using k hash function has expected false positive rate exactly
26///
27/// $  (1 - (1 - 1/M)^{kN})^k  $
28///
29/// because each array slot has a
30///
31/// $  (1 - 1/M)^{kN}  $
32///
33/// chance of being 0, and the expected false positive rate is the
34/// probability that all of the k hash functions will hit a nonzero
35/// slot.
36///
37/// For reasonable assumptions (M large, kN large, which should both
38/// hold if we're worried about false positives) about M and kN this
39/// becomes approximately
40///
41/// $$  (1 - \exp(-kN/M))^k   $$
42///
43/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
44/// or in other words
45///
46/// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
47///
48/// where r is the false positive rate.  This can be used to compute
49/// the desired KeySize for a given load N and false positive rate r.
50///
51/// If N/M is assumed small, then the false positive rate can
52/// further be approximated as 4*N^2/M^2.  So increasing KeySize by
53/// 1, which doubles M, reduces the false positive rate by about a
54/// factor of 4, and a false positive rate of 1% corresponds to
55/// about M/N == 20.
56///
57/// What this means in practice is that for a few hundred keys using a
58/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
59///
60/// Similarly, using a KeySize of 10 would lead to a 4% false
61/// positive rate for N == 100 and to quite bad false positive
62/// rates for larger N.
63pub struct BloomFilter {
64    counters: [u8; ARRAY_SIZE],
65}
66
67impl Clone for BloomFilter {
68    #[inline]
69    fn clone(&self) -> BloomFilter {
70        BloomFilter {
71            counters: self.counters,
72        }
73    }
74}
75
76impl BloomFilter {
77    /// Creates a new bloom filter.
78    #[inline]
79    pub fn new() -> BloomFilter {
80        BloomFilter {
81            counters: [0; ARRAY_SIZE],
82        }
83    }
84
85    #[inline]
86    fn first_slot(&self, hash: u32) -> &u8 {
87        &self.counters[hash1(hash) as usize]
88    }
89
90    #[inline]
91    fn first_mut_slot(&mut self, hash: u32) -> &mut u8 {
92        &mut self.counters[hash1(hash) as usize]
93    }
94
95    #[inline]
96    fn second_slot(&self, hash: u32) -> &u8 {
97        &self.counters[hash2(hash) as usize]
98    }
99
100    #[inline]
101    fn second_mut_slot(&mut self, hash: u32) -> &mut u8 {
102        &mut self.counters[hash2(hash) as usize]
103    }
104
105    #[inline]
106    pub fn clear(&mut self) {
107        self.counters = [0; ARRAY_SIZE]
108    }
109
110    #[inline]
111    fn insert_hash(&mut self, hash: u32) {
112        {
113            let slot1 = self.first_mut_slot(hash);
114            if !full(slot1) {
115                *slot1 += 1
116            }
117        }
118        {
119            let slot2 = self.second_mut_slot(hash);
120            if !full(slot2) {
121                *slot2 += 1
122            }
123        }
124    }
125
126    /// Inserts an item into the bloom filter.
127    #[inline]
128    pub fn insert<T:BloomHash>(&mut self, elem: &T) {
129        self.insert_hash(elem.bloom_hash())
130
131    }
132
133    #[inline]
134    fn remove_hash(&mut self, hash: u32) {
135        {
136            let slot1 = self.first_mut_slot(hash);
137            if !full(slot1) {
138                *slot1 -= 1
139            }
140        }
141        {
142            let slot2 = self.second_mut_slot(hash);
143            if !full(slot2) {
144                *slot2 -= 1
145            }
146        }
147    }
148
149    /// Removes an item from the bloom filter.
150    #[inline]
151    pub fn remove<T:BloomHash>(&mut self, elem: &T) {
152        self.remove_hash(elem.bloom_hash())
153    }
154
155    #[inline]
156    fn might_contain_hash(&self, hash: u32) -> bool {
157        *self.first_slot(hash) != 0 && *self.second_slot(hash) != 0
158    }
159
160    /// Check whether the filter might contain an item.  This can
161    /// sometimes return true even if the item is not in the filter,
162    /// but will never return false for items that are actually in the
163    /// filter.
164    #[inline]
165    pub fn might_contain<T:BloomHash>(&self, elem: &T) -> bool {
166        self.might_contain_hash(elem.bloom_hash())
167    }
168}
169
170pub trait BloomHash {
171    fn bloom_hash(&self) -> u32;
172}
173
174impl BloomHash for u64 {
175    #[inline]
176    fn bloom_hash(&self) -> u32 {
177        ((*self >> 32) ^ *self) as u32
178    }
179}
180
181impl BloomHash for isize {
182    #[allow(exceeding_bitshifts)]
183    #[inline]
184    fn bloom_hash(&self) -> u32 {
185        ((*self >> 32) ^ *self) as u32
186    }
187}
188
189impl BloomHash for usize {
190    #[allow(exceeding_bitshifts)]
191    #[inline]
192    fn bloom_hash(&self) -> u32 {
193        ((*self >> 32) ^ *self) as u32
194    }
195}
196
197impl BloomHash for String {
198    #[inline]
199    fn bloom_hash(&self) -> u32 {
200        let mut h = FnvHasher::default();
201        h.write(self.as_bytes());
202        h.finish().bloom_hash()
203    }
204}
205
206#[inline]
207fn full(slot: &u8) -> bool {
208    *slot == 0xff
209}
210
211#[inline]
212fn hash1(hash: u32) -> u32 {
213    hash & KEY_MASK
214}
215
216#[inline]
217fn hash2(hash: u32) -> u32 {
218    (hash >> KEY_SHIFT) & KEY_MASK
219}
220
221#[test]
222fn create_and_insert_some_stuff() {
223    let mut bf = BloomFilter::new();
224
225    for i in 0_usize .. 1000 {
226        bf.insert(&i);
227    }
228
229    for i in 0_usize .. 1000 {
230        assert!(bf.might_contain(&i));
231    }
232
233    let false_positives =
234        (1001_usize .. 2000).filter(|i| bf.might_contain(i)).count();
235
236    assert!(false_positives < 10); // 1%.
237
238    for i in 0_usize .. 100 {
239        bf.remove(&i);
240    }
241
242    for i in 100_usize .. 1000 {
243        assert!(bf.might_contain(&i));
244    }
245
246    let false_positives = (0_usize .. 100).filter(|i| bf.might_contain(i)).count();
247
248    assert!(false_positives < 2); // 2%.
249
250    bf.clear();
251
252    for i in 0_usize .. 2000 {
253        assert!(!bf.might_contain(&i));
254    }
255}
256
257#[cfg(test)]
258mod bench {
259    extern crate test;
260
261    use std::hash::{Hash, Hasher, SipHasher};
262    use super::BloomFilter;
263
264    #[bench]
265    fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
266        b.iter(|| {
267            let mut bf = BloomFilter::new();
268            for i in 0_usize .. 1000 {
269                bf.insert(&i);
270            }
271            for i in 0_usize .. 100 {
272                bf.remove(&i);
273            }
274            for i in 100_usize .. 200 {
275                test::black_box(bf.might_contain(&i));
276            }
277        });
278    }
279
280    #[bench]
281    fn might_contain(b: &mut test::Bencher) {
282        let mut bf = BloomFilter::new();
283
284        for i in 0_usize .. 1000 {
285            bf.insert(&i);
286        }
287
288        let mut i = 0_usize;
289
290        b.bench_n(1000, |b| {
291            b.iter(|| {
292                test::black_box(bf.might_contain(&i));
293                i += 1;
294            });
295        });
296    }
297
298    #[bench]
299    fn insert(b: &mut test::Bencher) {
300        let mut bf = BloomFilter::new();
301
302        b.bench_n(1000, |b| {
303            let mut i = 0_usize;
304
305            b.iter(|| {
306                test::black_box(bf.insert(&i));
307                i += 1;
308            });
309        });
310    }
311
312    #[bench]
313    fn remove(b: &mut test::Bencher) {
314        let mut bf = BloomFilter::new();
315        for i in 0_usize .. 1000 {
316            bf.insert(&i);
317        }
318
319        b.bench_n(1000, |b| {
320            let mut i = 0_usize;
321
322            b.iter(|| {
323                bf.remove(&i);
324                i += 1;
325            });
326        });
327
328        test::black_box(bf.might_contain(&0_usize));
329    }
330
331    #[bench]
332    fn hash_a_uint(b: &mut test::Bencher) {
333        let mut i = 0_usize;
334        b.iter(|| {
335            let mut hasher = SipHasher::default();
336            i.hash(&mut hasher);
337            test::black_box(hasher.finish());
338            i += 1;
339        })
340    }
341}