stable_bloom_filter/
stable.rs

1use crate::buckets::Buckets;
2use crate::fnv::FnvHasher;
3use crate::Filter;
4use crate::{optimal_k, optimal_stable_p};
5use rand::{thread_rng, Rng};
6use std::hash::Hasher;
7
8pub struct StableBloomFilter {
9    /// filter data
10    cells: Buckets,
11    /// hash function (kernel for all k functions)
12    hash: FnvHasher,
13    /// number of cells
14    m: usize,
15    /// number of cells to decrement
16    p: usize,
17    /// number of hash functions
18    k: usize,
19    /// cell max value
20    max: u8,
21    /// buffer used to cache indices
22    index_buffer: Vec<usize>,
23}
24
25impl StableBloomFilter {
26    /// Creates a new Stable Bloom Filter with m cells and d
27    /// bits allocated per cell optimized for the target false-positive rate. Use
28    /// default if you don't want to calculate d.
29    pub fn new(m: usize, d: u8, fp_rate: f64) -> Self {
30        let mut k = optimal_k(fp_rate) / 2;
31        if k > m {
32            k = m;
33        } else if k == 0 {
34            k = 1;
35        }
36
37        let cells = Buckets::new(m, d);
38
39        StableBloomFilter {
40            hash: FnvHasher::default(),
41            m,
42            k,
43            p: optimal_stable_p(m, k, d, fp_rate),
44            max: cells.max_bucket_value(),
45            cells,
46            index_buffer: vec![0; k],
47        }
48    }
49
50    /// Creates a new Stable Bloom Filter with m 1-bit
51    /// cells and which is optimized for cases where there is no prior knowledge of
52    /// the input data stream while maintaining an upper bound using the provided
53    /// rate of false positives.
54    pub fn new_default(m: usize, fp_rate: f64) -> Self {
55        Self::new(m, 1, fp_rate)
56    }
57
58    /// NewUnstableBloomFilter creates a new special case of Stable Bloom Filter
59    /// which is a traditional Bloom filter with m bits and an optimal number of
60    /// hash functions for the target false-positive rate. Unlike the stable
61    /// variant, data is not evicted and a cell contains a maximum of 1 hash value.
62    pub fn new_unstable(m: usize, fp_rate: f64) -> Self {
63        let cells = Buckets::new(m, 1);
64        let k = optimal_k(fp_rate);
65
66        StableBloomFilter {
67            hash: FnvHasher::default(),
68            m,
69            k,
70            p: 0,
71            max: cells.max_bucket_value(),
72            cells,
73            index_buffer: vec![0; k],
74        }
75    }
76
77    /// Returns the number of cells in the Stable Bloom Filter.
78    pub fn cells(&self) -> usize {
79        self.m
80    }
81
82    /// Returns the number of hash functions.
83    pub fn k(&self) -> usize {
84        self.k
85    }
86
87    /// Returns the number of cells decremented on every add.
88    pub fn p(&self) -> usize {
89        self.p
90    }
91
92    pub fn max(&self) -> u8 {
93        self.max
94    }
95
96    /// Returns the limit of the expected fraction of zeros in the
97    /// Stable Bloom Filter when the number of iterations goes to infinity. When
98    /// this limit is reached, the Stable Bloom Filter is considered stable.
99    pub fn stable_point(&self) -> f64 {
100        let sub_denom = (self.p as f64) * ((1.0 / (self.k as f64)) - (1.0 / (self.m as f64)));
101        let denom = 1.0 + 1.0 / sub_denom;
102        let base = 1.0 / denom;
103
104        base.powf(f64::from(self.max))
105    }
106
107    /// Returns the upper bound on false positives when the filter
108    /// has become stable.
109    pub fn false_positive_rate(&self) -> f64 {
110        (1.0 - self.stable_point()).powf(self.k as f64)
111    }
112
113    pub fn hash_kernel(&self, data: &[u8]) -> (u32, u32) {
114        let mut hasher = self.hash.clone();
115        hasher.write(data);
116        let hash: u64 = hasher.finish();
117        let lower = hash as u32;
118        let upper = (hash >> 32) as u32;
119        (lower, upper)
120    }
121
122    /// Restores the Stable Bloom Filter to its original state. It returns the
123    /// filter to allow for chaining.
124    pub fn reset(&mut self) -> &Self {
125        self.cells.reset();
126        self
127    }
128
129    /// Will decrement a random cell and (p-1) adjacent cells by 1. This
130    /// is faster than generating p random numbers. Although the processes of
131    /// picking the p cells are not independent, each cell has a probability of p/m
132    /// for being picked at each iteration, which means the properties still hold.
133    pub fn decrement(&mut self) {
134        let mut rng = thread_rng();
135        let r: usize = rng.gen_range(0, self.m);
136
137        for i in 0..(self.p) {
138            let idx = (r + i) % self.m;
139            self.cells.decrease(idx, 1);
140        }
141    }
142}
143
144impl Filter for StableBloomFilter {
145    /// Will test for membership of the data and returns true if it is a
146    /// member, false if not. This is a probabilistic test, meaning there is a
147    /// non-zero probability of false positives and false negatives.
148    fn test(&self, data: &[u8]) -> bool {
149        let (lower, upper) = self.hash_kernel(data);
150        for i in 0..(self.k) {
151            if self
152                .cells
153                .get((lower as usize + upper as usize * i) % self.m)
154                == 0
155            {
156                return false;
157            }
158        }
159        true
160    }
161
162    /// Will add the data to the Stable Bloom Filter. It returns the filter to
163    /// allow for chaining.
164    fn add(&mut self, data: &[u8]) -> &Self {
165        // Randomly decrement p cells to make room for new elements.
166        self.decrement();
167        let (lower, upper) = self.hash_kernel(data);
168
169        for i in 0..(self.k) {
170            self.cells
171                .set((lower as usize + upper as usize * i) % self.m, self.max);
172        }
173
174        self
175    }
176
177    /// Is equivalent to calling Test followed by Add. It returns true if
178    /// the data is a member, false if not.
179    fn test_and_add(&mut self, data: &[u8]) -> bool {
180        let (lower, upper) = self.hash_kernel(data);
181        let mut member = true;
182
183        // If any of the K cells are 0, then it's not a member.
184        for i in 0..(self.k) {
185            self.index_buffer[i] = (lower as usize + upper as usize * i) % self.m;
186            if self.cells.get(self.index_buffer[i]) == 0 {
187                member = false;
188            }
189        }
190
191        // Randomly decrement p cells to make room for new elements.
192        self.decrement();
193        // Set the K cells to max.
194        for i in self.index_buffer.iter() {
195            self.cells.set(*i, self.max);
196        }
197
198        member
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::StableBloomFilter;
205    use crate::optimal_k;
206    use crate::Filter;
207    use float_cmp::ApproxEq;
208    use std::f64;
209
210    fn round(val: f64, round_on: f64, places: usize) -> f64 {
211        let pow = (10.0_f64).powf(places as f64);
212        let digit = pow * val;
213        let div = digit - digit.floor();
214        let round = if div >= round_on {
215            digit.ceil()
216        } else {
217            digit.floor()
218        };
219
220        round / pow
221    }
222
223    // Ensures that new_unstable creates a Stable Bloom Filter with p=0,
224    // max=1 and k hash functions.
225    #[test]
226    fn test_new_unstable() {
227        let f = StableBloomFilter::new_unstable(100, 0.1);
228        let k = optimal_k(0.1);
229
230        assert_eq!(f.k, k);
231        assert_eq!(f.m, 100);
232        assert_eq!(f.p(), 0);
233        assert_eq!(f.max(), 1);
234    }
235
236    // Ensures that Cells returns the number of cells, m, in the Stable Bloom
237    // Filter.
238    #[test]
239    fn test_cells() {
240        let f = StableBloomFilter::new(100, 1, 0.1);
241
242        assert_eq!(f.cells(), 100);
243    }
244
245    // Ensures that K returns the number of hash functions in the Stable Bloom
246    // Filter.
247    #[test]
248    fn test_k() {
249        let f = StableBloomFilter::new(100, 1, 0.01);
250        assert_eq!(f.k(), 3);
251    }
252
253    // Ensures that Test, Add, and TestAndAdd behave correctly.
254    #[test]
255    fn test_test_and_add() {
256        let mut f = StableBloomFilter::new_default(1_000, 0.01);
257        assert!(!f.test(b"a"));
258
259        f.add(b"a");
260        assert!(f.test(b"a"));
261
262        assert!(f.test_and_add(b"a"));
263
264        assert!(!f.test_and_add(b"b"));
265        assert!(f.test(b"a"));
266
267        assert!(f.test(b"b"));
268
269        assert!(!f.test(b"c"));
270
271        for i in 0..1_000_000 {
272            f.test_and_add(i.to_string().as_bytes());
273        }
274
275        // `a` should have been evicted.
276        assert!(!f.test(b"a"));
277    }
278
279    // Ensures that StablePoint returns the expected fraction of zeros for large
280    // iterations.
281    #[test]
282    fn test_stable_point() {
283        let mut f = StableBloomFilter::new(1000, 1, 0.1);
284        for i in 0..1_000_000 {
285            f.add(i.to_string().as_bytes());
286        }
287
288        let mut zero = 0;
289        for i in 0..(f.m) {
290            if f.cells.get(i) == 0 {
291                zero += 1;
292            }
293        }
294
295        let actual = round(f64::from(zero) / (f.m as f64), 0.5, 1);
296        let expected = round(f.stable_point(), 0.5, 1);
297
298        assert!(actual.approx_eq(expected, (f64::EPSILON, 1)));
299        // A classic Bloom filter is a special case of SBF where P is 0 and max is
300        // 1. It doesn't have a stable point.
301        let bf = StableBloomFilter::new_unstable(1000, 0.1);
302        assert!(bf.stable_point().approx_eq(0.0, (f64::EPSILON, 1)));
303    }
304
305    // Ensures that FalsePositiveRate returns the upper bound on false positives
306    // for stable filters.
307    #[test]
308    fn test_false_positive_rate() {
309        let f = StableBloomFilter::new_default(1000, 0.01);
310        let fps = round(f.false_positive_rate(), 0.5, 2);
311
312        assert!(fps.approx_eq(0.01, (f64::EPSILON, 1)));
313
314        // Classic Bloom filters have an unbounded rate of false positives. Once
315        // they become full, every query returns a false positive.
316        let bf = StableBloomFilter::new_unstable(1000, 0.1);
317        assert!(bf.false_positive_rate().approx_eq(1.0, (f64::EPSILON, 1)));
318    }
319
320    // Ensures that Reset sets every cell to zero.
321    #[test]
322    fn test_reset() {
323        let mut f = StableBloomFilter::new_default(1000, 0.01);
324
325        for i in 0..1000 {
326            f.add(i.to_string().as_bytes());
327        }
328
329        f.reset();
330
331        for i in 0..(f.m) {
332            assert_eq!(f.cells.get(i), 0);
333        }
334    }
335}