1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// StableBloomFilter implements a Stable Bloom Filter as described by Deng and
// Rafiei in Approximately Detecting Duplicates for Streaming Data using Stable
// Bloom Filters:
//
// http://webdocs.cs.ualberta.ca/~drafiei/papers/DupDet06Sigmod.pdf
//
// A Stable Bloom Filter (SBF) continuously evicts stale information so that it
// has room for more recent elements. Like traditional Bloom filters, an SBF
// has a non-zero probability of false positives, which is controlled by
// several parameters. Unlike the classic Bloom filter, an SBF has a tight
// upper bound on the rate of false positives while introducing a non-zero rate
// of false negatives. The false-positive rate of a classic Bloom filter
// eventually reaches 1, after which all queries result in a false positive.
// The stable-point property of an SBF means the false-positive rate
// asymptotically approaches a configurable fixed constant. A classic Bloom
// filter is actually a special case of SBF where the eviction rate is zero, so
// this package provides support for them as well.
//
// Stable Bloom Filters are useful for cases where the size of the data set
// isn't known a priori, which is a requirement for traditional Bloom filters,
// and memory is bounded.  For example, an SBF can be used to deduplicate
// events from an unbounded event stream with a specified upper bound on false
// positives and minimal false negatives.
pub mod buckets;
pub mod fnv;
pub mod stable;

pub trait Filter {
    fn test(&self, _data: &[u8]) -> bool;

    fn add(&mut self, _data: &[u8]) -> &Self;

    fn test_and_add(&mut self, _data: &[u8]) -> bool;
}

/// Calculates the optimal number of hash functions to use for a Bloom
/// filter based on the desired rate of false positives.
pub(crate) fn optimal_k(fp_rate: f64) -> usize {
    (1.0 / fp_rate).log2().ceil() as usize
}

/// Returns the optimal number of cells to decrement, p, per
/// iteration for the provided parameters of an SBF.
pub(crate) fn optimal_stable_p(m: usize, k: usize, d: u8, fp_rate: f64) -> usize {
    let max = (2_u64.pow(u32::from(d)) - 1) as f64;
    let sub_denom = (1.0 - fp_rate.powf(1.0 / (k as f64))).powf(1.0 / max);
    let denom = (1.0 / sub_denom - 1.0) * (1.0 / (k as f64) - 1.0 / (m as f64));

    let mut p = (1.0 / denom) as usize;

    if p == 0 {
        p = 1;
    }

    p
}