toolbox_rs/
bloom_filter.rs

1use bitvec::prelude::*;
2use xxhash_rust::xxh3::xxh3_64_with_seed;
3
4use crate::as_bytes::AsBytes;
5
6/// Straight-forward implementation of a bloom filter on u8 slices. It applies
7/// the result of Kirsch and Mitzenmacher [1] of using a simple linear
8/// combination of two hash functions without any loss in the asymptotic false
9/// positive rate.
10/// [1] Kirsch, Mitzenmacher. "Less Hashing, Same Performance: Building a Better Bloom Filter".
11pub struct BloomFilter {
12    bit_vector: BitVec,
13    number_of_functions: usize,
14}
15
16/// Result type for the contains(.) operation
17#[derive(Debug, Eq, PartialEq)]
18pub enum BloomResult {
19    No,
20    YesWhp,
21}
22
23impl BloomFilter {
24    // first hash function
25    fn fn1(&self, t: &[u8]) -> usize {
26        // hash operations are based on xxhash3 which is fast
27        xxh3_64_with_seed(t, 0xdeadbeef) as usize
28    }
29
30    // second hash function
31    fn fn2(&self, t: &[u8]) -> usize {
32        // hash operations are based on xxhash3 which is fast
33        xxh3_64_with_seed(t, 123) as usize
34    }
35
36    // constructs an empty filter with optimal length of bit vector and number
37    // of simulated hash functions.
38    pub fn new_from_size_and_probabilty(expected_set_size: usize, probability: f64) -> Self {
39        assert!(probability > 0.);
40        assert!(probability < 1.);
41
42        // calculate optimal values on size and function count
43        let ln_pfp = probability.log(std::f64::consts::E);
44        let ln_2 = 1. / std::f64::consts::LOG2_E;
45
46        let optimal_vector_legth =
47            (-(expected_set_size as f64 * ln_pfp / ln_2.powi(2)).ceil()) as usize;
48        assert!(optimal_vector_legth > 0);
49
50        let bit_vector: BitVec = (0..optimal_vector_legth).map(|_a| false).collect();
51        let number_of_functions = (-(ln_pfp / ln_2).ceil()) as usize;
52        assert!(number_of_functions > 0);
53
54        BloomFilter {
55            bit_vector,
56            number_of_functions,
57        }
58    }
59
60    // constructs a filter from a list of inputs
61    pub fn new_from_list<T: AsBytes>(list: &[T], probability: f64) -> Self {
62        let mut filter = BloomFilter::new_from_size_and_probabilty(list.len(), probability);
63        for i in list {
64            // add all the items to the filter
65            filter.add(i);
66        }
67        filter
68    }
69
70    // checks wether the given value is contained with high probability.
71    pub fn contains(&self, value: &[u8]) -> BloomResult {
72        let len = self.bit_vector.len();
73        let fn1_value = self.fn1(value) % len;
74
75        let all_hash_fns_matched = (0..self.number_of_functions).all(|i| {
76            let fn2_value = self.fn2(value) % len;
77            let index = (fn1_value + (i * fn2_value)) % len;
78            self.bit_vector[index]
79        });
80        if all_hash_fns_matched {
81            return BloomResult::YesWhp;
82        }
83        BloomResult::No
84    }
85
86    // adds a value via its byte representation to the filter.
87    pub fn add_bytes(&mut self, value: &[u8]) {
88        let len = self.bit_vector.len();
89        let fn1_value = self.fn1(value) % len;
90
91        (0..self.number_of_functions).for_each(|i| {
92            let fn2_value = self.fn2(value) % len;
93            let index = (fn1_value + (i * fn2_value)) % len;
94            self.bit_vector.set(index, true);
95        });
96    }
97
98    // add a value to the filter
99    pub fn add<T: AsBytes>(&mut self, value: &T) {
100        self.add_bytes(value.as_bytes())
101    }
102}
103
104#[cfg(test)]
105mod tests {
106    use crate::bloom_filter::BloomResult;
107
108    use super::BloomFilter;
109
110    #[test]
111    fn one_sentence() {
112        let sentence1 = "this is just a string of words with little meaning.";
113        let sentence2 = "and this is another one with equally little meaning.";
114
115        let mut filter = BloomFilter::new_from_size_and_probabilty(1, 0.001);
116        // assert!(filter.size > 10);
117        filter.add(&sentence1);
118        assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
119        assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::No);
120    }
121
122    #[test]
123    fn from_list() {
124        let sentence1 = "this is just a string of words with little meaning.";
125        let sentence2 = "and this is another one with equally little meaning.";
126        let list = vec![sentence1, sentence2];
127
128        let sentence3 = "I am running out of meaningless examples to write.";
129
130        let mut filter = BloomFilter::new_from_list(&list, 0.001);
131        // assert!(filter.size > 10);
132        filter.add(&sentence1);
133        assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
134        assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::YesWhp);
135        assert_eq!(filter.contains(sentence3.as_bytes()), BloomResult::No);
136    }
137}