toolbox_rs/
bloom_filter.rs1use bitvec::prelude::*;
2use xxhash_rust::xxh3::xxh3_64_with_seed;
3
4use crate::as_bytes::AsBytes;
5
6pub struct BloomFilter {
12 bit_vector: BitVec,
13 number_of_functions: usize,
14}
15
16#[derive(Debug, Eq, PartialEq)]
18pub enum BloomResult {
19 No,
20 YesWhp,
21}
22
23impl BloomFilter {
24 fn fn1(&self, t: &[u8]) -> usize {
26 xxh3_64_with_seed(t, 0xdeadbeef) as usize
28 }
29
30 fn fn2(&self, t: &[u8]) -> usize {
32 xxh3_64_with_seed(t, 123) as usize
34 }
35
36 pub fn new_from_size_and_probabilty(expected_set_size: usize, probability: f64) -> Self {
39 assert!(probability > 0.);
40 assert!(probability < 1.);
41
42 let ln_pfp = probability.log(std::f64::consts::E);
44 let ln_2 = 1. / std::f64::consts::LOG2_E;
45
46 let optimal_vector_legth =
47 (-(expected_set_size as f64 * ln_pfp / ln_2.powi(2)).ceil()) as usize;
48 assert!(optimal_vector_legth > 0);
49
50 let bit_vector: BitVec = (0..optimal_vector_legth).map(|_a| false).collect();
51 let number_of_functions = (-(ln_pfp / ln_2).ceil()) as usize;
52 assert!(number_of_functions > 0);
53
54 BloomFilter {
55 bit_vector,
56 number_of_functions,
57 }
58 }
59
60 pub fn new_from_list<T: AsBytes>(list: &[T], probability: f64) -> Self {
62 let mut filter = BloomFilter::new_from_size_and_probabilty(list.len(), probability);
63 for i in list {
64 filter.add(i);
66 }
67 filter
68 }
69
70 pub fn contains(&self, value: &[u8]) -> BloomResult {
72 let len = self.bit_vector.len();
73 let fn1_value = self.fn1(value) % len;
74
75 let all_hash_fns_matched = (0..self.number_of_functions).all(|i| {
76 let fn2_value = self.fn2(value) % len;
77 let index = (fn1_value + (i * fn2_value)) % len;
78 self.bit_vector[index]
79 });
80 if all_hash_fns_matched {
81 return BloomResult::YesWhp;
82 }
83 BloomResult::No
84 }
85
86 pub fn add_bytes(&mut self, value: &[u8]) {
88 let len = self.bit_vector.len();
89 let fn1_value = self.fn1(value) % len;
90
91 (0..self.number_of_functions).for_each(|i| {
92 let fn2_value = self.fn2(value) % len;
93 let index = (fn1_value + (i * fn2_value)) % len;
94 self.bit_vector.set(index, true);
95 });
96 }
97
98 pub fn add<T: AsBytes>(&mut self, value: &T) {
100 self.add_bytes(value.as_bytes())
101 }
102}
103
104#[cfg(test)]
105mod tests {
106 use crate::bloom_filter::BloomResult;
107
108 use super::BloomFilter;
109
110 #[test]
111 fn one_sentence() {
112 let sentence1 = "this is just a string of words with little meaning.";
113 let sentence2 = "and this is another one with equally little meaning.";
114
115 let mut filter = BloomFilter::new_from_size_and_probabilty(1, 0.001);
116 filter.add(&sentence1);
118 assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
119 assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::No);
120 }
121
122 #[test]
123 fn from_list() {
124 let sentence1 = "this is just a string of words with little meaning.";
125 let sentence2 = "and this is another one with equally little meaning.";
126 let list = vec![sentence1, sentence2];
127
128 let sentence3 = "I am running out of meaningless examples to write.";
129
130 let mut filter = BloomFilter::new_from_list(&list, 0.001);
131 filter.add(&sentence1);
133 assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
134 assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::YesWhp);
135 assert_eq!(filter.contains(sentence3.as_bytes()), BloomResult::No);
136 }
137}