1use murmur3::murmur3_32;
2use bit_vec::BitVec;
3
4use std::io::{Cursor, Read};
5use std::f64::consts::{LN_2, E};
6
7pub trait BloomHasher {
9 #[inline]
11 fn hash(&self, seed: u32, bytes: &[u8]) -> u32;
12}
13
14pub struct Murmur3;
16
17impl BloomHasher for Murmur3 {
18 fn hash(&self, seed: u32, bytes: &[u8]) -> u32 {
19 let mut cursor = Cursor::new(bytes);
20 murmur3_32(cursor.by_ref(), seed)
21 }
22}
23
24pub struct BloomFilter<T> {
28 hasher: T,
29 k: u32,
30 bit_vec: BitVec,
31 insert_count: u64,
32}
33
34impl<T: BloomHasher> BloomFilter<T> {
35 pub fn new(hasher: T, k: u32, array_size: u64) -> Self {
43 Self {
44 hasher,
45 k,
46 bit_vec: BitVec::from_elem(array_size as usize, false),
47 insert_count: 0,
48 }
49 }
50
51 pub fn optimal(hasher: T, max_elements: u64, error_rate: f64) -> Self {
62 if error_rate <= 0_f64 || error_rate >= 1_f64 {
64 panic!("Error rate must be 0 <= error_rate < 1");
65 }
66
67 let m = optimal_vec_size(max_elements as u64, error_rate);
69
70 let k = optimal_hash_functions(m, max_elements as u64);
72
73 Self::new(hasher, k, m)
75 }
76
77 pub fn insert(&mut self, bytes: &[u8]) {
79 for seed in 0..self.k {
80 let hash = self.hasher.hash(seed, bytes) as usize % self.bit_vec.len();
81 self.bit_vec.set(hash, true);
82 }
83 self.insert_count += 1;
84 }
85
86 pub fn insert_all<B: AsRef<[u8]>>(&mut self, slice: &[B]) {
88 for item in slice {
89 self.insert(item.as_ref());
90 }
91 }
92
93 pub fn contains<B: AsRef<[u8]>>(&self, bytes: B) -> bool {
117 for seed in 0..self.k {
118 let hash = self.hasher.hash(seed, bytes.as_ref()) as usize % self.bit_vec.len();
119 if !self.bit_vec[hash] {
120 return false;
121 }
122 }
123
124 true
125 }
126
127 pub fn false_positive_rate(&self) -> f64 {
130 false_positive_rate(self.insert_count, self.bit_vec.len() as u64, self.k)
131 }
132}
133
134#[inline]
136fn false_positive_rate(n: u64, m: u64, k: u32) -> f64 {
137 (1_f64 - E.powf(-(n as f64) * m as f64 / k as f64)).powf(k as f64)
138}
139
140#[inline]
141fn optimal_hash_functions(m: u64, n: u64) -> u32 {
142 1_f64.max(m as f64 / n as f64 * LN_2).ceil() as u32
143}
144
145#[inline]
146fn optimal_vec_size(n: u64, p: f64) -> u64 {
147 (-(n as f64 * p.ln()) / LN_2.powi(2)).ceil() as u64
148}
149
150#[cfg(test)]
151mod tests {
152 use std::fs::File;
153 use std::io::{BufReader, BufRead};
154 use super::*;
155
156 #[test]
157 fn test_optimal_hash_functions() {
158 assert_eq!(1, optimal_hash_functions(1, 10));
159 assert_eq!(7, optimal_hash_functions(95851, 10000));
160 assert_eq!(7, optimal_hash_functions(9586, 1000));
161 assert_eq!(7, optimal_hash_functions(90000, 10000));
162 }
163
164 #[test]
165 fn test_optimal_vec_size() {
166 assert_eq!(95851, optimal_vec_size(10000, 0.01));
167 assert_eq!(9586, optimal_vec_size(1000, 0.01));
168 }
169
170 #[test]
171 #[should_panic]
172 fn test_error_rate_too_low() {
173 BloomFilter::optimal(Murmur3, 10000, 0_f64);
174 }
175
176 #[test]
177 #[should_panic]
178 fn test_error_rate_too_high() {
179 BloomFilter::optimal(Murmur3, 10000, 1_f64);
180 }
181
182 #[test]
183 fn test_no_false_negatives() {
184 let words: Vec<String> = BufReader::new(File::open("./resources/1000.txt").unwrap())
185 .lines()
186 .map(|s| s.unwrap())
187 .collect();
188
189 let mut bloom_filter = BloomFilter::optimal(Murmur3, words.len() as u64, 0.01);
190
191 bloom_filter.insert_all(&words);
192
193 for word in words.iter() {
194 assert!(bloom_filter.contains(&word));
195 }
196}
203}