rust_bloomfilter/
lib.rs

1extern crate bigint;
2extern crate bit_vec;
3extern crate fasthash;
4
5use bigint::uint::U512;
6use bit_vec::BitVec;
7use fasthash::murmur3::hash128;
8use std::ops::Add;
9
10/// I have intentionally made the BloomFilter type opaque.
11/// It accepts any kind of input as long as it can be converted to bytes.
12
13pub struct BloomFilter {
14    capacity: usize,
15    bitvec: BitVec,
16    error_rate: f64,
17    num_of_hashfuncs: usize,
18    num_of_elements: usize,
19    dup_check: bool,
20}
21
22// The number of bits for bloom filter is given by the following formula
23// m = math.ceil((n * math.log(p)) / math.log(1.0 / (pow(2.0, math.log(2.0)))))
24#[inline(always)]
25fn nbits(n: usize, p: f64) -> usize {
26    let numerator = n as f64 * p.ln();
27    let denominator = (1.0_f64 / 2.0_f64.powf(2.0_f64.ln())).ln();
28    (numerator / denominator).ceil() as usize
29}
30
31// Iterations gives the number of hash functions to be used.
32// The formula is : k = round((m / n) * math.log(2));
33#[inline(always)]
34fn iterations(m: usize, n: usize) -> usize {
35    ((m as f64 / n as f64) * 2.0_f64.ln()).round() as usize
36}
37
38impl BloomFilter {
39    pub fn new(capacity: usize, error_rate: f64, dup_check: bool) -> BloomFilter {
40        if capacity == 0 {
41            panic!("capacity must be greater than zero");
42        }
43        if error_rate <= 0.0 || error_rate > 1.0 {
44            panic!("error_rate must be greater than 0.0 and less than 1.0");
45        }
46        let num_of_bits = nbits(capacity, error_rate);
47        let num_of_hashfuncs = iterations(num_of_bits, capacity);
48        BloomFilter {
49            bitvec: BitVec::from_elem(num_of_bits, false),
50            capacity,
51            error_rate,
52            num_of_hashfuncs,
53            num_of_elements: 0,
54            dup_check,
55        }
56    }
57    // Here the error rate is assumed as 0.01
58    pub fn new_with_default_error_rate(capacity:usize, dup_check:bool) -> BloomFilter{
59        if capacity == 0 {
60            panic!("capacity must be greater than zero");
61        }
62        BloomFilter::new(capacity, 0.01, dup_check)
63    }
64
65    pub fn bitvec_len(&self) -> usize {
66        self.bitvec.len()
67    }
68
69    pub fn capacity(&self) -> usize {
70        self.capacity
71    }
72
73    pub fn len(&self) -> usize {
74        self.num_of_elements
75    }
76
77    pub fn error_rate(&self) -> f64 {
78        self.error_rate
79    }
80    pub fn is_empty(&self) -> bool {
81        self.len() == 0
82    }
83
84    pub fn add(&mut self, data: &[u8]) -> Result<bool, &'static str> {
85        if self.num_of_elements == self.capacity {
86            return Err("You are adding to the bloom filter that is already full");
87        }
88        let hash = hash128(data);
89        let hash64_first = (hash & (2_u128.pow(64) - 1)) as u64;
90        let hash64_second = (hash >> 64) as u64;
91        let mut result_hash: U512 = hash64_first.into();
92        let mut exists = true;
93        for value in 0..self.num_of_hashfuncs {
94            let temp: U512 = U512::from(value) * U512::from(hash64_second);
95            result_hash = result_hash.add(temp);
96            let index = result_hash % U512::from(self.bitvec_len());
97            if self.dup_check && self.bitvec.get(index.as_u64() as usize) == Some(false) {
98                exists = false;
99            }
100            self.bitvec.set(index.as_u64() as usize, true);
101        }
102        if self.dup_check && exists {
103            return Ok(false);
104        }
105        self.num_of_elements += 1;
106        return Ok(true);
107    }
108
109    pub fn contains(&self, data: &[u8]) -> bool {
110        let hash = hash128(data);
111        let hash64_first = (hash & (2_u128.pow(64) - 1)) as u64;
112        let hash64_second = (hash >> 64) as u64;
113        let mut result_hash: U512 = hash64_first.into();
114        for value in 0..self.num_of_hashfuncs {
115            let temp: U512 = U512::from(value) * U512::from(hash64_second);
116            result_hash = result_hash.add(temp);
117            let index = result_hash % U512::from(self.bitvec_len());
118            if self.bitvec.get(index.as_u64() as usize) == Some(false) {
119                return false;
120            }
121        }
122        true
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use crate::BloomFilter;
129
130    #[test]
131    fn test_single_element() {
132        let mut b = BloomFilter::new(20000, 0.01, true);
133        assert_eq!(b.add("Test".as_bytes()).unwrap(), true);
134        assert!(b.contains("Test".as_bytes()));
135    }
136
137    #[test]
138    fn test_multiple_types() {
139        let mut b = BloomFilter::new(20000, 0.01, true);
140        assert_eq!(b.add("Test".as_bytes()).unwrap(), true);
141        assert!(b.contains("Test".as_bytes()));
142        assert_eq!(b.add(&1_u8.to_ne_bytes()).unwrap(), true);
143        assert!(b.contains(&1_u8.to_ne_bytes()));
144        assert_eq!(b.add(&1_i32.to_ne_bytes()).unwrap(), true);
145        assert!(b.contains(&1_i32.to_ne_bytes()));
146        assert_eq!(b.add(&1_i64.to_ne_bytes()).unwrap(), true);
147        assert!(b.contains(&1_i64.to_ne_bytes()));
148        assert_eq!(b.add(&1.0_f32.to_bits().to_be_bytes()).unwrap(), true);
149        assert!(b.contains(&1.0_f32.to_bits().to_be_bytes()));
150        assert_eq!(b.add(&12131231231231321123.0_f64.to_bits().to_be_bytes()).unwrap(), true);
151        assert!(b.contains(&12131231231231321123.0_f64.to_bits().to_be_bytes()));
152        assert_eq!(b.add(&1.33333333333333333_f32.to_bits().to_be_bytes()).unwrap(), true);
153        assert!(b.contains(&1.33333333333333333_f32.to_bits().to_be_bytes()));
154    }
155
156    #[test]
157    #[should_panic]
158    fn test_empty_bloom_zero_capacity_filter() {
159        let _b = BloomFilter::new(0, 0.01, true);
160    }
161    #[test]
162    #[should_panic]
163    fn test_empty_bloom_zero_error_rate_filter() {
164        let _b = BloomFilter::new(10, 0.000, true);
165    }
166
167    #[test]
168    #[should_panic]
169    fn test_empty_bloom_negative_error_rate_filter() {
170        let _b = BloomFilter::new(10, -0.010, true);
171    }
172
173    #[test]
174    fn test_full_bloom_filter() {
175        let mut b = BloomFilter::new(10, 0.01, true);
176        // Add 11 elements to the 10 capacity Bloomfilter
177        let elements = vec![
178            "April is the cruellest month, breeding",
179            "Lilacs out of the dead land, mixing",
180            "Memory and desire, stirring",
181            "Dull roots with spring rain.",
182            "Winter kept us warm, covering",
183            "Earth in forgetful snow, feeding",
184            "A little life with dried tubers.",
185            "Summer surprised us, coming over the Starnbergersee",
186            "With a shower of rain; we stopped in the colonnade,",
187            "And went on in sunlight, into the Hofgarten,",
188            "And drank coffee, and talked for an hour.",
189        ];
190        for element in &elements[..9] {
191            assert_eq!(b.add(element.as_bytes()).unwrap(), true);
192        }
193        assert!(
194            b.add((&elements[10]).as_ref()).unwrap(),
195            "You are adding to the bloom filter that is full"
196        );
197    }
198
199    #[test]
200    fn test_multiple_elements() {
201        let mut b = BloomFilter::new(20000, 0.01, true);
202        let elements = vec![
203            "Bin gar keine Russin, stamm’ aus Litauen, echt deutsch.",
204            "And when we were children, staying at the arch-duke’s,",
205            "My cousin’s, he took me out on a sled,",
206            "And I was frightened. He said, Marie,",
207            "Marie, hold on tight. And down we went.",
208            "In the mountains, there you feel free.",
209            "I read, much of the night, and go south in the winter.",
210        ];
211        for element in &elements {
212            b.add(element.as_bytes()).unwrap();
213        }
214        for element in &elements {
215            assert!(b.contains(element.as_bytes()));
216        }
217        assert_eq!(
218            b.contains("What are the roots that clutch, what branches grow".as_bytes()),
219            false
220        );
221        assert_eq!(elements.len(), b.len())
222    }
223    #[test]
224    fn test_multiple_duplicate_elements() {
225        let mut b = BloomFilter::new(20000, 0.01, true);
226        let elements = vec![
227            "Out of this stony rubbish? Son of man,",
228            "Out of this stony rubbish? Son of man,",
229            "You cannot say, or guess, for you know only",
230            "You cannot say, or guess, for you know only",
231        ];
232        assert_eq!(b.add(elements[0].as_bytes()).unwrap(), true);
233        assert_eq!(b.len(), 1);
234        assert_eq!(b.add(elements[1].as_bytes()).unwrap(), false);
235        assert_eq!(b.len(), 1);
236        assert_eq!(b.add(elements[2].as_bytes()).unwrap(), true);
237        assert_eq!(b.len(), 2);
238        assert_eq!(b.add(elements[3].as_bytes()).unwrap(), false);
239        assert_eq!(b.len(), 2);
240    }
241
242    #[test]
243    fn test_multiple_duplicate_elements_with_dup_check_false() {
244        let mut b = BloomFilter::new(20000, 0.01, false);
245        let elements = vec![
246            "A heap of broken images, where the sun beats,",
247            "A heap of broken images, where the sun beats,",
248            "And the dead tree gives no shelter, the cricket no relief,",
249            "And the dead tree gives no shelter, the cricket no relief,",
250        ];
251        assert_eq!(b.add(elements[0].as_bytes()).unwrap(), true);
252        assert_eq!(b.len(), 1);
253        assert_eq!(b.add(elements[1].as_bytes()).unwrap(), true);
254        assert_eq!(b.len(), 2);
255        assert_eq!(b.add(elements[2].as_bytes()).unwrap(), true);
256        assert_eq!(b.len(), 3);
257        assert_eq!(b.add(elements[3].as_bytes()).unwrap(), true);
258        assert_eq!(b.len(), 4);
259        for i in vec![
260            "A heap of broken images, where the sun beats,",
261            "And the dead tree gives no shelter, the cricket no relief,",
262        ] {
263            assert!(b.contains(i.as_bytes()))
264        }
265    }
266
267    #[test]
268    fn test_new_with_default_error_rate() {
269        let mut b = BloomFilter::new_with_default_error_rate(20000, true);
270        assert_eq!(b.add("Test".as_bytes()).unwrap(), true);
271        assert!(b.contains("Test".as_bytes()));
272    }
273}