bloomfilter_rs/filter.rs
1use crate::random_sequence;
2use core::iter::zip;
3use fasthash_fork;
4use rand::Rng;
5use std::collections::HashSet;
6use std::time;
7/// Bloom Filter
8pub struct Bloomfilter {
9 /// Storing the 1s if activated by a hash funtion, zero otherwise.
10 bitmap: Vec<u8>,
11 /// The family of hash function thats will be used.
12 seeds_for_hash_function: Vec<u64>,
13 /// The hash function that will be used
14 hash_function: fn(&[u8], u64) -> u64,
15}
16
17impl Bloomfilter {
18 /// Get a new bloom filter with a bitmap of length `n_bits` and `n_hash_functions`
19 /// hash functions.
20 ///
21 /// # Arguments
22 ///
23 /// * `n_bits`: How many bits should the bitmap of the bloomfilter have? More bits make
24 /// collision (False Positives) more unlikely.
25 ///
26 /// * `n_hash_functions`: How many hash functions should be used?
27 /// # Examples
28 ///
29 /// ```
30 /// use bloomfilter_rs::filter;
31 ///
32 /// let bloom_filter = filter::Bloomfilter::new(
33 /// 10,
34 /// 20
35 /// );
36 /// ```
37 pub fn new(n_bits: usize, n_hash_functions: usize) -> Self {
38 Bloomfilter {
39 bitmap: vec![0; n_bits],
40 seeds_for_hash_function: get_random_seeds(n_hash_functions),
41 hash_function: |element, seed| fasthash_fork::xx::hash64_with_seed(element, seed),
42 }
43 }
44 /// Generate a bloomfilter with non-random seeds.
45 ///
46 /// # Arguments
47 ///
48 /// * `n_bits`: How many bits should the bitmap of the bloomfilter have? More bits make
49 /// collision (False Positives) more unlikely.
50 ///
51 /// * `seeds`: Provide custom seeds to parameterize the hash functions. Given a different seed, the hash function
52 /// would hash the same objects to different hashes. You should not provide the same seeds twice.
53 ///
54 /// # Examples
55 ///
56 /// ```
57 /// use bloomfilter_rs::filter;
58 ///
59 /// let bloom_filter = filter::Bloomfilter::new_with_seeds(
60 /// 10,
61 /// vec![1, 10, 100],
62 /// );
63 /// ```
64 pub fn new_with_seeds(n_bits: usize, seeds: Vec<u64>) -> Self {
65 Bloomfilter {
66 bitmap: vec![0; n_bits],
67 seeds_for_hash_function: seeds,
68 hash_function: |element, seed| fasthash_fork::xx::hash64_with_seed(element, seed),
69 }
70 }
71 /// Generate a bloomfilter with non-random seeds and a custom hash function.
72 ///
73 /// # Arguments
74 ///
75 /// * `n_bits`: How many bits should the bitmap of the bloomfilter have? More bits make
76 /// collision (False Positives) more unlikely.
77 ///
78 /// * `seeds`: Provide custom seeds to parameterize the hash functions. Given a different seed, the hash function
79 /// would hash the same objects to different hashes. You should not provide the same seeds twice.
80 ///
81 /// * `hash_function`: Specify a custom hash function. Hash functions should take in the bytes of the data and a seed to
82 /// parmetrize the behaviour of the hash function. Given different hash values the function should show different behaviour.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use bloomfilter_rs::filter;
88 ///
89 /// let bloom_filter = filter::Bloomfilter::new_with_seeds_and_custom_hash_function(
90 /// 10,
91 /// vec![1, 10, 100],
92 /// |bytes, seed| -> u64
93 /// {(bytes.iter().map(|elem| *elem as u64).sum::<u64>() + seed).into()}
94 /// );
95 /// ```
96 pub fn new_with_seeds_and_custom_hash_function(
97 n_bits: usize,
98 seeds: Vec<u64>,
99 hash_function: fn(&[u8], u64) -> u64,
100 ) -> Self {
101 Bloomfilter {
102 bitmap: vec![0; n_bits],
103 seeds_for_hash_function: seeds,
104 hash_function,
105 }
106 }
107 /// Create a random bloomfilter given its supposed characteristics.
108 ///
109 /// Give the number of expected entries and the acceptable false positive rates, we can create a
110 /// hashfilter that will have the expected characteristic. If we use more entries then expected, the
111 /// false positive rate might decrease.
112 ///
113 /// # Arguments:
114 /// * expected_nubmber_of_entries: How many entries do you expect in the hash filter?
115 /// * acceptable_false_positive_rate: What false positive rate is acceptabel for your use-case?
116 ///
117 ///
118 /// # Examples
119 ///
120 /// ```
121 /// use bloomfilter_rs::filter;
122 ///
123 /// let my_hash_filter = filter::Bloomfilter::from_specification(10, 0.5);
124 /// ```
125 pub fn from_specification(
126 expected_number_of_entries: usize,
127 acceptable_false_positive_rate: f64,
128 ) -> Self {
129 // Formulas taken from https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions.
130 let optimal_number_of_bits = (-(expected_number_of_entries as f64
131 * f64::ln(acceptable_false_positive_rate)))
132 / (f64::powi(f64::ln(2.0), 2));
133 let optimal_number_hash_functions = -f64::ln(acceptable_false_positive_rate) / f64::ln(2.0);
134 Bloomfilter::new(
135 optimal_number_of_bits.ceil() as usize,
136 optimal_number_hash_functions.floor() as usize,
137 )
138 }
139 /// How many bits does the bitmap of the filter have?
140 /// # Examples
141 ///
142 /// ```
143 /// use bloomfilter_rs::filter;
144 ///
145 /// let bloom_filter = filter::Bloomfilter::new(
146 /// 10,
147 /// 20
148 /// );
149 /// println!("{}", bloom_filter.number_of_bits())
150 /// ```
151
152 pub fn number_of_bits(&self) -> usize {
153 self.bitmap.len()
154 }
155 /// Insert an object into the bloomfilter. The object should be a string.
156 ///
157 /// If you map higher-level objects (struct ect.) to string, make sure that the representation does exactly
158 /// identify each object (e.g. two different objects cannot map to the same string representation).
159 ///
160 /// # Arguments
161 ///
162 /// * `object`: The object to insert.
163 ///
164 /// # Examples
165 ///
166 /// ```
167 /// use bloomfilter_rs::filter;
168 ///
169 /// let mut bloom_filter = filter::Bloomfilter::new(
170 /// 10,
171 /// 20,
172 /// );
173 /// bloom_filter.insert("To be inserted");
174 /// ```
175
176 pub fn insert(&mut self, object_as_string: &str) {
177 let n_bits = self.number_of_bits();
178 for seed in &self.seeds_for_hash_function {
179 self.bitmap[bytes_to_position_in_bitmap(
180 *seed,
181 object_as_string.as_bytes(),
182 n_bits,
183 self.hash_function,
184 )] = 1;
185 }
186 }
187 /// Insert an object as bytes into the bloomfilter.
188 ///
189 /// # Arguments
190 ///
191 /// * `object_as_bytes`: The object to insert.
192 ///
193 /// # Examples
194 ///
195 /// ```
196 /// use bloomfilter_rs::filter;
197 ///
198 /// let mut bloom_filter = filter::Bloomfilter::new(
199 /// 10,
200 /// 20,
201 /// );
202 /// bloom_filter.insert_object_as_bytes("To be inserted".as_bytes());
203 /// ```
204 pub fn insert_object_as_bytes(&mut self, object_as_bytes: &[u8]) {
205 let n_bits = self.number_of_bits();
206 for seed in &self.seeds_for_hash_function {
207 self.bitmap
208 [bytes_to_position_in_bitmap(*seed, object_as_bytes, n_bits, self.hash_function)] =
209 1;
210 }
211 }
212 /// Test that an object is in the hash filter. The objects needs to be string.
213 ///
214 /// The bloomfilter is probabilistic in nature. False negatives cannot occur (if `bloomfilter.contains`
215 /// returns false, that will always be true), but false positives can occur.
216 ///
217 /// # Arguments
218 ///
219 /// * `object_as_string`: The object to check if it is contained in the bloomfilter.
220 ///
221 /// # Examples
222 ///
223 /// ```
224 /// use bloomfilter_rs::filter;
225 ///
226 /// let mut bloom_filter = filter::Bloomfilter::new(
227 /// 10,
228 /// 20,
229 /// );
230 /// bloom_filter.contains("To be inserted");
231 /// ```
232 pub fn contains(&self, object_as_string: &str) -> bool {
233 let mut bit_map_for_elem: Vec<u8> = vec![0; self.number_of_bits()];
234 for seed in &self.seeds_for_hash_function {
235 bit_map_for_elem[bytes_to_position_in_bitmap(
236 *seed,
237 object_as_string.as_bytes(),
238 self.number_of_bits(),
239 self.hash_function,
240 )] = 1
241 }
242
243 all_elements_bitmap_a_also_in_b(&self.bitmap, &bit_map_for_elem).unwrap()
244 }
245 /// Test that an object as byte-array is in the hash filter.
246 ///
247 /// # Arguments
248 ///
249 /// * `object_as_bytes`: The object to check if it is contained in the bloomfilter.
250 ///
251 /// # Examples
252 ///
253 /// ```
254 /// use bloomfilter_rs::filter;
255 ///
256 /// let mut bloom_filter = filter::Bloomfilter::new(
257 /// 10,
258 /// 20,
259 /// );
260 /// bloom_filter.contains_object_as_bytes("To be inserted".as_bytes());
261 /// ```
262 pub fn contains_object_as_bytes(&self, object_as_bytes: &[u8]) -> bool {
263 let mut bit_map_for_elem: Vec<u8> = vec![0; self.number_of_bits()];
264 for seed in &self.seeds_for_hash_function {
265 bit_map_for_elem[bytes_to_position_in_bitmap(
266 *seed,
267 object_as_bytes,
268 self.number_of_bits(),
269 self.hash_function,
270 )] = 1
271 }
272
273 all_elements_bitmap_a_also_in_b(&self.bitmap, &bit_map_for_elem).unwrap()
274 }
275}
276
277// Utilities used for the bloomfilter-implementation
278/// Generate seeds that are used within the bloom filter
279/// to parameterize hash-functions be have different behaviour.
280/// # Arguments
281///
282/// * `numbers of seeds` - How many seeds should be generated? Should be the same number of needed hash
283/// functions in the bloom filter.
284///
285fn get_random_seeds(number_of_seeds: usize) -> Vec<u64> {
286 (0..number_of_seeds)
287 .map(|_| rand::thread_rng().gen())
288 .collect()
289}
290/// The if bitmap `contains_all` contains all entries that are in `contains_some`
291/// # Arguments
292///
293/// * `contains_all` - Bitmap that should contain all entries.
294/// * `contains_some` - Bitmap that should contain some entries, but only entries from `contains_all`.
295///
296fn all_elements_bitmap_a_also_in_b(
297 contains_all: &Vec<u8>,
298 contains_some: &Vec<u8>,
299) -> Option<bool> {
300 if contains_all.len() != contains_some.len() {
301 return None;
302 }
303 for (elem_in_contains_all, elem_in_contains_some) in zip(contains_all, contains_some) {
304 if *elem_in_contains_all == 0 && *elem_in_contains_some == 1 {
305 return Some(false);
306 }
307 }
308 Some(true)
309}
310/// Given an object of generic type (needs to implement `as_bytes`) hash the elem and map it to a bitmap
311/// of length `number_of_bits`.
312///
313/// # Arguments:
314/// * `seed`: Seed that should be used for the hash function.
315/// * `bytes`: Bytes that should be added to the bloomfilter.
316/// * `number_of_bits`: The number of bits in the bitmap.
317/// * `hash_function`: The hash function that should be used for the mapping. It is assumed to take a reference to a bytes
318/// array (the data that should be hashed) and a seed.
319fn bytes_to_position_in_bitmap(
320 seed: u64,
321 bytes: &[u8],
322 number_of_bits: usize,
323 hash_function: fn(&[u8], u64) -> u64,
324) -> usize {
325 let elem_as_hash = hash_function(bytes, seed);
326 (elem_as_hash % number_of_bits as u64) as usize
327}
328/// This function computes the actual false_positive rate for a bloomfilter.
329/// The formula is taken from `https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives`
330pub fn compute_false_positive_rate_of_bloomfilter(
331 n_hash_functions: usize,
332 n_bits_in_bitmap: usize,
333 number_of_expected_entries: usize,
334) -> f64 {
335 let inner_most_bracket = 1.0 as f64 - ((1.0 as f64) / (n_bits_in_bitmap as f64));
336 let outer_bracket = 1.0 as f64
337 - f64::powf(
338 inner_most_bracket,
339 (n_hash_functions * number_of_expected_entries) as f64,
340 );
341 f64::powf(outer_bracket, n_hash_functions as f64)
342}
343/// Create a hash-filter given the specification of the function and test
344/// its false positives, false negatives rate and time to insert object /
345/// test that objects are in there.
346///
347/// # Arguments:
348/// * `n_samples_in_bloom_filter`: The number of samples that will be added to the bloomfilter.
349/// * `n_samples_not_in_bloom_filter`: The number of samples that are definitly not in the bloomfilter but
350/// that are used for testing if they the bloomfilter reports them as being in the filter.
351/// * `n_bits_of_bloomfilter`: The number of bits that the bloomfilter has.
352/// * `n_hash_functions_in_bloomfilter`: The number of hash functions that are used in the bloomfilter.
353///
354/// Returns:
355/// * (false_positive_rate, false_negative_rate, insertion_time_per_1000_entries, contains_time_per_1000_entry, expected_false_positive_rate)
356pub fn compute_mistakes_and_execution_time_bloomfilter(
357 n_samples_in_bloom_filter: usize,
358 n_samples_not_in_bloom_filter: usize,
359 n_bits_of_bloomfilter: usize,
360 n_hash_functions_in_bloomfilter: usize,
361) -> (f64, f64, u64, u64, f64) {
362 // Sampling 1000 sequence with 20 elements that will be added to bloom filer
363 let mut sequences_in_bloom_filter: HashSet<Vec<i32>> =
364 HashSet::with_capacity(n_samples_in_bloom_filter);
365 while sequences_in_bloom_filter.len() < n_samples_in_bloom_filter {
366 sequences_in_bloom_filter.insert(random_sequence::random_sequence_from_range(0..100, 20));
367 }
368
369 let mut bloom_filter = Bloomfilter::new(n_bits_of_bloomfilter, n_hash_functions_in_bloomfilter);
370
371 let before = time::Instant::now();
372 for sequence in &sequences_in_bloom_filter {
373 bloom_filter.insert(format!("{:?}", sequence).as_ref())
374 }
375 let total_insertion_time = duration_to_micro_seconds(before.elapsed());
376
377 // Test false negatives rate
378 let mut false_negatives = 0.0;
379 for sequence in &sequences_in_bloom_filter {
380 if !bloom_filter.contains(format!("{:?}", sequence).as_ref()) {
381 false_negatives += 1.0;
382 }
383 }
384
385 // Test false positives rate.
386 let mut false_positives = 0.0;
387 let mut total_contains_time = 0;
388 for _ in 0..n_samples_not_in_bloom_filter {
389 let new_sequence = random_sequence::random_sequence_from_range(0..100, 20);
390 if !sequences_in_bloom_filter.contains(&new_sequence) {
391 let before = time::Instant::now();
392 if bloom_filter.contains(format!("{:?}", new_sequence).as_ref()) {
393 false_positives += 1.0;
394 }
395 total_contains_time += duration_to_micro_seconds(before.elapsed())
396 }
397 }
398 let total_contains_time = total_contains_time;
399
400 (
401 false_positives / n_samples_not_in_bloom_filter as f64,
402 false_negatives / n_samples_in_bloom_filter as f64,
403 total_insertion_time / 1000,
404 total_contains_time / 1000,
405 compute_false_positive_rate_of_bloomfilter(
406 n_hash_functions_in_bloomfilter,
407 n_bits_of_bloomfilter,
408 n_samples_in_bloom_filter,
409 ),
410 )
411}
412/// Repeate the `compute_mistakes_and_execution_time_bloomfilter` and average the
413/// individual result to get more robust estimates.
414pub fn repeat_compute_mistakes_and_execution_time_bloomfilter(
415 n_samples_in_bloom_filter: usize,
416 n_samples_not_in_bloom_filter: usize,
417 n_bits_of_bloomfilter: usize,
418 n_hash_functions_in_bloomfilter: usize,
419 n_repititions: usize,
420) -> (f64, f64, f64, f64, f64) {
421 let mut total_false_positives_rate = 0.0;
422 let mut total_false_negatives_rate = 0.0;
423 let mut total_insertion_time = 0.0;
424 let mut total_contains_time = 0.0;
425 let mut expected_false_positive_rate = 0.0;
426
427 for _ in 0..n_repititions {
428 let (
429 false_positive_rate,
430 false_negative_rate,
431 _total_insertion_time,
432 _total_contains_time,
433 _expected_false_positive_rate,
434 ) = compute_mistakes_and_execution_time_bloomfilter(
435 n_samples_in_bloom_filter,
436 n_samples_not_in_bloom_filter,
437 n_bits_of_bloomfilter,
438 n_hash_functions_in_bloomfilter,
439 );
440 total_false_positives_rate += false_positive_rate;
441 total_false_negatives_rate += false_negative_rate;
442 total_insertion_time += _total_insertion_time as f64;
443 total_contains_time += _total_contains_time as f64;
444 expected_false_positive_rate = _expected_false_positive_rate;
445 }
446 let n_repititions = n_repititions as f64;
447 (
448 total_false_positives_rate / n_repititions,
449 total_false_negatives_rate / n_repititions,
450 total_insertion_time / n_repititions,
451 total_contains_time / n_repititions,
452 expected_false_positive_rate,
453 )
454}
455
456/// Compute the number of micro seconds in a `Duration`-object.
457fn duration_to_micro_seconds(duration: time::Duration) -> u64 {
458 let nanos = duration.subsec_nanos() as u64;
459 (1000 * 1000 * 1000 * duration.as_secs() + nanos) / (1000)
460}
461
462#[cfg(test)]
463pub mod tests {
464 mod test_utilities {
465
466 #[test]
467 fn test_duration_to_micro_seconds() {
468 use super::super::duration_to_micro_seconds;
469 use std::time;
470 assert_eq!(
471 duration_to_micro_seconds(time::Duration::from_micros(100.5 as u64)),
472 100.5 as u64
473 );
474 }
475
476 #[test]
477 /// Test `compute_false_positive_rate_of_bloomfilter` runs through with
478 /// reasonable input values.
479 fn test_compute_mistakes_and_execution_time_bloomfilter() {
480 use super::super::compute_false_positive_rate_of_bloomfilter;
481
482 _ = compute_false_positive_rate_of_bloomfilter(10, 10, 5);
483 }
484 #[test]
485 /// Test `repeate_compute_false_positive_rate_of_bloomfilter` runs through with
486 /// reasonable input values.
487 fn test_repeat_compute_mistakes_and_execution_time_bloomfilter() {
488 use super::super::repeat_compute_mistakes_and_execution_time_bloomfilter;
489
490 _ = repeat_compute_mistakes_and_execution_time_bloomfilter(10, 10, 5, 10, 5);
491 }
492 #[test]
493 fn test_get_random_seed() {
494 // Simply test the function runs through, not much more that can be test
495 // given it stochasticity.
496 use super::super::get_random_seeds;
497 let number_seeds_to_generate = 10;
498 let generated_random_seeds = get_random_seeds(number_seeds_to_generate);
499 assert_eq!(generated_random_seeds.len(), number_seeds_to_generate);
500 }
501 mod test_all_elements_bitmap_a_also_in_b {
502 use super::super::super::all_elements_bitmap_a_also_in_b;
503 #[test]
504 fn different_length() {
505 // Both bitmaps have different length, hence `None` is returned.
506 assert_eq!(
507 all_elements_bitmap_a_also_in_b(&vec![0, 0, 0, 0], &vec![0, 0, 0]),
508 None
509 )
510 }
511
512 #[test]
513 fn a_equals_b_both_empty() {
514 // Both bitmaps are entry, therefore all elements in b are also in a
515 assert_eq!(
516 all_elements_bitmap_a_also_in_b(&vec![0, 0, 0, 0], &vec![0, 0, 0, 0]),
517 Some(true)
518 )
519 }
520
521 #[test]
522 fn a_equals_b_both_full() {
523 // Both bitmaps are completely full, therefore all elements in b are also in a
524 assert_eq!(
525 all_elements_bitmap_a_also_in_b(&vec![1, 1, 1, 1], &vec![1, 1, 1, 1]),
526 Some(true)
527 )
528 }
529
530 #[test]
531 fn a_equals_b_mixed_entries() {
532 // Some elements in bitmap, but the same for a and b therefore all elements in b
533 // are also in a.
534 assert_eq!(
535 all_elements_bitmap_a_also_in_b(&vec![1, 0, 0, 1], &vec![1, 0, 0, 1]),
536 Some(true)
537 )
538 }
539
540 #[test]
541 fn a_more_entries_than_b() {
542 // A has more entries in its bitmap, therefore all elements in b are also in a.
543 assert_eq!(
544 all_elements_bitmap_a_also_in_b(&vec![1, 1, 0, 1], &vec![1, 0, 0, 1]),
545 Some(true)
546 )
547 }
548
549 #[test]
550 fn b_more_entries_than_a() {
551 // B has one element that, a does not have, therefore return Value is `Some(false)`
552 assert_eq!(
553 all_elements_bitmap_a_also_in_b(&vec![1, 1, 0, 1], &vec![1, 0, 1, 1]),
554 Some(false)
555 )
556 }
557 }
558 mod test_elem_to_position_in_bitmap {
559 use super::super::super::bytes_to_position_in_bitmap;
560 /// Small hash function that sums the integer in the bytes array and
561 /// adds the seed. This does not make sense from a hashing perspective,
562 /// and is only used in testing to make the results predictable.
563 fn hash_function_for_testing(bytes: &[u8], seed: u64) -> u64 {
564 (bytes.iter().map(|elem| *elem as u64).sum::<u64>() + seed).into()
565 }
566
567 #[test]
568 /// Test that the hash function just sums all entries and adds
569 /// the seed.
570 fn test_hash_function_for_testing() {
571 assert_eq!(hash_function_for_testing(&vec![1, 2, 3], 5), 11);
572 }
573 #[test]
574 /// Take adding seed to the entries in the vec gives you 3, which given a bit map of size three
575 /// should result in entry 3.
576 fn array_2_entries_sum_3() {
577 assert_eq!(
578 bytes_to_position_in_bitmap(1, &vec![1, 1], 4, hash_function_for_testing),
579 3
580 );
581 }
582 #[test]
583 /// Take adding seed to the entries in the vec gives you 3, which given a bit map of size three
584 fn array_3_entries_sum_3() {
585 assert_eq!(
586 bytes_to_position_in_bitmap(3, &vec![1, 4, 9], 4, hash_function_for_testing),
587 1
588 );
589 }
590 #[test]
591 /// Add a string as bytes.
592 fn test_with_string_to_bytes() {
593 bytes_to_position_in_bitmap(3, "testing".as_bytes(), 4, hash_function_for_testing);
594 }
595 #[test]
596 /// Add an integer as bytes.
597 fn test_with_integer_as_bytes() {
598 bytes_to_position_in_bitmap(3, &1_u32.to_ne_bytes(), 4, hash_function_for_testing);
599 }
600 }
601 mod test_constructors {
602 use super::super::super::Bloomfilter;
603
604 #[test]
605 /// Test `Bloomfilter::random`-constructor.
606 fn test_random() {
607 let bloom_filter = Bloomfilter::new(10, 20);
608 assert_eq!(bloom_filter.bitmap.len(), 10);
609 assert_eq!(bloom_filter.seeds_for_hash_function.len(), 20);
610 }
611 #[test]
612 /// Test `Bloomfilter::new_with_seeds`-constructor.
613 fn test_new_with_seeds() {
614 let seeds: Vec<u64> = vec![1, 2, 3];
615 let bloom_filter = Bloomfilter::new_with_seeds(10, seeds.clone());
616 assert_eq!(bloom_filter.bitmap.len(), 10);
617 assert_eq!(bloom_filter.seeds_for_hash_function, seeds);
618 }
619 #[test]
620 /// Test `Bloomfilter::new_with_seeds_and_custom_hash_function`-constructor.
621 fn test_new_with_seeds_and_custom_hash_function() {
622 let seeds: Vec<u64> = vec![1, 2, 3];
623 let hash_function = |_object_as_bytes: &[u8], _seed: u64| -> u64 { 2 };
624 let bloom_filter = Bloomfilter::new_with_seeds_and_custom_hash_function(
625 10,
626 seeds.clone(),
627 hash_function,
628 );
629 assert_eq!(bloom_filter.bitmap.len(), 10);
630 assert_eq!(bloom_filter.seeds_for_hash_function, seeds);
631 // Cannot compare closures, therefore call it and check it returns 2.
632 assert_eq!((bloom_filter.hash_function)(&[1, 0, 1,], 4), 2);
633 }
634 }
635 mod test_getter_functions {
636 use super::super::super::Bloomfilter;
637 #[test]
638 fn bloom_filter_with_5_bits() {
639 let bloom_filter = Bloomfilter::new(5, 10);
640 assert_eq!(bloom_filter.number_of_bits(), 5);
641 }
642 }
643
644 mod test_insertion_and_retrivial {
645 use super::super::super::Bloomfilter;
646
647 fn sum_up_bitmap(bitmap: &[u8]) -> u64 {
648 bitmap.iter().map(|indicator| *indicator as u64).sum()
649 }
650 #[test]
651 fn test_insert_string() {
652 let mut bloomfilter = Bloomfilter::new(100, 10);
653 bloomfilter.insert("This is in there");
654
655 // After the `insert`-function was called with 10 hash functions, the sum of the bitmap
656 // has to be between 1..=10. (1 = 10 collisions, e.g. all hash functions mapped to the
657 // same entry, 10 meaning all hash_function mapped this string to a different index
658 // in the bitmap.)
659 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
660 assert!((sum_of_entries > 0) & (sum_of_entries <= 10));
661
662 bloomfilter.insert("This is also in here!");
663
664 // After the second insertion there may be between 1 and 20 entries.
665 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
666 assert!((sum_of_entries > 0) & (sum_of_entries <= 20));
667 }
668 #[test]
669 fn test_insert_bytes() {
670 let mut bloomfilter = Bloomfilter::new(100, 10);
671 bloomfilter.insert_object_as_bytes(&1_u32.to_ne_bytes());
672
673 // After the `insert_object_as_bytes`-function was called with 10 hash functions, the sum of the bitmap
674 // has to be between 1..=10. (1 = 10 collisions, e.g. all hash functions mapped to the
675 // same entry, 10 meaning all hash_function mapped this string to a different index
676 // in the bitmap.)
677 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
678 assert!((sum_of_entries > 0) & (sum_of_entries <= 10));
679
680 bloomfilter.insert_object_as_bytes("This is also in here!".as_bytes());
681
682 // After the second insertion there may be between 1 and 20 entries.
683 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
684 assert!((sum_of_entries > 0) & (sum_of_entries <= 20));
685 }
686 #[test]
687 /// Mix the two interfaces `insert` and `insert_object_as_bytes` and test that the same
688 /// properties still hold for the populated bitmap.
689 fn test_insert_bytes_and_string() {
690 let mut bloomfilter = Bloomfilter::new(100, 10);
691 bloomfilter.insert_object_as_bytes(&1_u32.to_ne_bytes());
692
693 // After the `insert_object_as_bytes`-function was called with 10 hash functions, the sum of the bitmap
694 // has to be between 1..=10. (1 = 10 collisions, e.g. all hash functions mapped to the
695 // same entry, 10 meaning all hash_function mapped this string to a different index
696 // in the bitmap.)
697 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
698 assert!((sum_of_entries > 0) & (sum_of_entries <= 10));
699
700 bloomfilter.insert("This is also in here!");
701
702 // After the second insertion there may be between 1 and 20 entries.
703 let sum_of_entries: u64 = sum_up_bitmap(&bloomfilter.bitmap);
704 assert!((sum_of_entries > 0) & (sum_of_entries <= 20));
705 }
706 #[test]
707 fn test_contains() {
708 let mut bloomfilter = Bloomfilter::new(100, 10);
709 bloomfilter.insert("1");
710
711 // Only with the single entry we can guarantee that the bloomfilter does not have
712 // collision, e.g. two different objects get the same representation in the bitmap.
713 assert!(bloomfilter.contains("1"));
714 // Also test that a different object is not in here.
715 assert!(!bloomfilter.contains("2"));
716 }
717 #[test]
718 fn does_not_contain() {
719 let mut bloomfilter = Bloomfilter::new(100, 10);
720 for number_as_string in vec!["1", "2", "3", "4", "5", "6", "7", "8", "9"] {
721 bloomfilter.insert(number_as_string);
722 }
723 assert!(!bloomfilter.contains("10"));
724 }
725 #[test]
726 fn test_contains_bytes() {
727 let mut bloomfilter = Bloomfilter::new(100, 10);
728 bloomfilter.insert_object_as_bytes("1".as_bytes());
729
730 // Only with the single entry we can guarantee that the bloomfilter does not have
731 // collision, e.g. two different objects get the same representation in the bitmap.
732 assert!(bloomfilter.contains_object_as_bytes("1".as_bytes()));
733 // Also test that a different object is not in here.
734 assert!(!bloomfilter.contains_object_as_bytes("2".as_bytes()));
735 }
736 #[test]
737 fn does_not_contain_bytes() {
738 let mut bloomfilter = Bloomfilter::new(100, 10);
739 for number_as_string in vec!["1", "2", "3", "4", "5", "6", "7", "8", "9"] {
740 bloomfilter.insert_object_as_bytes(number_as_string.as_bytes());
741 }
742 assert!(!bloomfilter.contains_object_as_bytes("10".as_bytes()));
743 }
744 }
745 mod test_bloomfilter_from_specification {
746 use super::super::super::super::random_sequence;
747 use super::super::super::Bloomfilter;
748 use std::collections;
749
750 #[test]
751 /// Test that `from_specification` with reasonable values
752 /// achieves roughly the expected quality.
753 fn test_from_specification() {
754 let expected_false_positive_rate = 0.1;
755 let number_of_entries_in_bloom_filter = 1000;
756 let number_of_entries_not_in_bloom_filter = 2000;
757 let margin_of_error = 0.2;
758
759 let mut bloom_filter_to_test = Bloomfilter::from_specification(
760 number_of_entries_in_bloom_filter,
761 expected_false_positive_rate,
762 );
763
764 // Compute and insert 1000 sequences.
765 let mut sequences_in_bloom_filter: collections::HashSet<Vec<i32>> =
766 collections::HashSet::with_capacity(number_of_entries_in_bloom_filter);
767 while sequences_in_bloom_filter.len() < number_of_entries_in_bloom_filter {
768 sequences_in_bloom_filter
769 .insert(random_sequence::random_sequence_from_range(0..100, 20));
770 }
771 for sequence in &sequences_in_bloom_filter {
772 bloom_filter_to_test.insert(format!("{:?}", sequence).as_ref())
773 }
774
775 // Test false positives rate.
776 let mut false_positives = 0.0;
777 for _ in 0..number_of_entries_not_in_bloom_filter {
778 let new_sequence = random_sequence::random_sequence_from_range(0..100, 20);
779 if !sequences_in_bloom_filter.contains(&new_sequence) {
780 if bloom_filter_to_test.contains(format!("{:?}", new_sequence).as_ref()) {
781 false_positives += 1.0;
782 }
783 }
784 }
785 let actual_false_positive_rate =
786 false_positives / number_of_entries_not_in_bloom_filter as f64;
787 println!("{}", actual_false_positive_rate);
788 assert!(
789 (actual_false_positive_rate * (1.0 + margin_of_error)
790 > expected_false_positive_rate)
791 & (actual_false_positive_rate * (1.0 - margin_of_error)
792 < expected_false_positive_rate)
793 )
794 }
795 }
796 }
797}