1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
//! `BloomFilter` is a probabilistic data structure that can quickly and definitively conclude that it does //! *not* contain an item. On the other hand, it can only conclude that it *probably* contains an //! item, i.e., the data structure has an inherent false-positive rate greater than 0%. //! //! Items can be added to the Bloom filter, but cannot be removed - this would introduce false //! negative cases. If this is required, an alternative might be to use a Counting Bloom filter //! (not (yet) implemented in this crate). //! //! This implementation is backed by a Rust implementation of the [xxHash hashing //! algorithm](https://github.com/Cyan4973/xxHash), [twox-hash](https://crates.io/crates/twox-hash). //! //! # References //! - [Less Hashing, Same Performance: Building a Better Bloom //! Filter](https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf) //! - [Wikipedia article](https://en.wikipedia.org/wiki/Bloom_filter) //! - [Bloom Filters by Example](https://llimllib.github.io/bloomfilter-tutorial/) //! - [Bloom Filter Calculator](https://hur.st/bloomfilter/) use bitvec::{bitvec, BitVec}; use std::f64::consts::{E, LN_2}; use std::hash::{BuildHasher, Hash, Hasher}; use std::marker::PhantomData; use twox_hash::RandomXxHashBuilder; const LN2_SQUARED: f64 = LN_2 * LN_2; /// Represents a Bloom filter. /// /// When constructing the filter using `new`, you need to specify the desired acceptable /// false-positive rate, and the number of items you intend to store in the filter. Neither can be /// adjusted after creation - create a new filter instead. /// /// # Example /// ```rust /// use flit::BloomFilter; /// /// let mut filter = BloomFilter::new(0.01, 10000); /// filter.add(&"Hello, world!"); /// /// assert_eq!(filter.might_contain(&"Hello, world!"), true); // probably true /// assert_eq!(filter.might_contain(&"Dogs are cool!"), false); // definitely false! /// ``` pub struct BloomFilter<T> { n: u64, m: u64, k: u32, bit_vec: BitVec, build_hasher: RandomXxHashBuilder, _phantom: PhantomData<T>, } impl<T: Hash> BloomFilter<T> { /// Creates a new Bloom filter based on the required false positive rate and the estimated /// number of items that will be added to the filter. /// /// The parameters influence the size of the filter, as well as the number of /// hashes that must be applied to the items. /// /// # Panics /// /// This function will panic if `false_positive_rate` is not between 0 and 1 (non inclusive), /// or if `estimated_items` is not greater than 0. pub fn new(false_positive_rate: f64, estimated_items: usize) -> Self { assert!( false_positive_rate > 0_f64 && false_positive_rate < 1_f64, "False positive rate must be between 0 and 1 (non-inclusive)" ); assert!( estimated_items > 0, "Number of estimated items must be greater than zero" ); let num_bits = -(estimated_items as f64) * false_positive_rate.ln() / LN2_SQUARED; let num_hashes = (num_bits / estimated_items as f64) * LN_2; let num_bits = num_bits.ceil() as u64; let num_hashes = num_hashes.ceil() as u32; BloomFilter { n: 0, m: num_bits, k: num_hashes, bit_vec: bitvec![0; num_bits as usize], build_hasher: RandomXxHashBuilder::default(), _phantom: PhantomData, } } /// Adds the `item` to the filter by setting the appropriate bits in the filter to `true`. pub fn add(&mut self, item: &T) { for i in indices_for_hash(split_hash(item, &self.build_hasher), self.m, self.k) { self.bit_vec.set(i, true); } self.n += 1; } /// Checks if the filter *might* contain the `item`. /// /// If this function returns false, the filter definitely does not contain the item. /// If this function returns true, the filter *might* contain the item, but it might also be a /// false-positive. pub fn might_contain(&self, item: &T) -> bool { for i in indices_for_hash(split_hash(item, &self.build_hasher), self.m, self.k) { if !self.bit_vec[i] { return false; } } true } /// Calculates the current expected false positive rate given the number of items in the /// filter. pub fn false_positive_rate(&self) -> f64 { (1_f64 - E.powf(-1_f64 * f64::from(self.k) * self.n as f64 / self.m as f64)) .powi(self.k as i32) } } /// Hashes `item` using a `Hasher`, and produces a two-element tuple. /// The first element is the "upper half" of the `u64` produced by the hash function, and the second /// element is the "lower half". fn split_hash<T: Hash>(item: &T, hasher: &impl BuildHasher) -> (u32, u32) { let mut hasher = hasher.build_hasher(); item.hash(&mut hasher); let hash = hasher.finish(); (((hash >> 32) as u32), hash as u32) } /// Returns the indices to be set to "true" in a Bloom filter for a given hash. /// /// `split_hash` is a tuple of two `u32` values produced by passing the item to be added through /// the `split_hash` function. /// `m` is the number of indices in the filter. /// `k` is the number of hash functions that the item should be passed through. fn indices_for_hash(split_hash: (u32, u32), m: u64, k: u32) -> impl Iterator<Item = usize> { (0..k).map(move |i| { (u64::from(split_hash.0.wrapping_add(split_hash.1.wrapping_mul(i))) % m) as usize }) } #[cfg(test)] mod tests { use super::*; #[test] fn test_num_bits_and_hashes() { let filter = BloomFilter::<&str>::new(0.01_f64, 216553); assert_eq!(filter.m, 2_075_674); assert_eq!(filter.k, 7); } #[test] fn test_false_positive_rate_empty() { let filter = BloomFilter::<&str>::new(0.01_f64, 216553); // False positive rate with nothing added to the filter should be 0. assert_eq!(filter.false_positive_rate(), 0_f64); } #[test] fn test_add() { let mut filter = BloomFilter::new(0.03_f64, 10); filter.add(&"Hello, world!"); assert!(filter.false_positive_rate() > 0.0); assert_eq!(filter.might_contain(&"Hello, world!"), true); assert_eq!(filter.might_contain(&"Dogs are cool!"), false); } }