1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
use murmur3::murmur3_32;
use bit_vec::BitVec;

use std::io::{Cursor, Read};
use std::f64::consts::{LN_2, E};

/// A trait for hashing an arbitrary stream of bytes into a bloom filter.
pub trait BloomHasher {
    /// Returns the hashed value of the bytes given some seed.
    #[inline]
    fn hash(&self, seed: u32, bytes: &[u8]) -> u32;
}

/// A unit struct for the murmur3 hash function.
pub struct Murmur3;

impl BloomHasher for Murmur3 {
    fn hash(&self, seed: u32, bytes: &[u8]) -> u32 {
        let mut cursor = Cursor::new(bytes);
        murmur3_32(cursor.by_ref(), seed)
    }
}

/// BloomFilter
///
/// An implementation of a bloom filter
pub struct BloomFilter<T> {
    hasher: T,
    k: u32,
    bit_vec: BitVec,
    insert_count: u64,
}

impl<T: BloomHasher> BloomFilter<T> {
    /// Create a new `BloomFilter` given a `hasher`,
    /// the number of hash functions to use,
    /// and the size of the underlying bit array.
    ///
    /// Typically, this function should not be called directly unless,
    /// the optimal number of hash functions and optimal array size are
    /// already known.
    pub fn new(hasher: T, k: u32, array_size: u64) -> Self {
        Self {
            hasher,
            k,
            bit_vec: BitVec::from_elem(array_size as usize, false),
            insert_count: 0,
        }
    }

    /// Create a `BloomFilter` by computing its optimal parameters.
    ///
    /// This function computes the optimal array size using
    /// ```text
    /// -(n * ln(p)) / ln(2) ^ 2
    /// ```
    /// and computes the optimal number of hash functions using
    /// ```text
    /// m / n * ln(2)
    /// ```
    pub fn optimal(hasher: T, max_elements: u64, error_rate: f64) -> Self {
        // Check error rate is valid
        if error_rate <= 0_f64 || error_rate >= 1_f64 {
            panic!("Error rate must be 0 <= error_rate < 1");
        }

        // Calculate the length of the bit vector
        let m = optimal_vec_size(max_elements as u64, error_rate);

        // Calculate the number of hash functions to use
        let k = optimal_hash_functions(m, max_elements as u64);

        // Create the bloom filter
        Self::new(hasher, k, m)
    }

    /// Insert a slice of bytes into the `BloomFilter`.
    pub fn insert(&mut self, bytes: &[u8]) {
        for seed in 0..self.k {
            let hash = self.hasher.hash(seed, bytes) as usize % self.bit_vec.len();
            self.bit_vec.set(hash, true);
        }
        self.insert_count += 1;
    }

    /// Insert a slice of slices of bytes into the `BloomFilter`.
    pub fn insert_all<B: AsRef<[u8]>>(&mut self, slice: &[B]) {
        for item in slice {
            self.insert(item.as_ref());
        }
    }

    /// Check whether a slice of bytes exists in the `BloomFilter`.
    ///
    /// This is a probabilistic function that may return a false positive but will
    /// never return a false negative.
    ///
    /// # Examples
    ///
    /// ```
    /// extern crate bloom_filter_rs as bloom_filter;
    ///
    /// use std::vec::Vec;
    /// use bloom_filter::{BloomFilter, Murmur3};
    ///
    /// let words = vec!["Hello", "I", "am", "some", "words"];
    ///
    /// let mut bloom_filter = BloomFilter::optimal(Murmur3, words.len() as u64, 0.01);
    ///
    /// bloom_filter.insert_all(&words);
    ///
    /// for word in words.iter() {
    ///     assert!(bloom_filter.contains(&word));
    /// }
    /// ```
    pub fn contains<B: AsRef<[u8]>>(&self, bytes: B) -> bool {
        for seed in 0..self.k {
            let hash = self.hasher.hash(seed, bytes.as_ref()) as usize % self.bit_vec.len();
            if !self.bit_vec[hash] {
                return false;
            }
        }

        true
    }

    /// Calculate the expected false positive rate given the current state of
    /// the `BloomFilter`.
    pub fn false_positive_rate(&self) -> f64 {
        false_positive_rate(self.insert_count, self.bit_vec.len() as u64, self.k)
    }
}

/// This function computes the false positive rate given n, m, and k.
#[inline]
fn false_positive_rate(n: u64, m: u64, k: u32) -> f64 {
    (1_f64 - E.powf(-(n as f64) * m as f64 / k as f64)).powf(k as f64)
}

#[inline]
fn optimal_hash_functions(m: u64, n: u64) -> u32 {
    1_f64.max(m as f64 / n as f64 * LN_2).ceil() as u32
}

#[inline]
fn optimal_vec_size(n: u64, p: f64) -> u64 {
    (-(n as f64 * p.ln()) / LN_2.powi(2)).ceil() as u64
}

#[cfg(test)]
mod tests {
    use std::fs::File;
    use std::io::{BufReader, BufRead};
    use super::*;

    #[test]
    fn test_optimal_hash_functions() {
        assert_eq!(1, optimal_hash_functions(1, 10));
        assert_eq!(7, optimal_hash_functions(95851, 10000));
        assert_eq!(7, optimal_hash_functions(9586, 1000));
        assert_eq!(7, optimal_hash_functions(90000, 10000));
    }

    #[test]
    fn test_optimal_vec_size() {
        assert_eq!(95851, optimal_vec_size(10000, 0.01));
        assert_eq!(9586, optimal_vec_size(1000, 0.01));
    }

    #[test]
    #[should_panic]
    fn test_error_rate_too_low() {
        BloomFilter::optimal(Murmur3, 10000, 0_f64);
    }

    #[test]
    #[should_panic]
    fn test_error_rate_too_high() {
        BloomFilter::optimal(Murmur3, 10000, 1_f64);
    }

    #[test]
    fn test_no_false_negatives() {
        let words: Vec<String> = BufReader::new(File::open("./resources/1000.txt").unwrap())
            .lines()
            .map(|s| s.unwrap())
            .collect();

        let mut bloom_filter = BloomFilter::optimal(Murmur3, words.len() as u64, 0.01);

        bloom_filter.insert_all(&words);

        for word in words.iter() {
            assert!(bloom_filter.contains(&word));
        }
//
//        let bloom_filter = BloomFilter::from_iter(words.iter(), Murmur3, 0.01);
//
//        for word in words.iter() {
//            assert!(bloom_filter.contains(&word));
//        }
    }
}