cbloom/
lib.rs

1//! A concurrent implementation of Bloom filters.
2//!
3//! Bloom filters is a simple data structure, which is used in many different situations. It can
4//! neatly solve certain problems heaurustically without need for extreme memory usage.
5//!
6//! This implementation is fairly standard, except that it uses atomic integers to work
7//! concurrently.
8
9#![deny(missing_debug_implementations)]
10
11use std::cmp;
12use std::sync::atomic::{self, AtomicU64};
13
14/// The atomic ordering used throughout the crate.
15const ORDERING: atomic::Ordering = atomic::Ordering::Relaxed;
16
17/// Hash an integer.
18///
19/// This is a pseudorandom permutation of `u64` with high statistical quality. It can thus be used
20/// as a hash function.
21fn hash(mut x: u64) -> u64 {
22    // The following is copied from SeaHash.
23
24    x = x.wrapping_mul(0x6eed0e9da4d94a4f);
25    let a = x >> 32;
26    let b = x >> 60;
27    x ^= a >> b;
28    x = x.wrapping_mul(0x6eed0e9da4d94a4f);
29
30    // We XOR with some constant to make it zero-sensitive.
31    x ^ 0x11c92f7574d3e84f
32}
33
34/// A concurrent Bloom filter.
35///
36/// Bloom filters are a probabilistic data structure, which allows you to insert elements, and
37/// later test if they were inserted. The filter will either know it doesn't contain the element,
38/// or that it might. It will never be "sure", hence the name "filter".
39///
40/// It works by having an array of bits. Every element is hashed into a sequence of these bits. The
41/// bits of the inserted elements are set to 1. When testing for membership, we simply AND the
42/// bits.
43#[derive(Debug)]
44pub struct Filter {
45    /// The bit array.
46    ///
47    /// We use `u64` to improve performance of `Filter::clear()`.
48    bits: Vec<AtomicU64>,
49    /// The number of hash functions.
50    hashers: usize,
51}
52
53impl Filter {
54    /// Get the chunk of a particular hash.
55    #[inline]
56    fn get(&self, hash: u64) -> &AtomicU64 {
57        &self.bits[(hash as usize / 64) % self.bits.len()]
58    }
59
60    /// Create a new Bloom filter with the optimal number of hash functions.
61    ///
62    /// This creates a Bloom filter with `bytes` bytes of internal data, and optimal number (for
63    /// `expected_elements` number of elements) of hash functions.
64    pub fn new(bytes: usize, expected_elements: usize) -> Filter {
65        // The number of hashers are calculated by multiplying the bits per element by ln(2), which
66        // we approximate through multiplying by an integer, then shifting. To make things more
67        // precise, we add 0x8000 to round the shift.
68        let hashers = (bytes / expected_elements)
69            .saturating_mul(45426)
70            .saturating_add(0x8000)
71            >> 16;
72        Filter::with_size_and_hashers(bytes, hashers)
73    }
74
75    /// Create a new Bloom filter with some number of bytes and hashers.
76    ///
77    /// This creates a Bloom filter with at least `bytes` bytes of internal data and `hashers`
78    /// number of hash functions.
79    ///
80    /// If `hashers` is 0, it will be rounded to 1.
81    pub fn with_size_and_hashers(bytes: usize, hashers: usize) -> Filter {
82        // Convert `bytes` to number of `u64`s, and ceil to avoid case where the output is 0.
83        let len = (bytes.saturating_add(7)) / 8;
84        // Initialize a vector with zeros.
85        let mut vec = Vec::with_capacity(len);
86        for _ in 0..len {
87            vec.push(AtomicU64::new(0));
88        }
89
90        Filter {
91            bits: vec,
92            // Set hashers to 1, if it is 0, as there must be at least one hash function.
93            hashers: cmp::max(hashers, 1),
94        }
95    }
96
97    /// Clear the Bloom filter.
98    ///
99    /// This removes every element from the Bloom filter.
100    ///
101    /// Note that it will not do so atomically, and it can remove elements inserted simultaneously
102    /// to this function being called.
103    pub fn clear(&self) {
104        for i in &self.bits {
105            // Clear the bits of this chunk.
106            i.store(0, ORDERING);
107        }
108    }
109
110    /// Insert an element into the Bloom filter.
111    pub fn insert(&self, x: u64) {
112        // Start at `x`.
113        let mut h = x;
114        // Run over the hashers.
115        for _ in 0..self.hashers {
116            // We use the hash function to generate a pseudorandom sequence, defining the different
117            // hashes.
118            h = hash(h);
119            // Create a mask and OR the chunk chosen by `hash`.
120            self.get(h).fetch_or(1 << (h % 8), ORDERING);
121        }
122    }
123
124    /// Check if the Bloom filter potentially contains an element.
125    ///
126    /// This returns `true` if we're not sure if the filter contains `x` or not, and `false` if we
127    /// know that the filter does not contain `x`.
128    pub fn maybe_contains(&self, x: u64) -> bool {
129        // Start at `x`.
130        let mut h = x;
131
132        // Go over the hashers.
133        for _ in 0..self.hashers {
134            // Again, the hashes are defined by a cuckoo sequence of repeatedly hashing.
135            h = hash(h);
136            // Short-circuit if the bit is not set.
137            if self.get(h).load(ORDERING) & 1 << (h % 8) == 0 {
138                // Since the bit of this hash value was not set, it is impossible that the filter
139                // contains `x`, so we return `false`.
140                return false;
141            }
142        }
143
144        // Every bit was set, so the element might be in the filter.
145        true
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    use std::sync::Arc;
154    use std::thread;
155
156    #[test]
157    fn insert() {
158        let filter = Filter::new(400, 4);
159        filter.insert(3);
160        filter.insert(5);
161        filter.insert(7);
162        filter.insert(13);
163
164        assert!(!filter.maybe_contains(0));
165        assert!(!filter.maybe_contains(1));
166        assert!(!filter.maybe_contains(2));
167        assert!(filter.maybe_contains(3));
168        assert!(filter.maybe_contains(5));
169        assert!(filter.maybe_contains(7));
170        assert!(filter.maybe_contains(13));
171
172        for i in 14..60 {
173            assert!(!filter.maybe_contains(!i));
174        }
175    }
176
177    #[test]
178    fn clear() {
179        let filter = Filter::new(400, 4);
180        filter.insert(3);
181        filter.insert(5);
182        filter.insert(7);
183        filter.insert(13);
184
185        filter.clear();
186
187        assert!(!filter.maybe_contains(0));
188        assert!(!filter.maybe_contains(1));
189        assert!(!filter.maybe_contains(2));
190        assert!(!filter.maybe_contains(3));
191        assert!(!filter.maybe_contains(5));
192        assert!(!filter.maybe_contains(7));
193        assert!(!filter.maybe_contains(13));
194    }
195
196    #[test]
197    fn spam() {
198        let filter = Arc::new(Filter::new(2000, 100));
199        let mut joins = Vec::new();
200
201        for _ in 0..16 {
202            let filter = filter.clone();
203            joins.push(thread::spawn(move || {
204                for i in 0..100 {
205                    filter.insert(i)
206                }
207            }));
208        }
209
210        for i in joins {
211            i.join().unwrap();
212        }
213
214        for i in 0..100 {
215            assert!(filter.maybe_contains(i));
216        }
217        for i in 100..200 {
218            assert!(!filter.maybe_contains(i));
219        }
220    }
221}