mmap_bitvec/
bloom.rs

1use std::hash::{Hash, Hasher};
2use std::io;
3use std::path::Path;
4
5use murmurhash3::murmurhash3_x64_128;
6
7use crate::bitvec::BitVector;
8use crate::mmap_bitvec::MmapBitVec;
9
10/// Newtype for murmur hashing.
11/// We don't use murmurhash3::Murmur3Hasher because it makes copies of the
12/// bytes to be hashed on every `hash` call
13#[derive(Default)]
14pub struct MurmurHasher(u64, u64);
15
16impl MurmurHasher {
17    #[allow(missing_docs)]
18    pub fn new() -> Self {
19        MurmurHasher(0, 0)
20    }
21    #[allow(missing_docs)]
22    pub fn values(&self) -> (u64, u64) {
23        (self.0, self.1)
24    }
25}
26
27impl Hasher for MurmurHasher {
28    #[inline]
29    fn write(&mut self, bytes: &[u8]) {
30        let hash = murmurhash3_x64_128(bytes, self.0);
31        *self = MurmurHasher(hash.0, hash.1);
32    }
33
34    fn finish(&self) -> u64 {
35        self.0
36    }
37}
38
39/// A simple implementation of a Bloom filter backed by `BitVec`
40pub struct BloomFilter {
41    bit_vec: MmapBitVec,
42    hashes: u8,
43}
44
45impl BloomFilter {
46    /// Creates a new `BloomFilter` (or opens an existing one, if the file
47    /// already exists) of a given size (in bits) and with a given number of
48    /// hash functions for each insert (`n_hashes`). If a filename is not
49    /// passed, the Bloom filter will be created in memory.
50    pub fn new<P>(filename: Option<P>, bits: usize, n_hashes: u8) -> Result<Self, io::Error>
51    where
52        P: AsRef<Path>,
53    {
54        let header = vec![];
55        let bitvec = match filename {
56            Some(filename) => {
57                if Path::exists(filename.as_ref()) {
58                    MmapBitVec::open(&filename, Some(b"!!"), false)?
59                } else {
60                    MmapBitVec::create(&filename, bits, Some(*b"!!"), &header)?
61                }
62            }
63            None => MmapBitVec::from_memory(bits)?,
64        };
65        Ok(BloomFilter {
66            bit_vec: bitvec,
67            hashes: n_hashes,
68        })
69    }
70
71    /// Insert an item into the Bloom filter.
72    pub fn insert<H>(&mut self, item: H)
73    where
74        H: Hash,
75    {
76        let size = self.bit_vec.size();
77        let mut hasher = MurmurHasher::new();
78        for _ in 0..self.hashes {
79            item.hash(&mut hasher);
80            let hash: u64 = hasher.finish();
81            self.bit_vec.set(hash as usize % size, true);
82        }
83    }
84
85    /// Check if an item is in the Bloom filter already.
86    pub fn contains<H>(&self, item: H) -> bool
87    where
88        H: Hash,
89    {
90        let size = self.bit_vec.size();
91        let mut hasher = MurmurHasher::new();
92        for _ in 0..self.hashes {
93            item.hash(&mut hasher);
94            let hash: u64 = hasher.finish();
95            if !self.bit_vec.get(hash as usize % size) {
96                return false;
97            }
98        }
99        true
100    }
101}
102
103#[cfg(test)]
104mod test {
105    use super::BloomFilter;
106
107    #[test]
108    fn test_bloom_filter() {
109        use std::fs::remove_file;
110        let mut filter = BloomFilter::new(Some("./test_bloom"), 100, 2).unwrap();
111        let (a, b) = (1, 2);
112        assert!(!filter.contains(a));
113        assert!(!filter.contains(b));
114        filter.insert(b);
115        assert!(!filter.contains(a));
116        assert!(filter.contains(b));
117
118        remove_file("./test_bloom").unwrap();
119    }
120}