use bit_vec::BitVec;
use fasthash::RandomState;
use fasthash::{murmur3, xx};
use std::hash::{BuildHasher, Hash, Hasher};
const LN_SQR: f64 = core::f64::consts::LN_2 * core::f64::consts::LN_2;
const SET_BIT: bool = true;
const UNSET_BIT: bool = false;
pub const DEFAULT_FALSE_POS: f64 = 0.01;
pub struct BloomFilter {
bit_vec: BitVec,
num_hashes: u64,
set_bits: u64,
murmur_hasher: RandomState<murmur3::Murmur3_x64_128>,
xx_hasher: RandomState<xx::XXHash64>,
}
impl BloomFilter {
pub fn new(approx_items: u64) -> BloomFilter {
BloomFilter::new_with_rate(approx_items, DEFAULT_FALSE_POS)
}
pub fn new_with_rate(approx_items: u64, fp_prob: f64) -> BloomFilter {
let num_bits = optimal_num_bits(approx_items, fp_prob);
let num_hashes = optimal_num_hashes(num_bits, approx_items);
BloomFilter {
bit_vec: BitVec::from_elem(num_bits as usize, UNSET_BIT),
num_hashes: num_hashes,
set_bits: 0,
murmur_hasher: RandomState::<murmur3::Murmur3_x64_128>::new(),
xx_hasher: RandomState::<xx::XXHash64>::new(),
}
}
}
impl BloomFilter {
pub fn set<T: Hash>(&mut self, obj: &T) {
let mut hasher_one = self.murmur_hasher.build_hasher();
let mut hasher_two = self.xx_hasher.build_hasher();
obj.hash(&mut hasher_one);
obj.hash(&mut hasher_two);
let h1 = hasher_one.finish();
let h2 = hasher_two.finish();
for i in 0..self.num_hashes {
let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize;
if self.bit_vec.get(bit_idx).unwrap() == UNSET_BIT {
self.set_bits += 1;
}
self.bit_vec.set(bit_idx, SET_BIT);
}
}
pub fn has<T: Hash>(&self, obj: &T) -> Option<bool> {
let mut hasher_one = self.murmur_hasher.build_hasher();
let mut hasher_two = self.xx_hasher.build_hasher();
obj.hash(&mut hasher_one);
obj.hash(&mut hasher_two);
let h1 = hasher_one.finish();
let h2 = hasher_two.finish();
for i in 0..self.num_hashes {
let bit_idx = self.enhanced_double_hash(h1, h2, i) as usize;
if self.bit_vec.get(bit_idx).unwrap() != SET_BIT {
return Some(false);
}
}
Some(true)
}
pub fn num_items_approx(&self) -> u64 {
let m = self.bit_vec.len() as f64;
let k = self.num_hashes as f64;
let x = self.set_bits as f64;
(-(m / k) * (1.0 - (x / m)).ln()) as u64
}
fn enhanced_double_hash(&self, h1: u64, h2: u64, i: u64) -> u64 {
let r = h1.wrapping_add(i.wrapping_mul(h2)).wrapping_add(i.pow(3));
r % self.bit_vec.len() as u64
}
}
fn optimal_num_bits(approx_items: u64, fp_prob: f64) -> u64 {
(-((fp_prob.ln() * (approx_items as f64)) / LN_SQR)).ceil() as u64
}
fn optimal_num_hashes(num_bits: u64, approx_items: u64) -> u64 {
(((num_bits / approx_items) as f64) * core::f64::consts::LN_2).ceil() as u64
}
#[cfg(test)]
mod tests {
use super::*;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use std::collections::HashSet;
fn random_str(len: usize) -> String {
thread_rng().sample_iter(&Alphanumeric).take(len).collect()
}
#[test]
fn test_bloom_filter() {
let n = 1000;
let mut items = HashSet::<String>::new();
for _ in 0..n {
items.insert(random_str(30));
}
let mut bf = BloomFilter::new(items.len() as u64);
for item in items.iter() {
bf.set(item);
let exists = bf.has(item).unwrap();
assert_eq!(
exists, true,
"item {} should result in a positive inclusion",
item,
);
}
for _ in 0..n {
let item = random_str(30);
let exists = bf.has(&item).unwrap();
if items.contains(&item) {
assert_eq!(exists, true, "item {} resulted in a false negative", item);
}
}
}
#[test]
fn test_optimal_num_bits() {
assert_eq!(optimal_num_bits(10, 0.04), 67);
assert_eq!(optimal_num_bits(5000, 0.01), 47926);
assert_eq!(optimal_num_bits(100000, 0.01), 958506);
}
#[test]
fn test_optimal_num_hashes() {
assert_eq!(optimal_num_hashes(67, 10), 5);
assert_eq!(optimal_num_hashes(47926, 5000), 7);
assert_eq!(optimal_num_hashes(958506, 100000), 7);
}
}