use bitvec::prelude::*;
use xxhash_rust::xxh3::xxh3_64_with_seed;
use crate::as_bytes::AsBytes;
pub struct BloomFilter {
bit_vector: BitVec,
number_of_functions: usize,
}
#[derive(Debug, Eq, PartialEq)]
pub enum BloomResult {
No,
YesWhp,
}
impl BloomFilter {
fn fn1(&self, t: &[u8]) -> usize {
xxh3_64_with_seed(t, 0xdeadbeef) as usize
}
fn fn2(&self, t: &[u8]) -> usize {
xxh3_64_with_seed(t, 123) as usize
}
pub fn new_from_size_and_probabilty(expected_set_size: usize, probability: f64) -> Self {
assert!(probability > 0.);
assert!(probability < 1.);
let ln_pfp = probability.log(std::f64::consts::E);
let ln_2 = 1. / std::f64::consts::LOG2_E;
let optimal_vector_legth =
(-(expected_set_size as f64 * ln_pfp / ln_2.powi(2)).ceil()) as usize;
assert!(optimal_vector_legth > 0);
let bit_vector: BitVec = (0..optimal_vector_legth).map(|_a| false).collect();
let number_of_functions = (-(ln_pfp / ln_2).ceil()) as usize;
assert!(number_of_functions > 0);
BloomFilter {
bit_vector,
number_of_functions,
}
}
pub fn new_from_list<T: AsBytes>(list: &[T], probability: f64) -> Self {
let mut filter = BloomFilter::new_from_size_and_probabilty(list.len(), probability);
for i in list {
filter.add(i);
}
filter
}
pub fn contains(&self, value: &[u8]) -> BloomResult {
let len = self.bit_vector.len();
let fn1_value = self.fn1(value) % len;
let all_hash_fns_matched = (0..self.number_of_functions).all(|i| {
let fn2_value = self.fn2(value) % len;
let index = (fn1_value + (i * fn2_value)) % len;
self.bit_vector[index]
});
if all_hash_fns_matched {
return BloomResult::YesWhp;
}
BloomResult::No
}
pub fn add_bytes(&mut self, value: &[u8]) {
let len = self.bit_vector.len();
let fn1_value = self.fn1(value) % len;
(0..self.number_of_functions).for_each(|i| {
let fn2_value = self.fn2(value) % len;
let index = (fn1_value + (i * fn2_value)) % len;
self.bit_vector.set(index, true);
});
}
pub fn add<T: AsBytes>(&mut self, value: &T) {
self.add_bytes(value.as_bytes())
}
}
#[cfg(test)]
mod tests {
use crate::bloom_filter::BloomResult;
use super::BloomFilter;
#[test]
fn one_sentence() {
let sentence1 = "this is just a string of words with little meaning.";
let sentence2 = "and this is another one with equally little meaning.";
let mut filter = BloomFilter::new_from_size_and_probabilty(1, 0.001);
filter.add(&sentence1);
assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::No);
}
#[test]
fn from_list() {
let sentence1 = "this is just a string of words with little meaning.";
let sentence2 = "and this is another one with equally little meaning.";
let list = vec![sentence1, sentence2];
let sentence3 = "I am running out of meaningless examples to write.";
let mut filter = BloomFilter::new_from_list(&list, 0.001);
filter.add(&sentence1);
assert_eq!(filter.contains(sentence1.as_bytes()), BloomResult::YesWhp);
assert_eq!(filter.contains(sentence2.as_bytes()), BloomResult::YesWhp);
assert_eq!(filter.contains(sentence3.as_bytes()), BloomResult::No);
}
}