#![deny(unsafe_code)]
#![feature(test)]
extern crate test;
use bitvec::prelude::BitVec;
use seahash::SeaHasher;
use serde_derive::{Deserialize, Serialize};
use std::{
hash::{Hash, Hasher},
iter::Iterator,
};
#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
struct Bloom {
field: BitVec,
num_slices: usize,
slice_len: usize,
seed: u64,
}
impl Bloom {
fn new(num_slices: usize, slice_len: usize, seed: u64) -> Bloom {
debug_assert!(slice_len >= 1);
debug_assert!(num_slices > 0);
let bitvec_size = num_slices * slice_len;
let mut field = BitVec::with_capacity(bitvec_size);
for _ in 0..bitvec_size {
field.push(false);
}
field.shrink_to_fit();
Bloom {
field,
num_slices,
slice_len,
seed,
}
}
fn index_iterator<'a, T: Hash>(&self, item: &'a T) -> impl Iterator<Item = usize> + 'a {
let slice_len = self.slice_len;
let (k1, k2, k3, k4) = generate_seed(self.seed);
let mut hasher = SeaHasher::with_seeds(k1, k2, k3, k4);
(0..self.num_slices).map(move |curr_slice| {
item.hash(&mut hasher);
let hash = hasher.finish();
hasher.write_u64(hash);
(hash as usize % slice_len) + curr_slice * slice_len
})
}
fn insert<T: Hash>(&mut self, item: &T) {
for index in self.index_iterator(item) {
self.field.set(index, true)
}
}
fn contains<T: Hash>(&self, item: &T) -> bool {
self.index_iterator(item)
.all(|index| self.field.get(index).unwrap())
}
fn fill_ratio_gte(&self, lower_bound: f64) -> bool {
let len = (self.slice_len * self.num_slices) as f64;
(self.field.count_ones() as f64 / len) >= lower_bound
}
}
fn hash_u64(i: u64) -> u64 {
seahash::hash(&i.to_be_bytes())
}
#[inline]
fn generate_seed(base_seed: u64) -> (u64, u64, u64, u64) {
let h1 = hash_u64(base_seed);
let h2 = hash_u64(h1);
let h3 = hash_u64(h2);
let mut hasher = SeaHasher::with_seeds(base_seed, h1, h2, h3);
hasher.write_u64(0);
let a = hasher.finish();
hasher.write_u64(a);
let b = hasher.finish();
hasher.write_u64(b);
let c = hasher.finish();
hasher.write_u64(c);
let d = hasher.finish();
(a, b, c, d)
}
#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
pub struct GrowableBloom {
blooms: Vec<Bloom>,
curr_num_slice: usize,
slice_size: usize,
curr_seed: u64,
}
impl GrowableBloom {
const GROWTH_FACTOR: usize = 2;
pub fn new(desired_error_prob: f64, est_insertions: usize) -> GrowableBloom {
assert!(0.0 < desired_error_prob && desired_error_prob <= 1.0);
let opt_num_slices = ((1.0 / desired_error_prob).log2()).ceil();
let opt_total_bits = (desired_error_prob.ln().abs() * est_insertions as f64
/ 2f64.ln().powi(2))
.ceil() as usize;
let opt_num_slices = opt_num_slices as usize;
let slice_size = opt_total_bits / opt_num_slices;
let curr_seed = 0;
let first_bloom = Bloom::new(opt_num_slices, slice_size, curr_seed);
debug_assert!(opt_num_slices > 0);
GrowableBloom {
blooms: vec![first_bloom],
curr_num_slice: opt_num_slices,
slice_size,
curr_seed,
}
}
pub fn contains<T: Hash>(&self, item: T) -> bool {
debug_assert!(!self.blooms.is_empty());
self.blooms.iter().any(|bloom| bloom.contains(&item))
}
pub fn insert<T: Hash>(&mut self, item: T) {
debug_assert!(!self.blooms.is_empty());
if self.contains(&item) {
return;
}
let curr_bloom = self.blooms.last_mut().unwrap();
curr_bloom.insert(&item);
if curr_bloom.fill_ratio_gte(0.8) {
self.grow();
}
}
fn grow(&mut self) {
self.curr_num_slice += 1;
self.slice_size *= GrowableBloom::GROWTH_FACTOR;
self.curr_seed ^= hash_u64(self.curr_seed);
let new_bloom = Bloom::new(self.curr_num_slice, self.slice_size, self.curr_seed);
self.blooms.push(new_bloom)
}
}
#[cfg(test)]
mod growable_bloom_tests {
mod test_bloom {
use crate::Bloom;
#[test]
fn can_insert_bloom() {
let mut b = Bloom::new(2, 1024, 10);
let item = 20;
b.insert(&item);
assert!(b.contains(&item))
}
#[test]
fn can_insert_string_bloom() {
let mut b = Bloom::new(2, 1024, 10);
let item: String = "hello world".to_owned();
b.insert(&item);
assert!(b.contains(&item))
}
#[test]
fn test_slice_bloom() {
let mut b = Bloom::new(3, 5, 10);
let item: String = "hello world".to_owned();
b.insert(&item);
assert_eq!(b.field.count_ones(), 3);
}
#[test]
fn does_not_contain() {
let mut b = Bloom::new(2, 1024, 10);
let upper = 100;
for i in (0..upper).step_by(2) {
b.insert(&i);
assert_eq!(b.contains(&i), true);
}
for i in (1..upper).step_by(2) {
assert_eq!(b.contains(&i), false);
}
}
#[test]
fn test_seeds() {
let mut b1 = Bloom::new(2, 10, 0);
let mut b2 = Bloom::new(2, 10, 1);
b1.insert(&0);
b2.insert(&0);
assert_ne!(b1.field, b2.field);
}
#[test]
fn can_insert_lots() {
let mut b = Bloom::new(2, 1024, 10);
for i in 0..1024 {
b.insert(&i);
assert!(b.contains(&i))
}
}
#[test]
fn test_fill_ratio() {
let mut b = Bloom::new(2, 2, 0);
let item: String = "hello world".to_owned();
b.insert(&item);
assert_eq!(b.fill_ratio_gte(0.5), true, "There's only two bits set!");
assert_eq!(b.fill_ratio_gte(0.1), true);
assert_eq!(b.fill_ratio_gte(0.50000000001), false);
}
#[test]
fn slices_are_different() {
let slice_len = 128;
let mut b = Bloom::new(2, slice_len, 0);
let item: String = "hello world".to_owned();
b.insert(&item);
assert_eq!(b.field[0..slice_len].len(), b.field[slice_len..].len());
assert_ne!(b.field[0..slice_len], b.field[slice_len..]);
}
}
mod test_growable {
use crate::GrowableBloom;
use serde_json;
#[test]
fn can_insert() {
let mut b = GrowableBloom::new(0.05, 1000);
let item = 20;
b.insert(&item);
assert!(b.contains(&item))
}
#[test]
fn can_insert_string() {
let mut b = GrowableBloom::new(0.05, 1000);
let item: String = "hello world".to_owned();
b.insert(&item);
assert!(b.contains(&item))
}
#[test]
fn does_not_contain() {
let mut b = GrowableBloom::new(0.05, 1000);
assert_eq!(b.contains(&"hello"), false);
b.insert(&0);
assert_eq!(b.contains(&"hello"), false);
b.insert(&1);
assert_eq!(b.contains(&"hello"), false);
b.insert(&2);
assert_eq!(b.contains(&"hello"), false);
}
#[test]
fn can_insert_a_lot_of_elements() {
let mut b = GrowableBloom::new(0.05, 1000);
for i in 0..1000 {
b.insert(&i);
assert!(b.contains(&i));
}
}
#[test]
fn can_serialize_deserialize() {
let mut b = GrowableBloom::new(0.05, 1000);
b.insert(&0);
let s = serde_json::to_string(&b).unwrap();
let b_s: GrowableBloom = serde_json::from_str(&s).unwrap();
assert!(b_s.contains(&0));
assert_ne!(b_s.contains(&1), true);
assert_ne!(b_s.contains(&1000), true);
}
#[test]
fn verify_saturation() {
let mut b = GrowableBloom::new(0.50, 100);
for i in 0..1000 {
b.insert(&i);
}
assert_eq!(b.contains(&10001), false)
}
#[test]
fn test_types_saturation() {
let mut b = GrowableBloom::new(0.50, 100);
b.insert(&vec![1, 2, 3]);
b.insert("hello");
b.insert(&-1);
b.insert(&0);
}
}
mod bench {
use crate::GrowableBloom;
use test::Bencher;
#[bench]
fn bench_insert_normal_prob(b: &mut Bencher) {
let mut gbloom = GrowableBloom::new(0.05, 1000);
b.iter(|| gbloom.insert(10));
}
#[bench]
fn bench_insert_small_prob(b: &mut Bencher) {
let mut gbloom = GrowableBloom::new(0.0005, 1000);
b.iter(|| gbloom.insert(10));
}
#[bench]
fn bench_many(b: &mut Bencher) {
let mut gbloom = GrowableBloom::new(0.05, 100000);
b.iter(|| gbloom.insert(10));
}
#[bench]
fn bench_insert_large(b: &mut Bencher) {
let s: String = (0..10000).map(|_| 'X').collect();
let mut gbloom = GrowableBloom::new(0.05, 100000);
b.iter(|| gbloom.insert(&s))
}
#[bench]
fn bench_insert_large_very_small_prob(b: &mut Bencher) {
let s: String = (0..10000).map(|_| 'X').collect();
let mut gbloom = GrowableBloom::new(0.000005, 100000);
b.iter(|| gbloom.insert(&s))
}
#[bench]
fn bench_grow(b: &mut Bencher) {
let mut gbloom = GrowableBloom::new(0.90, 1);
b.iter(|| {
for i in 0..100 {
gbloom.insert(&i);
assert!(gbloom.contains(&i));
}
})
}
}
}