use crate::{bitmap::CompressedBitmap, FilterSize};
use std::collections::hash_map::RandomState;
use std::hash::{BuildHasher, Hash, Hasher};
use std::marker::PhantomData;
pub trait Bitmap {
fn set(&mut self, key: usize, value: bool);
fn get(&self, key: usize) -> bool;
fn byte_size(&self) -> usize;
}
pub struct BloomFilterBuilder<H, B>
where
H: BuildHasher,
B: Bitmap,
{
hasher: H,
bitmap: B,
key_size: FilterSize,
}
impl std::default::Default for BloomFilterBuilder<RandomState, CompressedBitmap> {
fn default() -> BloomFilterBuilder<RandomState, CompressedBitmap> {
let size = FilterSize::KeyBytes2;
BloomFilterBuilder {
hasher: RandomState::default(),
bitmap: CompressedBitmap::new(key_size_to_bits(size)),
key_size: size,
}
}
}
impl<H, B> BloomFilterBuilder<H, B>
where
H: BuildHasher,
B: Bitmap,
{
pub fn hasher(self, hasher: H) -> Self {
Self { hasher, ..self }
}
pub unsafe fn bitmap(self, bitmap: B, key_size: FilterSize) -> Self {
Self {
bitmap,
key_size,
..self
}
}
pub fn build<T: Hash>(self) -> Bloom2<H, B, T> {
Bloom2 {
hasher: self.hasher,
bitmap: self.bitmap,
key_size: self.key_size,
_key_type: PhantomData,
}
}
}
impl<H> BloomFilterBuilder<H, CompressedBitmap>
where
H: BuildHasher,
{
pub fn size(self, size: FilterSize) -> Self {
Self {
key_size: size,
bitmap: CompressedBitmap::new(key_size_to_bits(size)),
..self
}
}
}
fn key_size_to_bits(k: FilterSize) -> usize {
(2 as usize).pow(8 * k as u32)
}
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Bloom2<H, B, T>
where
H: BuildHasher,
B: Bitmap,
{
hasher: H,
bitmap: B,
key_size: FilterSize,
_key_type: PhantomData<T>,
}
impl<T> std::default::Default for Bloom2<RandomState, CompressedBitmap, T>
where
T: Hash,
{
fn default() -> Self {
crate::BloomFilterBuilder::default().build()
}
}
impl<H, B, T> Bloom2<H, B, T>
where
H: BuildHasher,
B: Bitmap,
T: Hash,
{
pub fn insert(&mut self, data: &'_ T) {
let mut hasher = self.hasher.build_hasher();
data.hash(&mut hasher);
hasher
.finish()
.to_be_bytes()
.chunks(self.key_size as usize)
.for_each(|chunk| self.bitmap.set(bytes_to_usize_key(chunk), true));
}
pub fn contains(&mut self, data: &'_ T) -> bool {
let mut hasher = self.hasher.build_hasher();
data.hash(&mut hasher);
hasher
.finish()
.to_be_bytes()
.chunks(self.key_size as usize)
.any(|chunk| self.bitmap.get(bytes_to_usize_key(chunk)))
}
pub fn byte_size(&mut self) -> usize {
self.bitmap.byte_size()
}
}
impl<H, T> Bloom2<H, CompressedBitmap, T>
where
H: BuildHasher,
{
pub fn shrink_to_fit(&mut self) {
self.bitmap.shrink_to_fit();
}
}
fn bytes_to_usize_key<'a, I: IntoIterator<Item = &'a u8>>(bytes: I) -> usize {
bytes
.into_iter()
.fold(0, |key, &byte| (key << 8) | byte as usize)
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck_macros::quickcheck;
use std::cell::RefCell;
#[derive(Debug, Clone, Default)]
struct MockHasher {
return_hash: u64,
}
impl Hasher for MockHasher {
fn write(&mut self, _bytes: &[u8]) {}
fn finish(&self) -> u64 {
self.return_hash
}
}
impl BuildHasher for MockHasher {
type Hasher = Self;
fn build_hasher(&self) -> MockHasher {
self.clone()
}
}
#[derive(Debug, Default)]
struct MockBitmap {
set_calls: Vec<(usize, bool)>,
get_calls: RefCell<Vec<usize>>,
}
impl Bitmap for MockBitmap {
fn set(&mut self, key: usize, value: bool) {
self.set_calls.push((key, value))
}
fn get(&self, key: usize) -> bool {
self.get_calls.borrow_mut().push(key);
false
}
fn byte_size(&self) -> usize {
42
}
}
fn new_test_bloom<T: Hash>() -> Bloom2<MockHasher, MockBitmap, T> {
Bloom2 {
hasher: MockHasher::default(),
bitmap: MockBitmap::default(),
key_size: FilterSize::KeyBytes1,
_key_type: PhantomData,
}
}
#[test]
fn test_default() {
let mut b = Bloom2::default();
assert_eq!(b.key_size, FilterSize::KeyBytes2);
b.insert(&42);
assert!(b.contains(&42));
}
#[quickcheck]
fn test_default_prop(vals: Vec<u16>) {
let mut b = Bloom2::default();
for v in &vals {
b.insert(&*v);
}
for v in &vals {
assert!(b.contains(&*v));
}
}
#[test]
fn test_insert_contains_kb1() {
let mut b = new_test_bloom();
b.hasher.return_hash = 12345678901234567890;
b.insert(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.set_calls,
vec![
(171, true),
(84, true),
(169, true),
(140, true),
(235, true),
(31, true),
(10, true),
(210, true),
]
);
b.contains(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.get_calls.into_inner(),
vec![171, 84, 169, 140, 235, 31, 10, 210]
);
}
#[test]
fn test_insert_contains_kb2() {
let mut b = new_test_bloom();
b.key_size = FilterSize::KeyBytes2;
b.hasher.return_hash = 12345678901234567890;
b.insert(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.set_calls,
vec![(43860, true), (43404, true), (60191, true), (2770, true),]
);
assert!(b.bitmap.get_calls.into_inner().is_empty());
}
#[test]
fn test_issue_3() {
let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, &str> =
BloomFilterBuilder::default()
.size(FilterSize::KeyBytes4)
.build();
bloom_filter.insert(&"a");
bloom_filter.insert(&"b");
bloom_filter.insert(&"c");
bloom_filter.insert(&"d");
}
#[test]
fn test_size_shrink() {
let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, _> =
BloomFilterBuilder::default()
.size(FilterSize::KeyBytes4)
.build();
for i in 0..10 {
bloom_filter.insert(&i);
}
assert_eq!(bloom_filter.byte_size(), 8388920);
bloom_filter.shrink_to_fit();
assert_eq!(bloom_filter.byte_size(), 8388824);
}
}