use crate::{bitmap::CompressedBitmap, FilterSize, VecBitmap};
use std::collections::hash_map::RandomState;
use std::hash::{BuildHasher, Hash};
use std::marker::PhantomData;
pub trait Bitmap {
fn new_with_capacity(max_key: usize) -> Self;
fn set(&mut self, key: usize, value: bool);
fn get(&self, key: usize) -> bool;
fn byte_size(&self) -> usize;
fn or(&self, other: &Self) -> Self;
}
pub struct BloomFilterBuilder<H, B>
where
H: BuildHasher,
B: Bitmap,
{
hasher: H,
bitmap: B,
key_size: FilterSize,
}
impl std::default::Default for BloomFilterBuilder<RandomState, CompressedBitmap> {
fn default() -> BloomFilterBuilder<RandomState, CompressedBitmap> {
let size = FilterSize::KeyBytes2;
BloomFilterBuilder {
hasher: RandomState::default(),
bitmap: CompressedBitmap::new(key_size_to_bits(size)),
key_size: size,
}
}
}
impl<H, B> BloomFilterBuilder<H, B>
where
H: BuildHasher,
B: Bitmap,
{
pub fn with_bitmap_data(self, bitmap: B, key_size: FilterSize) -> Self {
let _ = bitmap.get(key_size as usize);
Self {
bitmap,
key_size,
..self
}
}
pub fn with_bitmap<U>(self) -> BloomFilterBuilder<H, U>
where
U: Bitmap,
{
BloomFilterBuilder {
hasher: self.hasher,
bitmap: U::new_with_capacity(key_size_to_bits(self.key_size)),
key_size: self.key_size,
}
}
pub fn build<T: Hash>(self) -> Bloom2<H, B, T> {
Bloom2 {
hasher: self.hasher,
bitmap: self.bitmap,
key_size: self.key_size,
_key_type: PhantomData,
}
}
pub fn size(self, size: FilterSize) -> Self {
Self {
key_size: size,
bitmap: B::new_with_capacity(key_size_to_bits(size)),
..self
}
}
}
impl<H> BloomFilterBuilder<H, CompressedBitmap>
where
H: BuildHasher,
{
pub fn hasher(hasher: H) -> Self {
let size = FilterSize::KeyBytes2;
Self {
hasher,
bitmap: CompressedBitmap::new(key_size_to_bits(size)),
key_size: size,
}
}
}
fn key_size_to_bits(k: FilterSize) -> usize {
2_usize.pow(8 * k as u32)
}
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Bloom2<H, B, T>
where
H: BuildHasher,
B: Bitmap,
{
#[cfg_attr(feature = "serde", serde(skip))]
hasher: H,
bitmap: B,
key_size: FilterSize,
#[cfg_attr(feature = "serde", serde(skip))]
_key_type: PhantomData<T>,
}
impl<T> std::default::Default for Bloom2<RandomState, CompressedBitmap, T>
where
T: Hash,
{
fn default() -> Self {
crate::BloomFilterBuilder::default().build()
}
}
impl<H, B, T> Bloom2<H, B, T>
where
H: BuildHasher,
B: Bitmap,
T: Hash,
{
pub fn insert(&mut self, data: &'_ T) {
self.hasher
.hash_one(data)
.to_be_bytes()
.chunks(self.key_size as usize)
.for_each(|chunk| self.bitmap.set(bytes_to_usize_key(chunk), true));
}
pub fn contains(&self, data: &'_ T) -> bool {
self.hasher
.hash_one(data)
.to_be_bytes()
.chunks(self.key_size as usize)
.any(|chunk| self.bitmap.get(bytes_to_usize_key(chunk)))
}
pub fn union(&mut self, other: &Self) {
assert_eq!(self.key_size, other.key_size);
self.bitmap = self.bitmap.or(&other.bitmap);
}
pub fn byte_size(&mut self) -> usize {
self.bitmap.byte_size()
}
}
impl<H, T> Bloom2<H, CompressedBitmap, T>
where
H: BuildHasher,
{
pub fn shrink_to_fit(&mut self) {
self.bitmap.shrink_to_fit();
}
}
impl<H, T> Bloom2<H, VecBitmap, T>
where
H: BuildHasher,
{
pub fn compress(self) -> Bloom2<H, CompressedBitmap, T> {
Bloom2::from(self)
}
}
fn bytes_to_usize_key<'a, I: IntoIterator<Item = &'a u8>>(bytes: I) -> usize {
bytes
.into_iter()
.fold(0, |key, &byte| (key << 8) | byte as usize)
}
impl<H, T> From<Bloom2<H, VecBitmap, T>> for Bloom2<H, CompressedBitmap, T>
where
H: BuildHasher,
{
fn from(v: Bloom2<H, VecBitmap, T>) -> Self {
Self {
hasher: v.hasher,
bitmap: CompressedBitmap::from(v.bitmap),
key_size: v.key_size,
_key_type: PhantomData,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prelude::*;
use quickcheck_macros::quickcheck;
use std::{
cell::RefCell,
collections::HashSet,
hash::{BuildHasherDefault, Hasher},
};
#[derive(Debug, Clone, Default)]
struct MockHasher {
return_hash: u64,
}
impl Hasher for MockHasher {
fn write(&mut self, _bytes: &[u8]) {}
fn finish(&self) -> u64 {
self.return_hash
}
}
impl BuildHasher for MockHasher {
type Hasher = Self;
fn build_hasher(&self) -> MockHasher {
self.clone()
}
}
#[derive(Debug, Default)]
struct MockBitmap {
set_calls: Vec<(usize, bool)>,
get_calls: RefCell<Vec<usize>>,
}
impl Bitmap for MockBitmap {
fn set(&mut self, key: usize, value: bool) {
self.set_calls.push((key, value))
}
fn get(&self, key: usize) -> bool {
self.get_calls.borrow_mut().push(key);
false
}
fn byte_size(&self) -> usize {
42
}
fn or(&self, _other: &Self) -> Self {
unreachable!()
}
fn new_with_capacity(_max_key: usize) -> Self {
Self::default()
}
}
fn new_test_bloom<T: Hash>() -> Bloom2<MockHasher, MockBitmap, T> {
Bloom2 {
hasher: MockHasher::default(),
bitmap: MockBitmap::default(),
key_size: FilterSize::KeyBytes1,
_key_type: PhantomData,
}
}
#[test]
fn test_default() {
let mut b = Bloom2::default();
assert_eq!(b.key_size, FilterSize::KeyBytes2);
b.insert(&42);
assert!(b.contains(&42));
}
#[quickcheck]
fn test_default_prop(vals: Vec<u16>) {
let mut b = Bloom2::default();
for v in &vals {
b.insert(v);
}
for v in &vals {
assert!(b.contains(v));
}
}
#[test]
fn test_insert_contains_kb1() {
let mut b = new_test_bloom();
b.hasher.return_hash = 12345678901234567890;
b.insert(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.set_calls,
vec![
(171, true),
(84, true),
(169, true),
(140, true),
(235, true),
(31, true),
(10, true),
(210, true),
]
);
b.contains(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.get_calls.into_inner(),
vec![171, 84, 169, 140, 235, 31, 10, 210]
);
}
#[test]
fn test_insert_contains_kb2() {
let mut b = new_test_bloom();
b.key_size = FilterSize::KeyBytes2;
b.hasher.return_hash = 12345678901234567890;
b.insert(&[1, 2, 3, 4]);
assert_eq!(
b.bitmap.set_calls,
vec![(43860, true), (43404, true), (60191, true), (2770, true),]
);
assert!(b.bitmap.get_calls.into_inner().is_empty());
}
#[test]
fn test_issue_3() {
let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, &str> =
BloomFilterBuilder::default()
.size(FilterSize::KeyBytes4)
.build();
bloom_filter.insert(&"a");
bloom_filter.insert(&"b");
bloom_filter.insert(&"c");
bloom_filter.insert(&"d");
}
#[test]
fn test_size_shrink() {
let mut bloom_filter: Bloom2<RandomState, CompressedBitmap, _> =
BloomFilterBuilder::default()
.size(FilterSize::KeyBytes4)
.build();
for i in 0..10 {
bloom_filter.insert(&i);
}
assert_eq!(bloom_filter.byte_size(), 8388920);
bloom_filter.shrink_to_fit();
assert_eq!(bloom_filter.byte_size(), 8388824);
}
#[test]
fn set_hasher() {
let mut bloom_filter: Bloom2<
BuildHasherDefault<twox_hash::XxHash64>,
CompressedBitmap,
i32,
> = BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
.size(FilterSize::KeyBytes4)
.build();
for i in 0..10 {
bloom_filter.insert(&i);
}
for i in 0..10 {
assert!(bloom_filter.contains(&i), "did not contain {}", i);
}
}
#[quickcheck]
fn test_union(mut a: Vec<usize>, mut b: Vec<usize>, mut control: Vec<usize>) {
a.truncate(50);
b.truncate(50);
control.truncate(100);
let mut bitmap_a =
BloomFilterBuilder::hasher(BuildHasherDefault::<twox_hash::XxHash64>::default())
.size(FilterSize::KeyBytes2)
.build();
let mut bitmap_b = bitmap_a.clone();
for v in &a {
bitmap_a.insert(v);
}
for v in &b {
bitmap_b.insert(v);
}
let mut merged = bitmap_a.clone();
merged.union(&bitmap_b);
for v in &a {
assert!(merged.contains(v));
}
for v in &b {
assert!(merged.contains(v));
}
for v in &control {
let input_maybe_contains = bitmap_a.contains(v) || bitmap_b.contains(v);
assert_eq!(input_maybe_contains, merged.contains(v));
}
}
#[cfg(feature = "serde")]
#[test]
fn serde() {
type MyBuildHasher = BuildHasherDefault<twox_hash::XxHash64>;
let mut bloom_filter: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
BloomFilterBuilder::hasher(MyBuildHasher::default())
.size(FilterSize::KeyBytes4)
.build();
for i in 0..10 {
bloom_filter.insert(&i);
}
let encoded = serde_json::to_string(&bloom_filter).unwrap();
let decoded: Bloom2<MyBuildHasher, CompressedBitmap, i32> =
serde_json::from_str(&encoded).unwrap();
assert_eq!(bloom_filter.bitmap, decoded.bitmap);
for i in 0..10 {
assert!(decoded.contains(&i), "didn't contain {}", i);
}
}
pub fn arbitrary_value() -> impl Strategy<Value = usize> {
prop_oneof![
5 => 0_usize..100,
1 => any::<usize>(),
]
}
#[derive(Debug, Clone)]
pub enum Op {
Insert(usize),
Contains(usize),
}
pub fn arbitrary_op(s: impl Strategy<Value = usize>) -> impl Strategy<Value = Op> {
s.prop_flat_map(|v| prop_oneof![Just(Op::Insert(v)), Just(Op::Contains(v))])
}
proptest! {
#[test]
fn prop_ops_compressed_bitmap(
ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
) {
run_ops_fuzz::<CompressedBitmap>(ops);
}
#[test]
fn prop_ops_vec_bitmap(
ops in prop::collection::vec(arbitrary_op(arbitrary_value()), 1..100),
) {
run_ops_fuzz::<VecBitmap>(ops);
}
#[test]
fn prop_ops_compress(
values in prop::collection::vec(arbitrary_value(), 1..100),
check in prop::collection::vec(arbitrary_value(), 1..100),
) {
let mut b = BloomFilterBuilder::default().with_bitmap::<VecBitmap>().build();
let mut control: HashSet<usize, RandomState> = HashSet::default();
for v in values {
b.insert(&v);
control.insert(v);
}
let compressed = b.clone().compress();
for v in control {
assert!(compressed.contains(&v));
}
for v in check {
assert_eq!(compressed.contains(&v), b.contains(&v));
}
}
}
fn run_ops_fuzz<B>(ops: Vec<Op>)
where
B: Bitmap,
{
let mut b = BloomFilterBuilder::default().with_bitmap::<B>().build();
let mut control: HashSet<usize, RandomState> = HashSet::default();
for op in ops {
match op {
Op::Insert(v) => {
b.insert(&v);
control.insert(v);
}
Op::Contains(v) => {
if control.contains(&v) {
assert!(b.contains(&v));
}
}
}
}
}
}