#![allow(rustdoc::bare_urls)]
#![warn(unreachable_pub)]
#![doc = include_str!("../README.md")]
#![cfg_attr(not(feature = "std"), no_std)]
extern crate alloc;
use alloc::vec::Vec;
use core::hash::{BuildHasher, Hash, Hasher};
use core::iter::repeat;
mod hasher;
pub use hasher::DefaultHasher;
mod builder;
pub use builder::{
AtomicBuilderWithBits, AtomicBuilderWithFalsePositiveRate, BuilderWithBits,
BuilderWithFalsePositiveRate,
};
mod bit_vector;
use bit_vector::{AtomicBitVec, BitVec};
#[cfg(feature = "loom")]
pub(crate) use loom::sync::atomic::AtomicU64;
#[cfg(not(feature = "loom"))]
pub(crate) use core::sync::atomic::AtomicU64;
#[cfg(all(feature = "loom", feature = "serde"))]
compile_error!("features `loom` and `serde` are mutually exclusive");
macro_rules! impl_bloom {
($name:ident, $builder_bits:ident, $builder_fp:ident, $bitvec:ident, $bits:ty, $ismut:literal) => {
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let ", $ismut, "filter = ", stringify!($name), "::with_num_bits(1024).expected_items(2);")]
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let filter = ", stringify!($name), "::with_false_pos(0.001).items([\"42\", \"🦀\"]);")]
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let filter = ", stringify!($name), "::with_num_bits(1024)")]
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct $name<S = DefaultHasher> {
bits: $bitvec,
num_hashes: u32,
hasher: S,
}
impl $name {
fn new_builder(num_bits: usize) -> $builder_bits {
assert!(num_bits > 0);
let num_u64s = (num_bits + 64 - 1) / 64;
$builder_bits {
data: repeat(0).take(num_u64s).collect(),
hasher: Default::default(),
}
}
fn new_from_vec(vec: Vec<u64>) -> $builder_bits {
assert!(!vec.is_empty());
$builder_bits {
data: vec,
hasher: Default::default(),
}
}
fn new_with_false_pos(fp: f64) -> $builder_fp {
assert!(fp > 0.0);
$builder_fp {
desired_fp_rate: fp,
hasher: Default::default(),
}
}
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let filter = ", stringify!($name), "::with_false_pos(0.001).expected_items(1000);")]
pub fn with_false_pos(fp: f64) -> $builder_fp {
$name::new_with_false_pos(fp)
}
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let filter = ", stringify!($name), "::with_num_bits(1024).hashes(4);")]
pub fn with_num_bits(num_bits: usize) -> $builder_bits {
$name::new_builder(num_bits)
}
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let orig = ", stringify!($name), "::with_false_pos(0.001).seed(&42).items([1, 2]);")]
#[doc = concat!("let new = ", stringify!($name), "::from_vec(orig.iter().collect()).seed(&42).hashes(num_hashes);")]
pub fn from_vec(bit_vec: Vec<u64>) -> $builder_bits {
$name::new_from_vec(bit_vec)
}
}
impl<S: BuildHasher> $name<S> {
#[doc = concat!("use fastbloom::", stringify!($name), ";")]
#[doc = concat!("let bloom = ", stringify!($name), "::with_num_bits(1024).items([1, 2, 3]);")]
#[inline]
pub fn contains(&self, val: &(impl Hash + ?Sized)) -> bool {
let source_hash = self.source_hash(val);
self.contains_hash(source_hash)
}
#[inline]
pub fn contains_hash(&self, source_hash: u64) -> bool {
let [mut h1, h2] = starting_hashes(source_hash);
(0..self.num_hashes).all(|_| {
let index = block_index(self.bits.len(), h1);
let h = next_hash(&mut h1, h2);
self.bits.check(index, h)
})
}
#[inline]
pub fn num_hashes(&self) -> u32 {
self.num_hashes
}
pub fn num_bits(&self) -> usize {
self.bits.num_bits()
}
#[inline]
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.bits.iter()
}
#[inline]
pub fn as_slice(&self) -> &[$bits] {
self.bits.as_slice()
}
#[inline]
pub fn source_hash(&self, val: &(impl Hash + ?Sized)) -> u64 {
let mut state = self.hasher.build_hasher();
val.hash(&mut state);
state.finish()
}
}
impl<T, S: BuildHasher> Extend<T> for $name<S>
where
T: Hash,
{
#[inline]
fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
for val in iter {
self.insert(&val);
}
}
}
impl<S: BuildHasher> PartialEq for $name<S> {
fn eq(&self, other: &Self) -> bool {
self.bits == other.bits && self.num_hashes == other.num_hashes
}
}
impl<S: BuildHasher> Eq for $name<S> {}
};
}
impl_bloom!(
BloomFilter,
BuilderWithBits,
BuilderWithFalsePositiveRate,
BitVec,
u64,
"mut "
);
impl_bloom!(
AtomicBloomFilter,
AtomicBuilderWithBits,
AtomicBuilderWithFalsePositiveRate,
AtomicBitVec,
AtomicU64,
""
);
impl<S: BuildHasher> BloomFilter<S> {
#[inline]
pub fn insert(&mut self, val: &(impl Hash + ?Sized)) -> bool {
let source_hash = self.source_hash(val);
self.insert_hash(source_hash)
}
#[inline]
pub fn insert_hash(&mut self, hash: u64) -> bool {
let [mut h1, h2] = starting_hashes(hash);
let mut previously_contained = true;
for _ in 0..self.num_hashes {
let index = block_index(self.bits.len(), h1);
let h = next_hash(&mut h1, h2);
previously_contained &= self.bits.set(index, h);
}
previously_contained
}
#[inline]
pub fn insert_all<T: Hash, I: IntoIterator<Item = T>>(&mut self, iter: I) {
for val in iter {
self.insert(&val);
}
}
#[inline]
pub fn clear(&mut self) {
self.bits.clear();
}
}
impl<S: BuildHasher> AtomicBloomFilter<S> {
#[inline]
pub fn insert(&self, val: &(impl Hash + ?Sized)) -> bool {
let source_hash = self.source_hash(val);
self.insert_hash(source_hash)
}
#[inline]
pub fn insert_hash(&self, hash: u64) -> bool {
let [mut h1, h2] = starting_hashes(hash);
let mut previously_contained = true;
for _ in 0..self.num_hashes {
let index = block_index(self.bits.len(), h1);
let h = next_hash(&mut h1, h2);
previously_contained &= self.bits.set(index, h);
}
previously_contained
}
#[inline]
pub fn insert_all<T: Hash, I: IntoIterator<Item = T>>(&self, iter: I) {
for val in iter {
self.insert(&val);
}
}
#[inline]
pub fn clear(&self) {
self.bits.clear();
}
}
#[inline]
pub(crate) fn starting_hashes(hash: u64) -> [u64; 2] {
let h2 = hash
.wrapping_shr(32)
.wrapping_mul(0x51_7c_c1_b7_27_22_0a_95); [hash, h2]
}
#[inline]
pub(crate) fn block_index(num_blocks: usize, hash: u64) -> usize {
(((hash >> 32).wrapping_mul(num_blocks as u64)) >> 32) as usize
}
#[inline]
fn next_hash(h1: &mut u64, h2: u64) -> u64 {
*h1 = h1.wrapping_add(h2).rotate_left(5);
*h1
}
macro_rules! impl_tests {
($modname:ident, $name:ident) => {
#[allow(unused_mut)]
#[cfg(not(feature = "loom"))]
#[cfg(test)]
mod $modname {
use super::*;
use alloc::format;
use rand::{rngs::StdRng, Rng, SeedableRng};
trait Seeded: BuildHasher {
fn seeded(seed: &[u8; 16]) -> Self;
}
impl Seeded for DefaultHasher {
fn seeded(seed: &[u8; 16]) -> Self {
Self::seeded(seed)
}
}
const TRIALS: usize = 1_000_000;
fn false_pos_rate<H: BuildHasher>(filter: &$name<H>) -> f64 {
let mut total = 0;
let mut false_positives = 0;
for x in non_member_nums() {
total += 1;
false_positives += filter.contains(&x) as usize;
}
(false_positives as f64) / (total as f64)
}
fn member_nums(num: usize) -> impl Iterator<Item = u64> {
random_numbers(num, 5)
}
fn non_member_nums() -> impl Iterator<Item = u64> {
random_numbers(TRIALS, 7).map(|x| x + u32::MAX as u64)
}
fn random_numbers(num: usize, seed: u64) -> impl Iterator<Item = u64> {
let mut rng = StdRng::seed_from_u64(seed);
(0..=num).map(move |_| rng.random::<u32>() as u64)
}
#[test]
fn test_to_from_vec() {
fn to_from_(size: usize) {
let mut b = $name::new_builder(size).seed(&1).hashes(3);
b.extend(member_nums(1000));
let b2 = $name::new_from_vec(b.iter().collect())
.seed(&1)
.hashes(3);
assert_eq!(b, b2);
assert_eq!(b.num_bits(), b.bits.len() * 64);
assert!(size <= b.bits.len() * 64);
assert!((size + u64::BITS as usize) > b.bits.len() * 64);
}
for size in 1..=10009 {
to_from_(size);
}
}
#[test]
fn first_insert_false() {
let mut filter = $name::with_num_bits(1202).expected_items(4);
assert!(!filter.insert(&5));
}
#[test]
fn target_fp_is_accurate() {
let thresh = 15.0f64;
for mag in 1..=5 {
let fp = 1.0f64 / 10u64.pow(mag) as f64;
for num_items_mag in 1..6 {
let num_items = 10usize.pow(num_items_mag);
let mut filter = $name::new_with_false_pos(fp)
.seed(&42)
.expected_items(num_items);
filter.extend(member_nums(num_items));
let sample_fp = false_pos_rate(&filter);
if sample_fp > 0.0 {
let score = sample_fp / fp;
assert!(score <= thresh, "score {score:}, thresh {thresh:}, size: {num_items:}, fp: {fp:}, sample fp: {sample_fp:}");
}
}
}
}
#[test]
fn nothing_after_clear() {
for mag in 1..6 {
let size = 10usize.pow(mag);
for bloom_size_mag in 6..10 {
let num_blocks_bytes = 1 << bloom_size_mag;
let num_bits = num_blocks_bytes * 8;
let mut filter = $name::new_builder(num_bits)
.seed(&7)
.expected_items(size);
filter.extend(member_nums(size));
assert!(filter.num_hashes() > 0);
filter.clear();
assert!(member_nums(size).all(|x| !filter.contains(&x)));
}
}
}
#[test]
fn random_inserts_always_contained() {
for mag in 1..6 {
let size = 10usize.pow(mag);
for bloom_size_mag in 6..10 {
let num_blocks_bytes = 1 << bloom_size_mag;
let num_bits = num_blocks_bytes * 8;
let mut filter = $name::new_builder(num_bits).expected_items(size);
filter.extend(member_nums(size));
assert!(member_nums(size).all(|x| filter.contains(&x)));
assert!(member_nums(size).all(|x| filter.insert(&x)));
}
}
}
#[test]
fn test_optimal_hashes_is_optimal() {
fn test_optimal_hashes_is_optimal_<H: Seeded>() {
let sizes = [1000, 2000, 5000, 6000, 8000, 10000];
let mut wins = 0;
for num_items in sizes {
let num_bits = 65000 * 8;
let mut filter = $name::new_builder(num_bits)
.hasher(H::seeded(&[42; 16]))
.expected_items(num_items);
filter.extend(member_nums(num_items));
let fp_to_beat = false_pos_rate(&filter);
let optimal_hashes = filter.num_hashes();
for num_hashes in [optimal_hashes - 1, optimal_hashes + 1] {
let mut test_filter = $name::new_builder(num_bits)
.hasher(H::seeded(&[42; 16]))
.hashes(num_hashes);
test_filter.extend(member_nums(num_items));
let fp = false_pos_rate(&test_filter);
wins += (fp_to_beat <= fp) as usize;
}
}
assert!(wins > sizes.len() / 2);
}
test_optimal_hashes_is_optimal_::<DefaultHasher>();
}
#[test]
fn seeded_is_same() {
let num_bits = 1 << 10;
let sample_vals = member_nums(1000).collect::<Vec<_>>();
for x in 0u8..32 {
let seed = x as u128;
assert_eq!(
$name::new_builder(num_bits)
.seed(&seed)
.items(sample_vals.iter()),
$name::new_builder(num_bits)
.seed(&seed)
.items(sample_vals.iter())
);
assert!(
!($name::new_builder(num_bits)
.seed(&(seed + 1))
.items(sample_vals.iter())
== $name::new_builder(num_bits)
.seed(&seed)
.items(sample_vals.iter()))
);
}
}
#[test]
fn false_pos_decrease_with_size() {
for mag in 5..6 {
let size = 10usize.pow(mag);
let mut prev_fp = 1.0;
let mut prev_prev_fp = 1.0;
for num_bits_mag in 9..22 {
let num_bits = 1 << num_bits_mag;
let mut filter = $name::new_builder(num_bits).expected_items(size);
filter.extend(member_nums(size));
let fp = false_pos_rate(&filter);
let err = format!(
"size: {size:}, num_bits: {num_bits:}, {:.6}, {:?}",
fp,
filter.num_hashes(),
);
assert!(
fp <= prev_fp || prev_fp <= prev_prev_fp || fp < 0.01,
"{}",
err
); prev_prev_fp = prev_fp;
prev_fp = fp;
}
}
}
fn assert_even_distribution(distr: &[u64], err: f64) {
assert!(err > 0.0 && err < 1.0);
let expected: i64 = (distr.iter().sum::<u64>() / (distr.len() as u64)) as i64;
let thresh = (expected as f64 * err) as i64;
for x in distr {
let diff = (*x as i64 - expected).abs();
assert!(
diff <= thresh,
"{x:?} deviates from {expected:?}\nDistribution: {distr:?}"
);
}
}
#[test]
fn test_seeded_hash_from_hashes_depth() {
for size in [1, 10, 100, 1000] {
let mut rng = StdRng::seed_from_u64(524323);
let mut h1 = rng.random_range(0..u64::MAX);
let h2 = rng.random_range(0..u64::MAX);
let mut seeded_hash_counts: Vec<_> = repeat(0).take(size).collect();
for _ in 0..(size * 10_000) {
let hi = next_hash(&mut h1, h2);
seeded_hash_counts[(hi as usize) % size] += 1;
}
assert_even_distribution(&seeded_hash_counts, 0.05);
}
}
#[test]
fn test_debug() {
let filter = $name::with_num_bits(1).hashes(1);
assert!(!format!("{:?}", filter).is_empty());
}
#[test]
fn test_clone() {
let filter = $name::with_num_bits(4).hashes(4);
let mut cloned = filter.clone();
assert_eq!(filter, cloned);
cloned.insert(&42);
assert!(filter != cloned);
}
#[test]
fn eq_constructors_num_bits() {
assert_eq!(
$name::with_num_bits(4).hashes(4),
$name::new_builder(4).hashes(4),
);
}
#[test]
fn eq_constructors_false_pos() {
assert_eq!(
$name::with_false_pos(0.4),
$name::new_with_false_pos(0.4),
);
}
#[test]
fn eq_constructors_from_vec() {
assert_eq!(
$name::from_vec(repeat(42).take(42).collect()),
$name::new_from_vec(repeat(42).take(42).collect()),
);
}
#[test]
fn test_rebuilt_from_vec() {
for num in [1, 10, 1000, 100_000] {
for fp in [0.1, 0.01, 0.0001, 0.0000001] {
let mut b = $name::with_false_pos(fp)
.seed(&42)
.expected_items(num);
b.extend(member_nums(num));
let orig_hashes = b.num_hashes();
let new = $name::from_vec(b.iter().collect())
.seed(&42)
.hashes(orig_hashes);
assert!(member_nums(num).all(|x| new.contains(&x)));
}
}
}
#[cfg(feature = "serde")]
#[test]
fn test_serde() {
for num in [1, 10, 1000, 100_000] {
for fp in [0.1, 0.01, 0.0001, 0.0000001] {
let mut before = $name::with_false_pos(fp)
.seed(&42)
.expected_items(num);
before.extend(member_nums(num));
let s = serde_cbor::to_vec(&before).unwrap();
let mut after: $name = serde_cbor::from_slice(&s).unwrap();
assert_eq!(before, after);
before.extend(member_nums(num * 2));
after.extend(member_nums(num * 2));
assert_eq!(before, after);
}
}
}
}
};
}
impl_tests!(non_atomic, BloomFilter);
impl_tests!(atomic, AtomicBloomFilter);
#[cfg(feature = "loom")]
#[cfg(test)]
mod loom_tests {
use super::*;
#[test]
fn test_loom() {
loom::model(|| {
let b = loom::sync::Arc::new(AtomicBloomFilter::with_num_bits(128).seed(&42).hashes(2));
let expected = AtomicBloomFilter::with_num_bits(128).seed(&42).hashes(2);
expected.insert_all(1..=3);
let handles: Vec<_> = [(1..=2), (2..=3)]
.into_iter()
.map(|data| {
let v = b.clone();
loom::thread::spawn(move || v.insert_all(data))
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let res = b.iter().collect::<Vec<_>>();
assert_eq!(res, expected.iter().collect::<Vec<_>>());
});
}
}