use core::sync::atomic::{AtomicU64, Ordering};
const LN_2: f64 = std::f64::consts::LN_2;
struct Size {
size: u64,
exp: u64,
}
fn get_size(n: u64) -> Size {
let mut n = n;
if n < 512 {
n = 512;
}
let mut size = 1u64;
let mut exp = 0u64;
while size < n {
size <<= 1;
exp += 1;
}
Size { size, exp }
}
struct EntriesLocs {
entries: u64,
locs: u64,
}
fn calc_size_by_wrong_positives(num_entries: usize, wrongs: f64) -> EntriesLocs {
let num_entries = num_entries as f64;
let size = -num_entries * wrongs.ln() / LN_2.powf(2f64);
let locs = (LN_2 * size / num_entries).ceil();
EntriesLocs {
entries: size as u64,
locs: locs as u64,
}
}
#[repr(C)]
pub(crate) struct Bloom {
bitset: Vec<AtomicU64>,
size_exp: u64,
size: u64,
set_locs: u64,
shift: u64,
}
const MAX_ENTRIES: u64 = 1 << 40;
const MAX_LOCS: u64 = 64;
impl Bloom {
pub fn new(cap: usize, false_positive_ratio: f64) -> Self {
assert!(
false_positive_ratio.is_finite() && false_positive_ratio > 0.0,
"false_positive_ratio must be finite and > 0; got {false_positive_ratio}"
);
let cap = cap.max(1);
let entries_locs = {
if false_positive_ratio < 1f64 {
calc_size_by_wrong_positives(cap, false_positive_ratio)
} else {
EntriesLocs {
entries: cap as u64,
locs: false_positive_ratio as u64,
}
}
};
assert!(
entries_locs.entries > 0 && entries_locs.entries <= MAX_ENTRIES,
"bloom entries out of range: {} (cap={cap}, ratio={false_positive_ratio}); \
max is {MAX_ENTRIES}",
entries_locs.entries
);
assert!(
entries_locs.locs > 0 && entries_locs.locs <= MAX_LOCS,
"bloom set_locs out of range: {} (ratio={false_positive_ratio}); \
max is {MAX_LOCS}",
entries_locs.locs
);
let size = get_size(entries_locs.entries);
let words = (size.size >> 6) as usize;
let mut bitset = Vec::with_capacity(words);
bitset.resize_with(words, || AtomicU64::new(0));
Self {
bitset,
size: size.size - 1,
size_exp: size.exp,
set_locs: entries_locs.locs,
shift: 64 - size.exp,
}
}
pub fn reset(&self) {
for word in &self.bitset {
word.store(0, Ordering::Relaxed);
}
}
#[cfg_attr(not(tarpaulin), inline(always))]
#[allow(dead_code)]
pub fn size_exp(&self) -> u64 {
self.size_exp
}
pub fn clear(&self) {
for word in &self.bitset {
word.store(0, Ordering::Relaxed);
}
}
pub fn set(&self, idx: usize) {
let word = idx >> 6;
let mask = 1u64 << (idx & 63);
self.bitset[word].fetch_or(mask, Ordering::Relaxed);
}
pub fn is_set(&self, idx: usize) -> bool {
let word = idx >> 6;
let mask = 1u64 << (idx & 63);
self.bitset[word].load(Ordering::Relaxed) & mask != 0
}
pub fn add(&self, hash: u64) {
let h = hash >> self.shift;
let l = (hash << self.shift) >> self.shift;
(0..self.set_locs).for_each(|i| {
self.set(((h + i * l) & self.size) as usize);
});
}
pub fn contains(&self, hash: u64) -> bool {
let h = hash >> self.shift;
let l = (hash << self.shift) >> self.shift;
for i in 0..self.set_locs {
if !self.is_set(((h + i * l) & self.size) as usize) {
return false;
}
}
true
}
pub fn contains_or_add(&self, hash: u64) -> bool {
if self.contains(hash) {
false
} else {
self.add(hash);
true
}
}
#[allow(dead_code)]
#[cfg_attr(not(tarpaulin), inline(always))]
pub fn total_size(&self) -> usize {
self.bitset.len() * 8 + 4 * 8
}
}
#[cfg(test)]
mod test {
use crate::bbloom::Bloom;
use rand::{RngExt, distr::Alphanumeric};
use std::{
collections::hash_map::DefaultHasher,
hash::{Hash, Hasher},
};
const N: usize = 1 << 16;
fn get_word_list() -> Vec<Vec<u8>> {
let mut word_list = Vec::<Vec<u8>>::with_capacity(N);
(0..N).for_each(|_| {
let rand_string: String = rand::rng()
.sample_iter(Alphanumeric)
.take(30)
.map(char::from)
.collect();
word_list.push(rand_string.as_bytes().to_vec());
});
word_list
}
fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[test]
fn test_number_of_wrongs() {
let bf = Bloom::new(N * 10, 7f64);
let mut cnt = 0;
get_word_list().into_iter().for_each(|words| {
let hash = calculate_hash(&words);
if !bf.contains_or_add(hash) {
cnt += 1;
}
});
eprintln!(
"Bloomfilter(size = {}) Check for 'false positives': {} wrong positive 'Has' results on 2^16 entries => {}%",
bf.bitset.len() << 6,
cnt,
cnt as f64 / N as f64
);
}
#[test]
fn test_total_size() {
let bf = Bloom::new(10, 7f64);
assert_eq!(bf.total_size(), 96);
}
#[test]
fn test_size_exp() {
let bf = Bloom::new(10, 7f64);
assert_eq!(bf.size_exp(), 9);
}
#[test]
#[should_panic(expected = "false_positive_ratio must be finite")]
fn test_new_rejects_zero_ratio() {
let _ = Bloom::new(10, 0.0);
}
#[test]
#[should_panic(expected = "false_positive_ratio must be finite")]
fn test_new_rejects_nan_ratio() {
let _ = Bloom::new(10, f64::NAN);
}
#[test]
#[should_panic(expected = "false_positive_ratio must be finite")]
fn test_new_rejects_negative_ratio() {
let _ = Bloom::new(10, -0.5);
}
#[test]
fn test_new_clamps_zero_cap() {
let bf = Bloom::new(0, 0.01);
assert!(bf.set_locs > 0);
}
#[test]
#[should_panic(expected = "bloom entries out of range")]
fn test_new_rejects_saturating_entries() {
let _ = Bloom::new(usize::MAX, 1e-300);
}
#[test]
#[should_panic(expected = "bloom set_locs out of range")]
fn test_new_rejects_huge_locs() {
let _ = Bloom::new(10, 1e15);
}
#[test]
fn test_bloom_concurrent_add_distinct_keys() {
use std::{sync::Arc, thread};
let bf = Arc::new(Bloom::new(8192, 0.01));
let mut handles = Vec::with_capacity(16);
for t in 0..16u64 {
let bf = Arc::clone(&bf);
handles.push(thread::spawn(move || {
let base = t * 1_000_000;
for i in 0..256u64 {
bf.add(base + i);
}
}));
}
for h in handles {
h.join().unwrap();
}
for t in 0..16u64 {
let base = t * 1_000_000;
for i in 0..256u64 {
assert!(
bf.contains(base + i),
"hash {} (thread {}) not present after concurrent add",
base + i,
t
);
}
}
}
#[test]
fn test_bloom_concurrent_add_same_key() {
use std::{sync::Arc, thread};
let bf = Arc::new(Bloom::new(1024, 0.01));
let mut handles = Vec::with_capacity(8);
for _ in 0..8 {
let bf = Arc::clone(&bf);
handles.push(thread::spawn(move || {
for _ in 0..1000 {
bf.add(0xdead_beef);
}
}));
}
for h in handles {
h.join().unwrap();
}
assert!(bf.contains(0xdead_beef));
}
#[test]
fn test_bloom_concurrent_add_and_reset() {
use std::{
sync::{
Arc,
atomic::{AtomicBool, Ordering},
},
thread,
};
let bf = Arc::new(Bloom::new(2048, 0.01));
let stop = Arc::new(AtomicBool::new(false));
let adder = {
let bf = Arc::clone(&bf);
let stop = Arc::clone(&stop);
thread::spawn(move || {
let mut h = 1u64;
while !stop.load(Ordering::Relaxed) {
bf.add(h);
h = h.wrapping_mul(0x9E37_79B9_7F4A_7C15).wrapping_add(1);
}
})
};
let resetter = {
let bf = Arc::clone(&bf);
let stop = Arc::clone(&stop);
thread::spawn(move || {
for _ in 0..1000 {
if stop.load(Ordering::Relaxed) {
break;
}
bf.reset();
}
})
};
resetter.join().unwrap();
stop.store(true, Ordering::Relaxed);
adder.join().unwrap();
let _ = bf.contains(0);
}
}