use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicU64, Ordering};
pub struct BloomFilter {
bits: Box<[AtomicU64]>,
num_hashes: usize,
num_bits: usize,
}
impl BloomFilter {
pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
let ln2 = std::f64::consts::LN_2;
let ln2_sq = ln2 * ln2;
let num_bits = (-(expected_items as f64) * false_positive_rate.ln() / ln2_sq).ceil() as usize;
let num_bits = num_bits.max(64);
let num_hashes = ((num_bits as f64 / expected_items as f64) * ln2).ceil() as usize;
let num_hashes = num_hashes.clamp(1, 16);
let num_u64s = (num_bits + 63) / 64;
let actual_bits = num_u64s * 64;
let bits: Box<[AtomicU64]> = (0..num_u64s)
.map(|_| AtomicU64::new(0))
.collect::<Vec<_>>()
.into_boxed_slice();
Self {
bits,
num_hashes,
num_bits: actual_bits,
}
}
pub fn with_size(num_bits: usize, num_hashes: usize) -> Self {
let num_u64s = (num_bits + 63) / 64;
let actual_bits = num_u64s * 64;
let bits: Box<[AtomicU64]> = (0..num_u64s)
.map(|_| AtomicU64::new(0))
.collect::<Vec<_>>()
.into_boxed_slice();
Self {
bits,
num_hashes,
num_bits: actual_bits,
}
}
pub fn insert(&self, key: &str) {
for i in 0..self.num_hashes {
let bit_idx = self.hash_index(key, i);
let word_idx = bit_idx / 64;
let bit_pos = bit_idx % 64;
self.bits[word_idx].fetch_or(1 << bit_pos, Ordering::Relaxed);
}
}
pub fn might_contain(&self, key: &str) -> bool {
for i in 0..self.num_hashes {
let bit_idx = self.hash_index(key, i);
let word_idx = bit_idx / 64;
let bit_pos = bit_idx % 64;
if self.bits[word_idx].load(Ordering::Relaxed) & (1 << bit_pos) == 0 {
return false;
}
}
true
}
pub fn clear(&self) {
for word in self.bits.iter() {
word.store(0, Ordering::Relaxed);
}
}
pub fn num_bits(&self) -> usize {
self.num_bits
}
pub fn num_hashes(&self) -> usize {
self.num_hashes
}
fn hash_index(&self, key: &str, hash_num: usize) -> usize {
let mut hasher1 = std::collections::hash_map::DefaultHasher::new();
key.hash(&mut hasher1);
let h1 = hasher1.finish();
let mut hasher2 = std::collections::hash_map::DefaultHasher::new();
(key, 0x517cc1b727220a95u64).hash(&mut hasher2);
let h2 = hasher2.finish();
let combined = h1.wrapping_add((hash_num as u64).wrapping_mul(h2));
(combined as usize) % self.num_bits
}
}
impl Clone for BloomFilter {
fn clone(&self) -> Self {
let bits: Box<[AtomicU64]> = self
.bits
.iter()
.map(|b| AtomicU64::new(b.load(Ordering::Relaxed)))
.collect::<Vec<_>>()
.into_boxed_slice();
Self {
bits,
num_hashes: self.num_hashes,
num_bits: self.num_bits,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_insert_and_check() {
let filter = BloomFilter::new(1000, 0.01);
filter.insert("key1");
filter.insert("key2");
filter.insert("key3");
assert!(filter.might_contain("key1"));
assert!(filter.might_contain("key2"));
assert!(filter.might_contain("key3"));
}
#[test]
fn test_negative_lookup() {
let filter = BloomFilter::new(100, 0.01);
for i in 0..50 {
filter.insert(&format!("key:{}", i));
}
let mut false_count = 0;
for i in 1000..1100 {
if !filter.might_contain(&format!("key:{}", i)) {
false_count += 1;
}
}
assert!(false_count > 90, "False count was {}", false_count);
}
#[test]
fn test_clear() {
let filter = BloomFilter::new(100, 0.01);
filter.insert("key1");
assert!(filter.might_contain("key1"));
filter.clear();
assert!(!filter.might_contain("key1"));
}
#[test]
fn test_parameters() {
let filter = BloomFilter::new(1000, 0.01);
assert!(filter.num_bits() > 0);
assert!(filter.num_hashes() > 0);
}
}