pub struct BloomFilter {
bits: Vec<u8>, num_hashes: u8, size: u32, }
impl BloomFilter {
pub fn new(size: u32, num_hashes: u8) -> Self {
let byte_size = size.div_ceil(8) as usize;
Self {
bits: vec![0u8; byte_size],
num_hashes,
size,
}
}
pub fn with_capacity(elements: usize, false_positive_rate: f64) -> Self {
let size = Self::optimal_size(elements, false_positive_rate);
let num_hashes = Self::optimal_hashes(elements, size);
Self::new(size, num_hashes)
}
fn optimal_size(elements: usize, fp_rate: f64) -> u32 {
let n = elements as f64;
let p = fp_rate;
let size = -(n * p.ln()) / (2.0_f64.ln().powi(2));
size.ceil() as u32
}
fn optimal_hashes(elements: usize, size: u32) -> u8 {
let n = elements as f64;
let m = size as f64;
let k = (m / n) * 2.0_f64.ln();
k.ceil().clamp(1.0, 10.0) as u8
}
pub fn insert(&mut self, key: &[u8]) {
for i in 0..self.num_hashes {
let hash = self.hash(key, i);
self.set_bit(hash);
}
}
pub fn contains(&self, key: &[u8]) -> bool {
for i in 0..self.num_hashes {
let hash = self.hash(key, i);
if !self.get_bit(hash) {
return false; }
}
true }
fn hash(&self, key: &[u8], index: u8) -> u32 {
match index {
0 => self.hash_fnv1a(key),
1 => self.hash_murmur3(key),
2 => self.hash_djb2(key),
3 => self.hash_sdbm(key),
4 => self.hash_lose(key),
_ => self.hash_fnv1a(key),
}
}
fn hash_fnv1a(&self, key: &[u8]) -> u32 {
let mut hash = 2166136261u32;
for &byte in key {
hash ^= byte as u32;
hash = hash.wrapping_mul(16777619);
}
hash % self.size
}
fn hash_murmur3(&self, key: &[u8]) -> u32 {
let mut hash = 0u32;
for &byte in key {
hash = hash.wrapping_add(byte as u32);
hash = hash.wrapping_add(hash << 10);
hash ^= hash >> 6;
}
hash = hash.wrapping_add(hash << 3);
hash ^= hash >> 11;
hash = hash.wrapping_add(hash << 15);
hash % self.size
}
fn hash_djb2(&self, key: &[u8]) -> u32 {
let mut hash = 5381u32;
for &byte in key {
hash = hash.wrapping_mul(33).wrapping_add(byte as u32);
}
hash % self.size
}
fn hash_sdbm(&self, key: &[u8]) -> u32 {
let mut hash = 0u32;
for &byte in key {
hash = (byte as u32)
.wrapping_add(hash << 6)
.wrapping_add(hash << 16)
.wrapping_sub(hash);
}
hash % self.size
}
fn hash_lose(&self, key: &[u8]) -> u32 {
let mut hash = 0u32;
for chunk in key.chunks(4) {
let mut val = 0u32;
for (i, &byte) in chunk.iter().enumerate() {
val |= (byte as u32) << (i * 8);
}
hash ^= val;
}
hash % self.size
}
fn set_bit(&mut self, pos: u32) {
let byte_index = (pos / 8) as usize;
let bit_offset = (pos % 8) as u8;
if byte_index < self.bits.len() {
self.bits[byte_index] |= 1 << bit_offset;
}
}
fn get_bit(&self, pos: u32) -> bool {
let byte_index = (pos / 8) as usize;
let bit_offset = (pos % 8) as u8;
if byte_index < self.bits.len() {
(self.bits[byte_index] & (1 << bit_offset)) != 0
} else {
false
}
}
pub fn as_bytes(&self) -> &[u8] {
&self.bits
}
pub fn from_bytes(bytes: Vec<u8>, num_hashes: u8) -> Self {
let size = (bytes.len() * 8) as u32;
Self {
bits: bytes,
num_hashes,
size,
}
}
pub fn from_bytes_with_size(bytes: Vec<u8>, num_hashes: u8, size: u32) -> Self {
Self {
bits: bytes,
num_hashes,
size,
}
}
pub fn byte_size(&self) -> usize {
self.bits.len()
}
pub fn bit_size(&self) -> u32 {
self.size
}
pub fn clear(&mut self) {
for byte in &mut self.bits {
*byte = 0;
}
}
pub fn estimate_fp_rate(&self, inserted_count: usize) -> f64 {
let m = self.size as f64;
let n = inserted_count as f64;
let k = self.num_hashes as f64;
let exp_term = (-k * n / m).exp();
(1.0 - exp_term).powf(k)
}
pub fn count_set_bits(&self) -> u32 {
let mut count = 0;
for &byte in &self.bits {
count += byte.count_ones();
}
count
}
pub fn fill_ratio(&self) -> f64 {
self.count_set_bits() as f64 / self.size as f64
}
pub fn merge(&self, other: &BloomFilter) -> Option<BloomFilter> {
if self.size != other.size || self.num_hashes != other.num_hashes {
return None;
}
let mut merged_bits = self.bits.clone();
for (i, &byte) in other.bits.iter().enumerate() {
merged_bits[i] |= byte;
}
Some(BloomFilter {
bits: merged_bits,
num_hashes: self.num_hashes,
size: self.size,
})
}
pub fn union_inplace(&mut self, other: &BloomFilter) -> bool {
if self.size != other.size || self.num_hashes != other.num_hashes {
return false;
}
for (i, &byte) in other.bits.iter().enumerate() {
self.bits[i] |= byte;
}
true
}
pub fn num_hashes(&self) -> u8 {
self.num_hashes
}
}
pub struct BloomFilterBuilder {
expected_elements: Option<usize>,
false_positive_rate: f64,
size: Option<u32>,
num_hashes: u8,
}
impl BloomFilterBuilder {
pub fn new() -> Self {
Self {
expected_elements: None,
false_positive_rate: 0.01, size: None,
num_hashes: 3,
}
}
pub fn expected_elements(mut self, n: usize) -> Self {
self.expected_elements = Some(n);
self
}
pub fn false_positive_rate(mut self, rate: f64) -> Self {
self.false_positive_rate = rate;
self
}
pub fn size(mut self, size: u32) -> Self {
self.size = Some(size);
self
}
pub fn num_hashes(mut self, n: u8) -> Self {
self.num_hashes = n;
self
}
pub fn build(self) -> BloomFilter {
if let Some(size) = self.size {
BloomFilter::new(size, self.num_hashes)
} else if let Some(elements) = self.expected_elements {
BloomFilter::with_capacity(elements, self.false_positive_rate)
} else {
BloomFilter::with_capacity(100_000, self.false_positive_rate)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bloom_basic() {
let mut bloom = BloomFilter::new(1000, 3);
bloom.insert(b"hello");
bloom.insert(b"world");
assert!(bloom.contains(b"hello"));
assert!(bloom.contains(b"world"));
assert!(!bloom.contains(b"foo"));
}
#[test]
fn test_bloom_with_capacity() {
let mut bloom = BloomFilter::with_capacity(1000, 0.01);
for i in 0..1000 {
let key = format!("key{}", i);
bloom.insert(key.as_bytes());
}
for i in 0..1000 {
let key = format!("key{}", i);
assert!(bloom.contains(key.as_bytes()));
}
let mut false_positives = 0;
for i in 1000..2000 {
let key = format!("key{}", i);
if bloom.contains(key.as_bytes()) {
false_positives += 1;
}
}
let fp_rate = false_positives as f64 / 1000.0;
println!("False positive rate: {:.2}%", fp_rate * 100.0);
assert!(fp_rate < 0.05); }
#[test]
fn test_bloom_serialization() {
let mut bloom1 = BloomFilter::new(1000, 3);
bloom1.insert(b"test1");
bloom1.insert(b"test2");
let bytes = bloom1.as_bytes().to_vec();
let bloom2 = BloomFilter::from_bytes(bytes, 3);
assert!(bloom2.contains(b"test1"));
assert!(bloom2.contains(b"test2"));
assert!(!bloom2.contains(b"test3"));
}
#[test]
fn test_bloom_clear() {
let mut bloom = BloomFilter::new(1000, 3);
bloom.insert(b"hello");
assert!(bloom.contains(b"hello"));
bloom.clear();
assert!(!bloom.contains(b"hello"));
}
#[test]
fn test_bloom_fill_ratio() {
let mut bloom = BloomFilter::new(1000, 3);
assert_eq!(bloom.fill_ratio(), 0.0);
for i in 0..100 {
bloom.insert(format!("key{}", i).as_bytes());
}
let ratio = bloom.fill_ratio();
println!("Fill ratio: {:.2}%", ratio * 100.0);
assert!(ratio > 0.0);
assert!(ratio < 1.0);
}
#[test]
fn test_bloom_builder() {
let bloom = BloomFilterBuilder::new()
.expected_elements(10000)
.false_positive_rate(0.01)
.build();
assert!(bloom.bit_size() > 0);
assert!(bloom.byte_size() > 0);
}
#[test]
fn test_hash_functions() {
let bloom = BloomFilter::new(1000, 3);
let key = b"test";
let h1 = bloom.hash_fnv1a(key);
let h2 = bloom.hash_murmur3(key);
let h3 = bloom.hash_djb2(key);
assert_ne!(h1, h2);
assert_ne!(h2, h3);
assert_ne!(h1, h3);
assert!(h1 < 1000);
assert!(h2 < 1000);
assert!(h3 < 1000);
}
#[test]
fn test_bloom_merge() {
let mut bloom1 = BloomFilter::new(1000, 3);
bloom1.insert(b"alpha");
bloom1.insert(b"beta");
let mut bloom2 = BloomFilter::new(1000, 3);
bloom2.insert(b"gamma");
bloom2.insert(b"delta");
let merged = bloom1.merge(&bloom2).unwrap();
assert!(merged.contains(b"alpha"));
assert!(merged.contains(b"beta"));
assert!(merged.contains(b"gamma"));
assert!(merged.contains(b"delta"));
assert!(!merged.contains(b"missing"));
}
#[test]
fn test_bloom_merge_incompatible() {
let bloom1 = BloomFilter::new(1000, 3);
let bloom2 = BloomFilter::new(2000, 3);
assert!(bloom1.merge(&bloom2).is_none());
let bloom3 = BloomFilter::new(1000, 4);
assert!(bloom1.merge(&bloom3).is_none());
}
#[test]
fn test_bloom_union_inplace() {
let mut bloom1 = BloomFilter::new(1000, 3);
bloom1.insert(b"one");
let mut bloom2 = BloomFilter::new(1000, 3);
bloom2.insert(b"two");
assert!(bloom1.union_inplace(&bloom2));
assert!(bloom1.contains(b"one"));
assert!(bloom1.contains(b"two"));
}
#[test]
fn test_optimal_sizing() {
let bloom = BloomFilter::with_capacity(1_000_000, 0.01);
println!("Bloom filter stats:");
println!(" Elements: 1,000,000");
println!(" FP rate: 1%");
println!(" Bits: {}", bloom.bit_size());
println!(" Bytes: {}", bloom.byte_size());
println!(" Hashes: {}", bloom.num_hashes);
let bits_per_element = bloom.bit_size() as f64 / 1_000_000.0;
println!(" Bits/element: {:.2}", bits_per_element);
assert!(bits_per_element > 8.0);
assert!(bits_per_element < 12.0);
}
}