const FNV_OFFSET_BASIS: u64 = 0xcbf29ce484222325u64;
const FNV_PRIME: u64 = 0x00000100000001b3u64;
fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
let mut hash = seed;
for &byte in data {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
#[inline]
fn h1(data: &[u8]) -> u64 {
fnv1a_64_seeded(data, FNV_OFFSET_BASIS)
}
#[inline]
fn h2(data: &[u8]) -> u64 {
let seed = FNV_OFFSET_BASIS ^ 0xdeadbeefcafe1337u64;
fnv1a_64_seeded(data, seed) | 1
}
#[inline]
fn double_hash_position(h1_val: u64, h2_val: u64, i: u64, num_bits: usize) -> usize {
let nb = num_bits as u64;
(h1_val.wrapping_add(i.wrapping_mul(h2_val)) % nb) as usize
}
fn optimal_num_bits(expected_items: usize, false_positive_rate: f64) -> usize {
let n = expected_items as f64;
let p = false_positive_rate.clamp(1e-15, 1.0 - f64::EPSILON);
let ln2_sq = std::f64::consts::LN_2 * std::f64::consts::LN_2;
let m = (-n * p.ln() / ln2_sq).ceil() as usize;
m.max(8)
}
fn optimal_num_hash_functions(num_bits: usize, expected_items: usize) -> u8 {
if expected_items == 0 {
return 1;
}
let m = num_bits as f64;
let n = expected_items as f64;
let k = ((m / n) * std::f64::consts::LN_2).round() as u64;
k.clamp(1, 255) as u8
}
#[derive(Debug, Clone)]
pub struct BloomFilter {
bit_array: Vec<u8>,
num_bits: usize,
num_hash_functions: u8,
num_items: u64,
}
impl BloomFilter {
pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
assert!(expected_items > 0, "expected_items must be > 0");
assert!(
false_positive_rate > 0.0 && false_positive_rate < 1.0,
"false_positive_rate must be in (0, 1)"
);
let num_bits = optimal_num_bits(expected_items, false_positive_rate);
let num_hash_functions = optimal_num_hash_functions(num_bits, expected_items);
let byte_count = (num_bits + 7) / 8;
Self {
bit_array: vec![0u8; byte_count],
num_bits,
num_hash_functions,
num_items: 0,
}
}
fn set_bit(&mut self, pos: usize) {
let byte_idx = pos / 8;
let bit_idx = pos % 8;
if let Some(byte) = self.bit_array.get_mut(byte_idx) {
*byte |= 1u8 << bit_idx;
}
}
fn get_bit(&self, pos: usize) -> bool {
let byte_idx = pos / 8;
let bit_idx = pos % 8;
self.bit_array
.get(byte_idx)
.map(|byte| (byte >> bit_idx) & 1 == 1)
.unwrap_or(false)
}
pub fn insert(&mut self, item: &[u8]) {
let h1_val = h1(item);
let h2_val = h2(item);
for i in 0..self.num_hash_functions as u64 {
let pos = double_hash_position(h1_val, h2_val, i, self.num_bits);
self.set_bit(pos);
}
self.num_items += 1;
}
pub fn contains(&self, item: &[u8]) -> bool {
let h1_val = h1(item);
let h2_val = h2(item);
for i in 0..self.num_hash_functions as u64 {
let pos = double_hash_position(h1_val, h2_val, i, self.num_bits);
if !self.get_bit(pos) {
return false;
}
}
true
}
pub fn estimate_false_positive_rate(&self) -> f64 {
let k = self.num_hash_functions as f64;
let n = self.num_items as f64;
let m = self.num_bits as f64;
if m == 0.0 {
return 1.0;
}
(1.0_f64 - (-k * n / m).exp()).powf(k)
}
pub fn item_count(&self) -> u64 {
self.num_items
}
pub fn num_bits(&self) -> usize {
self.num_bits
}
pub fn num_hash_functions(&self) -> u8 {
self.num_hash_functions
}
}
#[derive(Debug, Clone)]
pub struct CountingBloomFilter {
counts: Vec<u8>,
num_counters: usize,
num_hash_functions: u8,
num_items: u64,
}
impl CountingBloomFilter {
pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
assert!(expected_items > 0, "expected_items must be > 0");
assert!(
false_positive_rate > 0.0 && false_positive_rate < 1.0,
"false_positive_rate must be in (0, 1)"
);
let num_bits = optimal_num_bits(expected_items, false_positive_rate);
let num_hash_functions = optimal_num_hash_functions(num_bits, expected_items);
let byte_count = (num_bits + 1) / 2;
Self {
counts: vec![0u8; byte_count],
num_counters: num_bits,
num_hash_functions,
num_items: 0,
}
}
fn get_nibble(&self, pos: usize) -> u8 {
let byte_idx = pos / 2;
let nibble_shift = (pos % 2) * 4;
self.counts
.get(byte_idx)
.map(|b| (b >> nibble_shift) & 0x0F)
.unwrap_or(0)
}
fn increment_nibble(&mut self, pos: usize) {
let byte_idx = pos / 2;
let nibble_shift = (pos % 2) * 4;
if let Some(byte) = self.counts.get_mut(byte_idx) {
let nibble = (*byte >> nibble_shift) & 0x0F;
if nibble < 0x0F {
*byte += 1u8 << nibble_shift;
}
}
}
fn decrement_nibble(&mut self, pos: usize) -> bool {
let byte_idx = pos / 2;
let nibble_shift = (pos % 2) * 4;
if let Some(byte) = self.counts.get_mut(byte_idx) {
let nibble = (*byte >> nibble_shift) & 0x0F;
if nibble == 0x0F {
return false;
}
if nibble > 0 {
*byte -= 1u8 << nibble_shift;
return true;
}
}
false
}
pub fn insert(&mut self, item: &[u8]) {
let h1_val = h1(item);
let h2_val = h2(item);
for i in 0..self.num_hash_functions as u64 {
let pos = double_hash_position(h1_val, h2_val, i, self.num_counters);
self.increment_nibble(pos);
}
self.num_items += 1;
}
pub fn contains(&self, item: &[u8]) -> bool {
let h1_val = h1(item);
let h2_val = h2(item);
for i in 0..self.num_hash_functions as u64 {
let pos = double_hash_position(h1_val, h2_val, i, self.num_counters);
if self.get_nibble(pos) == 0 {
return false;
}
}
true
}
pub fn remove(&mut self, item: &[u8]) -> bool {
if !self.contains(item) {
return false;
}
let h1_val = h1(item);
let h2_val = h2(item);
let positions: Vec<usize> = (0..self.num_hash_functions as u64)
.map(|i| double_hash_position(h1_val, h2_val, i, self.num_counters))
.collect();
for &pos in &positions {
let nibble = self.get_nibble(pos);
if nibble == 0 || nibble == 0x0F {
return false;
}
}
for &pos in &positions {
self.decrement_nibble(pos);
}
if self.num_items > 0 {
self.num_items -= 1;
}
true
}
pub fn item_count(&self) -> u64 {
self.num_items
}
pub fn estimate_false_positive_rate(&self) -> f64 {
let k = self.num_hash_functions as f64;
let n = self.num_items as f64;
let m = self.num_counters as f64;
if m == 0.0 {
return 1.0;
}
(1.0_f64 - (-k * n / m).exp()).powf(k)
}
}
#[derive(Debug, Clone)]
pub struct ScalableBloomFilter {
layers: Vec<BloomFilter>,
initial_capacity: usize,
max_fpr_per_layer: f64,
growth_factor: f64,
tightening_ratio: f64,
total_items: u64,
}
impl ScalableBloomFilter {
pub fn new(initial_capacity: usize, target_fpr: f64, growth_factor: f64) -> Self {
assert!(initial_capacity > 0, "initial_capacity must be > 0");
assert!(
target_fpr > 0.0 && target_fpr < 1.0,
"target_fpr must be in (0, 1)"
);
let gf = if growth_factor > 1.0 {
growth_factor
} else {
2.0
};
let first_layer = BloomFilter::new(initial_capacity, target_fpr);
Self {
layers: vec![first_layer],
initial_capacity,
max_fpr_per_layer: target_fpr,
growth_factor: gf,
tightening_ratio: 0.8,
total_items: 0,
}
}
pub fn insert(&mut self, item: &[u8]) {
if let Some(active) = self.layers.last() {
if active.estimate_false_positive_rate() > self.max_fpr_per_layer {
self.add_layer();
}
}
if let Some(active) = self.layers.last_mut() {
active.insert(item);
}
self.total_items += 1;
}
pub fn contains(&self, item: &[u8]) -> bool {
self.layers.iter().any(|layer| layer.contains(item))
}
pub fn estimate_false_positive_rate(&self) -> f64 {
let product: f64 = self
.layers
.iter()
.map(|l| 1.0 - l.estimate_false_positive_rate())
.product();
1.0 - product
}
pub fn total_item_count(&self) -> u64 {
self.total_items
}
pub fn layer_count(&self) -> usize {
self.layers.len()
}
fn add_layer(&mut self) {
let layer_idx = self.layers.len();
let capacity =
(self.initial_capacity as f64 * self.growth_factor.powi(layer_idx as i32)) as usize;
let capacity = capacity.max(1);
let fpr = self.max_fpr_per_layer * self.tightening_ratio.powi(layer_idx as i32);
let fpr = fpr.clamp(1e-15, 1.0 - f64::EPSILON);
self.layers.push(BloomFilter::new(capacity, fpr));
}
pub fn set_tightening_ratio(&mut self, ratio: f64) {
if ratio > 0.0 && ratio < 1.0 {
self.tightening_ratio = ratio;
}
}
pub fn tightening_ratio(&self) -> f64 {
self.tightening_ratio
}
pub fn growth_factor(&self) -> f64 {
self.growth_factor
}
pub fn layer_stats(&self) -> Vec<(u64, usize, f64)> {
self.layers
.iter()
.map(|l| {
(
l.item_count(),
l.num_bits(),
l.estimate_false_positive_rate(),
)
})
.collect()
}
pub fn estimated_capacity_remaining(&self) -> usize {
let active = match self.layers.last() {
Some(l) => l,
None => return 0,
};
let current_fpr = active.estimate_false_positive_rate();
if current_fpr >= self.max_fpr_per_layer {
return 0;
}
let k = active.num_hash_functions() as f64;
let m = active.num_bits() as f64;
let n = active.item_count() as f64;
let theoretical_max = (m / k) * std::f64::consts::LN_2;
let remaining = (theoretical_max - n).max(0.0);
remaining as usize
}
pub fn clear(&mut self) {
self.layers.clear();
self.total_items = 0;
let first_layer = BloomFilter::new(self.initial_capacity, self.max_fpr_per_layer);
self.layers.push(first_layer);
}
pub fn total_bits(&self) -> usize {
self.layers.iter().map(|l| l.num_bits()).sum()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fnv1a_deterministic() {
let a = h1(b"hello");
let b = h1(b"hello");
assert_eq!(a, b);
}
#[test]
fn test_h1_h2_differ() {
let v = h1(b"test");
let v2 = h2(b"test");
assert_ne!(v, v2, "h1 and h2 should produce different hashes");
}
#[test]
fn test_h2_always_odd() {
for seed in [b"a".as_ref(), b"hello", b"oximedia", b"\x00\xff"] {
assert_eq!(h2(seed) & 1, 1, "h2 must be odd for {seed:?}");
}
}
#[test]
fn test_new_bloom_filter() {
let bf = BloomFilter::new(1000, 0.01);
assert!(bf.num_bits() > 0);
assert!(bf.num_hash_functions() > 0);
assert_eq!(bf.item_count(), 0);
}
#[test]
fn test_optimal_num_bits_reasonable() {
let m = optimal_num_bits(10_000, 0.01);
assert!(m > 90_000 && m < 110_000, "unexpected m={m}");
}
#[test]
fn test_optimal_k_reasonable() {
let m = optimal_num_bits(10_000, 0.01);
let k = optimal_num_hash_functions(m, 10_000);
assert!(k >= 6 && k <= 8, "unexpected k={k}");
}
#[test]
fn test_insert_then_contains() {
let mut bf = BloomFilter::new(100, 0.01);
bf.insert(b"key1");
assert!(bf.contains(b"key1"));
}
#[test]
fn test_contains_absent_item() {
let bf = BloomFilter::new(100, 0.01);
assert!(!bf.contains(b"ghost"));
}
#[test]
fn test_no_false_negatives() {
let mut bf = BloomFilter::new(500, 0.01);
let items: Vec<Vec<u8>> = (0u32..200).map(|i| i.to_le_bytes().to_vec()).collect();
for item in &items {
bf.insert(item);
}
for item in &items {
assert!(bf.contains(item), "false negative detected for {:?}", item);
}
}
#[test]
fn test_item_count() {
let mut bf = BloomFilter::new(100, 0.05);
bf.insert(b"a");
bf.insert(b"b");
bf.insert(b"c");
assert_eq!(bf.item_count(), 3);
}
#[test]
fn test_estimate_fpr_empty() {
let bf = BloomFilter::new(1000, 0.01);
assert_eq!(bf.estimate_false_positive_rate(), 0.0);
}
#[test]
fn test_estimate_fpr_increases_with_fill() {
let mut bf = BloomFilter::new(100, 0.01);
let fpr_empty = bf.estimate_false_positive_rate();
for i in 0u32..50 {
bf.insert(&i.to_le_bytes());
}
let fpr_half = bf.estimate_false_positive_rate();
assert!(fpr_half > fpr_empty, "FPR should increase as filter fills");
}
#[test]
fn test_empirical_fpr_at_n10000_p001() {
let n = 10_000usize;
let p = 0.01_f64;
let mut bf = BloomFilter::new(n, p);
for i in 0u32..n as u32 {
let key = format!("inserted_{i}");
bf.insert(key.as_bytes());
}
let mut false_positives = 0usize;
let probes = 10_000usize;
for i in 0u32..probes as u32 {
let key = format!("absent_{i}");
if bf.contains(key.as_bytes()) {
false_positives += 1;
}
}
let observed_fpr = false_positives as f64 / probes as f64;
assert!(
observed_fpr <= p * 3.0,
"observed FPR {observed_fpr:.4} exceeded 3× target ({:.4})",
p * 3.0
);
}
#[test]
fn test_counting_bf_insert_contains() {
let mut cbf = CountingBloomFilter::new(200, 0.01);
cbf.insert(b"alpha");
assert!(cbf.contains(b"alpha"));
}
#[test]
fn test_counting_bf_remove() {
let mut cbf = CountingBloomFilter::new(200, 0.01);
cbf.insert(b"remove_me");
assert!(cbf.contains(b"remove_me"));
let removed = cbf.remove(b"remove_me");
assert!(removed, "remove should succeed");
assert!(!cbf.contains(b"remove_me"));
}
#[test]
fn test_counting_bf_remove_absent() {
let mut cbf = CountingBloomFilter::new(200, 0.01);
let removed = cbf.remove(b"never_inserted");
assert!(!removed, "cannot remove item that was never inserted");
}
#[test]
fn test_counting_bf_item_count() {
let mut cbf = CountingBloomFilter::new(100, 0.05);
cbf.insert(b"x");
cbf.insert(b"y");
assert_eq!(cbf.item_count(), 2);
cbf.remove(b"x");
assert_eq!(cbf.item_count(), 1);
}
#[test]
fn test_counting_bf_no_false_negatives() {
let mut cbf = CountingBloomFilter::new(300, 0.01);
let items: Vec<Vec<u8>> = (0u32..100).map(|i| i.to_le_bytes().to_vec()).collect();
for item in &items {
cbf.insert(item);
}
for item in &items {
assert!(cbf.contains(item), "false negative for {item:?}");
}
}
#[test]
fn test_counting_bf_multiple_inserts_then_single_remove() {
let mut cbf = CountingBloomFilter::new(200, 0.01);
cbf.insert(b"double");
cbf.insert(b"double");
cbf.remove(b"double");
assert!(cbf.contains(b"double"));
}
#[test]
fn test_double_hash_position_range() {
let item = b"test_item";
let h1_val = h1(item);
let h2_val = h2(item);
let num_bits = 1024;
for i in 0..10u64 {
let pos = double_hash_position(h1_val, h2_val, i, num_bits);
assert!(pos < num_bits, "position {pos} out of range");
}
}
#[test]
fn test_bloom_filter_clone() {
let mut bf = BloomFilter::new(100, 0.01);
bf.insert(b"cloned");
let bf2 = bf.clone();
assert!(bf2.contains(b"cloned"));
assert_eq!(bf2.item_count(), bf.item_count());
}
#[test]
fn test_scalable_bf_insert_contains() {
let mut sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
sbf.insert(b"hello");
assert!(sbf.contains(b"hello"));
}
#[test]
fn test_scalable_bf_absent_item() {
let sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
assert!(!sbf.contains(b"missing"));
}
#[test]
fn test_scalable_bf_no_false_negatives() {
let mut sbf = ScalableBloomFilter::new(50, 0.05, 2.0);
let items: Vec<Vec<u8>> = (0u32..200).map(|i| i.to_le_bytes().to_vec()).collect();
for item in &items {
sbf.insert(item);
}
for item in &items {
assert!(sbf.contains(item), "false negative for {item:?}");
}
}
#[test]
fn test_scalable_bf_grows_layers() {
let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
for i in 0u32..500 {
sbf.insert(&i.to_le_bytes());
}
assert!(
sbf.layer_count() > 1,
"should have grown beyond 1 layer, got {}",
sbf.layer_count()
);
}
#[test]
fn test_scalable_bf_total_item_count() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
sbf.insert(b"a");
sbf.insert(b"b");
sbf.insert(b"c");
assert_eq!(sbf.total_item_count(), 3);
}
#[test]
fn test_scalable_bf_fpr_bounded() {
let mut sbf = ScalableBloomFilter::new(1000, 0.01, 2.0);
for i in 0u32..1000 {
sbf.insert(&i.to_le_bytes());
}
let fpr = sbf.estimate_false_positive_rate();
assert!(fpr < 0.10, "aggregate FPR {fpr:.4} is too high");
}
#[test]
fn test_scalable_bf_clone() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
sbf.insert(b"test");
let sbf2 = sbf.clone();
assert!(sbf2.contains(b"test"));
assert_eq!(sbf2.total_item_count(), 1);
assert_eq!(sbf2.layer_count(), sbf.layer_count());
}
#[test]
fn test_scalable_bf_empty_fpr() {
let sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
assert_eq!(sbf.estimate_false_positive_rate(), 0.0);
}
#[test]
fn test_scalable_bf_set_tightening_ratio() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
sbf.set_tightening_ratio(0.5);
assert!((sbf.tightening_ratio() - 0.5).abs() < f64::EPSILON);
}
#[test]
fn test_scalable_bf_invalid_tightening_ratio_ignored() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
let original = sbf.tightening_ratio();
sbf.set_tightening_ratio(0.0); assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
sbf.set_tightening_ratio(1.0); assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
sbf.set_tightening_ratio(-0.5); assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
}
#[test]
fn test_scalable_bf_growth_factor() {
let sbf = ScalableBloomFilter::new(100, 0.01, 3.0);
assert!((sbf.growth_factor() - 3.0).abs() < f64::EPSILON);
}
#[test]
fn test_scalable_bf_growth_factor_default_when_invalid() {
let sbf = ScalableBloomFilter::new(100, 0.01, 0.5);
assert!((sbf.growth_factor() - 2.0).abs() < f64::EPSILON);
}
#[test]
fn test_scalable_bf_layer_stats() {
let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
for i in 0u32..200 {
sbf.insert(&i.to_le_bytes());
}
let stats = sbf.layer_stats();
assert!(!stats.is_empty());
assert!(stats[0].0 > 0, "first layer should have items");
for (_, bits, _) in &stats {
assert!(*bits > 0);
}
}
#[test]
fn test_scalable_bf_estimated_capacity_remaining() {
let sbf = ScalableBloomFilter::new(1000, 0.01, 2.0);
let remaining = sbf.estimated_capacity_remaining();
assert!(remaining > 0, "fresh filter should have remaining capacity");
}
#[test]
fn test_scalable_bf_estimated_capacity_decreases() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
let before = sbf.estimated_capacity_remaining();
for i in 0u32..50 {
sbf.insert(&i.to_le_bytes());
}
let after = sbf.estimated_capacity_remaining();
assert!(
after < before,
"remaining capacity should decrease after inserts"
);
}
#[test]
fn test_scalable_bf_clear() {
let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
for i in 0u32..50 {
sbf.insert(&i.to_le_bytes());
}
sbf.clear();
assert_eq!(sbf.total_item_count(), 0);
assert_eq!(sbf.layer_count(), 1);
assert!(!sbf.contains(&0u32.to_le_bytes()));
}
#[test]
fn test_scalable_bf_total_bits() {
let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
let bits_single = sbf.total_bits();
for i in 0u32..500 {
sbf.insert(&i.to_le_bytes());
}
let bits_multi = sbf.total_bits();
assert!(
bits_multi > bits_single,
"total bits should increase with layers"
);
}
#[test]
fn test_scalable_bf_tighter_ratio_more_layers() {
let mut sbf_tight = ScalableBloomFilter::new(10, 0.1, 2.0);
sbf_tight.set_tightening_ratio(0.5);
let mut sbf_loose = ScalableBloomFilter::new(10, 0.1, 2.0);
sbf_loose.set_tightening_ratio(0.9);
for i in 0u32..200 {
sbf_tight.insert(&i.to_le_bytes());
sbf_loose.insert(&i.to_le_bytes());
}
for i in 0u32..200 {
assert!(sbf_tight.contains(&i.to_le_bytes()));
assert!(sbf_loose.contains(&i.to_le_bytes()));
}
}
#[test]
fn test_scalable_bf_empirical_fpr() {
let mut sbf = ScalableBloomFilter::new(1000, 0.05, 2.0);
for i in 0u32..1000 {
sbf.insert(&i.to_le_bytes());
}
let mut fps = 0usize;
for i in 10000u32..20000 {
if sbf.contains(&i.to_le_bytes()) {
fps += 1;
}
}
let observed_fpr = fps as f64 / 10000.0;
assert!(
observed_fpr < 0.20,
"observed FPR {observed_fpr:.4} is too high"
);
}
#[test]
fn test_scalable_bf_stress_many_inserts() {
let mut sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
for i in 0u32..10000 {
sbf.insert(&i.to_le_bytes());
}
assert_eq!(sbf.total_item_count(), 10000);
for i in [0u32, 999, 5000, 9999] {
assert!(sbf.contains(&i.to_le_bytes()), "false negative for {i}");
}
}
}