use bit_vec::BitVec;
use super::hash::{hash_pair, nth_hash};
use crate::ZiftError;
#[derive(Debug, Clone)]
pub struct BloomFilter {
pub(crate) bits: BitVec,
pub(crate) num_hashes: u32,
pub(crate) num_bits: usize,
}
impl BloomFilter {
#[must_use]
#[allow(clippy::cast_precision_loss, clippy::cast_sign_loss)]
pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
let n = expected_items.max(1) as f64;
let p = false_positive_rate.clamp(0.0001, 0.9999);
let m_f64 = (-n * p.ln() / (2.0_f64.ln().powi(2))).ceil();
let m = (if m_f64 > usize::MAX as f64 {
usize::MAX
} else {
#[allow(clippy::cast_possible_truncation)]
let val = m_f64 as usize;
val
})
.max(64);
let k_f64 = ((m as f64 / n) * 2.0_f64.ln()).round();
let k = (if k_f64 > f64::from(u32::MAX) {
u32::MAX
} else {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let val = k_f64 as u32;
val
})
.clamp(1, 32);
Self {
bits: BitVec::from_elem(m, false),
num_hashes: k,
num_bits: m,
}
}
#[must_use]
pub fn with_params(num_bits: usize, num_hashes: u32) -> Self {
Self {
bits: BitVec::from_elem(num_bits, false),
num_hashes: num_hashes.clamp(1, 32),
num_bits,
}
}
pub fn insert(&mut self, item: &[u8]) {
let (h1, h2) = hash_pair(item);
for i in 0..self.num_hashes {
let idx = nth_hash(h1, h2, i, self.num_bits);
self.bits.set(idx, true);
}
}
#[must_use]
pub fn may_contain(&self, item: &[u8]) -> bool {
let (h1, h2) = hash_pair(item);
for i in 0..self.num_hashes {
let idx = nth_hash(h1, h2, i, self.num_bits);
if !self.bits.get(idx).unwrap_or(false) {
return false;
}
}
true
}
#[must_use]
pub fn may_contain_any(&self, patterns: &[&[u8]]) -> bool {
patterns.iter().any(|p| self.may_contain(p))
}
pub fn clear(&mut self) {
self.bits.clear();
self.bits.grow(self.num_bits, false);
}
#[must_use]
pub fn num_bits(&self) -> usize {
self.num_bits
}
#[must_use]
pub fn num_hashes(&self) -> u32 {
self.num_hashes
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn fill_ratio(&self) -> f64 {
let set_bits = self.bits.iter().filter(|b| *b).count();
set_bits as f64 / self.num_bits as f64
}
#[must_use]
#[allow(clippy::cast_precision_loss, clippy::cast_possible_wrap)]
pub fn estimated_fpr(&self) -> f64 {
let k = f64::from(self.num_hashes);
let m = self.num_bits as f64;
let n = -m / k * (1.0 - self.fill_ratio()).ln();
let pow_hashes = i32::try_from(self.num_hashes).unwrap_or(i32::MAX);
(1.0 - (-k * n / m).exp()).powi(pow_hashes)
}
#[must_use]
pub fn bits(&self) -> &BitVec {
&self.bits
}
pub fn from_bits(bits: BitVec, num_hashes: u32) -> Result<Self, ZiftError> {
let num_bits = bits.len();
if num_bits == 0 {
return Err(ZiftError::InvalidData {
offset: 0,
reason: "BloomFilter requires at least 1 bit".to_string(),
});
}
Ok(Self {
bits,
num_hashes: num_hashes.clamp(1, 32),
num_bits,
})
}
}