pub struct BloomFilter {
bits: Vec<u8>,
num_bits: usize,
num_hash_functions: usize,
}
pub struct PrefixBloomFilter {
filter: BloomFilter,
max_prefix_len: usize,
}
impl PrefixBloomFilter {
pub fn new(bits_per_key: usize, num_keys: usize, max_prefix_len: usize) -> Self {
let effective_keys = num_keys * max_prefix_len;
Self {
filter: BloomFilter::new(bits_per_key, effective_keys),
max_prefix_len,
}
}
pub fn with_rate(
false_positive_rate: f64,
expected_items: usize,
max_prefix_len: usize,
) -> Self {
let effective_items = expected_items * max_prefix_len;
Self {
filter: BloomFilter::with_rate(false_positive_rate, effective_items),
max_prefix_len,
}
}
#[inline]
pub fn insert(&mut self, key: &[u8]) {
let max_len = std::cmp::min(key.len(), self.max_prefix_len);
for len in 1..=max_len {
self.filter.insert(&key[..len]);
}
}
#[inline]
pub fn may_contain_prefix(&self, prefix: &[u8]) -> bool {
if prefix.is_empty() {
return true; }
let check_len = std::cmp::min(prefix.len(), self.max_prefix_len);
self.filter.contains(&prefix[..check_len])
}
pub fn as_bytes(&self) -> &[u8] {
self.filter.as_bytes()
}
pub fn from_bytes(bits: Vec<u8>, bits_per_key: usize, max_prefix_len: usize) -> Self {
Self {
filter: BloomFilter::from_bytes(bits, bits_per_key),
max_prefix_len,
}
}
pub fn metadata(&self) -> (usize, usize, usize) {
let (num_hash, num_bits) = self.filter.metadata();
(num_hash, num_bits, self.max_prefix_len)
}
}
impl BloomFilter {
pub fn new(bits_per_key: usize, num_keys: usize) -> Self {
let num_bits = std::cmp::max(64, bits_per_key * num_keys);
let num_bytes = (num_bits + 7) / 8;
let num_hash_functions = std::cmp::max(1, (bits_per_key as f64 * 0.69) as usize);
Self {
bits: vec![0; num_bytes],
num_bits,
num_hash_functions,
}
}
pub fn with_rate(false_positive_rate: f64, expected_items: usize) -> Self {
let bits_per_key = (-false_positive_rate.ln() / (2.0_f64.ln().powi(2))) * 1.44;
Self::new(bits_per_key.ceil() as usize, expected_items)
}
#[inline]
pub fn insert(&mut self, key: &[u8]) {
let h = gxhash::gxhash64(key, 0);
let h1 = h as usize;
let h2 = (h >> 32) as usize;
for i in 0..self.num_hash_functions {
let bit_pos = h1.wrapping_add(i.wrapping_mul(h2)) % self.num_bits;
self.bits[bit_pos / 8] |= 1 << (bit_pos % 8);
}
}
#[inline]
pub fn contains(&self, key: &[u8]) -> bool {
let h = gxhash::gxhash64(key, 0);
let h1 = h as usize;
let h2 = (h >> 32) as usize;
for i in 0..self.num_hash_functions {
let bit_pos = h1.wrapping_add(i.wrapping_mul(h2)) % self.num_bits;
if (self.bits[bit_pos / 8] & (1 << (bit_pos % 8))) == 0 {
return false;
}
}
true
}
pub fn as_bytes(&self) -> &[u8] {
&self.bits
}
pub fn from_bytes(bits: Vec<u8>, bits_per_key: usize) -> Self {
let num_bits = bits.len() * 8;
let num_hash_functions = std::cmp::max(1, (bits_per_key as f64 * 0.69) as usize);
Self {
bits,
num_bits,
num_hash_functions,
}
}
pub fn metadata(&self) -> (usize, usize) {
(self.num_hash_functions, self.num_bits)
}
}