use crate::bloom::BloomFilter;
use crate::{CompressedBlock, CompressionFormat};
#[derive(Debug)]
pub(crate) struct BlockWithBloom {
pub(crate) block: CompressedBlock,
pub(crate) bloom: BloomFilter,
}
#[derive(Debug)]
pub struct CompressedIndex {
pub(crate) format: CompressionFormat,
pub(crate) blocks: Vec<BlockWithBloom>,
}
impl CompressedIndex {
fn pattern_might_contain(bloom: &BloomFilter, pattern: &[u8]) -> bool {
if pattern.len() <= 4 {
bloom.may_contain(pattern)
} else {
pattern.windows(4).any(|window| bloom.may_contain(window))
}
}
#[must_use]
pub fn candidate_blocks(&self, pattern: &[u8]) -> Vec<usize> {
if pattern.is_empty() {
return (0..self.blocks.len()).collect();
}
self.blocks
.iter()
.enumerate()
.filter(|(_, bwb)| Self::pattern_might_contain(&bwb.bloom, pattern))
.map(|(idx, _)| idx)
.collect()
}
pub fn candidate_blocks_iter<'a>(
&'a self,
pattern: &'a [u8],
) -> impl Iterator<Item = usize> + 'a {
self.blocks
.iter()
.enumerate()
.filter(move |(_, bwb)| Self::pattern_might_contain(&bwb.bloom, pattern))
.map(|(idx, _)| idx)
}
#[must_use]
pub fn get_block(&self, idx: usize) -> Option<&CompressedBlock> {
self.blocks.get(idx).map(|bwb| &bwb.block)
}
#[must_use]
pub fn block_count(&self) -> usize {
self.blocks.len()
}
#[must_use]
pub fn format(&self) -> CompressionFormat {
self.format
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn bloom_stats(&self) -> Option<BloomStats> {
let num_blocks = self.blocks.len();
if num_blocks == 0 {
return None;
}
let total_bits: usize = self.blocks.iter().map(|bwb| bwb.bloom.num_bits()).sum();
let total_hashes: u32 = self
.blocks
.iter()
.map(|bwb| bwb.bloom.num_hashes())
.sum::<u32>()
/ u32::try_from(num_blocks).unwrap_or(u32::MAX);
let avg_fill: f64 = self
.blocks
.iter()
.map(|bwb| bwb.bloom.fill_ratio())
.sum::<f64>()
/ num_blocks as f64;
let avg_fpr: f64 = self
.blocks
.iter()
.map(|bwb| bwb.bloom.estimated_fpr())
.sum::<f64>()
/ num_blocks as f64;
Some(BloomStats {
num_bits: total_bits,
num_hashes: total_hashes,
fill_ratio: avg_fill,
estimated_fpr: avg_fpr,
})
}
#[must_use]
#[allow(clippy::cast_precision_loss, clippy::cast_possible_wrap)]
pub fn estimated_fpr(&self, num_items: usize) -> f64 {
if self.blocks.is_empty() {
return 0.0;
}
let k = f64::from(self.blocks[0].bloom.num_hashes());
let m = self.blocks[0].bloom.num_bits() as f64;
let n = num_items as f64;
let pow_hashes = i32::try_from(self.blocks[0].bloom.num_hashes()).unwrap_or(i32::MAX);
(1.0 - (-k * n / m).exp()).powi(pow_hashes)
}
}
#[derive(Debug, Clone, Copy)]
pub struct BloomStats {
pub num_bits: usize,
pub num_hashes: u32,
pub fill_ratio: f64,
pub estimated_fpr: f64,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::bloom::BloomFilter;
#[test]
fn test_pattern_might_contain_false_negative_limitation() {
let mut bloom = BloomFilter::new(100, 0.01);
bloom.insert(b"AB");
bloom.insert(b"CD");
assert!(!CompressedIndex::pattern_might_contain(&bloom, b"ABCD"));
}
}