use crate::bloom::NgramBloom;
use crate::error::{Error, Result};
use crate::histogram::ByteHistogram;
use crate::index::BlockIndex;
const DEFAULT_BLOCK_SIZE: usize = 256 * 1024;
const DEFAULT_BLOOM_BITS: usize = 65_536;
#[derive(Clone, Debug)]
pub struct BlockIndexBuilder {
block_size: usize,
bloom_bits: usize,
}
impl BlockIndexBuilder {
#[must_use]
pub fn new() -> Self {
Self {
block_size: DEFAULT_BLOCK_SIZE,
bloom_bits: DEFAULT_BLOOM_BITS,
}
}
#[must_use]
pub fn block_size(mut self, size: usize) -> Self {
self.block_size = size;
self
}
#[must_use]
pub fn bloom_bits(mut self, bits: usize) -> Self {
self.bloom_bits = bits;
self
}
pub fn build(&self, data: &[u8]) -> Result<BlockIndex> {
validate_block_size(self.block_size)?;
if self.bloom_bits == 0 {
return Err(Error::ZeroBloomBits);
}
let mut histograms = Vec::new();
let mut blooms = Vec::new();
let mut prev_byte = None;
for block in data.chunks(self.block_size) {
histograms.push(ByteHistogram::from_block(block));
let mut bloom = NgramBloom::from_block(block, self.bloom_bits)?;
if let Some(b) = prev_byte {
if let Some(&first) = block.first() {
bloom.insert_ngram(b, first);
}
}
prev_byte = block.last().copied();
blooms.push(bloom);
}
Ok(BlockIndex::new(
self.block_size,
data.len(),
histograms,
blooms,
))
}
pub fn build_streaming<I: Iterator<Item = Vec<u8>>>(&self, blocks: I) -> Result<BlockIndex> {
validate_block_size(self.block_size)?;
if self.bloom_bits == 0 {
return Err(Error::ZeroBloomBits);
}
let mut histograms = Vec::new();
let mut blooms = Vec::new();
let mut total_len = 0_usize;
let mut prev_byte = None;
for block in blocks {
if block.len() != self.block_size {
return Err(Error::UnalignedData {
data_len: block.len(),
block_size: self.block_size,
});
}
total_len += block.len();
histograms.push(ByteHistogram::from_block(&block));
let mut bloom = NgramBloom::from_block(&block, self.bloom_bits)?;
if let Some(b) = prev_byte {
if let Some(&first) = block.first() {
bloom.insert_ngram(b, first);
}
}
prev_byte = block.last().copied();
blooms.push(bloom);
}
Ok(BlockIndex::new(
self.block_size,
total_len,
histograms,
blooms,
))
}
}
impl Default for BlockIndexBuilder {
fn default() -> Self {
Self::new()
}
}
fn validate_block_size(size: usize) -> Result<()> {
if size < 256 || !size.is_power_of_two() {
return Err(Error::InvalidBlockSize { size });
}
Ok(())
}