use crate::bloom::hash::hash_pair;
#[cfg(test)]
use crate::histogram::ByteHistogram;
#[derive(Clone, Copy, Debug)]
pub struct ByteHistogramRef<'a> {
pub(crate) data: &'a [u8],
}
impl ByteHistogramRef<'_> {
#[must_use]
pub fn count(&self, byte: u8) -> u32 {
let offset = usize::from(byte) * std::mem::size_of::<u32>();
u32::from_le_bytes([
self.data[offset],
self.data[offset + 1],
self.data[offset + 2],
self.data[offset + 3],
])
}
#[cfg(test)]
pub(crate) fn to_owned(self) -> ByteHistogram {
let mut counts = [0_u32; 256];
for byte in u8::MIN..=u8::MAX {
counts[usize::from(byte)] = self.count(byte);
}
ByteHistogram::from_raw_counts(counts)
}
}
#[derive(Clone, Copy, Debug)]
pub struct NgramBloomRef<'a> {
pub(crate) bloom_data: &'a [u8],
pub(crate) exact_pairs_data: Option<&'a [u8]>,
pub(crate) num_bits: usize,
}
impl NgramBloomRef<'_> {
#[must_use]
#[inline]
pub fn num_bits(&self) -> usize {
self.num_bits
}
#[must_use]
#[inline(always)]
pub fn maybe_contains_exact(&self, first: u8, second: u8) -> bool {
if let Some(exact_pairs) = self.exact_pairs_data {
let pair = (usize::from(first) << 8) | usize::from(second);
let word_index = pair >> 6;
let bit_offset = pair & 63;
let offset = word_index * std::mem::size_of::<u64>();
let word = u64::from_le_bytes([
exact_pairs[offset],
exact_pairs[offset + 1],
exact_pairs[offset + 2],
exact_pairs[offset + 3],
exact_pairs[offset + 4],
exact_pairs[offset + 5],
exact_pairs[offset + 6],
exact_pairs[offset + 7],
]);
return (word & (1_u64 << bit_offset)) != 0;
}
self.maybe_contains_bloom(first, second)
}
#[must_use]
#[inline(always)]
#[allow(clippy::cast_possible_truncation)]
pub fn maybe_contains_bloom(&self, first: u8, second: u8) -> bool {
let (h1, h2) = hash_pair(first, second);
let mask = (self.num_bits as u64).wrapping_sub(1);
let idx0 = (h1 & mask) as usize;
let idx1 = (h1.wrapping_add(h2) & mask) as usize;
let idx2 = (h1.wrapping_add(h2.wrapping_mul(2)) & mask) as usize;
self.bit_is_set(idx0) && self.bit_is_set(idx1) && self.bit_is_set(idx2)
}
#[must_use]
#[inline]
pub fn maybe_contains_any(&self, ngrams: &[(u8, u8)]) -> bool {
if ngrams.is_empty() {
return false;
}
if self.uses_exact_pairs() {
ngrams.iter().any(|&(a, b)| self.maybe_contains_exact(a, b))
} else {
ngrams.iter().any(|&(a, b)| self.maybe_contains_bloom(a, b))
}
}
#[must_use]
#[inline]
pub fn maybe_contains_all(&self, ngrams: &[(u8, u8)]) -> bool {
if ngrams.is_empty() {
return true;
}
if self.uses_exact_pairs() {
ngrams.iter().all(|&(a, b)| self.maybe_contains_exact(a, b))
} else {
ngrams.iter().all(|&(a, b)| self.maybe_contains_bloom(a, b))
}
}
#[must_use]
#[inline]
pub(crate) fn uses_exact_pairs(&self) -> bool {
self.exact_pairs_data.is_some()
}
fn bit_is_set(&self, bit_index: usize) -> bool {
let word_index = bit_index / 64;
let bit_offset = bit_index % 64;
(self.bloom_word(word_index) & (1_u64 << bit_offset)) != 0
}
fn bloom_word(&self, word_index: usize) -> u64 {
let offset = word_index * std::mem::size_of::<u64>();
u64::from_le_bytes([
self.bloom_data[offset],
self.bloom_data[offset + 1],
self.bloom_data[offset + 2],
self.bloom_data[offset + 3],
self.bloom_data[offset + 4],
self.bloom_data[offset + 5],
self.bloom_data[offset + 6],
self.bloom_data[offset + 7],
])
}
}