#![deny(unsafe_op_in_unsafe_fn)]
pub struct BigramBloom {
bits: Box<[u64; 1024]>,
saturated: bool,
}
const SATURATION_NUMERATOR: u32 = 3;
const SATURATION_DENOMINATOR: u32 = 5;
const TABLE_SLOTS: u32 = 65536;
impl Clone for BigramBloom {
fn clone(&self) -> Self {
Self {
bits: Box::new(*self.bits),
saturated: self.saturated,
}
}
}
impl BigramBloom {
pub fn empty() -> Self {
Self {
bits: Box::new([0; 1024]),
saturated: false,
}
}
pub fn insert_all(&mut self, bytes: &[u8]) {
for window in bytes.windows(2) {
self.insert(window[0], window[1]);
}
self.recompute_saturation();
}
#[inline]
fn insert(&mut self, a: u8, b: u8) {
let idx = bigram_slot(a, b);
self.bits[idx >> 6] |= 1u64 << (idx & 63);
}
#[inline]
fn insert_row(&mut self, a: u8) {
let word = (a as usize) << 2;
self.bits[word] = u64::MAX;
self.bits[word + 1] = u64::MAX;
self.bits[word + 2] = u64::MAX;
self.bits[word + 3] = u64::MAX;
}
pub fn from_literal_prefixes(literals: &[String]) -> Self {
let mut bloom = Self::empty();
for literal in literals {
let bytes = literal.as_bytes();
if bytes.is_empty() {
continue;
}
if bytes.len() < 2 {
bloom.insert_row(bytes[0]);
continue;
}
bloom.insert_all(bytes);
let Some(&last) = bytes.last() else { continue };
bloom.insert_row(last);
}
bloom.recompute_saturation();
bloom
}
fn recompute_saturation(&mut self) {
self.saturated = self.popcount() as u64 * SATURATION_DENOMINATOR as u64
>= TABLE_SLOTS as u64 * SATURATION_NUMERATOR as u64;
}
pub fn maybe_overlaps(&self, chunk: &[u8]) -> bool {
if chunk.len() < 2 {
return true;
}
if self.saturated {
return true;
}
let bits = self.bits.as_ref();
let last_start = chunk.len() - 2; let probe = |i: usize| -> bool {
let idx = bigram_slot(chunk[i], chunk[i + 1]);
bits[idx >> 6] & (1u64 << (idx & 63)) != 0
};
let mut i = 0usize;
while i + 4 <= last_start + 1 {
if probe(i) | probe(i + 1) | probe(i + 2) | probe(i + 3) {
return true;
}
i += 4;
}
while i <= last_start {
if probe(i) {
return true;
}
i += 1;
}
false
}
pub fn popcount(&self) -> u32 {
self.bits.iter().map(|w| w.count_ones()).sum()
}
pub fn is_saturated(&self) -> bool {
self.saturated
}
#[doc(hidden)]
pub fn scalar_overlaps_reference(&self, chunk: &[u8]) -> bool {
if chunk.len() < 2 {
return true;
}
chunk.windows(2).any(|w| {
let idx = bigram_slot(w[0], w[1]);
self.bits[idx >> 6] & (1u64 << (idx & 63)) != 0
})
}
#[doc(hidden)]
pub fn saturated_for_test() -> Self {
let mut bloom = Self::empty();
for a in 0u16..158 {
bloom.insert_row(a as u8);
}
bloom.recompute_saturation();
bloom
}
}
#[inline(always)]
fn bigram_slot(a: u8, b: u8) -> usize {
((a as usize) << 8) | (b as usize)
}