use crate::bloom::NgramBloom;
use crate::histogram::ByteHistogram;
#[derive(Clone, Debug)]
pub struct ByteFilter {
required: [bool; 256],
required_count: usize,
compact_requirements: Vec<Box<[u8]>>,
max_pattern_bytes: usize,
}
impl ByteFilter {
#[must_use]
pub fn new() -> Self {
Self {
required: [false; 256],
required_count: 0,
compact_requirements: Vec::new(),
max_pattern_bytes: 0,
}
}
#[must_use]
#[allow(clippy::cast_possible_truncation)]
pub fn from_patterns(patterns: &[&[u8]]) -> Self {
let mut filter = Self::new();
let mut max_pattern_bytes = 0_usize;
for &pattern in patterns {
if pattern.is_empty() {
continue;
}
max_pattern_bytes = max_pattern_bytes.max(pattern.len());
let single = Self::from_single_pattern(pattern);
for (index, required) in single.required.iter().enumerate() {
if *required && !filter.required[index] {
filter.required[index] = true;
filter.required_count += 1;
}
}
let compact: Box<[u8]> = single
.required
.iter()
.enumerate()
.filter(|(_, &r)| r)
.map(|(i, _)| i as u8)
.collect::<Vec<_>>()
.into_boxed_slice();
filter.compact_requirements.push(compact);
}
filter.max_pattern_bytes = max_pattern_bytes;
filter
}
#[must_use]
#[allow(clippy::cast_possible_truncation)]
pub fn from_single_pattern(pattern: &[u8]) -> Self {
let mut required = [false; 256];
let mut required_count = 0_usize;
for &byte in pattern {
let slot = &mut required[usize::from(byte)];
if !*slot {
*slot = true;
required_count += 1;
}
}
let compact: Box<[u8]> = required
.iter()
.enumerate()
.filter(|(_, &r)| r)
.map(|(i, _)| i as u8)
.collect();
Self {
required,
required_count,
compact_requirements: vec![compact],
max_pattern_bytes: pattern.len(),
}
}
#[must_use]
pub fn matches_histogram(&self, histogram: &ByteHistogram) -> bool {
if self.compact_requirements.is_empty() {
return false;
}
self.compact_requirements
.iter()
.any(|required_bytes| required_bytes.iter().all(|&b| histogram.count(b) > 0))
}
#[must_use]
pub fn required_count(&self) -> usize {
self.required_count
}
#[must_use]
pub fn matches_histogram_pair(&self, h1: &ByteHistogram, h2: &ByteHistogram) -> bool {
if self.compact_requirements.is_empty() {
return false;
}
self.compact_requirements.iter().any(|required_bytes| {
required_bytes
.iter()
.all(|&b| h1.count(b) > 0 || h2.count(b) > 0)
})
}
#[must_use]
pub fn matches_histogram_multi(&self, histograms: &[ByteHistogram]) -> bool {
if self.compact_requirements.is_empty() {
return false;
}
self.compact_requirements.iter().any(|required_bytes| {
required_bytes
.iter()
.all(|&b| histograms.iter().any(|h| h.count(b) > 0))
})
}
pub(crate) fn compact_requirements(&self) -> &[Box<[u8]>] {
&self.compact_requirements
}
#[allow(dead_code)]
pub(crate) fn max_pattern_bytes(&self) -> usize {
self.max_pattern_bytes
}
}
impl Default for ByteFilter {
fn default() -> Self {
Self::new()
}
}
#[derive(Clone, Debug)]
pub struct NgramFilter {
pattern_ngrams: Vec<Vec<(u8, u8)>>,
union_ngrams: Vec<(u8, u8)>,
max_pattern_bytes: usize,
}
impl NgramFilter {
#[must_use]
pub fn from_patterns(patterns: &[&[u8]]) -> Self {
let mut pattern_ngrams = Vec::with_capacity(patterns.len());
let mut prev_pattern: &[u8] = b"";
let mut prev_raw_ngrams: Vec<(u8, u8)> = Vec::new();
for &pattern in patterns {
if pattern.is_empty() {
continue;
}
let lcp = pattern
.iter()
.zip(prev_pattern.iter())
.take_while(|(a, b)| a == b)
.count();
let mut raw_ngrams = if lcp >= 2 {
prev_raw_ngrams[..lcp - 1].to_vec()
} else {
Vec::new()
};
if pattern.len() >= 2 {
let start_idx = if lcp >= 2 { lcp - 1 } else { 0 };
for window in pattern[start_idx..].windows(2) {
raw_ngrams.push((window[0], window[1]));
}
}
prev_pattern = pattern;
prev_raw_ngrams.clone_from(&raw_ngrams);
let mut ngrams = raw_ngrams;
ngrams.sort_unstable();
ngrams.dedup();
pattern_ngrams.push(ngrams);
}
let mut union_ngrams: Vec<(u8, u8)> = pattern_ngrams.iter().flatten().copied().collect();
union_ngrams.sort_unstable();
union_ngrams.dedup();
let max_pattern_bytes = patterns.iter().map(|p| p.len()).max().unwrap_or(0);
Self {
pattern_ngrams,
union_ngrams,
max_pattern_bytes,
}
}
#[inline]
#[must_use]
pub fn matches_bloom(&self, bloom: &NgramBloom) -> bool {
if self.pattern_ngrams.is_empty() {
return false;
}
let any_pattern_has_no_ngrams = self.pattern_ngrams.iter().any(Vec::is_empty);
if !any_pattern_has_no_ngrams
&& !self.union_ngrams.is_empty()
&& !bloom.maybe_contains_any(&self.union_ngrams)
{
return false;
}
if bloom.uses_exact_pairs() {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams
.iter()
.all(|&(first, second)| bloom.maybe_contains_exact(first, second))
})
} else {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams
.iter()
.all(|&(first, second)| bloom.maybe_contains_bloom(first, second))
})
}
}
#[must_use]
pub fn quick_reject(&self, data: &[u8]) -> bool {
if self.pattern_ngrams.is_empty() {
return false;
}
let prefix_len = data.len().min(4096);
let prefix = &data[..prefix_len];
if let Ok(bloom) = crate::bloom::NgramBloom::from_block(prefix, 4096) {
self.matches_bloom(&bloom)
} else {
true
}
}
#[inline]
#[must_use]
pub fn matches_bloom_pair(&self, b1: &NgramBloom, b2: &NgramBloom) -> bool {
if self.pattern_ngrams.is_empty() {
return false;
}
if b1.uses_exact_pairs() && b2.uses_exact_pairs() {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams.iter().all(|&(first, second)| {
b1.maybe_contains_exact(first, second) || b2.maybe_contains_exact(first, second)
})
})
} else {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams.iter().all(|&(first, second)| {
b1.maybe_contains_bloom(first, second) || b2.maybe_contains_bloom(first, second)
})
})
}
}
#[inline]
#[must_use]
pub fn matches_bloom_multi(&self, blooms: &[NgramBloom]) -> bool {
if self.pattern_ngrams.is_empty() || blooms.is_empty() {
return false;
}
let use_exact = blooms.first().is_some_and(NgramBloom::uses_exact_pairs);
if use_exact {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams.iter().all(|&(first, second)| {
blooms
.iter()
.any(|bloom| bloom.maybe_contains_exact(first, second))
})
})
} else {
self.pattern_ngrams.iter().any(|ngrams| {
ngrams.iter().all(|&(first, second)| {
blooms
.iter()
.any(|bloom| bloom.maybe_contains_bloom(first, second))
})
})
}
}
pub(crate) fn pattern_ngrams(&self) -> &[Vec<(u8, u8)>] {
&self.pattern_ngrams
}
pub(crate) fn union_ngrams(&self) -> &[(u8, u8)] {
&self.union_ngrams
}
pub(crate) fn max_pattern_bytes(&self) -> usize {
self.max_pattern_bytes
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum FilterOp {
And,
Or,
}
#[derive(Clone, Debug)]
pub enum CompositeFilter {
Byte(ByteFilter),
Ngram(NgramFilter),
Combine {
left: Box<CompositeFilter>,
right: Box<CompositeFilter>,
op: FilterOp,
},
}
impl CompositeFilter {
#[must_use]
pub fn combine_byte(a: ByteFilter, b: ByteFilter, op: FilterOp) -> Self {
Self::Combine {
left: Box::new(Self::Byte(a)),
right: Box::new(Self::Byte(b)),
op,
}
}
#[must_use]
pub fn combine_ngram(a: NgramFilter, b: NgramFilter, op: FilterOp) -> Self {
Self::Combine {
left: Box::new(Self::Ngram(a)),
right: Box::new(Self::Ngram(b)),
op,
}
}
#[must_use]
pub fn combine(a: Self, b: Self, op: FilterOp) -> Self {
Self::Combine {
left: Box::new(a),
right: Box::new(b),
op,
}
}
#[must_use]
pub fn matches(&self, histogram: &ByteHistogram, bloom: &NgramBloom) -> bool {
match self {
Self::Byte(filter) => filter.matches_histogram(histogram),
Self::Ngram(filter) => filter.matches_bloom(bloom),
Self::Combine { left, right, op } => {
let l = left.matches(histogram, bloom);
match op {
FilterOp::And => l && right.matches(histogram, bloom),
FilterOp::Or => l || right.matches(histogram, bloom),
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ngram_filter_union_short_circuit_skips_when_any_pattern_has_no_bigrams() {
let filter = NgramFilter::from_patterns(&[b"x".as_slice(), b"hello".as_slice()]);
let bloom = NgramBloom::from_block(b"x", 1024).unwrap();
assert!(
filter.matches_bloom(&bloom),
"short pattern should match; long pattern's union must not force rejection"
);
}
#[test]
fn ngram_filter_from_patterns_with_lcp_optimization() {
let pattern1 = b"test_pattern_a".as_slice();
let pattern2 = b"test_pattern_b".as_slice();
let filter = NgramFilter::from_patterns(&[pattern1, pattern2]);
let mut expected_ngrams1: Vec<_> = pattern1.windows(2).map(|w| (w[0], w[1])).collect();
expected_ngrams1.sort_unstable();
expected_ngrams1.dedup();
let mut expected_ngrams2: Vec<_> = pattern2.windows(2).map(|w| (w[0], w[1])).collect();
expected_ngrams2.sort_unstable();
expected_ngrams2.dedup();
assert_eq!(filter.pattern_ngrams().len(), 2);
assert_eq!(filter.pattern_ngrams()[0], expected_ngrams1);
assert_eq!(filter.pattern_ngrams()[1], expected_ngrams2);
}
}