use std::collections::HashMap;
use std::hash::{BuildHasher, BuildHasherDefault, Hasher};
use crate::compiler::{CompiledDetection, CompiledRule};
use crate::event::{Event, EventValue};
use crate::matcher::CompiledMatcher;
pub(crate) const NGRAM_SIZE: usize = 3;
pub(crate) const MAX_BLOOM_SCAN_BYTES: usize = 4096;
pub(crate) const DEFAULT_MAX_TOTAL_BYTES: usize = 1024 * 1024;
const TARGET_FPR: f64 = 0.01;
const MIN_BITS_PER_PATTERN: usize = 16;
const MAX_BYTES_PER_FIELD: usize = 64 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum BloomVerdict {
DefinitelyNoMatch,
MaybeMatch,
}
pub(crate) trait BloomLookup {
fn verdict_for_field(&self, field: &str) -> BloomVerdict;
}
pub(crate) struct NoBloom;
impl BloomLookup for NoBloom {
#[inline(always)]
fn verdict_for_field(&self, _field: &str) -> BloomVerdict {
BloomVerdict::MaybeMatch
}
}
struct FieldBloom {
bits: Vec<u64>,
num_bits: usize,
num_hashes: u32,
hasher_factory: BuildHasherDefault<ahash::AHasher>,
}
impl FieldBloom {
fn new_with_capacity(n_items: usize) -> Option<Self> {
if n_items == 0 {
return None;
}
let m_ideal = (-(n_items as f64) * TARGET_FPR.ln() / 2.0_f64.ln().powi(2)).ceil() as usize;
let mut num_bits = m_ideal.max(MIN_BITS_PER_PATTERN * n_items);
num_bits = num_bits.div_ceil(64) * 64;
let max_bits = MAX_BYTES_PER_FIELD * 8;
if num_bits > max_bits {
num_bits = max_bits;
}
if num_bits / 8 < n_items.div_ceil(8) {
return None;
}
let k_ideal = ((num_bits as f64 / n_items as f64) * 2.0_f64.ln()).round() as u32;
let num_hashes = k_ideal.clamp(2, 12);
Some(Self {
bits: vec![0u64; num_bits / 64],
num_bits,
num_hashes,
hasher_factory: BuildHasherDefault::default(),
})
}
fn byte_size(&self) -> usize {
self.bits.len() * 8
}
fn insert_trigram(&mut self, trigram: &[u8]) {
let (h1, h2) = self.hash_pair(trigram);
for i in 0..self.num_hashes as u64 {
let pos = (h1.wrapping_add(i.wrapping_mul(h2))) as usize % self.num_bits;
self.bits[pos / 64] |= 1 << (pos % 64);
}
}
fn contains_trigram(&self, trigram: &[u8]) -> bool {
let (h1, h2) = self.hash_pair(trigram);
for i in 0..self.num_hashes as u64 {
let pos = (h1.wrapping_add(i.wrapping_mul(h2))) as usize % self.num_bits;
if self.bits[pos / 64] & (1 << (pos % 64)) == 0 {
return false;
}
}
true
}
fn hash_pair(&self, trigram: &[u8]) -> (u64, u64) {
let mut h1 = self.hasher_factory.build_hasher();
h1.write_u8(0xA1);
h1.write(trigram);
let mut h2 = self.hasher_factory.build_hasher();
h2.write_u8(0xB2);
h2.write(trigram);
(h1.finish(), h2.finish())
}
}
pub(crate) struct FieldBloomIndex {
filters: HashMap<String, FieldBloom>,
}
impl FieldBloomIndex {
pub(crate) fn empty() -> Self {
Self {
filters: HashMap::new(),
}
}
pub(crate) fn build(rules: &[CompiledRule]) -> Self {
Self::build_with_budget(rules, DEFAULT_MAX_TOTAL_BYTES)
}
pub(crate) fn build_with_budget(rules: &[CompiledRule], max_total_bytes: usize) -> Self {
let mut field_needles: HashMap<String, Vec<String>> = HashMap::new();
for rule in rules {
for detection in rule.detections.values() {
collect_positive_substring_needles(detection, &mut field_needles);
}
}
struct Built {
field: String,
bloom: FieldBloom,
n_patterns: usize,
}
let mut built: Vec<Built> = field_needles
.into_iter()
.filter_map(|(field, mut needles)| {
needles.sort();
needles.dedup();
let mut trigram_set: std::collections::HashSet<[u8; NGRAM_SIZE]> =
std::collections::HashSet::new();
for needle in &needles {
let bytes = needle.as_bytes();
if bytes.len() < NGRAM_SIZE {
continue;
}
for window in bytes.windows(NGRAM_SIZE) {
let mut buf = [0u8; NGRAM_SIZE];
buf.copy_from_slice(window);
trigram_set.insert(buf);
}
}
let n_trigrams = trigram_set.len();
let mut bloom = FieldBloom::new_with_capacity(n_trigrams)?;
for needle in &needles {
insert_needle_trigrams(&mut bloom, needle);
}
Some(Built {
field,
bloom,
n_patterns: needles.len(),
})
})
.collect();
let mut total: usize = built.iter().map(|b| b.bloom.byte_size()).sum();
if total > max_total_bytes {
built.sort_by(|a, b| {
let da = a.bloom.byte_size() as f64 / a.n_patterns.max(1) as f64;
let db = b.bloom.byte_size() as f64 / b.n_patterns.max(1) as f64;
db.partial_cmp(&da).unwrap_or(std::cmp::Ordering::Equal)
});
while total > max_total_bytes {
if let Some(victim) = built.pop() {
total = total.saturating_sub(victim.bloom.byte_size());
} else {
break;
}
}
}
let filters = built
.into_iter()
.map(|b| (b.field, b.bloom))
.collect::<HashMap<_, _>>();
Self { filters }
}
#[cfg(test)]
pub(crate) fn field_count(&self) -> usize {
self.filters.len()
}
#[cfg(test)]
pub(crate) fn total_bytes(&self) -> usize {
self.filters.values().map(FieldBloom::byte_size).sum()
}
pub(crate) fn probe(&self, field: &str, value: &str) -> BloomVerdict {
let Some(bloom) = self.filters.get(field) else {
return BloomVerdict::MaybeMatch;
};
if value.len() < NGRAM_SIZE {
return BloomVerdict::MaybeMatch;
}
if value.len() > MAX_BLOOM_SCAN_BYTES {
return BloomVerdict::MaybeMatch;
}
let lowered = crate::matcher::ascii_lowercase_cow(value);
let bytes = lowered.as_bytes();
for window in bytes.windows(NGRAM_SIZE) {
if bloom.contains_trigram(window) {
return BloomVerdict::MaybeMatch;
}
}
BloomVerdict::DefinitelyNoMatch
}
}
pub(crate) struct BloomCache<'a, E: Event> {
index: &'a FieldBloomIndex,
event: &'a E,
cache: std::cell::RefCell<HashMap<String, BloomVerdict>>,
}
impl<'a, E: Event> BloomCache<'a, E> {
pub(crate) fn new(index: &'a FieldBloomIndex, event: &'a E) -> Self {
Self {
index,
event,
cache: std::cell::RefCell::new(HashMap::new()),
}
}
}
impl<E: Event> BloomLookup for BloomCache<'_, E> {
fn verdict_for_field(&self, field: &str) -> BloomVerdict {
if let Some(v) = self.cache.borrow().get(field) {
return *v;
}
let verdict = match self.event.get_field(field) {
Some(EventValue::Str(s)) => self.index.probe(field, &s),
_ => BloomVerdict::MaybeMatch,
};
self.cache.borrow_mut().insert(field.to_string(), verdict);
verdict
}
}
pub(crate) fn is_positive_substring_matcher(matcher: &CompiledMatcher) -> bool {
match matcher {
CompiledMatcher::Contains { .. }
| CompiledMatcher::StartsWith { .. }
| CompiledMatcher::EndsWith { .. }
| CompiledMatcher::AhoCorasickSet { .. } => true,
CompiledMatcher::AnyOf(children) => children.iter().all(is_positive_substring_matcher),
CompiledMatcher::CaseInsensitiveGroup { children, .. } => {
children.iter().all(is_positive_substring_matcher)
}
_ => false,
}
}
fn collect_positive_substring_needles(
detection: &CompiledDetection,
out: &mut HashMap<String, Vec<String>>,
) {
match detection {
CompiledDetection::AllOf(items) => {
for item in items {
if let Some(field) = &item.field {
extract_from_matcher(&item.matcher, field, false, out);
}
}
}
CompiledDetection::AnyOf(subs) => {
for sub in subs {
collect_positive_substring_needles(sub, out);
}
}
CompiledDetection::Keywords(_) => {
}
}
}
fn extract_from_matcher(
m: &CompiledMatcher,
field: &str,
negated: bool,
out: &mut HashMap<String, Vec<String>>,
) {
if negated {
return;
}
match m {
CompiledMatcher::Contains { value, .. }
| CompiledMatcher::StartsWith { value, .. }
| CompiledMatcher::EndsWith { value, .. } => {
out.entry(field.to_string())
.or_default()
.push(value.clone());
}
CompiledMatcher::AhoCorasickSet { needles, .. } => {
let entry = out.entry(field.to_string()).or_default();
entry.extend(needles.iter().cloned());
}
CompiledMatcher::AnyOf(children) | CompiledMatcher::AllOf(children) => {
for child in children {
extract_from_matcher(child, field, negated, out);
}
}
CompiledMatcher::CaseInsensitiveGroup { children, .. } => {
for child in children {
extract_from_matcher(child, field, negated, out);
}
}
CompiledMatcher::Not(inner) => {
extract_from_matcher(inner, field, true, out);
}
_ => {}
}
}
fn insert_needle_trigrams(bloom: &mut FieldBloom, needle: &str) {
let bytes = needle.as_bytes();
if bytes.len() < NGRAM_SIZE {
return;
}
for window in bytes.windows(NGRAM_SIZE) {
bloom.insert_trigram(window);
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Engine;
use rsigma_parser::parse_sigma_yaml;
fn engine_from(yaml: &str) -> Engine {
let collection = parse_sigma_yaml(yaml).unwrap();
let mut engine = Engine::new();
engine.add_collection(&collection).unwrap();
engine
}
fn bloom_from(yaml: &str) -> FieldBloomIndex {
let engine = engine_from(yaml);
FieldBloomIndex::build(engine.rules())
}
#[test]
fn empty_when_no_positive_substring_rules() {
let yaml = r#"
title: Exact Only
logsource:
product: windows
detection:
selection:
EventType: 'login'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.field_count(), 0);
}
#[test]
fn populates_filter_for_contains_field() {
let yaml = r#"
title: Contains Field
logsource:
product: windows
detection:
selection:
CommandLine|contains:
- 'whoami'
- 'mimikatz'
- 'powershell'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.field_count(), 1);
assert_eq!(
bloom.probe("CommandLine", "execute whoami /all"),
BloomVerdict::MaybeMatch
);
let mut rejected = 0usize;
let mut total = 0usize;
for a in b'0'..=b'9' {
for b in b'0'..=b'9' {
for c in b'0'..=b'9' {
total += 1;
let s = std::str::from_utf8(&[a, b, c]).unwrap().to_string();
if bloom.probe("CommandLine", &s) == BloomVerdict::DefinitelyNoMatch {
rejected += 1;
}
}
}
}
assert!(
rejected * 100 >= total * 95,
"expected >= 95% rejection on digit-only trigrams; got {rejected}/{total}"
);
}
#[test]
fn negated_contains_does_not_contribute_needles() {
let yaml = r#"
title: Negated Contains
logsource:
product: windows
detection:
selection:
CommandLine|contains|not: 'whoami'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.field_count(), 0);
}
#[test]
fn unrelated_field_falls_through_to_maybe_match() {
let yaml = r#"
title: Some Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.probe("User", "anything"), BloomVerdict::MaybeMatch);
}
#[test]
fn skips_haystacks_below_ngram_size() {
let yaml = r#"
title: Some Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'foo'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.probe("CommandLine", "ab"), BloomVerdict::MaybeMatch);
}
#[test]
fn skips_haystacks_above_max_scan_bytes() {
let yaml = r#"
title: Some Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'foo'
condition: selection
"#;
let bloom = bloom_from(yaml);
let huge = "x".repeat(MAX_BLOOM_SCAN_BYTES + 1);
assert_eq!(bloom.probe("CommandLine", &huge), BloomVerdict::MaybeMatch);
}
#[test]
fn ahocorasick_needles_contribute_to_bloom() {
let yaml = r#"
title: AC Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains:
- 'mimikatz'
- 'powershell'
- 'rundll32'
- 'regsvr32'
- 'certutil'
- 'bitsadmin'
- 'mshta'
- 'wscript'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(bloom.field_count(), 1);
assert_eq!(
bloom.probe("CommandLine", "rundll32.exe foo"),
BloomVerdict::MaybeMatch
);
assert_eq!(
bloom.probe("CommandLine", "012"),
BloomVerdict::DefinitelyNoMatch
);
}
#[test]
fn case_insensitive_probe() {
let yaml = r#"
title: CI
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
"#;
let bloom = bloom_from(yaml);
assert_eq!(
bloom.probe("CommandLine", "execute WHOAMI /all"),
BloomVerdict::MaybeMatch
);
}
#[test]
fn memory_budget_drops_lowest_density_fields_first() {
let mut rules = String::new();
for i in 0..50 {
rules.push_str(&format!(
"title: R{i}\n\
id: r-{i:03}\n\
logsource:\n\
\x20 product: windows\n\
detection:\n\
\x20 selection:\n\
\x20 Field{i}|contains: 'foo'\n\
\x20 condition: selection\n\
---\n",
));
}
let collection = parse_sigma_yaml(&rules).unwrap();
let mut engine = Engine::new();
engine.add_collection(&collection).unwrap();
let bloom = FieldBloomIndex::build_with_budget(engine.rules(), 100);
assert!(bloom.total_bytes() <= 100);
assert!(
bloom.field_count() < 50,
"expected eviction; got {} fields, {} bytes",
bloom.field_count(),
bloom.total_bytes()
);
let big = FieldBloomIndex::build_with_budget(engine.rules(), 1024);
assert_eq!(big.field_count(), 50);
}
}