use std::collections::HashMap;
use daachorse::DoubleArrayAhoCorasick;
use rsigma_parser::ConditionExpr;
use crate::compiler::{CompiledDetection, CompiledRule};
use crate::event::{Event, EventValue};
use crate::matcher::CompiledMatcher;
pub(crate) const MAX_PATTERNS_PER_FIELD: usize = 100_000;
struct FieldAc {
automaton: DoubleArrayAhoCorasick<u32>,
pattern_to_rules: Vec<Vec<u32>>,
}
pub(crate) struct CrossRuleAcIndex {
per_field: HashMap<String, FieldAc>,
rule_count: usize,
}
impl CrossRuleAcIndex {
pub(crate) fn empty() -> Self {
Self {
per_field: HashMap::new(),
rule_count: 0,
}
}
pub(crate) fn is_empty(&self) -> bool {
self.per_field.is_empty()
}
pub(crate) fn build(rules: &[CompiledRule]) -> Self {
let mut per_field: HashMap<String, HashMap<String, Vec<u32>>> = HashMap::new();
for (rule_idx, rule) in rules.iter().enumerate() {
let rule_idx_u32 = u32::try_from(rule_idx).unwrap_or(u32::MAX);
for detection in rule.detections.values() {
collect_rule_needles(detection, rule_idx_u32, &mut per_field);
}
}
let mut built: HashMap<String, FieldAc> = HashMap::new();
for (field, needle_to_rules) in per_field {
if needle_to_rules.is_empty() {
continue;
}
if needle_to_rules.len() > MAX_PATTERNS_PER_FIELD {
log::debug!(
"cross-rule AC: field '{field}' has {} patterns (> {MAX_PATTERNS_PER_FIELD}); falling back",
needle_to_rules.len()
);
continue;
}
let mut entries: Vec<(String, Vec<u32>)> = needle_to_rules.into_iter().collect();
entries.sort_by(|a, b| a.0.cmp(&b.0));
let mut patterns: Vec<String> = Vec::with_capacity(entries.len());
let mut pattern_to_rules: Vec<Vec<u32>> = Vec::with_capacity(entries.len());
for (pattern, mut rule_ids) in entries {
rule_ids.sort_unstable();
rule_ids.dedup();
patterns.push(pattern);
pattern_to_rules.push(rule_ids);
}
match DoubleArrayAhoCorasick::<u32>::new(&patterns) {
Ok(automaton) => {
built.insert(
field,
FieldAc {
automaton,
pattern_to_rules,
},
);
}
Err(e) => {
log::warn!(
"cross-rule AC: failed to build automaton for field '{field}' ({} patterns): {e}",
patterns.len()
);
}
}
}
Self {
per_field: built,
rule_count: rules.len(),
}
}
#[cfg(test)]
pub(crate) fn field_count(&self) -> usize {
self.per_field.len()
}
pub(crate) fn mark_hits<E: Event>(&self, event: &E, hits: &mut [bool]) {
debug_assert_eq!(hits.len(), self.rule_count);
for (field, ac) in &self.per_field {
let value = match event.get_field(field) {
Some(EventValue::Str(s)) => s,
_ => continue,
};
let lowered = crate::matcher::ascii_lowercase_cow(&value);
for m in ac.automaton.find_overlapping_iter(lowered.as_bytes()) {
let pattern_id = m.value() as usize;
if let Some(rule_ids) = ac.pattern_to_rules.get(pattern_id) {
for &rid in rule_ids {
let idx = rid as usize;
if let Some(slot) = hits.get_mut(idx) {
*slot = true;
}
}
}
}
}
}
}
fn collect_rule_needles(
detection: &CompiledDetection,
rule_idx: u32,
out: &mut HashMap<String, HashMap<String, Vec<u32>>>,
) {
match detection {
CompiledDetection::AllOf(items) => {
for item in items {
if let Some(field) = &item.field {
extract_from_matcher(
&item.matcher,
field,
false,
rule_idx,
out,
);
}
}
}
CompiledDetection::AnyOf(subs) => {
for sub in subs {
collect_rule_needles(sub, rule_idx, out);
}
}
CompiledDetection::Keywords(_) => {
}
}
}
fn extract_from_matcher(
m: &CompiledMatcher,
field: &str,
negated: bool,
rule_idx: u32,
out: &mut HashMap<String, HashMap<String, Vec<u32>>>,
) {
if negated {
return;
}
match m {
CompiledMatcher::Contains { value, .. }
| CompiledMatcher::StartsWith { value, .. }
| CompiledMatcher::EndsWith { value, .. } => {
push_needle(out, field, value, rule_idx);
}
CompiledMatcher::AhoCorasickSet { needles, .. } => {
for needle in needles {
push_needle(out, field, needle, rule_idx);
}
}
CompiledMatcher::AnyOf(children) | CompiledMatcher::AllOf(children) => {
for child in children {
extract_from_matcher(child, field, negated, rule_idx, out);
}
}
CompiledMatcher::CaseInsensitiveGroup { children, .. } => {
for child in children {
extract_from_matcher(child, field, negated, rule_idx, out);
}
}
CompiledMatcher::Not(inner) => {
extract_from_matcher(inner, field, true, rule_idx, out);
}
_ => {}
}
}
fn push_needle(
out: &mut HashMap<String, HashMap<String, Vec<u32>>>,
field: &str,
needle: &str,
rule_idx: u32,
) {
if needle.is_empty() {
return;
}
out.entry(field.to_string())
.or_default()
.entry(needle.to_string())
.or_default()
.push(rule_idx);
}
pub(crate) fn rule_is_ac_prunable(rule: &CompiledRule) -> bool {
if rule.detections.is_empty() {
return false;
}
let mut has_positive_substring = false;
for detection in rule.detections.values() {
let mut found = false;
if !detection_is_pure_positive_substring(detection, &mut found) {
return false;
}
has_positive_substring |= found;
}
if !has_positive_substring {
return false;
}
rule.conditions.iter().all(condition_is_negation_free)
}
fn detection_is_pure_positive_substring(
detection: &CompiledDetection,
found_positive_substring: &mut bool,
) -> bool {
match detection {
CompiledDetection::AllOf(items) => {
if items.is_empty() {
return false;
}
for item in items {
if item.field.is_none() {
return false;
}
if item.exists.is_some() {
return false;
}
if !matcher_is_pure_positive_substring(&item.matcher, found_positive_substring) {
return false;
}
}
true
}
CompiledDetection::AnyOf(subs) => {
if subs.is_empty() {
return false;
}
for sub in subs {
if !detection_is_pure_positive_substring(sub, found_positive_substring) {
return false;
}
}
true
}
CompiledDetection::Keywords(_) => false,
}
}
fn matcher_is_pure_positive_substring(
matcher: &CompiledMatcher,
found_positive_substring: &mut bool,
) -> bool {
match matcher {
CompiledMatcher::Contains { .. }
| CompiledMatcher::StartsWith { .. }
| CompiledMatcher::EndsWith { .. }
| CompiledMatcher::AhoCorasickSet { .. } => {
*found_positive_substring = true;
true
}
CompiledMatcher::AnyOf(children) | CompiledMatcher::AllOf(children) => {
!children.is_empty()
&& children
.iter()
.all(|c| matcher_is_pure_positive_substring(c, found_positive_substring))
}
CompiledMatcher::CaseInsensitiveGroup { children, .. } => {
!children.is_empty()
&& children
.iter()
.all(|c| matcher_is_pure_positive_substring(c, found_positive_substring))
}
_ => false,
}
}
fn condition_is_negation_free(cond: &ConditionExpr) -> bool {
match cond {
ConditionExpr::Not(_) => false,
ConditionExpr::Identifier(_) => true,
ConditionExpr::Selector { .. } => true,
ConditionExpr::And(parts) | ConditionExpr::Or(parts) => {
parts.iter().all(condition_is_negation_free)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Engine;
use crate::event::JsonEvent;
use rsigma_parser::parse_sigma_yaml;
use serde_json::json;
fn engine_from(yaml: &str) -> Engine {
let collection = parse_sigma_yaml(yaml).unwrap();
let mut engine = Engine::new();
engine.add_collection(&collection).unwrap();
engine
}
fn build_index(yaml: &str) -> (Engine, CrossRuleAcIndex) {
let engine = engine_from(yaml);
let index = CrossRuleAcIndex::build(engine.rules());
(engine, index)
}
#[test]
fn empty_when_no_substring_patterns() {
let yaml = r#"
title: Exact Only
logsource:
product: windows
detection:
selection:
EventType: 'login'
condition: selection
"#;
let (_, index) = build_index(yaml);
assert!(index.is_empty());
}
#[test]
fn populates_per_field_automaton() {
let yaml = r#"
title: Contains Heavy
logsource:
product: windows
detection:
selection:
CommandLine|contains:
- 'whoami'
- 'mimikatz'
- 'powershell'
condition: selection
"#;
let (_, index) = build_index(yaml);
assert_eq!(index.field_count(), 1);
}
#[test]
fn marks_hits_for_matching_rule() {
let yaml = r#"
title: Whoami Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
---
title: Mimikatz Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'mimikatz'
condition: selection
"#;
let (engine, index) = build_index(yaml);
let mut hits = vec![false; engine.rule_count()];
let ev = json!({"CommandLine": "execute whoami /all"});
index.mark_hits(&JsonEvent::borrow(&ev), &mut hits);
assert!(hits[0], "first rule should hit on 'whoami'");
assert!(!hits[1], "second rule should not hit");
}
#[test]
fn marks_hits_case_insensitive() {
let yaml = r#"
title: Whoami Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
"#;
let (engine, index) = build_index(yaml);
let mut hits = vec![false; engine.rule_count()];
let ev = json!({"CommandLine": "execute WHOAMI /all"});
index.mark_hits(&JsonEvent::borrow(&ev), &mut hits);
assert!(hits[0], "haystack lowering must match upper-case input");
}
#[test]
fn negated_substring_excluded_from_index() {
let yaml = r#"
title: Negated
logsource:
product: windows
detection:
selection:
CommandLine|contains|not: 'whoami'
condition: selection
"#;
let (_, index) = build_index(yaml);
assert!(index.is_empty());
}
#[test]
fn ahocorasick_needles_indexed() {
let yaml = r#"
title: AC Rule
logsource:
product: windows
detection:
selection:
CommandLine|contains:
- 'mimikatz'
- 'powershell'
- 'rundll32'
- 'regsvr32'
- 'certutil'
- 'bitsadmin'
- 'mshta'
- 'wscript'
condition: selection
"#;
let (engine, index) = build_index(yaml);
let mut hits = vec![false; engine.rule_count()];
let ev = json!({"CommandLine": "rundll32.exe foo"});
index.mark_hits(&JsonEvent::borrow(&ev), &mut hits);
assert!(hits[0]);
}
#[test]
fn shared_pattern_marks_multiple_rules() {
let yaml = r#"
title: Rule A
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
---
title: Rule B
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
"#;
let (engine, index) = build_index(yaml);
let mut hits = vec![false; engine.rule_count()];
let ev = json!({"CommandLine": "whoami /all"});
index.mark_hits(&JsonEvent::borrow(&ev), &mut hits);
assert!(hits[0] && hits[1]);
}
#[test]
fn ac_prunable_pure_substring_rule() {
let yaml = r#"
title: Pure Contains
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
condition: selection
"#;
let engine = engine_from(yaml);
assert!(rule_is_ac_prunable(&engine.rules()[0]));
}
#[test]
fn ac_prunable_rejects_mixed_exact_and_substring() {
let yaml = r#"
title: Mixed
logsource:
product: windows
detection:
selection:
EventType: 'process_create'
CommandLine|contains: 'whoami'
condition: selection
"#;
let engine = engine_from(yaml);
assert!(!rule_is_ac_prunable(&engine.rules()[0]));
}
#[test]
fn ac_prunable_rejects_negation_in_condition() {
let yaml = r#"
title: Negated Condition
logsource:
product: windows
detection:
selection:
CommandLine|contains: 'whoami'
other:
CommandLine|contains: 'admin'
condition: selection and not other
"#;
let engine = engine_from(yaml);
assert!(!rule_is_ac_prunable(&engine.rules()[0]));
}
#[test]
fn ac_prunable_rejects_keywords() {
let yaml = r#"
title: Keyword Only
logsource:
product: windows
detection:
keywords:
- 'suspicious'
- 'malware'
condition: keywords
"#;
let engine = engine_from(yaml);
assert!(!rule_is_ac_prunable(&engine.rules()[0]));
}
#[test]
fn ac_prunable_accepts_anyof_substring_values() {
let yaml = r#"
title: AnyOf Substrings
logsource:
product: windows
detection:
selection:
CommandLine|contains:
- 'whoami'
- 'mimikatz'
- 'powershell'
condition: selection
"#;
let engine = engine_from(yaml);
assert!(rule_is_ac_prunable(&engine.rules()[0]));
}
#[test]
fn cap_drops_overflowing_field() {
let mut yaml = String::new();
let n = MAX_PATTERNS_PER_FIELD + 5;
for i in 0..n {
yaml.push_str(&format!(
"title: R{i}\n\
id: r-{i:08}\n\
logsource:\n\
\x20 product: windows\n\
detection:\n\
\x20 selection:\n\
\x20 CommandLine|contains: 'pat-{i:08}'\n\
\x20 condition: selection\n\
---\n",
));
}
let engine = engine_from(&yaml);
let index = CrossRuleAcIndex::build(engine.rules());
assert!(index.is_empty());
}
}