rsigma_eval/matcher/mod.rs
1//! Compiled matchers for zero-allocation hot-path evaluation.
2//!
3//! Each `CompiledMatcher` variant is pre-compiled at rule load time.
4//! At evaluation time, `matches()` performs the comparison against an
5//! [`EventValue`](crate::event::EventValue) from the event with no
6//! dynamic dispatch or allocation.
7
8mod helpers;
9mod matching;
10
11pub use helpers::{ascii_lowercase_cow, parse_expand_template, sigma_string_to_regex};
12
13use aho_corasick::AhoCorasick;
14use regex::{Regex, RegexSet};
15
16use crate::event::Event;
17use ipnet::IpNet;
18
19/// A pre-compiled matcher for a single value comparison.
20///
21/// All string matchers store their values in the form needed for comparison
22/// (Unicode-lowercased for case-insensitive). The `case_insensitive` flag
23/// controls whether the input is lowercased before comparison.
24#[derive(Debug, Clone)]
25pub enum CompiledMatcher {
26 // -- String matchers --
27 /// Exact string equality.
28 Exact {
29 value: String,
30 case_insensitive: bool,
31 },
32 /// Substring containment.
33 Contains {
34 value: String,
35 case_insensitive: bool,
36 },
37 /// String starts with prefix.
38 StartsWith {
39 value: String,
40 case_insensitive: bool,
41 },
42 /// String ends with suffix.
43 EndsWith {
44 value: String,
45 case_insensitive: bool,
46 },
47 /// Compiled regex pattern (flags baked in at compile time).
48 Regex(Regex),
49
50 /// Multi-pattern substring match via Aho-Corasick automaton.
51 ///
52 /// Built by the optimizer when an `AnyOf` group contains
53 /// `AHO_CORASICK_THRESHOLD` or more plain `Contains` matchers with the
54 /// same case sensitivity. Replaces the sequential O(N * haystack_len)
55 /// scan of `AnyOf([Contains, ...])` with a single linear pass.
56 ///
57 /// **Invariant**: this variant only encodes `AnyOf` (OR) semantics.
58 /// `AllOf(Contains)` (`|all` modifier) MUST NOT be collapsed into this
59 /// variant - the optimizer enforces this.
60 ///
61 /// **Case insensitivity**: when `case_insensitive` is true, needles are
62 /// stored pre-lowered (matching the `Contains` invariant) and the hot
63 /// path lowers the haystack via [`ascii_lowercase_cow`] before searching.
64 /// The `AhoCorasick` automaton itself is built case-sensitively.
65 AhoCorasickSet {
66 automaton: AhoCorasick,
67 case_insensitive: bool,
68 /// Pre-lowered needles in the same order they were fed to
69 /// [`AhoCorasick::new`]. Retained so downstream consumers (e.g. the
70 /// engine's per-field bloom builder) can recover the pattern set
71 /// without parsing the automaton's internal state.
72 needles: Vec<String>,
73 },
74
75 /// Multi-pattern regex match via [`regex::RegexSet`].
76 ///
77 /// Built by the optimizer when an `AnyOf` group contains
78 /// `REGEX_SET_THRESHOLD` or more individual `Regex` matchers. Compiles
79 /// every pattern into a single combined DFA so one traversal of the
80 /// haystack tests all patterns at once.
81 ///
82 /// **Pattern reconstruction**: the optimizer rebuilds the set from each
83 /// matcher's [`Regex::as_str`], which preserves any inline flags the
84 /// compiler inlined (e.g. `(?i)`, `(?ims)`). This relies on the eval
85 /// crate's regex builder always inlining flags into the pattern string
86 /// rather than configuring them via `RegexBuilder`. A unit test guards
87 /// against future drift in that contract.
88 RegexSetMatch { set: RegexSet, mode: GroupMode },
89
90 // -- Network --
91 /// CIDR network match for IP addresses.
92 Cidr(IpNet),
93
94 // -- Numeric --
95 /// Numeric equality.
96 NumericEq(f64),
97 /// Numeric greater-than.
98 NumericGt(f64),
99 /// Numeric greater-than-or-equal.
100 NumericGte(f64),
101 /// Numeric less-than.
102 NumericLt(f64),
103 /// Numeric less-than-or-equal.
104 NumericLte(f64),
105
106 // -- Special --
107 /// Field existence check. `true` = field must exist, `false` = must not exist.
108 Exists(bool),
109 /// Compare against another field's value.
110 FieldRef {
111 field: String,
112 case_insensitive: bool,
113 },
114 /// Match null / missing values.
115 Null,
116 /// Boolean equality.
117 BoolEq(bool),
118
119 // -- Expand --
120 /// Placeholder expansion: `%fieldname%` is resolved from the event at match time.
121 Expand {
122 template: Vec<ExpandPart>,
123 case_insensitive: bool,
124 },
125
126 // -- Timestamp --
127 /// Extract a time component from a timestamp field value and match it.
128 TimestampPart {
129 part: TimePart,
130 inner: Box<CompiledMatcher>,
131 },
132
133 // -- Negation --
134 /// Negated matcher: matches if the inner matcher does NOT match.
135 Not(Box<CompiledMatcher>),
136
137 // -- Composite --
138 /// Match if ANY child matches (OR).
139 AnyOf(Vec<CompiledMatcher>),
140 /// Match if ALL children match (AND).
141 AllOf(Vec<CompiledMatcher>),
142
143 /// A composite of case-insensitive string matchers that lowers the haystack
144 /// once before dispatching to children.
145 ///
146 /// Built by the optimizer when an `AnyOf` or `AllOf` group is composed
147 /// entirely of case-insensitive string matchers (`Contains`, `StartsWith`,
148 /// `EndsWith`, `Exact`, `AhoCorasickSet`, plus regexes that carry the
149 /// `(?i)` flag, plus `Not` / nested `AnyOf` / `AllOf` whose every leaf
150 /// satisfies these rules).
151 ///
152 /// **Invariant**: every child must be pre-lowerable. The optimizer's
153 /// `is_pre_lowerable` validator enforces this; `matches_pre_lowered`
154 /// `debug_assert!`s on violation.
155 ///
156 /// **Why this exists**: a mixed `AnyOf([Contains, StartsWith, EndsWith])`
157 /// previously called `s.to_lowercase()` once per child. Pre-lowering the
158 /// haystack a single time and dispatching to a CI-aware match path
159 /// eliminates the redundant allocations.
160 CaseInsensitiveGroup {
161 children: Vec<CompiledMatcher>,
162 mode: GroupMode,
163 },
164}
165
166/// Reduction mode for composite matchers.
167///
168/// `Any` corresponds to OR semantics; `All` to AND. Used by
169/// `CaseInsensitiveGroup` to encode whether a single match suffices or every
170/// child must match.
171#[derive(Debug, Clone, Copy, PartialEq, Eq)]
172pub enum GroupMode {
173 /// At least one child must match (`AnyOf` semantics).
174 Any,
175 /// Every child must match (`AllOf` semantics).
176 All,
177}
178
179/// A part of an expand template.
180#[derive(Debug, Clone)]
181pub enum ExpandPart {
182 /// Literal text.
183 Literal(String),
184 /// A placeholder field name (between `%` delimiters).
185 Placeholder(String),
186}
187
188/// Which time component to extract from a timestamp.
189#[derive(Debug, Clone, Copy)]
190pub enum TimePart {
191 Minute,
192 Hour,
193 Day,
194 Week,
195 Month,
196 Year,
197}
198
199impl CompiledMatcher {
200 /// Check if this matcher matches any string value in the event.
201 /// Used for keyword detection (field-less matching).
202 ///
203 /// Avoids allocating a `Vec` of all strings and a `String` per value by
204 /// using `matches_str` with a short-circuiting traversal.
205 #[inline]
206 pub fn matches_keyword(&self, event: &impl Event) -> bool {
207 event.any_string_value(&|s| self.matches_str(s))
208 }
209}