rsigma_eval/matcher/mod.rs
1//! Compiled matchers for zero-allocation hot-path evaluation.
2//!
3//! Each `CompiledMatcher` variant is pre-compiled at rule load time.
4//! At evaluation time, `matches()` performs the comparison against an
5//! [`EventValue`] from the event with no dynamic dispatch or allocation.
6
7mod helpers;
8mod matching;
9
10pub use helpers::{ascii_lowercase_cow, parse_expand_template, sigma_string_to_regex};
11
12use aho_corasick::AhoCorasick;
13use regex::{Regex, RegexSet};
14
15use crate::event::Event;
16use ipnet::IpNet;
17
18/// A pre-compiled matcher for a single value comparison.
19///
20/// All string matchers store their values in the form needed for comparison
21/// (Unicode-lowercased for case-insensitive). The `case_insensitive` flag
22/// controls whether the input is lowercased before comparison.
23#[derive(Debug, Clone)]
24pub enum CompiledMatcher {
25 // -- String matchers --
26 /// Exact string equality.
27 Exact {
28 value: String,
29 case_insensitive: bool,
30 },
31 /// Substring containment.
32 Contains {
33 value: String,
34 case_insensitive: bool,
35 },
36 /// String starts with prefix.
37 StartsWith {
38 value: String,
39 case_insensitive: bool,
40 },
41 /// String ends with suffix.
42 EndsWith {
43 value: String,
44 case_insensitive: bool,
45 },
46 /// Compiled regex pattern (flags baked in at compile time).
47 Regex(Regex),
48
49 /// Multi-pattern substring match via Aho-Corasick automaton.
50 ///
51 /// Built by the optimizer when an `AnyOf` group contains
52 /// `AHO_CORASICK_THRESHOLD` or more plain `Contains` matchers with the
53 /// same case sensitivity. Replaces the sequential O(N * haystack_len)
54 /// scan of `AnyOf([Contains, ...])` with a single linear pass.
55 ///
56 /// **Invariant**: this variant only encodes `AnyOf` (OR) semantics.
57 /// `AllOf(Contains)` (`|all` modifier) MUST NOT be collapsed into this
58 /// variant - the optimizer enforces this.
59 ///
60 /// **Case insensitivity**: when `case_insensitive` is true, needles are
61 /// stored pre-lowered (matching the `Contains` invariant) and the hot
62 /// path lowers the haystack via [`ascii_lowercase_cow`] before searching.
63 /// The `AhoCorasick` automaton itself is built case-sensitively.
64 AhoCorasickSet {
65 automaton: AhoCorasick,
66 case_insensitive: bool,
67 /// Pre-lowered needles in the same order they were fed to
68 /// [`AhoCorasick::new`]. Retained so downstream consumers (e.g. the
69 /// engine's per-field bloom builder) can recover the pattern set
70 /// without parsing the automaton's internal state.
71 needles: Vec<String>,
72 },
73
74 /// Multi-pattern regex match via [`regex::RegexSet`].
75 ///
76 /// Built by the optimizer when an `AnyOf` group contains
77 /// `REGEX_SET_THRESHOLD` or more individual `Regex` matchers. Compiles
78 /// every pattern into a single combined DFA so one traversal of the
79 /// haystack tests all patterns at once.
80 ///
81 /// **Pattern reconstruction**: the optimizer rebuilds the set from each
82 /// matcher's [`Regex::as_str`], which preserves any inline flags the
83 /// compiler inlined (e.g. `(?i)`, `(?ims)`). This relies on the eval
84 /// crate's regex builder always inlining flags into the pattern string
85 /// rather than configuring them via `RegexBuilder`. A unit test guards
86 /// against future drift in that contract.
87 RegexSetMatch { set: RegexSet, mode: GroupMode },
88
89 // -- Network --
90 /// CIDR network match for IP addresses.
91 Cidr(IpNet),
92
93 // -- Numeric --
94 /// Numeric equality.
95 NumericEq(f64),
96 /// Numeric greater-than.
97 NumericGt(f64),
98 /// Numeric greater-than-or-equal.
99 NumericGte(f64),
100 /// Numeric less-than.
101 NumericLt(f64),
102 /// Numeric less-than-or-equal.
103 NumericLte(f64),
104
105 // -- Special --
106 /// Field existence check. `true` = field must exist, `false` = must not exist.
107 Exists(bool),
108 /// Compare against another field's value.
109 FieldRef {
110 field: String,
111 case_insensitive: bool,
112 },
113 /// Match null / missing values.
114 Null,
115 /// Boolean equality.
116 BoolEq(bool),
117
118 // -- Expand --
119 /// Placeholder expansion: `%fieldname%` is resolved from the event at match time.
120 Expand {
121 template: Vec<ExpandPart>,
122 case_insensitive: bool,
123 },
124
125 // -- Timestamp --
126 /// Extract a time component from a timestamp field value and match it.
127 TimestampPart {
128 part: TimePart,
129 inner: Box<CompiledMatcher>,
130 },
131
132 // -- Negation --
133 /// Negated matcher: matches if the inner matcher does NOT match.
134 Not(Box<CompiledMatcher>),
135
136 // -- Composite --
137 /// Match if ANY child matches (OR).
138 AnyOf(Vec<CompiledMatcher>),
139 /// Match if ALL children match (AND).
140 AllOf(Vec<CompiledMatcher>),
141
142 /// A composite of case-insensitive string matchers that lowers the haystack
143 /// once before dispatching to children.
144 ///
145 /// Built by the optimizer when an `AnyOf` or `AllOf` group is composed
146 /// entirely of case-insensitive string matchers (`Contains`, `StartsWith`,
147 /// `EndsWith`, `Exact`, `AhoCorasickSet`, plus regexes that carry the
148 /// `(?i)` flag, plus `Not` / nested `AnyOf` / `AllOf` whose every leaf
149 /// satisfies these rules).
150 ///
151 /// **Invariant**: every child must be pre-lowerable. The optimizer's
152 /// `is_pre_lowerable` validator enforces this; `matches_pre_lowered`
153 /// `debug_assert!`s on violation.
154 ///
155 /// **Why this exists**: a mixed `AnyOf([Contains, StartsWith, EndsWith])`
156 /// previously called `s.to_lowercase()` once per child. Pre-lowering the
157 /// haystack a single time and dispatching to a CI-aware match path
158 /// eliminates the redundant allocations.
159 CaseInsensitiveGroup {
160 children: Vec<CompiledMatcher>,
161 mode: GroupMode,
162 },
163}
164
165/// Reduction mode for composite matchers.
166///
167/// `Any` corresponds to OR semantics; `All` to AND. Used by
168/// `CaseInsensitiveGroup` to encode whether a single match suffices or every
169/// child must match.
170#[derive(Debug, Clone, Copy, PartialEq, Eq)]
171pub enum GroupMode {
172 /// At least one child must match (`AnyOf` semantics).
173 Any,
174 /// Every child must match (`AllOf` semantics).
175 All,
176}
177
178/// A part of an expand template.
179#[derive(Debug, Clone)]
180pub enum ExpandPart {
181 /// Literal text.
182 Literal(String),
183 /// A placeholder field name (between `%` delimiters).
184 Placeholder(String),
185}
186
187/// Which time component to extract from a timestamp.
188#[derive(Debug, Clone, Copy)]
189pub enum TimePart {
190 Minute,
191 Hour,
192 Day,
193 Week,
194 Month,
195 Year,
196}
197
198impl CompiledMatcher {
199 /// Check if this matcher matches any string value in the event.
200 /// Used for keyword detection (field-less matching).
201 ///
202 /// Avoids allocating a `Vec` of all strings and a `String` per value by
203 /// using `matches_str` with a short-circuiting traversal.
204 #[inline]
205 pub fn matches_keyword(&self, event: &impl Event) -> bool {
206 event.any_string_value(&|s| self.matches_str(s))
207 }
208}