Skip to main content

rsigma_eval/matcher/
mod.rs

1//! Compiled matchers for zero-allocation hot-path evaluation.
2//!
3//! Each `CompiledMatcher` variant is pre-compiled at rule load time.
4//! At evaluation time, `matches()` performs the comparison against an
5//! [`EventValue`] from the event with no dynamic dispatch or allocation.
6
7mod helpers;
8mod matching;
9
10pub use helpers::{ascii_lowercase_cow, parse_expand_template, sigma_string_to_regex};
11
12use aho_corasick::AhoCorasick;
13use regex::{Regex, RegexSet};
14
15use crate::event::Event;
16use ipnet::IpNet;
17
18/// A pre-compiled matcher for a single value comparison.
19///
20/// All string matchers store their values in the form needed for comparison
21/// (Unicode-lowercased for case-insensitive). The `case_insensitive` flag
22/// controls whether the input is lowercased before comparison.
23#[derive(Debug, Clone)]
24pub enum CompiledMatcher {
25    // -- String matchers --
26    /// Exact string equality.
27    Exact {
28        value: String,
29        case_insensitive: bool,
30    },
31    /// Substring containment.
32    Contains {
33        value: String,
34        case_insensitive: bool,
35    },
36    /// String starts with prefix.
37    StartsWith {
38        value: String,
39        case_insensitive: bool,
40    },
41    /// String ends with suffix.
42    EndsWith {
43        value: String,
44        case_insensitive: bool,
45    },
46    /// Compiled regex pattern (flags baked in at compile time).
47    Regex(Regex),
48
49    /// Multi-pattern substring match via Aho-Corasick automaton.
50    ///
51    /// Built by the optimizer when an `AnyOf` group contains
52    /// `AHO_CORASICK_THRESHOLD` or more plain `Contains` matchers with the
53    /// same case sensitivity. Replaces the sequential O(N * haystack_len)
54    /// scan of `AnyOf([Contains, ...])` with a single linear pass.
55    ///
56    /// **Invariant**: this variant only encodes `AnyOf` (OR) semantics.
57    /// `AllOf(Contains)` (`|all` modifier) MUST NOT be collapsed into this
58    /// variant - the optimizer enforces this.
59    ///
60    /// **Case insensitivity**: when `case_insensitive` is true, needles are
61    /// stored pre-lowered (matching the `Contains` invariant) and the hot
62    /// path lowers the haystack via [`ascii_lowercase_cow`] before searching.
63    /// The `AhoCorasick` automaton itself is built case-sensitively.
64    AhoCorasickSet {
65        automaton: AhoCorasick,
66        case_insensitive: bool,
67        /// Pre-lowered needles in the same order they were fed to
68        /// [`AhoCorasick::new`]. Retained so downstream consumers (e.g. the
69        /// engine's per-field bloom builder) can recover the pattern set
70        /// without parsing the automaton's internal state.
71        needles: Vec<String>,
72    },
73
74    /// Multi-pattern regex match via [`regex::RegexSet`].
75    ///
76    /// Built by the optimizer when an `AnyOf` group contains
77    /// `REGEX_SET_THRESHOLD` or more individual `Regex` matchers. Compiles
78    /// every pattern into a single combined DFA so one traversal of the
79    /// haystack tests all patterns at once.
80    ///
81    /// **Pattern reconstruction**: the optimizer rebuilds the set from each
82    /// matcher's [`Regex::as_str`], which preserves any inline flags the
83    /// compiler inlined (e.g. `(?i)`, `(?ims)`). This relies on the eval
84    /// crate's regex builder always inlining flags into the pattern string
85    /// rather than configuring them via `RegexBuilder`. A unit test guards
86    /// against future drift in that contract.
87    RegexSetMatch { set: RegexSet, mode: GroupMode },
88
89    // -- Network --
90    /// CIDR network match for IP addresses.
91    Cidr(IpNet),
92
93    // -- Numeric --
94    /// Numeric equality.
95    NumericEq(f64),
96    /// Numeric greater-than.
97    NumericGt(f64),
98    /// Numeric greater-than-or-equal.
99    NumericGte(f64),
100    /// Numeric less-than.
101    NumericLt(f64),
102    /// Numeric less-than-or-equal.
103    NumericLte(f64),
104
105    // -- Special --
106    /// Field existence check. `true` = field must exist, `false` = must not exist.
107    Exists(bool),
108    /// Compare against another field's value.
109    FieldRef {
110        field: String,
111        case_insensitive: bool,
112    },
113    /// Match null / missing values.
114    Null,
115    /// Boolean equality.
116    BoolEq(bool),
117
118    // -- Expand --
119    /// Placeholder expansion: `%fieldname%` is resolved from the event at match time.
120    Expand {
121        template: Vec<ExpandPart>,
122        case_insensitive: bool,
123    },
124
125    // -- Timestamp --
126    /// Extract a time component from a timestamp field value and match it.
127    TimestampPart {
128        part: TimePart,
129        inner: Box<CompiledMatcher>,
130    },
131
132    // -- Negation --
133    /// Negated matcher: matches if the inner matcher does NOT match.
134    Not(Box<CompiledMatcher>),
135
136    // -- Composite --
137    /// Match if ANY child matches (OR).
138    AnyOf(Vec<CompiledMatcher>),
139    /// Match if ALL children match (AND).
140    AllOf(Vec<CompiledMatcher>),
141
142    /// A composite of case-insensitive string matchers that lowers the haystack
143    /// once before dispatching to children.
144    ///
145    /// Built by the optimizer when an `AnyOf` or `AllOf` group is composed
146    /// entirely of case-insensitive string matchers (`Contains`, `StartsWith`,
147    /// `EndsWith`, `Exact`, `AhoCorasickSet`, plus regexes that carry the
148    /// `(?i)` flag, plus `Not` / nested `AnyOf` / `AllOf` whose every leaf
149    /// satisfies these rules).
150    ///
151    /// **Invariant**: every child must be pre-lowerable. The optimizer's
152    /// `is_pre_lowerable` validator enforces this; `matches_pre_lowered`
153    /// `debug_assert!`s on violation.
154    ///
155    /// **Why this exists**: a mixed `AnyOf([Contains, StartsWith, EndsWith])`
156    /// previously called `s.to_lowercase()` once per child. Pre-lowering the
157    /// haystack a single time and dispatching to a CI-aware match path
158    /// eliminates the redundant allocations.
159    CaseInsensitiveGroup {
160        children: Vec<CompiledMatcher>,
161        mode: GroupMode,
162    },
163}
164
165/// Reduction mode for composite matchers.
166///
167/// `Any` corresponds to OR semantics; `All` to AND. Used by
168/// `CaseInsensitiveGroup` to encode whether a single match suffices or every
169/// child must match.
170#[derive(Debug, Clone, Copy, PartialEq, Eq)]
171pub enum GroupMode {
172    /// At least one child must match (`AnyOf` semantics).
173    Any,
174    /// Every child must match (`AllOf` semantics).
175    All,
176}
177
178/// A part of an expand template.
179#[derive(Debug, Clone)]
180pub enum ExpandPart {
181    /// Literal text.
182    Literal(String),
183    /// A placeholder field name (between `%` delimiters).
184    Placeholder(String),
185}
186
187/// Which time component to extract from a timestamp.
188#[derive(Debug, Clone, Copy)]
189pub enum TimePart {
190    Minute,
191    Hour,
192    Day,
193    Week,
194    Month,
195    Year,
196}
197
198impl CompiledMatcher {
199    /// Check if this matcher matches any string value in the event.
200    /// Used for keyword detection (field-less matching).
201    ///
202    /// Avoids allocating a `Vec` of all strings and a `String` per value by
203    /// using `matches_str` with a short-circuiting traversal.
204    #[inline]
205    pub fn matches_keyword(&self, event: &impl Event) -> bool {
206        event.any_string_value(&|s| self.matches_str(s))
207    }
208}