Skip to main content

rsigma_eval/matcher/
mod.rs

1//! Compiled matchers for zero-allocation hot-path evaluation.
2//!
3//! Each `CompiledMatcher` variant is pre-compiled at rule load time.
4//! At evaluation time, `matches()` performs the comparison against an
5//! [`EventValue`](crate::event::EventValue) from the event with no
6//! dynamic dispatch or allocation.
7
8mod helpers;
9mod matching;
10
11pub use helpers::{ascii_lowercase_cow, parse_expand_template, sigma_string_to_regex};
12
13use aho_corasick::AhoCorasick;
14use regex::{Regex, RegexSet};
15
16use crate::event::Event;
17use ipnet::IpNet;
18
19/// A pre-compiled matcher for a single value comparison.
20///
21/// All string matchers store their values in the form needed for comparison
22/// (Unicode-lowercased for case-insensitive). The `case_insensitive` flag
23/// controls whether the input is lowercased before comparison.
24#[derive(Debug, Clone)]
25pub enum CompiledMatcher {
26    // -- String matchers --
27    /// Exact string equality.
28    Exact {
29        value: String,
30        case_insensitive: bool,
31    },
32    /// Substring containment.
33    Contains {
34        value: String,
35        case_insensitive: bool,
36    },
37    /// String starts with prefix.
38    StartsWith {
39        value: String,
40        case_insensitive: bool,
41    },
42    /// String ends with suffix.
43    EndsWith {
44        value: String,
45        case_insensitive: bool,
46    },
47    /// Compiled regex pattern (flags baked in at compile time).
48    Regex(Regex),
49
50    /// Multi-pattern substring match via Aho-Corasick automaton.
51    ///
52    /// Built by the optimizer when an `AnyOf` group contains
53    /// `AHO_CORASICK_THRESHOLD` or more plain `Contains` matchers with the
54    /// same case sensitivity. Replaces the sequential O(N * haystack_len)
55    /// scan of `AnyOf([Contains, ...])` with a single linear pass.
56    ///
57    /// **Invariant**: this variant only encodes `AnyOf` (OR) semantics.
58    /// `AllOf(Contains)` (`|all` modifier) MUST NOT be collapsed into this
59    /// variant - the optimizer enforces this.
60    ///
61    /// **Case insensitivity**: when `case_insensitive` is true, needles are
62    /// stored pre-lowered (matching the `Contains` invariant) and the hot
63    /// path lowers the haystack via [`ascii_lowercase_cow`] before searching.
64    /// The `AhoCorasick` automaton itself is built case-sensitively.
65    AhoCorasickSet {
66        automaton: AhoCorasick,
67        case_insensitive: bool,
68        /// Pre-lowered needles in the same order they were fed to
69        /// [`AhoCorasick::new`]. Retained so downstream consumers (e.g. the
70        /// engine's per-field bloom builder) can recover the pattern set
71        /// without parsing the automaton's internal state.
72        needles: Vec<String>,
73    },
74
75    /// Multi-pattern regex match via [`regex::RegexSet`].
76    ///
77    /// Built by the optimizer when an `AnyOf` group contains
78    /// `REGEX_SET_THRESHOLD` or more individual `Regex` matchers. Compiles
79    /// every pattern into a single combined DFA so one traversal of the
80    /// haystack tests all patterns at once.
81    ///
82    /// **Pattern reconstruction**: the optimizer rebuilds the set from each
83    /// matcher's [`Regex::as_str`], which preserves any inline flags the
84    /// compiler inlined (e.g. `(?i)`, `(?ims)`). This relies on the eval
85    /// crate's regex builder always inlining flags into the pattern string
86    /// rather than configuring them via `RegexBuilder`. A unit test guards
87    /// against future drift in that contract.
88    RegexSetMatch { set: RegexSet, mode: GroupMode },
89
90    // -- Network --
91    /// CIDR network match for IP addresses.
92    Cidr(IpNet),
93
94    // -- Numeric --
95    /// Numeric equality.
96    NumericEq(f64),
97    /// Numeric greater-than.
98    NumericGt(f64),
99    /// Numeric greater-than-or-equal.
100    NumericGte(f64),
101    /// Numeric less-than.
102    NumericLt(f64),
103    /// Numeric less-than-or-equal.
104    NumericLte(f64),
105
106    // -- Special --
107    /// Field existence check. `true` = field must exist, `false` = must not exist.
108    Exists(bool),
109    /// Compare against another field's value.
110    FieldRef {
111        field: String,
112        case_insensitive: bool,
113    },
114    /// Match null / missing values.
115    Null,
116    /// Boolean equality.
117    BoolEq(bool),
118
119    // -- Expand --
120    /// Placeholder expansion: `%fieldname%` is resolved from the event at match time.
121    Expand {
122        template: Vec<ExpandPart>,
123        case_insensitive: bool,
124    },
125
126    // -- Timestamp --
127    /// Extract a time component from a timestamp field value and match it.
128    TimestampPart {
129        part: TimePart,
130        inner: Box<CompiledMatcher>,
131    },
132
133    // -- Negation --
134    /// Negated matcher: matches if the inner matcher does NOT match.
135    Not(Box<CompiledMatcher>),
136
137    // -- Composite --
138    /// Match if ANY child matches (OR).
139    AnyOf(Vec<CompiledMatcher>),
140    /// Match if ALL children match (AND).
141    AllOf(Vec<CompiledMatcher>),
142
143    /// A composite of case-insensitive string matchers that lowers the haystack
144    /// once before dispatching to children.
145    ///
146    /// Built by the optimizer when an `AnyOf` or `AllOf` group is composed
147    /// entirely of case-insensitive string matchers (`Contains`, `StartsWith`,
148    /// `EndsWith`, `Exact`, `AhoCorasickSet`, plus regexes that carry the
149    /// `(?i)` flag, plus `Not` / nested `AnyOf` / `AllOf` whose every leaf
150    /// satisfies these rules).
151    ///
152    /// **Invariant**: every child must be pre-lowerable. The optimizer's
153    /// `is_pre_lowerable` validator enforces this; `matches_pre_lowered`
154    /// `debug_assert!`s on violation.
155    ///
156    /// **Why this exists**: a mixed `AnyOf([Contains, StartsWith, EndsWith])`
157    /// previously called `s.to_lowercase()` once per child. Pre-lowering the
158    /// haystack a single time and dispatching to a CI-aware match path
159    /// eliminates the redundant allocations.
160    CaseInsensitiveGroup {
161        children: Vec<CompiledMatcher>,
162        mode: GroupMode,
163    },
164}
165
166/// Reduction mode for composite matchers.
167///
168/// `Any` corresponds to OR semantics; `All` to AND. Used by
169/// `CaseInsensitiveGroup` to encode whether a single match suffices or every
170/// child must match.
171#[derive(Debug, Clone, Copy, PartialEq, Eq)]
172pub enum GroupMode {
173    /// At least one child must match (`AnyOf` semantics).
174    Any,
175    /// Every child must match (`AllOf` semantics).
176    All,
177}
178
179/// A part of an expand template.
180#[derive(Debug, Clone)]
181pub enum ExpandPart {
182    /// Literal text.
183    Literal(String),
184    /// A placeholder field name (between `%` delimiters).
185    Placeholder(String),
186}
187
188/// Which time component to extract from a timestamp.
189#[derive(Debug, Clone, Copy)]
190pub enum TimePart {
191    Minute,
192    Hour,
193    Day,
194    Week,
195    Month,
196    Year,
197}
198
199impl CompiledMatcher {
200    /// Check if this matcher matches any string value in the event.
201    /// Used for keyword detection (field-less matching).
202    ///
203    /// Avoids allocating a `Vec` of all strings and a `String` per value by
204    /// using `matches_str` with a short-circuiting traversal.
205    #[inline]
206    pub fn matches_keyword(&self, event: &impl Event) -> bool {
207        event.any_string_value(&|s| self.matches_str(s))
208    }
209}