Skip to main content

skill_veil_core/adapters/
regex_matcher.rs

1//! Pattern matcher implementation using the regex crate
2
3use crate::ports::{Captures, CompiledPattern, PatternError, PatternMatch, PatternMatcher};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7
8/// Hard upper bound on the compiled NFA size for a single regex pattern,
9/// in bytes. Mitigates ReDoS / state-explosion from user-supplied
10/// rule-pack patterns by bounding the work the regex crate will accept
11/// at compile time. The default in the `regex` crate is 10 MiB; we
12/// tighten to 8 MiB so genuinely-complex curated IOC patterns
13/// (`URL_PATTERN`, multi-MB IOC alternations) still compile while
14/// catastrophic patterns (`a{1000000}{2}`) are rejected.
15const REGEX_NFA_SIZE_LIMIT: usize = 8 * (1 << 20); // 8 MiB
16
17/// Hard upper bound on the lazy DFA cache for a single compiled regex.
18/// Bounds runtime memory growth on adversarial inputs in addition to
19/// `REGEX_NFA_SIZE_LIMIT`, which only bounds compile-time NFA size.
20const REGEX_DFA_SIZE_LIMIT: usize = 8 * (1 << 20); // 8 MiB
21
22/// Compile `pattern` with the size-bounded `RegexBuilder`. Centralising
23/// the construction in one helper means every code path — the cached
24/// trait-method calls and the `compile()` port — applies the same
25/// limits without forgetting one.
26fn build_bounded_regex(pattern: &str) -> Result<Regex, regex::Error> {
27    RegexBuilder::new(pattern)
28        .size_limit(REGEX_NFA_SIZE_LIMIT)
29        .dfa_size_limit(REGEX_DFA_SIZE_LIMIT)
30        .build()
31}
32
33/// Pattern matcher implementation using the regex crate.
34///
35/// # Compile-cache contract
36///
37/// Each unique `pattern: &str` argument MUST be compiled at most once
38/// per `RegexPatternMatcher` instance. Pre-fix the trait methods
39/// (`find_matches`, `is_match`, `captures_iter`) called `Regex::new`
40/// on every invocation, which on the rule-engine hot path recompiles
41/// the same regex hundreds of times per scan and, with adversarial
42/// rule packs, scales to a viable denial-of-service. The internal
43/// `cache` is a `Mutex<HashMap>` so concurrent scanners observe a
44/// single compiled `Arc<Regex>` per pattern; contention is minimal
45/// because the cache is consulted on the (rare) cold path only.
46#[derive(Debug, Default, Clone)]
47pub struct RegexPatternMatcher {
48    cache: Arc<Mutex<HashMap<String, Arc<Regex>>>>,
49}
50
51impl RegexPatternMatcher {
52    /// Create a new regex-based pattern matcher
53    #[must_use]
54    pub fn new() -> Self {
55        Self::default()
56    }
57
58    /// Look up `pattern` in the compile cache, compiling it on miss.
59    /// Returns `None` (and emits `tracing::warn`) when the pattern is
60    /// invalid OR exceeds [`REGEX_NFA_SIZE_LIMIT`] /
61    /// [`REGEX_DFA_SIZE_LIMIT`]; the trait methods then fall back to
62    /// returning empty results to keep the rest of the rule pass alive.
63    fn cached(&self, pattern: &str) -> Option<Arc<Regex>> {
64        // Recover from poison: a panicked thread holding the lock should
65        // not permanently degrade the cache from O(1) lookup to O(n)
66        // recompilation per call. The data inside is still valid —
67        // poison only signals that a thread panicked *while* holding
68        // the lock, not that the HashMap is corrupt.
69        //
70        // NOTE: We deliberately use `into_inner()` without clearing the
71        // poison flag. Rust's `Mutex` does not expose `clear_poison()`
72        // on the guard, and `std::sync::RecoverableError` is unstable.
73        // After recovery, subsequent lock acquisitions will also recover
74        // via `into_inner()`, which is correct but takes the slow path.
75        // A true poison-clear would require `parking_lot::Mutex` or the
76        // unstable `std::sync::RecoverableError` API.
77        let recover_poison = |e: std::sync::PoisonError<_>| e.into_inner();
78        // Quick read path: most requests hit an already-compiled entry.
79        if let Some(re) = self
80            .cache
81            .lock()
82            .unwrap_or_else(recover_poison)
83            .get(pattern)
84        {
85            return Some(Arc::clone(re));
86        }
87        // Cold path: compile through the bounded builder, then insert
88        // into the cache so subsequent calls hit the warm path. We
89        // re-acquire the lock here rather than holding it across the
90        // potentially-expensive compile so a stuck compile cannot
91        // serialise the entire matcher.
92        let re = match build_bounded_regex(pattern) {
93            Ok(re) => Arc::new(re),
94            Err(e) => {
95                tracing::warn!("Invalid or oversized regex pattern '{pattern}': {e}");
96                return None;
97            }
98        };
99        // If a concurrent compile inserted between our read and write,
100        // the existing entry wins — both compiles produce semantically
101        // identical `Regex` values, but reusing the existing Arc keeps
102        // reference counts predictable.
103        Some(Arc::clone(
104            self.cache
105                .lock()
106                .unwrap_or_else(recover_poison)
107                .entry(pattern.to_string())
108                .or_insert_with(|| Arc::clone(&re)),
109        ))
110    }
111}
112
113fn match_to_pattern(m: regex::Match<'_>) -> PatternMatch {
114    PatternMatch {
115        start: m.start(),
116        end: m.end(),
117        matched_text: m.as_str().to_string(),
118    }
119}
120
121fn captures_to_groups(caps: &regex::Captures<'_>) -> Captures {
122    let groups = caps
123        .iter()
124        .map(|opt| opt.map(match_to_pattern))
125        .collect::<Vec<_>>();
126    Captures::new(groups)
127}
128
129impl PatternMatcher for RegexPatternMatcher {
130    fn find_matches(&self, pattern: &str, text: &str) -> Vec<PatternMatch> {
131        match self.cached(pattern) {
132            Some(re) => re.find_iter(text).map(match_to_pattern).collect(),
133            None => Vec::new(),
134        }
135    }
136
137    fn is_match(&self, pattern: &str, text: &str) -> bool {
138        match self.cached(pattern) {
139            Some(re) => re.is_match(text),
140            None => false,
141        }
142    }
143
144    fn captures_iter(&self, pattern: &str, text: &str) -> Vec<Captures> {
145        match self.cached(pattern) {
146            Some(re) => re
147                .captures_iter(text)
148                .map(|c| captures_to_groups(&c))
149                .collect(),
150            None => Vec::new(),
151        }
152    }
153
154    fn compile(&self, pattern: &str) -> Result<CompiledPattern, PatternError> {
155        let re = Arc::new(
156            build_bounded_regex(pattern)
157                .map_err(|e| PatternError::InvalidPattern(e.to_string()))?,
158        );
159        let re_find = Arc::clone(&re);
160        let re_is_match = Arc::clone(&re);
161        let re_captures = re;
162
163        Ok(CompiledPattern::new(
164            Box::new(move |text: &str| re_find.find_iter(text).map(match_to_pattern).collect()),
165            Box::new(move |text: &str| re_is_match.is_match(text)),
166            Box::new(move |text: &str| {
167                re_captures
168                    .captures_iter(text)
169                    .map(|c| captures_to_groups(&c))
170                    .collect()
171            }),
172        ))
173    }
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    /// # Contract
181    /// `find_matches` returns every regex hit in source order with byte
182    /// offsets and the matched substring preserved verbatim.
183    #[test]
184    fn find_matches_returns_every_hit_in_source_order() {
185        let matcher = RegexPatternMatcher::new();
186        let matches = matcher.find_matches(r"\d+", "abc 123 def 456");
187
188        assert_eq!(matches.len(), 2);
189        assert_eq!(matches[0].matched_text, "123");
190        assert_eq!(matches[1].matched_text, "456");
191    }
192
193    /// # Contract
194    /// `find_matches` returns an empty vector when the pattern does not
195    /// occur in the text — never a sentinel match.
196    #[test]
197    fn find_matches_returns_empty_when_pattern_absent() {
198        let matcher = RegexPatternMatcher::new();
199        let matches = matcher.find_matches(r"\d+", "no numbers here");
200
201        assert!(matches.is_empty());
202    }
203
204    /// # Contract
205    /// A `CompiledPattern` reuses one compilation across `find_matches`,
206    /// `is_match`, and `captures_iter`; all three answer consistently.
207    #[test]
208    fn compile_shares_state_across_three_operations() {
209        let matcher = RegexPatternMatcher::new();
210        let compiled = matcher.compile(r"hello\s+world").unwrap();
211
212        let matches = compiled.find_matches("say hello   world!");
213        assert_eq!(matches.len(), 1);
214        assert_eq!(matches[0].matched_text, "hello   world");
215
216        assert!(compiled.is_match("say hello   world!"));
217        assert!(!compiled.is_match("say goodbye"));
218
219        let caps = compiled.captures_iter("say hello   world!");
220        assert_eq!(caps.len(), 1);
221        assert_eq!(caps[0].get(0).unwrap().matched_text, "hello   world");
222    }
223
224    /// # Contract
225    /// `compile` surfaces invalid regex syntax as `PatternError`; it must
226    /// never panic so tests can drive validation via `is_err()`.
227    #[test]
228    fn compile_returns_err_for_invalid_regex_syntax() {
229        let matcher = RegexPatternMatcher::new();
230        let result = matcher.compile(r"[invalid");
231
232        assert!(result.is_err());
233    }
234
235    /// # Contract
236    ///
237    /// Repeated calls to a trait method with the same `pattern` MUST
238    /// reuse the same compiled `Regex` — no re-compilation per call.
239    /// Pre-fix `find_matches` / `is_match` / `captures_iter` invoked
240    /// `Regex::new` on every call, scaling to O(rules × calls × scans)
241    /// regex compilations and producing a viable DoS surface for
242    /// malicious rule packs. The cache stores compiled `Arc<Regex>`
243    /// keyed by the pattern string; this test pins the contract by
244    /// observing that the cache size grows by exactly one entry no
245    /// matter how many times the same pattern is matched.
246    #[test]
247    fn repeated_calls_with_same_pattern_use_cached_regex() {
248        let matcher = RegexPatternMatcher::new();
249        let pattern = r"\b(curl|wget)\b\s+https?://";
250        for _ in 0..16 {
251            matcher.find_matches(pattern, "run curl https://example.com/install.sh");
252            matcher.is_match(pattern, "fetch wget https://example.com/x");
253            matcher.captures_iter(pattern, "exec curl https://attacker/x");
254        }
255        let cache = matcher.cache.lock().expect("cache mutex poisoned");
256        assert_eq!(
257            cache.len(),
258            1,
259            "trait methods must reuse one cached compile per pattern; got {} entries",
260            cache.len()
261        );
262        assert!(cache.contains_key(pattern));
263    }
264
265    /// # Contract
266    ///
267    /// Different patterns produce distinct cache entries. Pins that the
268    /// cache key is the pattern string, not the matcher instance.
269    #[test]
270    fn cache_keys_each_unique_pattern_separately() {
271        let matcher = RegexPatternMatcher::new();
272        matcher.find_matches(r"\d+", "a 1 b");
273        matcher.find_matches(r"[a-z]+", "abc");
274        matcher.find_matches(r"\d+", "c 2 d");
275        let cache = matcher.cache.lock().expect("cache mutex poisoned");
276        assert_eq!(
277            cache.len(),
278            2,
279            "two distinct patterns must produce two cache entries; got {} entries",
280            cache.len()
281        );
282    }
283
284    /// # Contract
285    ///
286    /// `compile` and the trait methods MUST go through
287    /// `build_bounded_regex`, which applies `RegexBuilder::size_limit`.
288    /// Catastrophic patterns (`a{100000}{2}`) that exceed the bound MUST
289    /// surface as `PatternError` from `compile` and as an empty result
290    /// (with a tracing warning) from the trait methods. Pre-fix
291    /// `Regex::new` had no size limit configured, so a malicious rule
292    /// pack could ship a pattern that compiled but blew up the regex
293    /// state machine at match time.
294    #[test]
295    fn compile_rejects_oversized_patterns() {
296        let matcher = RegexPatternMatcher::new();
297        // `a{1000000}{2}` requests an automaton with ~10^12 states; the
298        // size_limit (1 MiB) rejects it well before that.
299        let pathological = "a{1000000}{2}";
300        let result = matcher.compile(pathological);
301        assert!(
302            result.is_err(),
303            "compile MUST reject oversized regexes via size_limit"
304        );
305        // Trait methods fall back to empty / false on the same input.
306        assert!(matcher.find_matches(pathological, "aaa").is_empty());
307        assert!(!matcher.is_match(pathological, "aaa"));
308    }
309
310    /// # Contract
311    /// `is_match` and `captures_iter` on the trait must agree with
312    /// `find_matches` on whether a pattern occurs.
313    #[test]
314    fn is_match_and_captures_agree_with_find_matches() {
315        let matcher = RegexPatternMatcher::new();
316        let text = "user@example.com talks to admin@example.com";
317        let pattern = r"(\w+)@example\.com";
318
319        assert!(matcher.is_match(pattern, text));
320        assert_eq!(matcher.find_matches(pattern, text).len(), 2);
321
322        let caps = matcher.captures_iter(pattern, text);
323        assert_eq!(caps.len(), 2);
324        assert_eq!(caps[0].get(1).unwrap().matched_text, "user");
325        assert_eq!(caps[1].get(1).unwrap().matched_text, "admin");
326    }
327
328    /// # Contract
329    ///
330    /// After a thread panics while holding the cache mutex, `cached` MUST
331    /// recover from the poisoned lock rather than permanently degrading to
332    /// recompiling every pattern on every call. Pre-fix the `if let Ok(map)`
333    /// guards silently skipped both reads and writes on `Err(PoisonError)`,
334    /// causing an O(n×rules) recompilation per scan instead of O(n) total.
335    #[test]
336    fn cached_recovers_from_poisoned_mutex() {
337        let matcher = RegexPatternMatcher::new();
338        // Warm the cache with a pattern.
339        matcher.find_matches(r"\d+", "abc 123");
340        assert_eq!(
341            matcher
342                .cache
343                .lock()
344                .expect("cache should be healthy after warm-up")
345                .len(),
346            1,
347            "one pattern should be cached after first lookup"
348        );
349        // Poison the mutex by causing a panic while holding it.
350        let cache_clone = Arc::clone(&matcher.cache);
351        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
352            let _guard = cache_clone.lock().expect("lock before poison");
353            panic!("intentional test panic to poison mutex");
354        }));
355        assert!(result.is_err(), "inner panic should have propagated");
356        // Verify the mutex is poisoned.
357        assert!(
358            matcher.cache.is_poisoned(),
359            "mutex must be poisoned after a panic while holding it"
360        );
361        // `cached` must still return a valid result despite the poison.
362        let matches = matcher.find_matches(r"\d+", "xyz 456");
363        assert_eq!(matches.len(), 1, "cached must recover from poison");
364        assert_eq!(matches[0].matched_text, "456");
365        // A new pattern must also work post-poison.
366        let alpha_matches = matcher.find_matches(r"[a-z]+", "abc");
367        assert_eq!(alpha_matches.len(), 1);
368        assert_eq!(alpha_matches[0].matched_text, "abc");
369    }
370}