skill_veil_core/adapters/regex_matcher.rs
1//! Pattern matcher implementation using the regex crate
2
3use crate::ports::{Captures, CompiledPattern, PatternError, PatternMatch, PatternMatcher};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::sync::{Arc, Mutex};
7
8/// Hard upper bound on the compiled NFA size for a single regex pattern,
9/// in bytes. Mitigates ReDoS / state-explosion from user-supplied
10/// rule-pack patterns by bounding the work the regex crate will accept
11/// at compile time. The default in the `regex` crate is 10 MiB; we
12/// tighten to 8 MiB so genuinely-complex curated IOC patterns
13/// (`URL_PATTERN`, multi-MB IOC alternations) still compile while
14/// catastrophic patterns (`a{1000000}{2}`) are rejected.
15const REGEX_NFA_SIZE_LIMIT: usize = 8 * (1 << 20); // 8 MiB
16
17/// Hard upper bound on the lazy DFA cache for a single compiled regex.
18/// Bounds runtime memory growth on adversarial inputs in addition to
19/// `REGEX_NFA_SIZE_LIMIT`, which only bounds compile-time NFA size.
20const REGEX_DFA_SIZE_LIMIT: usize = 8 * (1 << 20); // 8 MiB
21
22/// Compile `pattern` with the size-bounded `RegexBuilder`. Centralising
23/// the construction in one helper means every code path — the cached
24/// trait-method calls and the `compile()` port — applies the same
25/// limits without forgetting one.
26fn build_bounded_regex(pattern: &str) -> Result<Regex, regex::Error> {
27 RegexBuilder::new(pattern)
28 .size_limit(REGEX_NFA_SIZE_LIMIT)
29 .dfa_size_limit(REGEX_DFA_SIZE_LIMIT)
30 .build()
31}
32
33/// Pattern matcher implementation using the regex crate.
34///
35/// # Compile-cache contract
36///
37/// Each unique `pattern: &str` argument MUST be compiled at most once
38/// per `RegexPatternMatcher` instance. Pre-fix the trait methods
39/// (`find_matches`, `is_match`, `captures_iter`) called `Regex::new`
40/// on every invocation, which on the rule-engine hot path recompiles
41/// the same regex hundreds of times per scan and, with adversarial
42/// rule packs, scales to a viable denial-of-service. The internal
43/// `cache` is a `Mutex<HashMap>` so concurrent scanners observe a
44/// single compiled `Arc<Regex>` per pattern; contention is minimal
45/// because the cache is consulted on the (rare) cold path only.
46#[derive(Debug, Default, Clone)]
47pub struct RegexPatternMatcher {
48 cache: Arc<Mutex<HashMap<String, Arc<Regex>>>>,
49}
50
51impl RegexPatternMatcher {
52 /// Create a new regex-based pattern matcher
53 #[must_use]
54 pub fn new() -> Self {
55 Self::default()
56 }
57
58 /// Look up `pattern` in the compile cache, compiling it on miss.
59 /// Returns `None` (and emits `tracing::warn`) when the pattern is
60 /// invalid OR exceeds [`REGEX_NFA_SIZE_LIMIT`] /
61 /// [`REGEX_DFA_SIZE_LIMIT`]; the trait methods then fall back to
62 /// returning empty results to keep the rest of the rule pass alive.
63 fn cached(&self, pattern: &str) -> Option<Arc<Regex>> {
64 // Recover from poison: a panicked thread holding the lock should
65 // not permanently degrade the cache from O(1) lookup to O(n)
66 // recompilation per call. The data inside is still valid —
67 // poison only signals that a thread panicked *while* holding
68 // the lock, not that the HashMap is corrupt.
69 //
70 // NOTE: We deliberately use `into_inner()` without clearing the
71 // poison flag. Rust's `Mutex` does not expose `clear_poison()`
72 // on the guard, and `std::sync::RecoverableError` is unstable.
73 // After recovery, subsequent lock acquisitions will also recover
74 // via `into_inner()`, which is correct but takes the slow path.
75 // A true poison-clear would require `parking_lot::Mutex` or the
76 // unstable `std::sync::RecoverableError` API.
77 let recover_poison = |e: std::sync::PoisonError<_>| e.into_inner();
78 // Quick read path: most requests hit an already-compiled entry.
79 if let Some(re) = self
80 .cache
81 .lock()
82 .unwrap_or_else(recover_poison)
83 .get(pattern)
84 {
85 return Some(Arc::clone(re));
86 }
87 // Cold path: compile through the bounded builder, then insert
88 // into the cache so subsequent calls hit the warm path. We
89 // re-acquire the lock here rather than holding it across the
90 // potentially-expensive compile so a stuck compile cannot
91 // serialise the entire matcher.
92 let re = match build_bounded_regex(pattern) {
93 Ok(re) => Arc::new(re),
94 Err(e) => {
95 tracing::warn!("Invalid or oversized regex pattern '{pattern}': {e}");
96 return None;
97 }
98 };
99 // If a concurrent compile inserted between our read and write,
100 // the existing entry wins — both compiles produce semantically
101 // identical `Regex` values, but reusing the existing Arc keeps
102 // reference counts predictable.
103 Some(Arc::clone(
104 self.cache
105 .lock()
106 .unwrap_or_else(recover_poison)
107 .entry(pattern.to_string())
108 .or_insert_with(|| Arc::clone(&re)),
109 ))
110 }
111}
112
113fn match_to_pattern(m: regex::Match<'_>) -> PatternMatch {
114 PatternMatch {
115 start: m.start(),
116 end: m.end(),
117 matched_text: m.as_str().to_string(),
118 }
119}
120
121fn captures_to_groups(caps: ®ex::Captures<'_>) -> Captures {
122 let groups = caps
123 .iter()
124 .map(|opt| opt.map(match_to_pattern))
125 .collect::<Vec<_>>();
126 Captures::new(groups)
127}
128
129impl PatternMatcher for RegexPatternMatcher {
130 fn find_matches(&self, pattern: &str, text: &str) -> Vec<PatternMatch> {
131 match self.cached(pattern) {
132 Some(re) => re.find_iter(text).map(match_to_pattern).collect(),
133 None => Vec::new(),
134 }
135 }
136
137 fn is_match(&self, pattern: &str, text: &str) -> bool {
138 match self.cached(pattern) {
139 Some(re) => re.is_match(text),
140 None => false,
141 }
142 }
143
144 fn captures_iter(&self, pattern: &str, text: &str) -> Vec<Captures> {
145 match self.cached(pattern) {
146 Some(re) => re
147 .captures_iter(text)
148 .map(|c| captures_to_groups(&c))
149 .collect(),
150 None => Vec::new(),
151 }
152 }
153
154 fn compile(&self, pattern: &str) -> Result<CompiledPattern, PatternError> {
155 let re = Arc::new(
156 build_bounded_regex(pattern)
157 .map_err(|e| PatternError::InvalidPattern(e.to_string()))?,
158 );
159 let re_find = Arc::clone(&re);
160 let re_is_match = Arc::clone(&re);
161 let re_captures = re;
162
163 Ok(CompiledPattern::new(
164 Box::new(move |text: &str| re_find.find_iter(text).map(match_to_pattern).collect()),
165 Box::new(move |text: &str| re_is_match.is_match(text)),
166 Box::new(move |text: &str| {
167 re_captures
168 .captures_iter(text)
169 .map(|c| captures_to_groups(&c))
170 .collect()
171 }),
172 ))
173 }
174}
175
176#[cfg(test)]
177mod tests {
178 use super::*;
179
180 /// # Contract
181 /// `find_matches` returns every regex hit in source order with byte
182 /// offsets and the matched substring preserved verbatim.
183 #[test]
184 fn find_matches_returns_every_hit_in_source_order() {
185 let matcher = RegexPatternMatcher::new();
186 let matches = matcher.find_matches(r"\d+", "abc 123 def 456");
187
188 assert_eq!(matches.len(), 2);
189 assert_eq!(matches[0].matched_text, "123");
190 assert_eq!(matches[1].matched_text, "456");
191 }
192
193 /// # Contract
194 /// `find_matches` returns an empty vector when the pattern does not
195 /// occur in the text — never a sentinel match.
196 #[test]
197 fn find_matches_returns_empty_when_pattern_absent() {
198 let matcher = RegexPatternMatcher::new();
199 let matches = matcher.find_matches(r"\d+", "no numbers here");
200
201 assert!(matches.is_empty());
202 }
203
204 /// # Contract
205 /// A `CompiledPattern` reuses one compilation across `find_matches`,
206 /// `is_match`, and `captures_iter`; all three answer consistently.
207 #[test]
208 fn compile_shares_state_across_three_operations() {
209 let matcher = RegexPatternMatcher::new();
210 let compiled = matcher.compile(r"hello\s+world").unwrap();
211
212 let matches = compiled.find_matches("say hello world!");
213 assert_eq!(matches.len(), 1);
214 assert_eq!(matches[0].matched_text, "hello world");
215
216 assert!(compiled.is_match("say hello world!"));
217 assert!(!compiled.is_match("say goodbye"));
218
219 let caps = compiled.captures_iter("say hello world!");
220 assert_eq!(caps.len(), 1);
221 assert_eq!(caps[0].get(0).unwrap().matched_text, "hello world");
222 }
223
224 /// # Contract
225 /// `compile` surfaces invalid regex syntax as `PatternError`; it must
226 /// never panic so tests can drive validation via `is_err()`.
227 #[test]
228 fn compile_returns_err_for_invalid_regex_syntax() {
229 let matcher = RegexPatternMatcher::new();
230 let result = matcher.compile(r"[invalid");
231
232 assert!(result.is_err());
233 }
234
235 /// # Contract
236 ///
237 /// Repeated calls to a trait method with the same `pattern` MUST
238 /// reuse the same compiled `Regex` — no re-compilation per call.
239 /// Pre-fix `find_matches` / `is_match` / `captures_iter` invoked
240 /// `Regex::new` on every call, scaling to O(rules × calls × scans)
241 /// regex compilations and producing a viable DoS surface for
242 /// malicious rule packs. The cache stores compiled `Arc<Regex>`
243 /// keyed by the pattern string; this test pins the contract by
244 /// observing that the cache size grows by exactly one entry no
245 /// matter how many times the same pattern is matched.
246 #[test]
247 fn repeated_calls_with_same_pattern_use_cached_regex() {
248 let matcher = RegexPatternMatcher::new();
249 let pattern = r"\b(curl|wget)\b\s+https?://";
250 for _ in 0..16 {
251 matcher.find_matches(pattern, "run curl https://example.com/install.sh");
252 matcher.is_match(pattern, "fetch wget https://example.com/x");
253 matcher.captures_iter(pattern, "exec curl https://attacker/x");
254 }
255 let cache = matcher.cache.lock().expect("cache mutex poisoned");
256 assert_eq!(
257 cache.len(),
258 1,
259 "trait methods must reuse one cached compile per pattern; got {} entries",
260 cache.len()
261 );
262 assert!(cache.contains_key(pattern));
263 }
264
265 /// # Contract
266 ///
267 /// Different patterns produce distinct cache entries. Pins that the
268 /// cache key is the pattern string, not the matcher instance.
269 #[test]
270 fn cache_keys_each_unique_pattern_separately() {
271 let matcher = RegexPatternMatcher::new();
272 matcher.find_matches(r"\d+", "a 1 b");
273 matcher.find_matches(r"[a-z]+", "abc");
274 matcher.find_matches(r"\d+", "c 2 d");
275 let cache = matcher.cache.lock().expect("cache mutex poisoned");
276 assert_eq!(
277 cache.len(),
278 2,
279 "two distinct patterns must produce two cache entries; got {} entries",
280 cache.len()
281 );
282 }
283
284 /// # Contract
285 ///
286 /// `compile` and the trait methods MUST go through
287 /// `build_bounded_regex`, which applies `RegexBuilder::size_limit`.
288 /// Catastrophic patterns (`a{100000}{2}`) that exceed the bound MUST
289 /// surface as `PatternError` from `compile` and as an empty result
290 /// (with a tracing warning) from the trait methods. Pre-fix
291 /// `Regex::new` had no size limit configured, so a malicious rule
292 /// pack could ship a pattern that compiled but blew up the regex
293 /// state machine at match time.
294 #[test]
295 fn compile_rejects_oversized_patterns() {
296 let matcher = RegexPatternMatcher::new();
297 // `a{1000000}{2}` requests an automaton with ~10^12 states; the
298 // size_limit (1 MiB) rejects it well before that.
299 let pathological = "a{1000000}{2}";
300 let result = matcher.compile(pathological);
301 assert!(
302 result.is_err(),
303 "compile MUST reject oversized regexes via size_limit"
304 );
305 // Trait methods fall back to empty / false on the same input.
306 assert!(matcher.find_matches(pathological, "aaa").is_empty());
307 assert!(!matcher.is_match(pathological, "aaa"));
308 }
309
310 /// # Contract
311 /// `is_match` and `captures_iter` on the trait must agree with
312 /// `find_matches` on whether a pattern occurs.
313 #[test]
314 fn is_match_and_captures_agree_with_find_matches() {
315 let matcher = RegexPatternMatcher::new();
316 let text = "user@example.com talks to admin@example.com";
317 let pattern = r"(\w+)@example\.com";
318
319 assert!(matcher.is_match(pattern, text));
320 assert_eq!(matcher.find_matches(pattern, text).len(), 2);
321
322 let caps = matcher.captures_iter(pattern, text);
323 assert_eq!(caps.len(), 2);
324 assert_eq!(caps[0].get(1).unwrap().matched_text, "user");
325 assert_eq!(caps[1].get(1).unwrap().matched_text, "admin");
326 }
327
328 /// # Contract
329 ///
330 /// After a thread panics while holding the cache mutex, `cached` MUST
331 /// recover from the poisoned lock rather than permanently degrading to
332 /// recompiling every pattern on every call. Pre-fix the `if let Ok(map)`
333 /// guards silently skipped both reads and writes on `Err(PoisonError)`,
334 /// causing an O(n×rules) recompilation per scan instead of O(n) total.
335 #[test]
336 fn cached_recovers_from_poisoned_mutex() {
337 let matcher = RegexPatternMatcher::new();
338 // Warm the cache with a pattern.
339 matcher.find_matches(r"\d+", "abc 123");
340 assert_eq!(
341 matcher
342 .cache
343 .lock()
344 .expect("cache should be healthy after warm-up")
345 .len(),
346 1,
347 "one pattern should be cached after first lookup"
348 );
349 // Poison the mutex by causing a panic while holding it.
350 let cache_clone = Arc::clone(&matcher.cache);
351 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
352 let _guard = cache_clone.lock().expect("lock before poison");
353 panic!("intentional test panic to poison mutex");
354 }));
355 assert!(result.is_err(), "inner panic should have propagated");
356 // Verify the mutex is poisoned.
357 assert!(
358 matcher.cache.is_poisoned(),
359 "mutex must be poisoned after a panic while holding it"
360 );
361 // `cached` must still return a valid result despite the poison.
362 let matches = matcher.find_matches(r"\d+", "xyz 456");
363 assert_eq!(matches.len(), 1, "cached must recover from poison");
364 assert_eq!(matches[0].matched_text, "456");
365 // A new pattern must also work post-poison.
366 let alpha_matches = matcher.find_matches(r"[a-z]+", "abc");
367 assert_eq!(alpha_matches.len(), 1);
368 assert_eq!(alpha_matches[0].matched_text, "abc");
369 }
370}