Skip to main content

sqry_core/query/
regex_cache.rs

1//! Regex compilation caching for query performance
2//!
3//! Provides thread-safe LRU cache for compiled regex patterns to avoid
4//! redundant compilation during predicate evaluation (P2-1).
5//!
6//! Supports both standard regexes and lookaround patterns (P2-10):
7//! - Standard patterns use `regex::Regex` for performance
8//! - Lookaround patterns (`(?=`, `(?!`, `(?<=`, `(?<!`) use `fancy_regex::Regex`
9
10use crate::cache::CacheConfig;
11use crate::cache::policy::{
12    CacheAdmission, CachePolicy, CachePolicyConfig, CachePolicyKind, build_cache_policy,
13};
14use log::debug;
15use lru::LruCache;
16use regex::Regex;
17use std::hash::{Hash, Hasher};
18use std::num::NonZeroUsize;
19use std::sync::atomic::{AtomicU64, Ordering};
20use std::sync::{Arc, Mutex, OnceLock};
21
22/// Backtrack limit for `fancy_regex` patterns.
23///
24/// The default `fancy_regex` limit is 1,000,000 which is generous enough to
25/// allow a malicious MCP client to burn significant CPU.  We lower it to
26/// 100,000 which still handles all practical lookaround patterns while
27/// bounding worst-case execution time.
28const FANCY_REGEX_BACKTRACK_LIMIT: usize = 100_000;
29
30/// Compiled regex that supports both standard and lookaround patterns
31#[derive(Clone)]
32pub enum CompiledRegex {
33    /// Standard regex (faster, no lookaround support)
34    Standard(Arc<Regex>),
35    /// Fancy regex with lookaround support
36    Fancy(Arc<fancy_regex::Regex>),
37}
38
39impl CompiledRegex {
40    /// Check if the pattern matches the text.
41    ///
42    /// Returns `false` on standard-regex mismatches.  For `fancy_regex`
43    /// patterns, a [`BacktrackLimitExceeded`] error is **propagated** so
44    /// callers can distinguish "no match" from "aborted evaluation".
45    pub fn is_match(&self, text: &str) -> Result<bool, RegexMatchError> {
46        match self {
47            CompiledRegex::Standard(re) => Ok(re.is_match(text)),
48            CompiledRegex::Fancy(re) => re
49                .is_match(text)
50                .map_err(|e| RegexMatchError::Fancy(Box::new(e))),
51        }
52    }
53}
54
55/// Error returned when a compiled regex fails during matching (not
56/// compilation).  Currently only `fancy_regex` can produce runtime errors
57/// (e.g. backtrack-limit exceeded).
58#[derive(Debug)]
59pub enum RegexMatchError {
60    /// `fancy_regex` runtime error (typically `BacktrackLimitExceeded`).
61    /// Boxed to satisfy `clippy::result_large_err` (`fancy_regex::Error` is 136+ bytes).
62    Fancy(Box<fancy_regex::Error>),
63}
64
65impl std::fmt::Display for RegexMatchError {
66    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
67        match self {
68            RegexMatchError::Fancy(e) => write!(f, "regex match failed: {e}"),
69        }
70    }
71}
72
73impl std::error::Error for RegexMatchError {}
74
75/// Check if a pattern contains lookaround assertions
76fn has_lookaround(pattern: &str) -> bool {
77    pattern.contains("(?=")
78        || pattern.contains("(?!")
79        || pattern.contains("(?<=")
80        || pattern.contains("(?<!")
81}
82
83/// Error type for regex compilation that handles both standard and fancy regex errors
84#[derive(Debug)]
85pub enum RegexCompileError {
86    /// Standard regex compilation error
87    Standard(regex::Error),
88    /// Fancy regex compilation error (lookaround patterns)
89    /// Boxed to reduce `Result` size (`fancy_regex::Error` is 136+ bytes)
90    Fancy(Box<fancy_regex::Error>),
91}
92
93impl std::fmt::Display for RegexCompileError {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        match self {
96            RegexCompileError::Standard(e) => write!(f, "{e}"),
97            RegexCompileError::Fancy(e) => write!(f, "{e}"),
98        }
99    }
100}
101
102impl std::error::Error for RegexCompileError {
103    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
104        match self {
105            RegexCompileError::Standard(e) => Some(e),
106            RegexCompileError::Fancy(e) => Some(e.as_ref()),
107        }
108    }
109}
110
111impl From<regex::Error> for RegexCompileError {
112    fn from(err: regex::Error) -> Self {
113        RegexCompileError::Standard(err)
114    }
115}
116
117impl From<fancy_regex::Error> for RegexCompileError {
118    fn from(err: fancy_regex::Error) -> Self {
119        RegexCompileError::Fancy(Box::new(err))
120    }
121}
122
123/// Cache key for compiled regexes (pattern + flags)
124#[derive(Clone, Eq, PartialEq)]
125struct RegexCacheKey {
126    pattern: String,
127    case_insensitive: bool,
128    multiline: bool,
129    dot_all: bool,
130}
131
132impl Hash for RegexCacheKey {
133    fn hash<H: Hasher>(&self, state: &mut H) {
134        self.pattern.hash(state);
135        self.case_insensitive.hash(state);
136        self.multiline.hash(state);
137        self.dot_all.hash(state);
138    }
139}
140
141/// Thread-safe LRU cache for compiled regexes
142pub struct RegexCache {
143    cache: Arc<Mutex<LruCache<RegexCacheKey, CompiledRegex>>>,
144    capacity: usize,
145    hits: AtomicU64,
146    misses: AtomicU64,
147    evictions: AtomicU64,
148    policy: Arc<dyn CachePolicy<RegexCacheKey>>,
149    /// Backtrack limit applied to `fancy_regex` patterns compiled through
150    /// this cache.  Defaults to [`FANCY_REGEX_BACKTRACK_LIMIT`].
151    fancy_backtrack_limit: usize,
152}
153
154impl RegexCache {
155    /// Create new cache with specified capacity
156    #[must_use]
157    pub fn new(capacity: usize) -> Self {
158        let (kind, window_ratio) = Self::policy_params_from_env();
159        Self::with_policy(capacity, kind, window_ratio)
160    }
161
162    /// Get or compile a regex (cache hit or compile + insert)
163    ///
164    /// Automatically detects lookaround patterns and uses `fancy_regex` for those,
165    /// falling back to standard `regex` for better performance on simple patterns.
166    ///
167    /// # Errors
168    ///
169    /// Returns a regex compilation error when the pattern or flags are invalid.
170    ///
171    /// # Panics
172    ///
173    /// Panics if the internal cache mutex is poisoned (should not occur in normal operation).
174    pub fn get_or_compile(
175        &self,
176        pattern: &str,
177        case_insensitive: bool,
178        multiline: bool,
179        dot_all: bool,
180    ) -> Result<CompiledRegex, RegexCompileError> {
181        let key = RegexCacheKey {
182            pattern: pattern.to_string(),
183            case_insensitive,
184            multiline,
185            dot_all,
186        };
187
188        self.handle_policy_evictions();
189
190        {
191            let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
192            if let Some(regex) = cache.get(&key) {
193                self.hits.fetch_add(1, Ordering::Relaxed);
194                let _ = self.policy.record_hit(&key);
195                return Ok(regex.clone());
196            }
197        }
198
199        self.misses.fetch_add(1, Ordering::Relaxed);
200
201        // Slow path: compile new regex (outside mutex)
202        // P2-10: Use fancy_regex for lookaround patterns, standard regex otherwise
203        let compiled = if has_lookaround(pattern) {
204            // Build pattern with flags for fancy_regex
205            // fancy_regex uses inline flags: (?i) for case-insensitive, (?m) for multiline, (?s) for dot_all
206            let mut flag_prefix = String::new();
207            if case_insensitive {
208                flag_prefix.push_str("(?i)");
209            }
210            if multiline {
211                flag_prefix.push_str("(?m)");
212            }
213            if dot_all {
214                flag_prefix.push_str("(?s)");
215            }
216            let full_pattern = format!("{flag_prefix}{pattern}");
217            let fancy_re = fancy_regex::RegexBuilder::new(&full_pattern)
218                .backtrack_limit(self.fancy_backtrack_limit)
219                .build()?;
220            CompiledRegex::Fancy(Arc::new(fancy_re))
221        } else {
222            // Standard regex for performance
223            let mut builder = regex::RegexBuilder::new(pattern);
224            builder
225                .case_insensitive(case_insensitive)
226                .multi_line(multiline)
227                .dot_matches_new_line(dot_all);
228            let re = builder.build()?;
229            CompiledRegex::Standard(Arc::new(re))
230        };
231
232        if matches!(self.policy.admit(&key, 1), CacheAdmission::Rejected) {
233            debug!(
234                "regex cache policy {:?} rejected pattern {:?}",
235                self.policy.kind(),
236                key.pattern
237            );
238            return Ok(compiled);
239        }
240
241        // Insert into cache (mutex held briefly)
242        {
243            let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
244            if cache.len() == self.capacity
245                && let Some((evicted_key, _)) = cache.pop_lru()
246            {
247                self.policy.invalidate(&evicted_key);
248                self.evictions.fetch_add(1, Ordering::Relaxed);
249            }
250            cache.put(key, compiled.clone());
251        }
252
253        self.handle_policy_evictions();
254
255        Ok(compiled)
256    }
257
258    /// Get cache statistics (for testing)
259    ///
260    /// # Panics
261    ///
262    /// Panics if the regex cache mutex is poisoned.
263    #[cfg(test)]
264    pub fn len(&self) -> usize {
265        self.cache.lock().expect("regex cache mutex poisoned").len()
266    }
267
268    /// Returns true when the cache holds no compiled regex entries (test-only helper).
269    #[cfg(test)]
270    pub fn is_empty(&self) -> bool {
271        self.len() == 0
272    }
273
274    fn handle_policy_evictions(&self) {
275        let evicted = self.policy.drain_evictions();
276        if evicted.is_empty() {
277            return;
278        }
279        let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
280        for eviction in evicted {
281            if cache.pop(&eviction.key).is_some() {
282                self.evictions.fetch_add(1, Ordering::Relaxed);
283            }
284        }
285    }
286
287    fn with_policy(capacity: usize, kind: CachePolicyKind, window_ratio: f32) -> Self {
288        let normalized_capacity = capacity.max(1);
289        let config = CachePolicyConfig::new(kind, normalized_capacity as u64, window_ratio);
290        Self {
291            cache: Arc::new(Mutex::new(LruCache::new(
292                NonZeroUsize::new(normalized_capacity).expect("capacity must be > 0"),
293            ))),
294            capacity: normalized_capacity,
295            hits: AtomicU64::new(0),
296            misses: AtomicU64::new(0),
297            evictions: AtomicU64::new(0),
298            policy: build_cache_policy(&config),
299            fancy_backtrack_limit: FANCY_REGEX_BACKTRACK_LIMIT,
300        }
301    }
302
303    /// Create a cache with a custom `fancy_regex` backtrack limit (test-only).
304    #[cfg(test)]
305    fn with_backtrack_limit(capacity: usize, limit: usize) -> Self {
306        let mut cache = Self::new(capacity);
307        cache.fancy_backtrack_limit = limit;
308        cache
309    }
310
311    fn policy_params_from_env() -> (CachePolicyKind, f32) {
312        let cfg = CacheConfig::from_env();
313        (cfg.policy_kind(), cfg.policy_window_ratio())
314    }
315
316    #[cfg(test)]
317    fn with_policy_kind(capacity: usize, kind: CachePolicyKind) -> Self {
318        Self::with_policy(capacity, kind, CacheConfig::DEFAULT_POLICY_WINDOW_RATIO)
319    }
320
321    #[cfg(test)]
322    fn policy_metrics(&self) -> crate::cache::policy::CachePolicyMetrics {
323        self.policy.stats()
324    }
325}
326
327/// Global singleton instance (lazy-initialized)
328static REGEX_CACHE: OnceLock<RegexCache> = OnceLock::new();
329
330fn get_global_cache() -> &'static RegexCache {
331    REGEX_CACHE.get_or_init(|| {
332        let size = std::env::var("SQRY_REGEX_CACHE_SIZE")
333            .ok()
334            .and_then(|s| s.parse::<usize>().ok())
335            .filter(|&s| (1..=10_000).contains(&s))
336            .unwrap_or(100);
337
338        RegexCache::new(size)
339    })
340}
341
342/// Public API: get or compile a regex
343///
344/// Automatically detects lookaround patterns and uses `fancy_regex` for those,
345/// falling back to standard `regex` for better performance on simple patterns.
346///
347/// # Errors
348///
349/// Returns a regex compilation error when the pattern or flags are invalid.
350pub fn get_or_compile_regex(
351    pattern: &str,
352    case_insensitive: bool,
353    multiline: bool,
354    dot_all: bool,
355) -> Result<CompiledRegex, RegexCompileError> {
356    // # Panics
357    // Panics if the global cache mutex is poisoned (unexpected).
358    get_global_cache().get_or_compile(pattern, case_insensitive, multiline, dot_all)
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use crate::cache::policy::CachePolicyKind;
365
366    #[test]
367    fn test_cache_hit_reuses_compiled_regex() {
368        let cache = RegexCache::new(10);
369
370        // First call: miss, compiles
371        let re1 = cache.get_or_compile("foo.*", false, false, false).unwrap();
372        assert_eq!(cache.len(), 1);
373
374        // Second call: hit, reuses
375        let _re2 = cache.get_or_compile("foo.*", false, false, false).unwrap();
376        assert_eq!(cache.len(), 1);
377        // Both should match
378        assert!(re1.is_match("foobar").unwrap());
379    }
380
381    #[test]
382    fn test_different_flags_create_separate_entries() {
383        let cache = RegexCache::new(10);
384
385        let re1 = cache.get_or_compile("foo", false, false, false).unwrap();
386        let re2 = cache.get_or_compile("foo", true, false, false).unwrap(); // case_insensitive=true
387
388        assert_eq!(cache.len(), 2); // Two different cache entries
389        assert!(re1.is_match("foo").unwrap());
390        assert!(!re1.is_match("FOO").unwrap()); // Case-sensitive
391        assert!(re2.is_match("FOO").unwrap()); // Case-insensitive
392    }
393
394    #[test]
395    fn test_lru_eviction_works() {
396        let cache = RegexCache::new(2);
397
398        cache.get_or_compile("a", false, false, false).unwrap();
399        cache.get_or_compile("b", false, false, false).unwrap();
400        assert_eq!(cache.len(), 2);
401
402        // Third entry evicts LRU (should evict "a")
403        cache.get_or_compile("c", false, false, false).unwrap();
404        assert_eq!(cache.len(), 2);
405    }
406
407    #[test]
408    fn test_compilation_errors_not_cached() {
409        let cache = RegexCache::new(10);
410
411        // Invalid regex pattern
412        assert!(
413            cache
414                .get_or_compile("[invalid", false, false, false)
415                .is_err()
416        );
417        assert_eq!(cache.len(), 0); // Error not cached
418    }
419
420    #[test]
421    fn tiny_lfu_rejects_cold_bursts() {
422        let cache = RegexCache::with_policy_kind(3, CachePolicyKind::TinyLfu);
423
424        let hot = cache
425            .get_or_compile("hot", false, false, false)
426            .expect("compile hot regex");
427        for _ in 0..10 {
428            let _ = cache
429                .get_or_compile("hot", false, false, false)
430                .expect("warm hot regex");
431        }
432
433        for i in 0..30 {
434            let pattern = format!("cold{i}");
435            let _ = cache
436                .get_or_compile(&pattern, false, false, false)
437                .expect("compile cold regex");
438        }
439
440        let warmed = cache
441            .get_or_compile("hot", false, false, false)
442            .expect("retrieve hot regex");
443        // Both should still match the same
444        assert!(hot.is_match("hot").unwrap());
445        assert!(warmed.is_match("hot").unwrap());
446
447        let metrics = cache.policy_metrics();
448        assert!(
449            metrics.lfu_rejects > 0,
450            "expected TinyLFU to reject some cold entries"
451        );
452    }
453
454    // P2-10: Tests for lookaround pattern support
455    #[test]
456    fn test_lookahead_pattern_compiles() {
457        let cache = RegexCache::new(10);
458        let re = cache
459            .get_or_compile("foo(?=bar)", false, false, false)
460            .expect("lookahead should compile");
461        assert!(re.is_match("foobar").unwrap());
462        assert!(!re.is_match("foobaz").unwrap());
463    }
464
465    #[test]
466    fn test_lookbehind_pattern_compiles() {
467        let cache = RegexCache::new(10);
468        let re = cache
469            .get_or_compile("(?<=test_)foo", false, false, false)
470            .expect("lookbehind should compile");
471        assert!(re.is_match("test_foo").unwrap());
472        assert!(!re.is_match("prod_foo").unwrap());
473    }
474
475    #[test]
476    fn test_negative_lookahead_pattern() {
477        let cache = RegexCache::new(10);
478        let re = cache
479            .get_or_compile("foo(?!bar)", false, false, false)
480            .expect("negative lookahead should compile");
481        assert!(re.is_match("foobaz").unwrap());
482        assert!(!re.is_match("foobar").unwrap());
483    }
484
485    #[test]
486    fn test_negative_lookbehind_pattern() {
487        let cache = RegexCache::new(10);
488        let re = cache
489            .get_or_compile("(?<!test_)foo", false, false, false)
490            .expect("negative lookbehind should compile");
491        assert!(re.is_match("prod_foo").unwrap());
492        assert!(!re.is_match("test_foo").unwrap());
493    }
494
495    #[test]
496    fn test_lookaround_with_flags() {
497        let cache = RegexCache::new(10);
498        // Case-insensitive lookahead
499        let re = cache
500            .get_or_compile("(?<=TEST_)foo", true, false, false)
501            .expect("lookaround with flags should compile");
502        assert!(re.is_match("TEST_foo").unwrap());
503        assert!(re.is_match("test_foo").unwrap()); // Case insensitive
504        assert!(re.is_match("TEST_FOO").unwrap()); // Case insensitive applies to whole pattern
505    }
506
507    /// Regression test: verify that our lowered backtrack limit is actually
508    /// wired through.  We construct a `fancy_regex::Regex` directly with an
509    /// intentionally tiny limit and a pattern that forces the backtracker.
510    ///
511    /// The production `FANCY_REGEX_BACKTRACK_LIMIT` (100,000) is hard to hit
512    /// reliably in a unit test without a very long input, so we test the
513    /// mechanism itself with limit=1 and verify the `RegexMatchError` path.
514    #[test]
515    fn test_backtrack_limit_exceeded_returns_error() {
516        // Build a regex with a limit of 1 — any backtracking at all will exceed it
517        let re = fancy_regex::RegexBuilder::new("(?=a*)b")
518            .backtrack_limit(1)
519            .build()
520            .expect("pattern should compile");
521        let compiled = CompiledRegex::Fancy(Arc::new(re));
522
523        // This input forces the lookahead to backtrack at least once
524        let result = compiled.is_match("aaa");
525        assert!(
526            result.is_err(),
527            "expected backtrack-limit error, got {result:?}"
528        );
529    }
530
531    /// Verify that the cache path actually wires through its backtrack limit.
532    ///
533    /// Uses `RegexCache::with_backtrack_limit(_, 1)` to compile a lookaround
534    /// pattern through the full cache code path with limit=1.  If the limit
535    /// were not applied in `get_or_compile`, the `is_match` call would return
536    /// `Ok(false)` instead of `Err`.
537    #[test]
538    fn test_cache_compiled_regex_enforces_backtrack_limit() {
539        // Build a cache that sets backtrack_limit=1 — any backtracking exceeds it
540        let cache = RegexCache::with_backtrack_limit(10, 1);
541
542        let re = cache
543            .get_or_compile("(?=a*)b", false, false, false)
544            .expect("lookahead should compile through cache");
545
546        assert!(
547            matches!(re, CompiledRegex::Fancy(_)),
548            "expected Fancy variant for lookaround pattern"
549        );
550
551        // This MUST return Err — proving the cache path applied limit=1.
552        // If the limit were not wired, this would return Ok(false).
553        let result = re.is_match("aaa");
554        assert!(
555            result.is_err(),
556            "cache-compiled regex must enforce backtrack limit, got {result:?}"
557        );
558    }
559}