Skip to main content

sqry_core/query/
regex_cache.rs

1//! Regex compilation caching for query performance
2//!
3//! Provides thread-safe LRU cache for compiled regex patterns to avoid
4//! redundant compilation during predicate evaluation (P2-1).
5//!
6//! Supports both standard regexes and lookaround patterns (P2-10):
7//! - Standard patterns use `regex::Regex` for performance
8//! - Lookaround patterns (`(?=`, `(?!`, `(?<=`, `(?<!`) use `fancy_regex::Regex`
9
10use crate::cache::CacheConfig;
11use crate::cache::policy::{
12    CacheAdmission, CachePolicy, CachePolicyConfig, CachePolicyKind, build_cache_policy,
13};
14use log::debug;
15use lru::LruCache;
16use regex::Regex;
17use std::hash::{Hash, Hasher};
18use std::num::NonZeroUsize;
19use std::sync::atomic::{AtomicU64, Ordering};
20use std::sync::{Arc, Mutex, OnceLock};
21
22/// Backtrack limit for `fancy_regex` patterns.
23///
24/// The default `fancy_regex` limit is 1,000,000 which is generous enough to
25/// allow a malicious MCP client to burn significant CPU.  We lower it to
26/// 100,000 which still handles all practical lookaround patterns while
27/// bounding worst-case execution time.
28const FANCY_REGEX_BACKTRACK_LIMIT: usize = 100_000;
29
30/// Compiled regex that supports both standard and lookaround patterns
31#[derive(Clone)]
32pub enum CompiledRegex {
33    /// Standard regex (faster, no lookaround support)
34    Standard(Arc<Regex>),
35    /// Fancy regex with lookaround support
36    Fancy(Arc<fancy_regex::Regex>),
37}
38
39impl CompiledRegex {
40    /// Check if the pattern matches the text.
41    ///
42    /// Returns `false` on standard-regex mismatches.  For `fancy_regex`
43    /// patterns, a [`BacktrackLimitExceeded`] error is **propagated** so
44    /// callers can distinguish "no match" from "aborted evaluation".
45    ///
46    /// # Errors
47    ///
48    /// Returns [`RegexMatchError::Fancy`] if a `fancy_regex` pattern exceeds
49    /// the backtrack limit during matching.
50    pub fn is_match(&self, text: &str) -> Result<bool, RegexMatchError> {
51        match self {
52            CompiledRegex::Standard(re) => Ok(re.is_match(text)),
53            CompiledRegex::Fancy(re) => re
54                .is_match(text)
55                .map_err(|e| RegexMatchError::Fancy(Box::new(e))),
56        }
57    }
58}
59
60/// Error returned when a compiled regex fails during matching (not
61/// compilation).  Currently only `fancy_regex` can produce runtime errors
62/// (e.g. backtrack-limit exceeded).
63#[derive(Debug)]
64pub enum RegexMatchError {
65    /// `fancy_regex` runtime error (typically `BacktrackLimitExceeded`).
66    /// Boxed to satisfy `clippy::result_large_err` (`fancy_regex::Error` is 136+ bytes).
67    Fancy(Box<fancy_regex::Error>),
68}
69
70impl std::fmt::Display for RegexMatchError {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        match self {
73            RegexMatchError::Fancy(e) => write!(f, "regex match failed: {e}"),
74        }
75    }
76}
77
78impl std::error::Error for RegexMatchError {}
79
80/// Check if a pattern contains lookaround assertions
81fn has_lookaround(pattern: &str) -> bool {
82    pattern.contains("(?=")
83        || pattern.contains("(?!")
84        || pattern.contains("(?<=")
85        || pattern.contains("(?<!")
86}
87
88/// Error type for regex compilation that handles both standard and fancy regex errors
89#[derive(Debug)]
90pub enum RegexCompileError {
91    /// Standard regex compilation error
92    Standard(regex::Error),
93    /// Fancy regex compilation error (lookaround patterns)
94    /// Boxed to reduce `Result` size (`fancy_regex::Error` is 136+ bytes)
95    Fancy(Box<fancy_regex::Error>),
96}
97
98impl std::fmt::Display for RegexCompileError {
99    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
100        match self {
101            RegexCompileError::Standard(e) => write!(f, "{e}"),
102            RegexCompileError::Fancy(e) => write!(f, "{e}"),
103        }
104    }
105}
106
107impl std::error::Error for RegexCompileError {
108    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
109        match self {
110            RegexCompileError::Standard(e) => Some(e),
111            RegexCompileError::Fancy(e) => Some(e.as_ref()),
112        }
113    }
114}
115
116impl From<regex::Error> for RegexCompileError {
117    fn from(err: regex::Error) -> Self {
118        RegexCompileError::Standard(err)
119    }
120}
121
122impl From<fancy_regex::Error> for RegexCompileError {
123    fn from(err: fancy_regex::Error) -> Self {
124        RegexCompileError::Fancy(Box::new(err))
125    }
126}
127
128/// Cache key for compiled regexes (pattern + flags)
129#[derive(Clone, Eq, PartialEq)]
130struct RegexCacheKey {
131    pattern: String,
132    case_insensitive: bool,
133    multiline: bool,
134    dot_all: bool,
135}
136
137impl Hash for RegexCacheKey {
138    fn hash<H: Hasher>(&self, state: &mut H) {
139        self.pattern.hash(state);
140        self.case_insensitive.hash(state);
141        self.multiline.hash(state);
142        self.dot_all.hash(state);
143    }
144}
145
146/// Thread-safe LRU cache for compiled regexes
147pub struct RegexCache {
148    cache: Arc<Mutex<LruCache<RegexCacheKey, CompiledRegex>>>,
149    capacity: usize,
150    hits: AtomicU64,
151    misses: AtomicU64,
152    evictions: AtomicU64,
153    policy: Arc<dyn CachePolicy<RegexCacheKey>>,
154    /// Backtrack limit applied to `fancy_regex` patterns compiled through
155    /// this cache.  Defaults to [`FANCY_REGEX_BACKTRACK_LIMIT`].
156    fancy_backtrack_limit: usize,
157}
158
159impl RegexCache {
160    /// Create new cache with specified capacity
161    #[must_use]
162    pub fn new(capacity: usize) -> Self {
163        let (kind, window_ratio) = Self::policy_params_from_env();
164        Self::with_policy(capacity, kind, window_ratio)
165    }
166
167    /// Get or compile a regex (cache hit or compile + insert)
168    ///
169    /// Automatically detects lookaround patterns and uses `fancy_regex` for those,
170    /// falling back to standard `regex` for better performance on simple patterns.
171    ///
172    /// # Errors
173    ///
174    /// Returns a regex compilation error when the pattern or flags are invalid.
175    ///
176    /// # Panics
177    ///
178    /// Panics if the internal cache mutex is poisoned (should not occur in normal operation).
179    pub fn get_or_compile(
180        &self,
181        pattern: &str,
182        case_insensitive: bool,
183        multiline: bool,
184        dot_all: bool,
185    ) -> Result<CompiledRegex, RegexCompileError> {
186        let key = RegexCacheKey {
187            pattern: pattern.to_string(),
188            case_insensitive,
189            multiline,
190            dot_all,
191        };
192
193        self.handle_policy_evictions();
194
195        {
196            let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
197            if let Some(regex) = cache.get(&key) {
198                self.hits.fetch_add(1, Ordering::Relaxed);
199                let _ = self.policy.record_hit(&key);
200                return Ok(regex.clone());
201            }
202        }
203
204        self.misses.fetch_add(1, Ordering::Relaxed);
205
206        // Slow path: compile new regex (outside mutex)
207        // P2-10: Use fancy_regex for lookaround patterns, standard regex otherwise
208        let compiled = if has_lookaround(pattern) {
209            // Build pattern with flags for fancy_regex
210            // fancy_regex uses inline flags: (?i) for case-insensitive, (?m) for multiline, (?s) for dot_all
211            let mut flag_prefix = String::new();
212            if case_insensitive {
213                flag_prefix.push_str("(?i)");
214            }
215            if multiline {
216                flag_prefix.push_str("(?m)");
217            }
218            if dot_all {
219                flag_prefix.push_str("(?s)");
220            }
221            let full_pattern = format!("{flag_prefix}{pattern}");
222            let fancy_re = fancy_regex::RegexBuilder::new(&full_pattern)
223                .backtrack_limit(self.fancy_backtrack_limit)
224                .build()?;
225            CompiledRegex::Fancy(Arc::new(fancy_re))
226        } else {
227            // Standard regex for performance
228            let mut builder = regex::RegexBuilder::new(pattern);
229            builder
230                .case_insensitive(case_insensitive)
231                .multi_line(multiline)
232                .dot_matches_new_line(dot_all);
233            let re = builder.build()?;
234            CompiledRegex::Standard(Arc::new(re))
235        };
236
237        if matches!(self.policy.admit(&key, 1), CacheAdmission::Rejected) {
238            debug!(
239                "regex cache policy {:?} rejected pattern {:?}",
240                self.policy.kind(),
241                key.pattern
242            );
243            return Ok(compiled);
244        }
245
246        // Insert into cache (mutex held briefly)
247        {
248            let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
249            if cache.len() == self.capacity
250                && let Some((evicted_key, _)) = cache.pop_lru()
251            {
252                self.policy.invalidate(&evicted_key);
253                self.evictions.fetch_add(1, Ordering::Relaxed);
254            }
255            cache.put(key, compiled.clone());
256        }
257
258        self.handle_policy_evictions();
259
260        Ok(compiled)
261    }
262
263    /// Get cache statistics (for testing)
264    ///
265    /// # Panics
266    ///
267    /// Panics if the regex cache mutex is poisoned.
268    #[cfg(test)]
269    pub fn len(&self) -> usize {
270        self.cache.lock().expect("regex cache mutex poisoned").len()
271    }
272
273    /// Returns true when the cache holds no compiled regex entries (test-only helper).
274    #[cfg(test)]
275    pub fn is_empty(&self) -> bool {
276        self.len() == 0
277    }
278
279    fn handle_policy_evictions(&self) {
280        let evicted = self.policy.drain_evictions();
281        if evicted.is_empty() {
282            return;
283        }
284        let mut cache = self.cache.lock().expect("regex cache mutex poisoned");
285        for eviction in evicted {
286            if cache.pop(&eviction.key).is_some() {
287                self.evictions.fetch_add(1, Ordering::Relaxed);
288            }
289        }
290    }
291
292    fn with_policy(capacity: usize, kind: CachePolicyKind, window_ratio: f32) -> Self {
293        let normalized_capacity = capacity.max(1);
294        let config = CachePolicyConfig::new(kind, normalized_capacity as u64, window_ratio);
295        Self {
296            cache: Arc::new(Mutex::new(LruCache::new(
297                NonZeroUsize::new(normalized_capacity).expect("capacity must be > 0"),
298            ))),
299            capacity: normalized_capacity,
300            hits: AtomicU64::new(0),
301            misses: AtomicU64::new(0),
302            evictions: AtomicU64::new(0),
303            policy: build_cache_policy(&config),
304            fancy_backtrack_limit: FANCY_REGEX_BACKTRACK_LIMIT,
305        }
306    }
307
308    /// Create a cache with a custom `fancy_regex` backtrack limit (test-only).
309    #[cfg(test)]
310    fn with_backtrack_limit(capacity: usize, limit: usize) -> Self {
311        let mut cache = Self::new(capacity);
312        cache.fancy_backtrack_limit = limit;
313        cache
314    }
315
316    fn policy_params_from_env() -> (CachePolicyKind, f32) {
317        let cfg = CacheConfig::from_env();
318        (cfg.policy_kind(), cfg.policy_window_ratio())
319    }
320
321    #[cfg(test)]
322    fn with_policy_kind(capacity: usize, kind: CachePolicyKind) -> Self {
323        Self::with_policy(capacity, kind, CacheConfig::DEFAULT_POLICY_WINDOW_RATIO)
324    }
325
326    #[cfg(test)]
327    fn policy_metrics(&self) -> crate::cache::policy::CachePolicyMetrics {
328        self.policy.stats()
329    }
330}
331
332/// Global singleton instance (lazy-initialized)
333static REGEX_CACHE: OnceLock<RegexCache> = OnceLock::new();
334
335fn get_global_cache() -> &'static RegexCache {
336    REGEX_CACHE.get_or_init(|| {
337        let size = std::env::var("SQRY_REGEX_CACHE_SIZE")
338            .ok()
339            .and_then(|s| s.parse::<usize>().ok())
340            .filter(|&s| (1..=10_000).contains(&s))
341            .unwrap_or(100);
342
343        RegexCache::new(size)
344    })
345}
346
347/// Public API: get or compile a regex
348///
349/// Automatically detects lookaround patterns and uses `fancy_regex` for those,
350/// falling back to standard `regex` for better performance on simple patterns.
351///
352/// # Errors
353///
354/// Returns a regex compilation error when the pattern or flags are invalid.
355pub fn get_or_compile_regex(
356    pattern: &str,
357    case_insensitive: bool,
358    multiline: bool,
359    dot_all: bool,
360) -> Result<CompiledRegex, RegexCompileError> {
361    // # Panics
362    // Panics if the global cache mutex is poisoned (unexpected).
363    get_global_cache().get_or_compile(pattern, case_insensitive, multiline, dot_all)
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369    use crate::cache::policy::CachePolicyKind;
370
371    #[test]
372    fn test_cache_hit_reuses_compiled_regex() {
373        let cache = RegexCache::new(10);
374
375        // First call: miss, compiles
376        let re1 = cache.get_or_compile("foo.*", false, false, false).unwrap();
377        assert_eq!(cache.len(), 1);
378
379        // Second call: hit, reuses
380        let _re2 = cache.get_or_compile("foo.*", false, false, false).unwrap();
381        assert_eq!(cache.len(), 1);
382        // Both should match
383        assert!(re1.is_match("foobar").unwrap());
384    }
385
386    #[test]
387    fn test_different_flags_create_separate_entries() {
388        let cache = RegexCache::new(10);
389
390        let re1 = cache.get_or_compile("foo", false, false, false).unwrap();
391        let re2 = cache.get_or_compile("foo", true, false, false).unwrap(); // case_insensitive=true
392
393        assert_eq!(cache.len(), 2); // Two different cache entries
394        assert!(re1.is_match("foo").unwrap());
395        assert!(!re1.is_match("FOO").unwrap()); // Case-sensitive
396        assert!(re2.is_match("FOO").unwrap()); // Case-insensitive
397    }
398
399    #[test]
400    fn test_lru_eviction_works() {
401        let cache = RegexCache::new(2);
402
403        cache.get_or_compile("a", false, false, false).unwrap();
404        cache.get_or_compile("b", false, false, false).unwrap();
405        assert_eq!(cache.len(), 2);
406
407        // Third entry evicts LRU (should evict "a")
408        cache.get_or_compile("c", false, false, false).unwrap();
409        assert_eq!(cache.len(), 2);
410    }
411
412    #[test]
413    fn test_compilation_errors_not_cached() {
414        let cache = RegexCache::new(10);
415
416        // Invalid regex pattern
417        assert!(
418            cache
419                .get_or_compile("[invalid", false, false, false)
420                .is_err()
421        );
422        assert_eq!(cache.len(), 0); // Error not cached
423    }
424
425    #[test]
426    fn tiny_lfu_rejects_cold_bursts() {
427        let cache = RegexCache::with_policy_kind(3, CachePolicyKind::TinyLfu);
428
429        let hot = cache
430            .get_or_compile("hot", false, false, false)
431            .expect("compile hot regex");
432        for _ in 0..10 {
433            let _ = cache
434                .get_or_compile("hot", false, false, false)
435                .expect("warm hot regex");
436        }
437
438        for i in 0..30 {
439            let pattern = format!("cold{i}");
440            let _ = cache
441                .get_or_compile(&pattern, false, false, false)
442                .expect("compile cold regex");
443        }
444
445        let warmed = cache
446            .get_or_compile("hot", false, false, false)
447            .expect("retrieve hot regex");
448        // Both should still match the same
449        assert!(hot.is_match("hot").unwrap());
450        assert!(warmed.is_match("hot").unwrap());
451
452        let metrics = cache.policy_metrics();
453        assert!(
454            metrics.lfu_rejects > 0,
455            "expected TinyLFU to reject some cold entries"
456        );
457    }
458
459    // P2-10: Tests for lookaround pattern support
460    #[test]
461    fn test_lookahead_pattern_compiles() {
462        let cache = RegexCache::new(10);
463        let re = cache
464            .get_or_compile("foo(?=bar)", false, false, false)
465            .expect("lookahead should compile");
466        assert!(re.is_match("foobar").unwrap());
467        assert!(!re.is_match("foobaz").unwrap());
468    }
469
470    #[test]
471    fn test_lookbehind_pattern_compiles() {
472        let cache = RegexCache::new(10);
473        let re = cache
474            .get_or_compile("(?<=test_)foo", false, false, false)
475            .expect("lookbehind should compile");
476        assert!(re.is_match("test_foo").unwrap());
477        assert!(!re.is_match("prod_foo").unwrap());
478    }
479
480    #[test]
481    fn test_negative_lookahead_pattern() {
482        let cache = RegexCache::new(10);
483        let re = cache
484            .get_or_compile("foo(?!bar)", false, false, false)
485            .expect("negative lookahead should compile");
486        assert!(re.is_match("foobaz").unwrap());
487        assert!(!re.is_match("foobar").unwrap());
488    }
489
490    #[test]
491    fn test_negative_lookbehind_pattern() {
492        let cache = RegexCache::new(10);
493        let re = cache
494            .get_or_compile("(?<!test_)foo", false, false, false)
495            .expect("negative lookbehind should compile");
496        assert!(re.is_match("prod_foo").unwrap());
497        assert!(!re.is_match("test_foo").unwrap());
498    }
499
500    #[test]
501    fn test_lookaround_with_flags() {
502        let cache = RegexCache::new(10);
503        // Case-insensitive lookahead
504        let re = cache
505            .get_or_compile("(?<=TEST_)foo", true, false, false)
506            .expect("lookaround with flags should compile");
507        assert!(re.is_match("TEST_foo").unwrap());
508        assert!(re.is_match("test_foo").unwrap()); // Case insensitive
509        assert!(re.is_match("TEST_FOO").unwrap()); // Case insensitive applies to whole pattern
510    }
511
512    /// Regression test: verify that our lowered backtrack limit is actually
513    /// wired through.  We construct a `fancy_regex::Regex` directly with an
514    /// intentionally tiny limit and a pattern that forces the backtracker.
515    ///
516    /// The production `FANCY_REGEX_BACKTRACK_LIMIT` (100,000) is hard to hit
517    /// reliably in a unit test without a very long input, so we test the
518    /// mechanism itself with limit=1 and verify the `RegexMatchError` path.
519    #[test]
520    fn test_backtrack_limit_exceeded_returns_error() {
521        // Build a regex with a limit of 1 — any backtracking at all will exceed it
522        let re = fancy_regex::RegexBuilder::new("(?=a*)b")
523            .backtrack_limit(1)
524            .build()
525            .expect("pattern should compile");
526        let compiled = CompiledRegex::Fancy(Arc::new(re));
527
528        // This input forces the lookahead to backtrack at least once
529        let result = compiled.is_match("aaa");
530        assert!(
531            result.is_err(),
532            "expected backtrack-limit error, got {result:?}"
533        );
534    }
535
536    /// Verify that the cache path actually wires through its backtrack limit.
537    ///
538    /// Uses `RegexCache::with_backtrack_limit(_, 1)` to compile a lookaround
539    /// pattern through the full cache code path with limit=1.  If the limit
540    /// were not applied in `get_or_compile`, the `is_match` call would return
541    /// `Ok(false)` instead of `Err`.
542    #[test]
543    fn test_cache_compiled_regex_enforces_backtrack_limit() {
544        // Build a cache that sets backtrack_limit=1 — any backtracking exceeds it
545        let cache = RegexCache::with_backtrack_limit(10, 1);
546
547        let re = cache
548            .get_or_compile("(?=a*)b", false, false, false)
549            .expect("lookahead should compile through cache");
550
551        assert!(
552            matches!(re, CompiledRegex::Fancy(_)),
553            "expected Fancy variant for lookaround pattern"
554        );
555
556        // This MUST return Err — proving the cache path applied limit=1.
557        // If the limit were not wired, this would return Ok(false).
558        let result = re.is_match("aaa");
559        assert!(
560            result.is_err(),
561            "cache-compiled regex must enforce backtrack limit, got {result:?}"
562        );
563    }
564}