Skip to main content

sanitize_engine/
allowlist.rs

1//! Allowlist for suppressing specific values from sanitization.
2//!
3//! Values matching an allowlist entry pass through the output unchanged and
4//! are **not** recorded in the [`MappingStore`](crate::store::MappingStore).
5//! This means they also won't propagate to the Phase 2 augmented scanner as
6//! discovered literals — a value that is allowed stays allowed everywhere.
7//!
8//! # Pattern syntax
9//!
10//! Three pattern forms are supported:
11//!
12//! | Pattern                          | Matches                                        |
13//! |----------------------------------|------------------------------------------------|
14//! | `localhost`                      | Exactly `localhost`                            |
15//! | `*.internal`                     | Any value ending with `.internal` (glob)       |
16//! | `192.168.1.*`                    | Any value starting with `192.168.1.` (glob)    |
17//! | `user-*@corp.com`                | Prefix + suffix glob                           |
18//! | `regex:^192\.168\.[0-9]+\.[0-9]+$` | Full regex match                             |
19//!
20//! **Glob patterns** use `*` as the only wildcard (matches any sequence of
21//! characters). Multiple `*` wildcards are supported. Globs are
22//! case-insensitive by default (see [`AllowlistMatcher::new_case_sensitive`]).
23//!
24//! **Regex patterns** are prefixed with `regex:`. The remainder is compiled as
25//! a [`regex::Regex`] and matched against the full value. Regex patterns are
26//! always case-sensitive; use the `(?i)` flag inside the pattern for
27//! case-insensitive matching. The `regex:` prefix is stripped before
28//! compiling, so `regex:^foo$` compiles to `^foo$`.
29//!
30//! If a regex fails to compile, a warning is returned and the pattern is
31//! skipped (the matcher continues without it rather than panicking).
32//!
33//! If a plain pattern (no `*`, no `regex:` prefix) contains regex
34//! metacharacters (`^`, `$`, `+`, `(`, `)`), a warning is emitted suggesting
35//! the `regex:` prefix — those characters are still matched literally in the
36//! plain form.
37
38use regex::Regex;
39use std::collections::HashSet;
40use std::sync::atomic::{AtomicU64, Ordering};
41
42/// Compiled allowlist that can be queried concurrently.
43///
44/// Exact patterns are stored in a [`HashSet`] for O(1) lookup. Glob patterns
45/// (those containing `*`) are stored in a [`Vec`] and scanned linearly after
46/// the hash check misses. Regex patterns (`regex:` prefix) are stored in a
47/// separate [`Vec`] and tried last. This means allowlists with many exact
48/// entries — the common case — pay no linear scan cost.
49///
50/// # Case sensitivity
51///
52/// By default the matcher is **case-insensitive**: patterns and query values
53/// are both lowercased before comparison (applies to exact and glob patterns
54/// only). Use [`AllowlistMatcher::new_case_sensitive`] when exact-case
55/// matching is required. Regex patterns (`regex:` prefix) are always
56/// case-sensitive; use the `(?i)` flag inside the pattern for
57/// case-insensitive regex matching.
58pub struct AllowlistMatcher {
59    exact: HashSet<String>,
60    globs: Vec<String>,
61    /// `(original_pattern_string, compiled_regex)` pairs from `regex:` entries.
62    regexes: Vec<(String, Regex)>,
63    /// When `false` (the default), patterns and query values are lowercased
64    /// before comparison (exact and glob only; regex patterns are unaffected).
65    case_sensitive: bool,
66    /// Number of values passed through as allowed across all `is_allowed` calls.
67    seen: AtomicU64,
68}
69
70impl AllowlistMatcher {
71    /// Build a case-insensitive [`AllowlistMatcher`] from a list of pattern strings.
72    ///
73    /// This is the default constructor. Patterns and query values are both
74    /// lowercased before comparison, so `"Localhost"` matches a pattern of
75    /// `"localhost"` and vice-versa.
76    ///
77    /// Each string is treated as a glob if it contains `*`, otherwise as an
78    /// exact match. Patterns that look like regexes (contain `^`, `$`, `+`,
79    /// `(`, or `)`) are accepted but a warning message is returned alongside
80    /// the matcher so the caller can surface it to the user.
81    #[must_use]
82    pub fn new(patterns: Vec<String>) -> (Self, Vec<String>) {
83        Self::build(patterns, false)
84    }
85
86    /// Build a case-sensitive [`AllowlistMatcher`] from a list of pattern strings.
87    ///
88    /// Use this when exact-case matching is required (e.g. allowlisting a
89    /// known token value that must not match differently-cased substrings).
90    #[must_use]
91    pub fn new_case_sensitive(patterns: Vec<String>) -> (Self, Vec<String>) {
92        Self::build(patterns, true)
93    }
94
95    fn build(patterns: Vec<String>, case_sensitive: bool) -> (Self, Vec<String>) {
96        let mut exact = HashSet::new();
97        let mut globs = Vec::new();
98        let mut regexes = Vec::new();
99        let mut warnings = Vec::new();
100
101        for pat in patterns {
102            if let Some(re_src) = pat.strip_prefix("regex:") {
103                match Regex::new(re_src) {
104                    Ok(compiled) => regexes.push((pat, compiled)),
105                    Err(e) => warnings.push(format!(
106                        "allowlist pattern '{pat}' failed to compile: {e} — pattern skipped"
107                    )),
108                }
109                continue;
110            }
111
112            for ch in ['^', '$', '+', '(', ')'] {
113                if pat.contains(ch) {
114                    warnings.push(format!(
115                        "allowlist pattern '{pat}' contains regex character '{ch}'; \
116                         it is matched literally — use the 'regex:' prefix for regex syntax"
117                    ));
118                    break;
119                }
120            }
121            // Normalize to lowercase for case-insensitive matchers so that
122            // both the stored pattern and the query value are in the same case.
123            let stored = if case_sensitive {
124                pat
125            } else {
126                pat.to_lowercase()
127            };
128            if stored.contains('*') {
129                globs.push(stored);
130            } else {
131                exact.insert(stored);
132            }
133        }
134
135        (
136            Self {
137                exact,
138                globs,
139                regexes,
140                case_sensitive,
141                seen: AtomicU64::new(0),
142            },
143            warnings,
144        )
145    }
146
147    /// Returns `true` if `value` matches any allowlist entry.
148    ///
149    /// Thread-safe; increments an internal counter when a match is found.
150    pub fn is_allowed(&self, value: &str) -> bool {
151        self.match_pattern(value).is_some()
152    }
153
154    /// Returns the pattern that matches `value`, or `None`.
155    ///
156    /// Lookup order: exact hash → glob scan → regex scan. Increments the seen
157    /// counter when a match is found.
158    ///
159    /// Exact and glob patterns are case-insensitive by default (the matcher
160    /// built by [`new`](Self::new) lowercases both patterns and query values
161    /// before comparison). Regex patterns (`regex:` prefix) are always matched
162    /// against the original, un-lowercased value regardless of the
163    /// case-sensitivity setting; use `(?i)` inside the pattern for
164    /// case-insensitive regex matching.
165    pub fn match_pattern<'a>(&'a self, value: &str) -> Option<&'a str> {
166        // Exact + glob: apply case normalization.
167        let normalized: std::borrow::Cow<str> = if self.case_sensitive {
168            std::borrow::Cow::Borrowed(value)
169        } else {
170            std::borrow::Cow::Owned(value.to_lowercase())
171        };
172        if let Some(s) = self.exact.get(normalized.as_ref()) {
173            self.seen.fetch_add(1, Ordering::Relaxed);
174            return Some(s.as_str());
175        }
176        for pat in &self.globs {
177            if glob_matches(pat, &normalized) {
178                self.seen.fetch_add(1, Ordering::Relaxed);
179                return Some(pat.as_str());
180            }
181        }
182        // Regex: always match against the original value (regex has (?i) for
183        // case-insensitive matching; we must not pre-lowercase the input).
184        for (pat_str, re) in &self.regexes {
185            if re.is_match(value) {
186                self.seen.fetch_add(1, Ordering::Relaxed);
187                return Some(pat_str.as_str());
188            }
189        }
190        None
191    }
192
193    /// Total number of values that have been allowed through.
194    pub fn seen_count(&self) -> u64 {
195        self.seen.load(Ordering::Relaxed)
196    }
197
198    /// Number of patterns registered (exact + glob + regex).
199    pub fn pattern_count(&self) -> usize {
200        self.exact.len() + self.globs.len() + self.regexes.len()
201    }
202
203    /// `true` if no patterns are registered (allowlist is effectively disabled).
204    pub fn is_empty(&self) -> bool {
205        self.exact.is_empty() && self.globs.is_empty() && self.regexes.is_empty()
206    }
207}
208
209/// Match `value` against a `*`-glob `pattern`.
210///
211/// `*` matches any sequence of characters (including empty). Multiple `*`
212/// wildcards are supported. Matching is case-sensitive.
213pub(crate) fn glob_matches(pattern: &str, value: &str) -> bool {
214    let parts: Vec<&str> = pattern.split('*').collect();
215    let n = parts.len();
216
217    // First segment must be a prefix.
218    if !value.starts_with(parts[0]) {
219        return false;
220    }
221    // Last segment must be a suffix.
222    if !value.ends_with(parts[n - 1]) {
223        return false;
224    }
225    // For a single `*` these two checks are sufficient.
226    if n == 2 {
227        // Guard against overlap: e.g. "ab" matching "a*b" is fine, but
228        // "a" with prefix "a" and suffix "b" must fail.
229        return value.len() >= parts[0].len() + parts[n - 1].len();
230    }
231
232    // For multiple wildcards, verify inner segments appear in order.
233    let mut pos = parts[0].len();
234    let end = value.len().saturating_sub(parts[n - 1].len());
235    for part in &parts[1..n - 1] {
236        if part.is_empty() {
237            continue;
238        }
239        match value[pos..end].find(part) {
240            Some(found) => pos += found + part.len(),
241            None => return false,
242        }
243    }
244    true
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    fn matcher(pats: &[&str]) -> AllowlistMatcher {
252        let (m, _) = AllowlistMatcher::new(pats.iter().map(|s| (*s).to_string()).collect());
253        m
254    }
255
256    fn matcher_cs(pats: &[&str]) -> AllowlistMatcher {
257        let (m, _) =
258            AllowlistMatcher::new_case_sensitive(pats.iter().map(|s| (*s).to_string()).collect());
259        m
260    }
261
262    #[test]
263    fn exact_match() {
264        // Default: case-insensitive
265        let m = matcher(&["localhost", "127.0.0.1"]);
266        assert!(m.is_allowed("localhost"));
267        assert!(m.is_allowed("127.0.0.1"));
268        assert!(m.is_allowed("Localhost")); // now matches — case-insensitive
269        assert!(m.is_allowed("LOCALHOST")); // now matches
270        assert!(!m.is_allowed("localhost2")); // suffix still fails
271    }
272
273    #[test]
274    fn exact_match_case_sensitive() {
275        let m = matcher_cs(&["localhost", "127.0.0.1"]);
276        assert!(m.is_allowed("localhost"));
277        assert!(!m.is_allowed("Localhost")); // case-sensitive: no match
278        assert!(!m.is_allowed("LOCALHOST"));
279    }
280
281    #[test]
282    fn glob_suffix() {
283        let m = matcher(&["*.internal"]);
284        assert!(m.is_allowed("db.internal"));
285        assert!(m.is_allowed("staging.db.internal"));
286        assert!(!m.is_allowed("db.internal.evil"));
287        assert!(!m.is_allowed("internal"));
288    }
289
290    #[test]
291    fn glob_prefix() {
292        let m = matcher(&["192.168.1.*"]);
293        assert!(m.is_allowed("192.168.1.1"));
294        assert!(m.is_allowed("192.168.1.255"));
295        assert!(!m.is_allowed("192.168.2.1"));
296        // * matches zero or more chars, so trailing-dot form also matches
297        assert!(m.is_allowed("192.168.1."));
298    }
299
300    #[test]
301    fn glob_middle() {
302        let m = matcher(&["user-*@corp.com"]);
303        assert!(m.is_allowed("user-alice@corp.com"));
304        assert!(m.is_allowed("user-bob@corp.com"));
305        assert!(!m.is_allowed("admin@corp.com"));
306        assert!(!m.is_allowed("user-alice@other.com"));
307    }
308
309    #[test]
310    fn glob_star_only() {
311        let m = matcher(&["*"]);
312        assert!(m.is_allowed("anything"));
313        assert!(m.is_allowed(""));
314    }
315
316    #[test]
317    fn seen_counter() {
318        let m = matcher(&["ok"]);
319        assert_eq!(m.seen_count(), 0);
320        m.is_allowed("ok");
321        m.is_allowed("ok");
322        m.is_allowed("not-ok");
323        assert_eq!(m.seen_count(), 2);
324    }
325
326    #[test]
327    fn regex_char_warning() {
328        let (_, warnings) = AllowlistMatcher::new(vec!["^bad$".into()]);
329        assert!(!warnings.is_empty());
330    }
331
332    #[test]
333    fn empty_allowlist_is_empty() {
334        let m = matcher(&[]);
335        assert!(m.is_empty());
336        assert!(!m.is_allowed("anything"));
337    }
338
339    // match_pattern
340
341    #[test]
342    fn match_pattern_returns_exact_pattern() {
343        let m = matcher(&["localhost"]);
344        assert_eq!(m.match_pattern("localhost"), Some("localhost"));
345        assert_eq!(m.match_pattern("other"), None);
346    }
347
348    #[test]
349    fn match_pattern_returns_glob_pattern() {
350        let m = matcher(&["*.internal"]);
351        assert_eq!(m.match_pattern("db.internal"), Some("*.internal"));
352        assert_eq!(m.match_pattern("github.com"), None);
353    }
354
355    #[test]
356    fn match_pattern_returns_first_matching_pattern() {
357        let m = matcher(&["*.internal", "db.*"]);
358        // "db.internal" matches both; first pattern wins
359        assert_eq!(m.match_pattern("db.internal"), Some("*.internal"));
360    }
361
362    #[test]
363    fn match_pattern_increments_seen_counter() {
364        let m = matcher(&["ok"]);
365        assert_eq!(m.seen_count(), 0);
366        m.match_pattern("ok");
367        assert_eq!(m.seen_count(), 1);
368        m.match_pattern("not-ok");
369        assert_eq!(m.seen_count(), 1);
370    }
371
372    #[test]
373    fn is_allowed_delegates_to_match_pattern() {
374        let m = matcher(&["*.internal"]);
375        assert!(m.is_allowed("db.internal"));
376        assert!(!m.is_allowed("github.com"));
377        // seen counter is shared
378        assert_eq!(m.seen_count(), 1);
379    }
380
381    // glob edge cases
382
383    #[test]
384    fn glob_multiple_wildcards() {
385        let m = matcher(&["a*b*c"]);
386        assert!(m.is_allowed("abc"));
387        assert!(m.is_allowed("aXbYc"));
388        assert!(m.is_allowed("aXXXbYYYc"));
389        assert!(!m.is_allowed("abX"));
390        assert!(!m.is_allowed("Xbc"));
391    }
392
393    #[test]
394    fn glob_adjacent_wildcards_treated_as_one() {
395        let m = matcher(&["a**b"]);
396        assert!(m.is_allowed("ab"));
397        assert!(m.is_allowed("aXb"));
398        assert!(!m.is_allowed("ba"));
399    }
400
401    #[test]
402    fn glob_empty_value_only_matches_star() {
403        let m = matcher(&["*"]);
404        assert!(m.is_allowed(""));
405        let m2 = matcher(&["a*"]);
406        assert!(!m2.is_allowed(""));
407    }
408
409    #[test]
410    fn glob_prefix_suffix_overlap_rejected() {
411        // "a*b" must not match "a" (suffix "b" requires at least one more char)
412        let m = matcher(&["a*b"]);
413        assert!(!m.is_allowed("a"));
414        assert!(!m.is_allowed("b"));
415        assert!(m.is_allowed("ab"));
416        assert!(m.is_allowed("aXb"));
417    }
418
419    #[test]
420    fn large_exact_list_all_match() {
421        // Verify HashSet lookup works correctly across many entries.
422        let words: Vec<String> = (0..500).map(|i| format!("word{i}")).collect();
423        let (m, _) = AllowlistMatcher::new(words.clone());
424        for w in &words {
425            assert!(m.is_allowed(w), "should allow {w}");
426        }
427        assert!(!m.is_allowed("word500"));
428        assert!(!m.is_allowed("notaword"));
429    }
430
431    #[test]
432    fn exact_and_glob_coexist() {
433        let m = matcher(&["localhost", "127.0.0.1", "*.internal"]);
434        assert!(m.is_allowed("localhost"));
435        assert!(m.is_allowed("127.0.0.1"));
436        assert!(m.is_allowed("db.internal"));
437        assert!(!m.is_allowed("github.com"));
438    }
439
440    // ── regex: prefix ──────────────────────────────────────────────────────
441
442    #[test]
443    fn regex_basic_match() {
444        let m = matcher(&["regex:^192\\.168\\.[0-9]+\\.[0-9]+$"]);
445        assert!(m.is_allowed("192.168.1.1"));
446        assert!(m.is_allowed("192.168.100.255"));
447        assert!(!m.is_allowed("192.168.1.")); // trailing dot
448        assert!(!m.is_allowed("10.0.0.1"));
449    }
450
451    #[test]
452    fn regex_substring_match_without_anchors() {
453        // Without ^ and $, the regex matches as a substring.
454        let m = matcher(&["regex:internal"]);
455        assert!(m.is_allowed("db.internal.corp"));
456        assert!(m.is_allowed("internal"));
457        assert!(!m.is_allowed("external"));
458    }
459
460    #[test]
461    fn regex_anchored_full_match() {
462        let m = matcher(&["regex:^token-[A-Z]{3}-[0-9]{4}$"]);
463        assert!(m.is_allowed("token-ABC-1234"));
464        assert!(!m.is_allowed("token-AB-1234")); // too short
465        assert!(!m.is_allowed("xtoken-ABC-1234")); // extra prefix
466    }
467
468    #[test]
469    fn regex_case_sensitive_by_default() {
470        // regex: patterns are always case-sensitive; (?i) opts in.
471        let m = matcher(&["regex:^localhost$"]);
472        assert!(m.is_allowed("localhost"));
473        assert!(!m.is_allowed("LOCALHOST"));
474        assert!(!m.is_allowed("Localhost"));
475    }
476
477    #[test]
478    fn regex_case_insensitive_via_flag() {
479        let m = matcher(&["regex:(?i)^localhost$"]);
480        assert!(m.is_allowed("localhost"));
481        assert!(m.is_allowed("LOCALHOST"));
482        assert!(m.is_allowed("LocalHost"));
483    }
484
485    #[test]
486    fn regex_invalid_pattern_produces_warning_and_is_skipped() {
487        let (m, warnings) = AllowlistMatcher::new(vec!["regex:[invalid".into()]);
488        assert!(!warnings.is_empty(), "invalid regex must produce a warning");
489        assert!(warnings[0].contains("failed to compile"));
490        // Pattern is skipped — nothing is allowed.
491        assert!(!m.is_allowed("anything"));
492        assert_eq!(m.pattern_count(), 0);
493    }
494
495    #[test]
496    fn regex_match_pattern_returns_full_prefixed_string() {
497        let m = matcher(&["regex:^10\\.0\\."]);
498        assert_eq!(m.match_pattern("10.0.1.5"), Some("regex:^10\\.0\\."),);
499        assert_eq!(m.match_pattern("192.168.1.1"), None);
500    }
501
502    #[test]
503    fn regex_seen_counter_increments() {
504        let m = matcher(&["regex:^test"]);
505        assert_eq!(m.seen_count(), 0);
506        m.is_allowed("test-value");
507        m.is_allowed("test-value");
508        m.is_allowed("other");
509        assert_eq!(m.seen_count(), 2);
510    }
511
512    #[test]
513    fn regex_coexists_with_exact_and_glob() {
514        let m = matcher(&[
515            "localhost",
516            "*.internal",
517            "regex:^10\\.[0-9]+\\.[0-9]+\\.[0-9]+$",
518        ]);
519        assert!(m.is_allowed("localhost"));
520        assert!(m.is_allowed("db.internal"));
521        assert!(m.is_allowed("10.0.0.1"));
522        assert!(m.is_allowed("10.255.255.255"));
523        assert!(!m.is_allowed("192.168.1.1"));
524        assert!(!m.is_allowed("github.com"));
525        assert_eq!(m.pattern_count(), 3);
526    }
527
528    #[test]
529    fn regex_not_subject_to_case_insensitive_lowercasing() {
530        // The case-insensitive matcher lowercases exact/glob query values,
531        // but regex must receive the original value to honour (?i) correctly.
532        let m = matcher(&["regex:^[A-Z]{3}$"]); // matches exactly 3 uppercase letters
533        assert!(m.is_allowed("ABC"));
534        assert!(!m.is_allowed("abc")); // no (?i) — must not match lowercased
535    }
536
537    #[test]
538    fn metacharacter_warning_updated_to_suggest_regex_prefix() {
539        let (_, warnings) = AllowlistMatcher::new(vec!["^bad$".into()]);
540        assert!(!warnings.is_empty());
541        assert!(
542            warnings[0].contains("regex:"),
543            "warning should suggest regex: prefix, got: {}",
544            warnings[0],
545        );
546    }
547}