Skip to main content

coding_tools/
pattern.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Jonathan Shook
3
4//! Substring → glob → regex pattern promotion, shared by every tool option that
5//! accepts a *pattern*.
6//!
7//! The rule lets a caller write the simplest thing that expresses intent and
8//! have the tool infer how literally it was meant:
9//!
10//! * a string with no metacharacters is a [`Literal`](PatternKind::Literal)
11//!   substring (regex-escaped, matched verbatim);
12//! * a string with only glob metacharacters (`*`, `?`, `[ … ]`) that is *not* a
13//!   valid regex is a [`Glob`](PatternKind::Glob), converted to an equivalent
14//!   regex;
15//! * anything else carrying regex metacharacters and forming a valid expression
16//!   is a [`Regex`](PatternKind::Regex), used exactly as written.
17
18use regex::Regex;
19
20/// How a raw pattern string was interpreted by [`classify`].
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum PatternKind {
23    /// No metacharacters: matched verbatim (regex-escaped).
24    Literal,
25    /// Glob metacharacters only: converted to an equivalent regex.
26    Glob,
27    /// Explicit, valid regex: used as written.
28    Regex,
29}
30
31/// An explicit `--mode` choice that switches promotion off for every pattern
32/// argument in an invocation: the stated interpretation is used as-is.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
34pub enum Mode {
35    /// Match the pattern text verbatim.
36    Literal,
37    /// Interpret the pattern as a shell-style glob.
38    Glob,
39    /// Interpret the pattern as a regular expression, exactly as written.
40    Regex,
41}
42
43impl Mode {
44    /// The [`PatternKind`] this explicit mode pins a pattern to.
45    pub fn kind(self) -> PatternKind {
46        match self {
47            Mode::Literal => PatternKind::Literal,
48            Mode::Glob => PatternKind::Glob,
49            Mode::Regex => PatternKind::Regex,
50        }
51    }
52}
53
54/// [`classify`] under an optional explicit mode: a stated [`Mode`] pins the
55/// kind; absent, promotion decides.
56pub fn classify_with(pat: &str, mode: Option<Mode>) -> PatternKind {
57    match mode {
58        Some(m) => m.kind(),
59        None => classify(pat),
60    }
61}
62
63/// [`promote`] under an optional explicit mode.
64pub fn promote_with(pat: &str, mode: Option<Mode>) -> String {
65    match classify_with(pat, mode) {
66        PatternKind::Literal => regex::escape(pat),
67        PatternKind::Glob => glob_to_regex(pat),
68        PatternKind::Regex => pat.to_string(),
69    }
70}
71
72/// [`compile`] under an optional explicit mode.
73///
74/// # Examples
75///
76/// ```
77/// use coding_tools::pattern::{compile_with, Mode};
78///
79/// // 'todo!(' promotes to an invalid-paren regex without a mode; pinned
80/// // literal, it matches its own text.
81/// let re = compile_with("todo!(\"x\")", Some(Mode::Literal)).unwrap();
82/// assert!(re.is_match("    todo!(\"x\") , here"));
83/// ```
84pub fn compile_with(pat: &str, mode: Option<Mode>) -> Result<Regex, regex::Error> {
85    Regex::new(&promote_with(pat, mode))
86}
87
88/// [`compile_anchored`] under an optional explicit mode.
89pub fn compile_anchored_with(pat: &str, mode: Option<Mode>) -> Result<Regex, regex::Error> {
90    Regex::new(&format!("^(?:{})$", promote_with(pat, mode)))
91}
92
93/// [`compile_name_set`] under an optional explicit mode. With a stated mode of
94/// `literal`, the `'|'` separator is still a set separator (the alternatives
95/// themselves are matched verbatim).
96pub fn compile_name_set_with(spec: &str, mode: Option<Mode>) -> Result<Vec<Regex>, regex::Error> {
97    spec.split('|')
98        .filter(|s| !s.is_empty())
99        .map(|alt| compile_anchored_with(alt, mode))
100        .collect()
101}
102
103/// Glob metacharacters that, on their own, signal a glob pattern.
104const GLOB_META: [char; 3] = ['*', '?', '['];
105
106/// Regex metacharacters that signal explicit regular-expression intent.
107const REGEX_META: [char; 10] = ['^', '$', '(', ')', '|', '+', '{', '}', '\\', '.'];
108
109/// Classify a raw pattern according to the suite's promotion rule.
110///
111/// A pattern is [`Regex`](PatternKind::Regex) only when it both carries explicit
112/// regex metacharacters *and* compiles, so an invalid-as-regex string such as
113/// `*.java` (leading quantifier) falls back to [`Glob`](PatternKind::Glob).
114///
115/// # Examples
116///
117/// ```
118/// use coding_tools::pattern::{classify, PatternKind};
119///
120/// assert_eq!(classify("ERROR:"), PatternKind::Literal); // no metacharacters
121/// assert_eq!(classify("*.java"), PatternKind::Glob);     // a leading `*` is not a valid regex
122/// assert_eq!(classify(r"\d+"), PatternKind::Regex);      // valid regex with metacharacters
123/// ```
124pub fn classify(pat: &str) -> PatternKind {
125    let has_glob = pat.chars().any(|c| GLOB_META.contains(&c));
126    let has_regex_meta = pat.chars().any(|c| REGEX_META.contains(&c));
127    if !has_glob && !has_regex_meta {
128        PatternKind::Literal
129    } else if has_regex_meta && Regex::new(pat).is_ok() {
130        PatternKind::Regex
131    } else {
132        PatternKind::Glob
133    }
134}
135
136/// Convert a glob pattern into an (unanchored) regular-expression *source*.
137///
138/// `*` and `?` do not cross a path separator (`/`), mirroring shell glob
139/// semantics; `[ … ]` character classes are passed through (with a leading `!`
140/// rewritten to the regex negation `^`); every other regex metacharacter is
141/// escaped to a literal.
142///
143/// # Examples
144///
145/// ```
146/// use coding_tools::pattern::glob_to_regex;
147///
148/// assert_eq!(glob_to_regex("*.rs"), r"[^/]*\.rs");
149/// assert_eq!(glob_to_regex("data_[0-9]"), "data_[0-9]");
150/// ```
151pub fn glob_to_regex(glob: &str) -> String {
152    let mut out = String::new();
153    let mut chars = glob.chars().peekable();
154    while let Some(c) = chars.next() {
155        match c {
156            '*' => out.push_str("[^/]*"),
157            '?' => out.push_str("[^/]"),
158            '[' => {
159                out.push('[');
160                if matches!(chars.peek(), Some('!')) {
161                    out.push('^');
162                    chars.next();
163                }
164                for cc in chars.by_ref() {
165                    out.push(cc);
166                    if cc == ']' {
167                        break;
168                    }
169                }
170            }
171            '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '\\' => {
172                out.push('\\');
173                out.push(c);
174            }
175            _ => out.push(c),
176        }
177    }
178    out
179}
180
181/// Produce the regex *source* a raw pattern promotes to, without anchoring.
182///
183/// # Examples
184///
185/// ```
186/// use coding_tools::pattern::promote;
187///
188/// assert_eq!(promote("a.b"), "a.b"); // valid regex: used as written
189/// assert_eq!(promote("a+b"), "a+b"); // valid regex
190/// assert_eq!(promote("*.rs"), r"[^/]*\.rs"); // glob -> regex
191/// assert_eq!(promote("v1.0"), "v1.0"); // '.' is a regex metachar, kept as-is
192/// ```
193pub fn promote(pat: &str) -> String {
194    match classify(pat) {
195        PatternKind::Literal => regex::escape(pat),
196        PatternKind::Glob => glob_to_regex(pat),
197        PatternKind::Regex => pat.to_string(),
198    }
199}
200
201/// Compile a pattern for *content / unanchored* matching (e.g. `ct-search --grep`
202/// or any `ct-test` matcher): the result matches anywhere in the haystack.
203///
204/// # Examples
205///
206/// ```
207/// use coding_tools::pattern::compile;
208///
209/// let re = compile("ERROR:").unwrap();
210/// assert!(re.is_match("first line\nERROR: bad input\n"));
211/// assert!(!re.is_match("all good here"));
212/// ```
213pub fn compile(pat: &str) -> Result<Regex, regex::Error> {
214    Regex::new(&promote(pat))
215}
216
217/// Compile a pattern for *whole-name* matching: anchored to the full string, so
218/// `*.java` means "the name ends in `.java`", not merely "contains".
219///
220/// # Examples
221///
222/// ```
223/// use coding_tools::pattern::compile_anchored;
224///
225/// let re = compile_anchored("*.rs").unwrap();
226/// assert!(re.is_match("main.rs"));
227/// assert!(!re.is_match("main.rs.bak")); // anchored: must end in .rs
228/// ```
229pub fn compile_anchored(pat: &str) -> Result<Regex, regex::Error> {
230    Regex::new(&format!("^(?:{})$", promote(pat)))
231}
232
233/// Compile a `'|'`-separated set of whole-name alternatives, each promoted and
234/// anchored. An entry name matches the set when it matches *any* alternative,
235/// mirroring `find`'s `-name a -o -name b`.
236///
237/// # Examples
238///
239/// ```
240/// use coding_tools::pattern::compile_name_set;
241///
242/// let set = compile_name_set("*.rs|*.toml").unwrap();
243/// let matches = |name: &str| set.iter().any(|r| r.is_match(name));
244/// assert!(matches("lib.rs"));
245/// assert!(matches("Cargo.toml"));
246/// assert!(!matches("README.md"));
247/// ```
248pub fn compile_name_set(spec: &str) -> Result<Vec<Regex>, regex::Error> {
249    spec.split('|')
250        .filter(|s| !s.is_empty())
251        .map(compile_anchored)
252        .collect()
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn classifies_literal_when_no_metacharacters() {
261        assert_eq!(classify("ERROR:"), PatternKind::Literal);
262        assert_eq!(classify("knn_entries"), PatternKind::Literal);
263    }
264
265    #[test]
266    fn classifies_glob_when_not_valid_regex() {
267        assert_eq!(classify("*.java"), PatternKind::Glob);
268        assert_eq!(classify("data_[0-9]"), PatternKind::Glob);
269    }
270
271    #[test]
272    fn classifies_regex_when_explicit_and_valid() {
273        assert_eq!(classify("^ERROR"), PatternKind::Regex);
274        assert_eq!(classify("foo|bar"), PatternKind::Regex);
275        assert_eq!(classify(r"\d+"), PatternKind::Regex);
276        // A bare '.' is a regex specifier, so this is a regex, not a literal.
277        assert_eq!(classify("foo.bar"), PatternKind::Regex);
278    }
279
280    #[test]
281    fn literal_matches_as_unanchored_substring() {
282        let re = compile("ERROR:").unwrap();
283        assert!(re.is_match("first line\nERROR: bad input\n"));
284        assert!(!re.is_match("all good here"));
285    }
286
287    #[test]
288    fn name_set_anchors_each_glob_alternative() {
289        let set = compile_name_set("*.java|*.kt").unwrap();
290        let matches = |name: &str| set.iter().any(|r| r.is_match(name));
291        assert!(matches("Widget.java"));
292        assert!(matches("Widget.kt"));
293        assert!(!matches("Widget.javax"));
294        assert!(!matches("Widget.java.bak"));
295    }
296
297    #[test]
298    fn regex_alternation_is_preserved_for_content() {
299        let re = compile("SimpleMFD|knn_entries").unwrap();
300        assert!(re.is_match("...knn_entries..."));
301        assert!(!re.is_match("nothing relevant"));
302    }
303
304    #[test]
305    fn explicit_mode_overrides_promotion() {
306        // Verbatim code anchor: promotion would try (and fail) regex; literal
307        // mode matches its own text.
308        let code = r#"WireSource::Port(_) => todo!("x"),"#;
309        let re = compile_with(code, Some(Mode::Literal)).unwrap();
310        assert!(re.is_match(&format!("    {code}\n")));
311
312        // Pinned regex: '.' stays a metacharacter even in a plain string.
313        let re = compile_with("a.c", Some(Mode::Regex)).unwrap();
314        assert!(re.is_match("abc"));
315        // Pinned literal: '.' is just a dot.
316        let re = compile_with("a.c", Some(Mode::Literal)).unwrap();
317        assert!(!re.is_match("abc"));
318        assert!(re.is_match("a.c"));
319    }
320
321    #[test]
322    fn absent_mode_keeps_promotion() {
323        assert_eq!(classify_with("*.rs", None), PatternKind::Glob);
324        assert_eq!(classify_with("*.rs", Some(Mode::Literal)), PatternKind::Literal);
325    }
326}