Skip to main content

coding_tools/
pattern.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Jonathan Shook
3
4//! Substring → glob → regex pattern promotion, shared by every tool option that
5//! accepts a *pattern*.
6//!
7//! The rule lets a caller write the simplest thing that expresses intent and
8//! have the tool infer how literally it was meant:
9//!
10//! * a string with no metacharacters is a [`Literal`](PatternKind::Literal)
11//!   substring (regex-escaped, matched verbatim);
12//! * a string with only glob metacharacters (`*`, `?`, `[ … ]`) that is *not* a
13//!   valid regex is a [`Glob`](PatternKind::Glob), converted to an equivalent
14//!   regex;
15//! * anything else carrying regex metacharacters and forming a valid expression
16//!   is a [`Regex`](PatternKind::Regex), used exactly as written.
17
18use regex::Regex;
19
20/// How a raw pattern string was interpreted by [`classify`].
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum PatternKind {
23    /// No metacharacters: matched verbatim (regex-escaped).
24    Literal,
25    /// Glob metacharacters only: converted to an equivalent regex.
26    Glob,
27    /// Explicit, valid regex: used as written.
28    Regex,
29}
30
31/// Glob metacharacters that, on their own, signal a glob pattern.
32const GLOB_META: [char; 3] = ['*', '?', '['];
33
34/// Regex metacharacters that signal explicit regular-expression intent.
35const REGEX_META: [char; 10] = ['^', '$', '(', ')', '|', '+', '{', '}', '\\', '.'];
36
37/// Classify a raw pattern according to the suite's promotion rule.
38///
39/// A pattern is [`Regex`](PatternKind::Regex) only when it both carries explicit
40/// regex metacharacters *and* compiles, so an invalid-as-regex string such as
41/// `*.java` (leading quantifier) falls back to [`Glob`](PatternKind::Glob).
42///
43/// # Examples
44///
45/// ```
46/// use coding_tools::pattern::{classify, PatternKind};
47///
48/// assert_eq!(classify("ERROR:"), PatternKind::Literal); // no metacharacters
49/// assert_eq!(classify("*.java"), PatternKind::Glob);     // a leading `*` is not a valid regex
50/// assert_eq!(classify(r"\d+"), PatternKind::Regex);      // valid regex with metacharacters
51/// ```
52pub fn classify(pat: &str) -> PatternKind {
53    let has_glob = pat.chars().any(|c| GLOB_META.contains(&c));
54    let has_regex_meta = pat.chars().any(|c| REGEX_META.contains(&c));
55    if !has_glob && !has_regex_meta {
56        PatternKind::Literal
57    } else if has_regex_meta && Regex::new(pat).is_ok() {
58        PatternKind::Regex
59    } else {
60        PatternKind::Glob
61    }
62}
63
64/// Convert a glob pattern into an (unanchored) regular-expression *source*.
65///
66/// `*` and `?` do not cross a path separator (`/`), mirroring shell glob
67/// semantics; `[ … ]` character classes are passed through (with a leading `!`
68/// rewritten to the regex negation `^`); every other regex metacharacter is
69/// escaped to a literal.
70///
71/// # Examples
72///
73/// ```
74/// use coding_tools::pattern::glob_to_regex;
75///
76/// assert_eq!(glob_to_regex("*.rs"), r"[^/]*\.rs");
77/// assert_eq!(glob_to_regex("data_[0-9]"), "data_[0-9]");
78/// ```
79pub fn glob_to_regex(glob: &str) -> String {
80    let mut out = String::new();
81    let mut chars = glob.chars().peekable();
82    while let Some(c) = chars.next() {
83        match c {
84            '*' => out.push_str("[^/]*"),
85            '?' => out.push_str("[^/]"),
86            '[' => {
87                out.push('[');
88                if matches!(chars.peek(), Some('!')) {
89                    out.push('^');
90                    chars.next();
91                }
92                for cc in chars.by_ref() {
93                    out.push(cc);
94                    if cc == ']' {
95                        break;
96                    }
97                }
98            }
99            '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '\\' => {
100                out.push('\\');
101                out.push(c);
102            }
103            _ => out.push(c),
104        }
105    }
106    out
107}
108
109/// Produce the regex *source* a raw pattern promotes to, without anchoring.
110///
111/// # Examples
112///
113/// ```
114/// use coding_tools::pattern::promote;
115///
116/// assert_eq!(promote("a.b"), "a.b"); // valid regex: used as written
117/// assert_eq!(promote("a+b"), "a+b"); // valid regex
118/// assert_eq!(promote("*.rs"), r"[^/]*\.rs"); // glob -> regex
119/// assert_eq!(promote("v1.0"), "v1.0"); // '.' is a regex metachar, kept as-is
120/// ```
121pub fn promote(pat: &str) -> String {
122    match classify(pat) {
123        PatternKind::Literal => regex::escape(pat),
124        PatternKind::Glob => glob_to_regex(pat),
125        PatternKind::Regex => pat.to_string(),
126    }
127}
128
129/// Compile a pattern for *content / unanchored* matching (e.g. `ct-search --grep`
130/// or any `ct-test` matcher): the result matches anywhere in the haystack.
131///
132/// # Examples
133///
134/// ```
135/// use coding_tools::pattern::compile;
136///
137/// let re = compile("ERROR:").unwrap();
138/// assert!(re.is_match("first line\nERROR: bad input\n"));
139/// assert!(!re.is_match("all good here"));
140/// ```
141pub fn compile(pat: &str) -> Result<Regex, regex::Error> {
142    Regex::new(&promote(pat))
143}
144
145/// Compile a pattern for *whole-name* matching: anchored to the full string, so
146/// `*.java` means "the name ends in `.java`", not merely "contains".
147///
148/// # Examples
149///
150/// ```
151/// use coding_tools::pattern::compile_anchored;
152///
153/// let re = compile_anchored("*.rs").unwrap();
154/// assert!(re.is_match("main.rs"));
155/// assert!(!re.is_match("main.rs.bak")); // anchored: must end in .rs
156/// ```
157pub fn compile_anchored(pat: &str) -> Result<Regex, regex::Error> {
158    Regex::new(&format!("^(?:{})$", promote(pat)))
159}
160
161/// Compile a `'|'`-separated set of whole-name alternatives, each promoted and
162/// anchored. An entry name matches the set when it matches *any* alternative,
163/// mirroring `find`'s `-name a -o -name b`.
164///
165/// # Examples
166///
167/// ```
168/// use coding_tools::pattern::compile_name_set;
169///
170/// let set = compile_name_set("*.rs|*.toml").unwrap();
171/// let matches = |name: &str| set.iter().any(|r| r.is_match(name));
172/// assert!(matches("lib.rs"));
173/// assert!(matches("Cargo.toml"));
174/// assert!(!matches("README.md"));
175/// ```
176pub fn compile_name_set(spec: &str) -> Result<Vec<Regex>, regex::Error> {
177    spec.split('|')
178        .filter(|s| !s.is_empty())
179        .map(compile_anchored)
180        .collect()
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    #[test]
188    fn classifies_literal_when_no_metacharacters() {
189        assert_eq!(classify("ERROR:"), PatternKind::Literal);
190        assert_eq!(classify("knn_entries"), PatternKind::Literal);
191    }
192
193    #[test]
194    fn classifies_glob_when_not_valid_regex() {
195        assert_eq!(classify("*.java"), PatternKind::Glob);
196        assert_eq!(classify("data_[0-9]"), PatternKind::Glob);
197    }
198
199    #[test]
200    fn classifies_regex_when_explicit_and_valid() {
201        assert_eq!(classify("^ERROR"), PatternKind::Regex);
202        assert_eq!(classify("foo|bar"), PatternKind::Regex);
203        assert_eq!(classify(r"\d+"), PatternKind::Regex);
204        // A bare '.' is a regex specifier, so this is a regex, not a literal.
205        assert_eq!(classify("foo.bar"), PatternKind::Regex);
206    }
207
208    #[test]
209    fn literal_matches_as_unanchored_substring() {
210        let re = compile("ERROR:").unwrap();
211        assert!(re.is_match("first line\nERROR: bad input\n"));
212        assert!(!re.is_match("all good here"));
213    }
214
215    #[test]
216    fn name_set_anchors_each_glob_alternative() {
217        let set = compile_name_set("*.java|*.kt").unwrap();
218        let matches = |name: &str| set.iter().any(|r| r.is_match(name));
219        assert!(matches("Widget.java"));
220        assert!(matches("Widget.kt"));
221        assert!(!matches("Widget.javax"));
222        assert!(!matches("Widget.java.bak"));
223    }
224
225    #[test]
226    fn regex_alternation_is_preserved_for_content() {
227        let re = compile("SimpleMFD|knn_entries").unwrap();
228        assert!(re.is_match("...knn_entries..."));
229        assert!(!re.is_match("nothing relevant"));
230    }
231}