coding_tools/pattern.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Jonathan Shook
3
4//! Substring → glob → regex pattern promotion, shared by every tool option that
5//! accepts a *pattern*.
6//!
7//! The rule lets a caller write the simplest thing that expresses intent and
8//! have the tool infer how literally it was meant:
9//!
10//! * a string with no metacharacters is a [`Literal`](PatternKind::Literal)
11//! substring (regex-escaped, matched verbatim);
12//! * a string with only glob metacharacters (`*`, `?`, `[ … ]`) that is *not* a
13//! valid regex is a [`Glob`](PatternKind::Glob), converted to an equivalent
14//! regex;
15//! * anything else carrying regex metacharacters and forming a valid expression
16//! is a [`Regex`](PatternKind::Regex), used exactly as written.
17
18use regex::Regex;
19
20/// How a raw pattern string was interpreted by [`classify`].
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum PatternKind {
23 /// No metacharacters: matched verbatim (regex-escaped).
24 Literal,
25 /// Glob metacharacters only: converted to an equivalent regex.
26 Glob,
27 /// Explicit, valid regex: used as written.
28 Regex,
29}
30
31/// Glob metacharacters that, on their own, signal a glob pattern.
32const GLOB_META: [char; 3] = ['*', '?', '['];
33
34/// Regex metacharacters that signal explicit regular-expression intent.
35const REGEX_META: [char; 10] = ['^', '$', '(', ')', '|', '+', '{', '}', '\\', '.'];
36
37/// Classify a raw pattern according to the suite's promotion rule.
38///
39/// A pattern is [`Regex`](PatternKind::Regex) only when it both carries explicit
40/// regex metacharacters *and* compiles, so an invalid-as-regex string such as
41/// `*.java` (leading quantifier) falls back to [`Glob`](PatternKind::Glob).
42///
43/// # Examples
44///
45/// ```
46/// use coding_tools::pattern::{classify, PatternKind};
47///
48/// assert_eq!(classify("ERROR:"), PatternKind::Literal); // no metacharacters
49/// assert_eq!(classify("*.java"), PatternKind::Glob); // a leading `*` is not a valid regex
50/// assert_eq!(classify(r"\d+"), PatternKind::Regex); // valid regex with metacharacters
51/// ```
52pub fn classify(pat: &str) -> PatternKind {
53 let has_glob = pat.chars().any(|c| GLOB_META.contains(&c));
54 let has_regex_meta = pat.chars().any(|c| REGEX_META.contains(&c));
55 if !has_glob && !has_regex_meta {
56 PatternKind::Literal
57 } else if has_regex_meta && Regex::new(pat).is_ok() {
58 PatternKind::Regex
59 } else {
60 PatternKind::Glob
61 }
62}
63
64/// Convert a glob pattern into an (unanchored) regular-expression *source*.
65///
66/// `*` and `?` do not cross a path separator (`/`), mirroring shell glob
67/// semantics; `[ … ]` character classes are passed through (with a leading `!`
68/// rewritten to the regex negation `^`); every other regex metacharacter is
69/// escaped to a literal.
70///
71/// # Examples
72///
73/// ```
74/// use coding_tools::pattern::glob_to_regex;
75///
76/// assert_eq!(glob_to_regex("*.rs"), r"[^/]*\.rs");
77/// assert_eq!(glob_to_regex("data_[0-9]"), "data_[0-9]");
78/// ```
79pub fn glob_to_regex(glob: &str) -> String {
80 let mut out = String::new();
81 let mut chars = glob.chars().peekable();
82 while let Some(c) = chars.next() {
83 match c {
84 '*' => out.push_str("[^/]*"),
85 '?' => out.push_str("[^/]"),
86 '[' => {
87 out.push('[');
88 if matches!(chars.peek(), Some('!')) {
89 out.push('^');
90 chars.next();
91 }
92 for cc in chars.by_ref() {
93 out.push(cc);
94 if cc == ']' {
95 break;
96 }
97 }
98 }
99 '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '\\' => {
100 out.push('\\');
101 out.push(c);
102 }
103 _ => out.push(c),
104 }
105 }
106 out
107}
108
109/// Produce the regex *source* a raw pattern promotes to, without anchoring.
110///
111/// # Examples
112///
113/// ```
114/// use coding_tools::pattern::promote;
115///
116/// assert_eq!(promote("a.b"), "a.b"); // valid regex: used as written
117/// assert_eq!(promote("a+b"), "a+b"); // valid regex
118/// assert_eq!(promote("*.rs"), r"[^/]*\.rs"); // glob -> regex
119/// assert_eq!(promote("v1.0"), "v1.0"); // '.' is a regex metachar, kept as-is
120/// ```
121pub fn promote(pat: &str) -> String {
122 match classify(pat) {
123 PatternKind::Literal => regex::escape(pat),
124 PatternKind::Glob => glob_to_regex(pat),
125 PatternKind::Regex => pat.to_string(),
126 }
127}
128
129/// Compile a pattern for *content / unanchored* matching (e.g. `ct-search --grep`
130/// or any `ct-test` matcher): the result matches anywhere in the haystack.
131///
132/// # Examples
133///
134/// ```
135/// use coding_tools::pattern::compile;
136///
137/// let re = compile("ERROR:").unwrap();
138/// assert!(re.is_match("first line\nERROR: bad input\n"));
139/// assert!(!re.is_match("all good here"));
140/// ```
141pub fn compile(pat: &str) -> Result<Regex, regex::Error> {
142 Regex::new(&promote(pat))
143}
144
145/// Compile a pattern for *whole-name* matching: anchored to the full string, so
146/// `*.java` means "the name ends in `.java`", not merely "contains".
147///
148/// # Examples
149///
150/// ```
151/// use coding_tools::pattern::compile_anchored;
152///
153/// let re = compile_anchored("*.rs").unwrap();
154/// assert!(re.is_match("main.rs"));
155/// assert!(!re.is_match("main.rs.bak")); // anchored: must end in .rs
156/// ```
157pub fn compile_anchored(pat: &str) -> Result<Regex, regex::Error> {
158 Regex::new(&format!("^(?:{})$", promote(pat)))
159}
160
161/// Compile a `'|'`-separated set of whole-name alternatives, each promoted and
162/// anchored. An entry name matches the set when it matches *any* alternative,
163/// mirroring `find`'s `-name a -o -name b`.
164///
165/// # Examples
166///
167/// ```
168/// use coding_tools::pattern::compile_name_set;
169///
170/// let set = compile_name_set("*.rs|*.toml").unwrap();
171/// let matches = |name: &str| set.iter().any(|r| r.is_match(name));
172/// assert!(matches("lib.rs"));
173/// assert!(matches("Cargo.toml"));
174/// assert!(!matches("README.md"));
175/// ```
176pub fn compile_name_set(spec: &str) -> Result<Vec<Regex>, regex::Error> {
177 spec.split('|')
178 .filter(|s| !s.is_empty())
179 .map(compile_anchored)
180 .collect()
181}
182
183#[cfg(test)]
184mod tests {
185 use super::*;
186
187 #[test]
188 fn classifies_literal_when_no_metacharacters() {
189 assert_eq!(classify("ERROR:"), PatternKind::Literal);
190 assert_eq!(classify("knn_entries"), PatternKind::Literal);
191 }
192
193 #[test]
194 fn classifies_glob_when_not_valid_regex() {
195 assert_eq!(classify("*.java"), PatternKind::Glob);
196 assert_eq!(classify("data_[0-9]"), PatternKind::Glob);
197 }
198
199 #[test]
200 fn classifies_regex_when_explicit_and_valid() {
201 assert_eq!(classify("^ERROR"), PatternKind::Regex);
202 assert_eq!(classify("foo|bar"), PatternKind::Regex);
203 assert_eq!(classify(r"\d+"), PatternKind::Regex);
204 // A bare '.' is a regex specifier, so this is a regex, not a literal.
205 assert_eq!(classify("foo.bar"), PatternKind::Regex);
206 }
207
208 #[test]
209 fn literal_matches_as_unanchored_substring() {
210 let re = compile("ERROR:").unwrap();
211 assert!(re.is_match("first line\nERROR: bad input\n"));
212 assert!(!re.is_match("all good here"));
213 }
214
215 #[test]
216 fn name_set_anchors_each_glob_alternative() {
217 let set = compile_name_set("*.java|*.kt").unwrap();
218 let matches = |name: &str| set.iter().any(|r| r.is_match(name));
219 assert!(matches("Widget.java"));
220 assert!(matches("Widget.kt"));
221 assert!(!matches("Widget.javax"));
222 assert!(!matches("Widget.java.bak"));
223 }
224
225 #[test]
226 fn regex_alternation_is_preserved_for_content() {
227 let re = compile("SimpleMFD|knn_entries").unwrap();
228 assert!(re.is_match("...knn_entries..."));
229 assert!(!re.is_match("nothing relevant"));
230 }
231}