coding_tools/pattern.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Jonathan Shook
3
4//! Substring → glob → regex pattern promotion, shared by every tool option that
5//! accepts a *pattern*.
6//!
7//! The rule lets a caller write the simplest thing that expresses intent and
8//! have the tool infer how literally it was meant:
9//!
10//! * a string with no metacharacters is a [`Literal`](PatternKind::Literal)
11//! substring (regex-escaped, matched verbatim);
12//! * a string with only glob metacharacters (`*`, `?`, `[ … ]`) that is *not* a
13//! valid regex is a [`Glob`](PatternKind::Glob), converted to an equivalent
14//! regex;
15//! * anything else carrying regex metacharacters and forming a valid expression
16//! is a [`Regex`](PatternKind::Regex), used exactly as written.
17
18use regex::Regex;
19
20/// How a raw pattern string was interpreted by [`classify`].
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum PatternKind {
23 /// No metacharacters: matched verbatim (regex-escaped).
24 Literal,
25 /// Glob metacharacters only: converted to an equivalent regex.
26 Glob,
27 /// Explicit, valid regex: used as written.
28 Regex,
29}
30
31/// An explicit `--mode` choice that switches promotion off for every pattern
32/// argument in an invocation: the stated interpretation is used as-is.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
34pub enum Mode {
35 /// Match the pattern text verbatim.
36 Literal,
37 /// Interpret the pattern as a shell-style glob.
38 Glob,
39 /// Interpret the pattern as a regular expression, exactly as written.
40 Regex,
41}
42
43impl Mode {
44 /// The [`PatternKind`] this explicit mode pins a pattern to.
45 pub fn kind(self) -> PatternKind {
46 match self {
47 Mode::Literal => PatternKind::Literal,
48 Mode::Glob => PatternKind::Glob,
49 Mode::Regex => PatternKind::Regex,
50 }
51 }
52}
53
54/// [`classify`] under an optional explicit mode: a stated [`Mode`] pins the
55/// kind; absent, promotion decides.
56pub fn classify_with(pat: &str, mode: Option<Mode>) -> PatternKind {
57 match mode {
58 Some(m) => m.kind(),
59 None => classify(pat),
60 }
61}
62
63/// [`promote`] under an optional explicit mode.
64pub fn promote_with(pat: &str, mode: Option<Mode>) -> String {
65 match classify_with(pat, mode) {
66 PatternKind::Literal => regex::escape(pat),
67 PatternKind::Glob => glob_to_regex(pat),
68 PatternKind::Regex => pat.to_string(),
69 }
70}
71
72/// [`compile`] under an optional explicit mode.
73///
74/// # Examples
75///
76/// ```
77/// use coding_tools::pattern::{compile_with, Mode};
78///
79/// // 'todo!(' promotes to an invalid-paren regex without a mode; pinned
80/// // literal, it matches its own text.
81/// let re = compile_with("todo!(\"x\")", Some(Mode::Literal)).unwrap();
82/// assert!(re.is_match(" todo!(\"x\") , here"));
83/// ```
84pub fn compile_with(pat: &str, mode: Option<Mode>) -> Result<Regex, regex::Error> {
85 Regex::new(&promote_with(pat, mode))
86}
87
88/// [`compile_anchored`] under an optional explicit mode.
89pub fn compile_anchored_with(pat: &str, mode: Option<Mode>) -> Result<Regex, regex::Error> {
90 Regex::new(&format!("^(?:{})$", promote_with(pat, mode)))
91}
92
93/// [`compile_name_set`] under an optional explicit mode. With a stated mode of
94/// `literal`, the `'|'` separator is still a set separator (the alternatives
95/// themselves are matched verbatim).
96pub fn compile_name_set_with(spec: &str, mode: Option<Mode>) -> Result<Vec<Regex>, regex::Error> {
97 spec.split('|')
98 .filter(|s| !s.is_empty())
99 .map(|alt| compile_anchored_with(alt, mode))
100 .collect()
101}
102
103/// Glob metacharacters that, on their own, signal a glob pattern.
104const GLOB_META: [char; 3] = ['*', '?', '['];
105
106/// Regex metacharacters that signal explicit regular-expression intent.
107const REGEX_META: [char; 10] = ['^', '$', '(', ')', '|', '+', '{', '}', '\\', '.'];
108
109/// Classify a raw pattern according to the suite's promotion rule.
110///
111/// A pattern is [`Regex`](PatternKind::Regex) only when it both carries explicit
112/// regex metacharacters *and* compiles, so an invalid-as-regex string such as
113/// `*.java` (leading quantifier) falls back to [`Glob`](PatternKind::Glob).
114///
115/// # Examples
116///
117/// ```
118/// use coding_tools::pattern::{classify, PatternKind};
119///
120/// assert_eq!(classify("ERROR:"), PatternKind::Literal); // no metacharacters
121/// assert_eq!(classify("*.java"), PatternKind::Glob); // a leading `*` is not a valid regex
122/// assert_eq!(classify(r"\d+"), PatternKind::Regex); // valid regex with metacharacters
123/// ```
124pub fn classify(pat: &str) -> PatternKind {
125 let has_glob = pat.chars().any(|c| GLOB_META.contains(&c));
126 let has_regex_meta = pat.chars().any(|c| REGEX_META.contains(&c));
127 if !has_glob && !has_regex_meta {
128 PatternKind::Literal
129 } else if has_regex_meta && Regex::new(pat).is_ok() {
130 PatternKind::Regex
131 } else {
132 PatternKind::Glob
133 }
134}
135
136/// Convert a glob pattern into an (unanchored) regular-expression *source*.
137///
138/// `*` and `?` do not cross a path separator (`/`), mirroring shell glob
139/// semantics; `[ … ]` character classes are passed through (with a leading `!`
140/// rewritten to the regex negation `^`); every other regex metacharacter is
141/// escaped to a literal.
142///
143/// # Examples
144///
145/// ```
146/// use coding_tools::pattern::glob_to_regex;
147///
148/// assert_eq!(glob_to_regex("*.rs"), r"[^/]*\.rs");
149/// assert_eq!(glob_to_regex("data_[0-9]"), "data_[0-9]");
150/// ```
151pub fn glob_to_regex(glob: &str) -> String {
152 let mut out = String::new();
153 let mut chars = glob.chars().peekable();
154 while let Some(c) = chars.next() {
155 match c {
156 '*' => out.push_str("[^/]*"),
157 '?' => out.push_str("[^/]"),
158 '[' => {
159 out.push('[');
160 if matches!(chars.peek(), Some('!')) {
161 out.push('^');
162 chars.next();
163 }
164 for cc in chars.by_ref() {
165 out.push(cc);
166 if cc == ']' {
167 break;
168 }
169 }
170 }
171 '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '\\' => {
172 out.push('\\');
173 out.push(c);
174 }
175 _ => out.push(c),
176 }
177 }
178 out
179}
180
181/// Produce the regex *source* a raw pattern promotes to, without anchoring.
182///
183/// # Examples
184///
185/// ```
186/// use coding_tools::pattern::promote;
187///
188/// assert_eq!(promote("a.b"), "a.b"); // valid regex: used as written
189/// assert_eq!(promote("a+b"), "a+b"); // valid regex
190/// assert_eq!(promote("*.rs"), r"[^/]*\.rs"); // glob -> regex
191/// assert_eq!(promote("v1.0"), "v1.0"); // '.' is a regex metachar, kept as-is
192/// ```
193pub fn promote(pat: &str) -> String {
194 match classify(pat) {
195 PatternKind::Literal => regex::escape(pat),
196 PatternKind::Glob => glob_to_regex(pat),
197 PatternKind::Regex => pat.to_string(),
198 }
199}
200
201/// Compile a pattern for *content / unanchored* matching (e.g. `ct-search --grep`
202/// or any `ct-test` matcher): the result matches anywhere in the haystack.
203///
204/// # Examples
205///
206/// ```
207/// use coding_tools::pattern::compile;
208///
209/// let re = compile("ERROR:").unwrap();
210/// assert!(re.is_match("first line\nERROR: bad input\n"));
211/// assert!(!re.is_match("all good here"));
212/// ```
213pub fn compile(pat: &str) -> Result<Regex, regex::Error> {
214 Regex::new(&promote(pat))
215}
216
217/// Compile a pattern for *whole-name* matching: anchored to the full string, so
218/// `*.java` means "the name ends in `.java`", not merely "contains".
219///
220/// # Examples
221///
222/// ```
223/// use coding_tools::pattern::compile_anchored;
224///
225/// let re = compile_anchored("*.rs").unwrap();
226/// assert!(re.is_match("main.rs"));
227/// assert!(!re.is_match("main.rs.bak")); // anchored: must end in .rs
228/// ```
229pub fn compile_anchored(pat: &str) -> Result<Regex, regex::Error> {
230 Regex::new(&format!("^(?:{})$", promote(pat)))
231}
232
233/// Compile a `'|'`-separated set of whole-name alternatives, each promoted and
234/// anchored. An entry name matches the set when it matches *any* alternative,
235/// mirroring `find`'s `-name a -o -name b`.
236///
237/// # Examples
238///
239/// ```
240/// use coding_tools::pattern::compile_name_set;
241///
242/// let set = compile_name_set("*.rs|*.toml").unwrap();
243/// let matches = |name: &str| set.iter().any(|r| r.is_match(name));
244/// assert!(matches("lib.rs"));
245/// assert!(matches("Cargo.toml"));
246/// assert!(!matches("README.md"));
247/// ```
248pub fn compile_name_set(spec: &str) -> Result<Vec<Regex>, regex::Error> {
249 spec.split('|')
250 .filter(|s| !s.is_empty())
251 .map(compile_anchored)
252 .collect()
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 #[test]
260 fn classifies_literal_when_no_metacharacters() {
261 assert_eq!(classify("ERROR:"), PatternKind::Literal);
262 assert_eq!(classify("knn_entries"), PatternKind::Literal);
263 }
264
265 #[test]
266 fn classifies_glob_when_not_valid_regex() {
267 assert_eq!(classify("*.java"), PatternKind::Glob);
268 assert_eq!(classify("data_[0-9]"), PatternKind::Glob);
269 }
270
271 #[test]
272 fn classifies_regex_when_explicit_and_valid() {
273 assert_eq!(classify("^ERROR"), PatternKind::Regex);
274 assert_eq!(classify("foo|bar"), PatternKind::Regex);
275 assert_eq!(classify(r"\d+"), PatternKind::Regex);
276 // A bare '.' is a regex specifier, so this is a regex, not a literal.
277 assert_eq!(classify("foo.bar"), PatternKind::Regex);
278 }
279
280 #[test]
281 fn literal_matches_as_unanchored_substring() {
282 let re = compile("ERROR:").unwrap();
283 assert!(re.is_match("first line\nERROR: bad input\n"));
284 assert!(!re.is_match("all good here"));
285 }
286
287 #[test]
288 fn name_set_anchors_each_glob_alternative() {
289 let set = compile_name_set("*.java|*.kt").unwrap();
290 let matches = |name: &str| set.iter().any(|r| r.is_match(name));
291 assert!(matches("Widget.java"));
292 assert!(matches("Widget.kt"));
293 assert!(!matches("Widget.javax"));
294 assert!(!matches("Widget.java.bak"));
295 }
296
297 #[test]
298 fn regex_alternation_is_preserved_for_content() {
299 let re = compile("SimpleMFD|knn_entries").unwrap();
300 assert!(re.is_match("...knn_entries..."));
301 assert!(!re.is_match("nothing relevant"));
302 }
303
304 #[test]
305 fn explicit_mode_overrides_promotion() {
306 // Verbatim code anchor: promotion would try (and fail) regex; literal
307 // mode matches its own text.
308 let code = r#"WireSource::Port(_) => todo!("x"),"#;
309 let re = compile_with(code, Some(Mode::Literal)).unwrap();
310 assert!(re.is_match(&format!(" {code}\n")));
311
312 // Pinned regex: '.' stays a metacharacter even in a plain string.
313 let re = compile_with("a.c", Some(Mode::Regex)).unwrap();
314 assert!(re.is_match("abc"));
315 // Pinned literal: '.' is just a dot.
316 let re = compile_with("a.c", Some(Mode::Literal)).unwrap();
317 assert!(!re.is_match("abc"));
318 assert!(re.is_match("a.c"));
319 }
320
321 #[test]
322 fn absent_mode_keeps_promotion() {
323 assert_eq!(classify_with("*.rs", None), PatternKind::Glob);
324 assert_eq!(classify_with("*.rs", Some(Mode::Literal)), PatternKind::Literal);
325 }
326}