Skip to main content

harn_glob/
lib.rs

1//! The single glob-matching implementation for the Harn workspace.
2//!
3//! Before this crate existed, seven near-identical `glob_match` functions
4//! lived in `harn-ir`, `harn-vm` (metadata scan, llm config, capabilities,
5//! merge-captain audit, runtime hooks, llm mock), and `harn-cli` (skills) —
6//! each with subtly different wildcard semantics. A pattern that matched in
7//! one subsystem silently behaved differently in another (hook routing
8//! honored `?`/`[...]`, model-override matching did not; the metadata scanner
9//! matched the `*` in `**/*.rs` literally). This crate is the one place those
10//! semantics are defined.
11//!
12//! Three contracts, chosen per call site:
13//!
14//! - [`match_path`] — slash-aware file-path globs: `*`/`?` never cross `/`,
15//!   `**` crosses directories. Use for path-shaped inputs (invariant globs,
16//!   skill manifests, workspace paths).
17//! - [`match_name`] — full glob syntax (`*`, `?`, `[...]`, `{a,b}`) over flat
18//!   identifiers where `/` has no special meaning (`*` crosses it). Use for
19//!   tool names, model ids, hook patterns, event names.
20//! - [`match_prose`] — `*`-only ordered-segment matching where every other
21//!   character is literal. Use when patterns target free text that routinely
22//!   contains `?`, `[`, or `{` as ordinary prose (e.g. llm-mock prompt
23//!   matchers).
24
25/// Slash-aware glob matching for file paths.
26///
27/// Semantics:
28/// - `*` matches any run of characters except `/`
29/// - `?` matches exactly one character except `/`
30/// - `**` matches any run of characters including `/` (a leading `**/` also
31///   matches zero directories, so `src/**/*.rs` matches `src/main.rs`)
32/// - every other character matches itself; the whole path must be consumed
33#[must_use]
34pub fn match_path(pattern: &str, path: &str) -> bool {
35    match_path_bytes(pattern.as_bytes(), 0, path.as_bytes(), 0)
36}
37
38fn match_path_bytes(pat: &[u8], mut pi: usize, path: &[u8], mut si: usize) -> bool {
39    while pi < pat.len() {
40        match pat[pi] {
41            b'*' => {
42                let double = pat.get(pi + 1) == Some(&b'*');
43                let mut next_pi = if double { pi + 2 } else { pi + 1 };
44                if double && pat.get(next_pi) == Some(&b'/') {
45                    next_pi += 1;
46                    // `**/` matches zero or more complete directory segments.
47                    // It must not start the next pattern in the middle of a
48                    // segment (`**/bar` should not match `foobar`).
49                    if match_path_bytes(pat, next_pi, path, si) {
50                        return true;
51                    }
52                    for try_si in si..path.len() {
53                        if path[try_si] == b'/' && match_path_bytes(pat, next_pi, path, try_si + 1)
54                        {
55                            return true;
56                        }
57                    }
58                    return false;
59                }
60                if next_pi >= pat.len() {
61                    if double {
62                        return true;
63                    }
64                    return !path[si..].contains(&b'/');
65                }
66                for try_si in si..=path.len() {
67                    if !double && path[si..try_si].contains(&b'/') {
68                        break;
69                    }
70                    if match_path_bytes(pat, next_pi, path, try_si) {
71                        return true;
72                    }
73                }
74                return false;
75            }
76            b'?' => {
77                if si >= path.len() || path[si] == b'/' {
78                    return false;
79                }
80                pi += 1;
81                si += 1;
82            }
83            expected => {
84                if si >= path.len() || path[si] != expected {
85                    return false;
86                }
87                pi += 1;
88                si += 1;
89            }
90        }
91    }
92    si == path.len()
93}
94
95/// Full glob matching for flat names (tool names, model ids, hook patterns).
96///
97/// `/` has no special meaning: `*` and `?` match across it. Beyond `*` and
98/// `?`, character classes (`[abc]`) and alternates (`{a,b}`) are supported
99/// via [`globset`]. Patterns that fail to parse as globs fall back to the
100/// historical prefix/suffix/equality behavior shared by the pre-consolidation
101/// call sites, so an unclosed `[` never panics or silently rejects.
102///
103/// Compiled matchers are cached per thread; patterns come from configuration
104/// and hook registration, so the cache stays small.
105#[cfg(feature = "name")]
106#[must_use]
107pub fn match_name(pattern: &str, name: &str) -> bool {
108    if pattern == "*" {
109        return true;
110    }
111    if !has_name_meta(pattern) {
112        return pattern == name;
113    }
114    // Fast paths for the dominant shapes: `prefix*`, `*suffix`, `*infix*`.
115    if let Some(prefix) = pattern.strip_suffix('*') {
116        if !has_name_meta(prefix) {
117            return name.starts_with(prefix);
118        }
119        if let Some(infix) = prefix.strip_prefix('*') {
120            if !has_name_meta(infix) {
121                return name.contains(infix);
122            }
123        }
124    }
125    if let Some(suffix) = pattern.strip_prefix('*') {
126        if !has_name_meta(suffix) {
127            return name.ends_with(suffix);
128        }
129    }
130    compiled_name_match(pattern, name)
131}
132
133#[cfg(feature = "name")]
134fn has_name_meta(pattern: &str) -> bool {
135    pattern
136        .bytes()
137        .any(|byte| matches!(byte, b'*' | b'?' | b'[' | b'{'))
138}
139
140#[cfg(feature = "name")]
141fn compiled_name_match(pattern: &str, name: &str) -> bool {
142    use std::cell::RefCell;
143    use std::collections::HashMap;
144
145    thread_local! {
146        static COMPILED: RefCell<HashMap<Box<str>, Option<globset::GlobMatcher>>> =
147            RefCell::new(HashMap::new());
148    }
149
150    COMPILED.with(|cache| {
151        let mut cache = cache.borrow_mut();
152        // Patterns are config/registration-driven and bounded in practice;
153        // the cap only guards against pathological dynamic pattern churn.
154        if cache.len() > 512 {
155            cache.clear();
156        }
157        let matcher = cache.entry(Box::from(pattern)).or_insert_with(|| {
158            globset::Glob::new(pattern)
159                .ok()
160                .map(|glob| glob.compile_matcher())
161        });
162        match matcher {
163            Some(matcher) => matcher.is_match(name),
164            // Unparsable glob: match the way pre-consolidation call sites did.
165            None => {
166                if let Some(prefix) = pattern.strip_suffix('*') {
167                    return name.starts_with(prefix);
168                }
169                if let Some(suffix) = pattern.strip_prefix('*') {
170                    return name.ends_with(suffix);
171                }
172                pattern == name
173            }
174        }
175    })
176}
177
178/// `*`-only ordered-segment matching over free text.
179///
180/// Splits the pattern on `*` and requires the literal segments to appear in
181/// order, anchored at the start/end unless the pattern begins/ends with `*`.
182/// Every character other than `*` is literal — including `?`, `[`, and `{` —
183/// because prose targets (prompts, transcript text) routinely contain them.
184#[must_use]
185pub fn match_prose(pattern: &str, text: &str) -> bool {
186    if pattern == "*" {
187        return true;
188    }
189    if !pattern.contains('*') {
190        return pattern == text;
191    }
192    let segments: Vec<&str> = pattern.split('*').collect();
193    let last = segments.len() - 1;
194    let mut remaining = text;
195    for (index, segment) in segments.iter().enumerate() {
196        if segment.is_empty() {
197            continue;
198        }
199        if index == 0 {
200            match remaining.strip_prefix(segment) {
201                Some(rest) => remaining = rest,
202                None => return false,
203            }
204        } else if index == last {
205            return remaining.ends_with(segment);
206        } else {
207            match remaining.find(segment) {
208                Some(at) => remaining = &remaining[at + segment.len()..],
209                None => return false,
210            }
211        }
212    }
213    true
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    // --- match_path: ported from harn-ir and harn-cli (skills) test suites ---
221
222    #[test]
223    fn path_single_star_stays_within_a_directory() {
224        assert!(match_path("src/*.rs", "src/main.rs"));
225        assert!(!match_path("src/*.rs", "src/nested/main.rs"));
226        assert!(!match_path("src/*.rs", "other/main.rs"));
227    }
228
229    #[test]
230    fn path_double_star_crosses_directories() {
231        assert!(match_path("src/**/*.rs", "src/nested/main.rs"));
232        assert!(match_path("src/**/*.rs", "src/main.rs"));
233        assert!(match_path("infra/**", "infra/terraform/main.tf"));
234        assert!(match_path("**", "anything/at/all"));
235        assert!(match_path("**/*.rs", "main.rs"));
236        assert!(match_path("**/*.rs", "deep/tree/main.rs"));
237    }
238
239    #[test]
240    fn path_double_star_slash_stays_on_directory_boundaries() {
241        assert!(match_path("**/bar", "bar"));
242        assert!(match_path("**/bar", "foo/bar"));
243        assert!(!match_path("**/bar", "foobar"));
244        assert!(match_path("src/**/main.rs", "src/main.rs"));
245        assert!(match_path("src/**/main.rs", "src/bin/main.rs"));
246        assert!(!match_path("src/**/main.rs", "src/binmain.rs"));
247    }
248
249    #[test]
250    fn path_question_mark_matches_one_non_separator() {
251        assert!(match_path("src/ma?n.rs", "src/main.rs"));
252        assert!(!match_path("src/ma?n.rs", "src/man.rs"));
253        assert!(!match_path("a?b", "a/b"));
254    }
255
256    #[test]
257    fn path_literal_and_edge_cases() {
258        assert!(match_path("exact.rs", "exact.rs"));
259        assert!(!match_path("exact.rs", "exact.rs.bak"));
260        assert!(!match_path("src/**", "src"));
261        assert!(match_path("src/**", "src/anything"));
262        assert!(match_path("", ""));
263        assert!(!match_path("", "x"));
264    }
265
266    // --- match_name: ported from hooks, llm_config, capabilities,
267    //     merge_captain_audit test suites ---
268
269    #[test]
270    fn name_star_matches_everything() {
271        assert!(match_name("*", "anything"));
272        assert!(match_name("*", ""));
273    }
274
275    #[test]
276    fn name_prefix_suffix_and_exact() {
277        assert!(match_name("claude-*", "claude-sonnet-4-20250514"));
278        assert!(match_name("gpt-*", "gpt-4o"));
279        assert!(!match_name("claude-*", "gpt-4o"));
280        assert!(match_name("*-latest", "llama3.2-latest"));
281        assert!(!match_name("*-latest", "llama3.2"));
282        assert!(match_name("gpt-4o", "gpt-4o"));
283        assert!(!match_name("gpt-4o", "gpt-4o-mini"));
284    }
285
286    #[test]
287    fn name_substring_and_middle_star() {
288        assert!(match_name("*gpt*", "openai/gpt-5.4"));
289        assert!(match_name("*claude*", "anthropic/claude-opus-4-7"));
290        assert!(!match_name("*xyz*", "openai/gpt-5.4"));
291        assert!(match_name("claude-*-latest", "claude-sonnet-latest"));
292        assert!(!match_name("claude-*-latest", "claude-sonnet-beta"));
293    }
294
295    #[test]
296    fn name_star_crosses_separators() {
297        assert!(match_name("tool/*", "tool/a/b"));
298        assert!(match_name("*svc", "a/b/svc"));
299    }
300
301    #[test]
302    fn name_multi_star_segments_in_order() {
303        assert!(match_name("a*b*c", "a-x-b-y-c"));
304        assert!(!match_name("a*b*c", "a-x-c-y-b"));
305        assert!(match_name("pre*mid*", "pre-anything-mid-tail"));
306    }
307
308    #[test]
309    fn name_question_mark_and_classes() {
310        assert!(match_name("gpt-?o", "gpt-4o"));
311        assert!(!match_name("gpt-?o", "gpt-44o"));
312        assert!(match_name("file[12]", "file1"));
313        assert!(!match_name("file[12]", "file3"));
314    }
315
316    #[test]
317    fn name_brace_alternates_use_glob_syntax() {
318        assert!(match_name("gpt-{4o,5}", "gpt-4o"));
319        assert!(match_name("gpt-{4o,5}", "gpt-5"));
320        assert!(!match_name("gpt-{4o,5}", "gpt-4.1"));
321    }
322
323    #[test]
324    fn name_unparsable_glob_falls_back_to_legacy_affix_matching() {
325        // Unclosed `[` fails glob compilation; legacy behavior treated the
326        // pattern as a literal prefix when it ends in `*`.
327        assert!(match_name("f[oo*", "f[oo-bar"));
328        assert!(!match_name("f[oo*", "g[oo-bar"));
329        assert!(match_name("f[oo", "f[oo"));
330    }
331
332    // --- match_prose: ported from the llm mock matcher tests ---
333
334    #[test]
335    fn prose_segments_in_order_with_literal_punctuation() {
336        assert!(match_prose("*", "anything"));
337        assert!(match_prose("hello", "hello"));
338        assert!(!match_prose("hello", "hello world"));
339        assert!(match_prose("hello*", "hello world"));
340        assert!(match_prose("*world", "hello world"));
341        assert!(match_prose("*llo wo*", "hello world"));
342        assert!(match_prose("he*wo*ld", "hello world"));
343        assert!(!match_prose("he*xx*ld", "hello world"));
344    }
345
346    #[test]
347    fn prose_treats_glob_metacharacters_as_literals() {
348        assert!(match_prose("what is [x]?*", "what is [x]? tell me"));
349        assert!(!match_prose("what is [x]?*", "what is x? tell me"));
350        assert!(match_prose("*{json}*", "respond with {json} only"));
351    }
352}