uv_globfilter/
glob_dir_filter.rs1use globset::{Glob, GlobSet, GlobSetBuilder};
2use regex_automata::dfa;
3use regex_automata::dfa::Automaton;
4use std::path::{MAIN_SEPARATOR, MAIN_SEPARATOR_STR, Path};
5use tracing::warn;
6
7const DFA_SIZE_LIMIT: usize = 1_000_000;
9
10pub struct GlobDirFilter {
16 glob_set: GlobSet,
17 dfa: Option<dfa::dense::DFA<Vec<u32>>>,
18}
19
20impl GlobDirFilter {
21 pub fn from_globs(globs: &[Glob]) -> Result<Self, globset::Error> {
25 let mut glob_set_builder = GlobSetBuilder::new();
26 for glob in globs {
27 glob_set_builder.add(glob.clone());
28 }
29 let glob_set = glob_set_builder.build()?;
30
31 let regexes: Vec<_> = globs
32 .iter()
33 .map(|glob| {
34 let main_separator = regex::escape(MAIN_SEPARATOR_STR);
35
36 glob.regex()
37 .strip_prefix("(?-u)")
39 .expect("a glob is a non-unicode byte regex")
40 .replace('/', &main_separator)
42 })
43 .collect();
44
45 let dfa_builder = dfa::dense::Builder::new()
46 .syntax(
47 regex_automata::util::syntax::Config::new()
49 .unicode(false)
50 .utf8(false),
51 )
52 .configure(
53 dfa::dense::Config::new()
54 .start_kind(dfa::StartKind::Anchored)
55 .dfa_size_limit(Some(DFA_SIZE_LIMIT))
57 .determinize_size_limit(Some(DFA_SIZE_LIMIT)),
58 )
59 .build_many(®exes);
60 let dfa = if let Ok(dfa) = dfa_builder {
61 Some(dfa)
62 } else {
63 warn!(
66 "Glob expressions regex is larger than {DFA_SIZE_LIMIT} bytes, \
67 falling back to full directory traversal!"
68 );
69 None
70 };
71
72 Ok(Self { glob_set, dfa })
73 }
74
75 pub fn match_path(&self, path: &Path) -> bool {
79 self.match_directory(path) || self.glob_set.is_match(path)
80 }
81
82 pub fn match_directory(&self, path: &Path) -> bool {
87 let Some(dfa) = &self.dfa else {
88 return true;
89 };
90
91 if path == Path::new("") {
93 return true;
94 }
95
96 let config_anchored =
97 regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
98 let mut state = dfa.start_state(&config_anchored).unwrap();
99
100 let byte_path = path.as_os_str().as_encoded_bytes();
103 for b in byte_path {
104 state = dfa.next_state(state, *b);
105 }
106 let eoi_state = dfa.next_eoi_state(state);
109 let slash_state = dfa.next_state(state, u8::try_from(MAIN_SEPARATOR).unwrap());
113
114 debug_assert!(
115 !dfa.is_quit_state(eoi_state) && !dfa.is_quit_state(slash_state),
116 "matcher is in quit state"
117 );
118
119 dfa.is_match_state(eoi_state) || !dfa.is_dead_state(slash_state)
120 }
121}
122
123#[cfg(test)]
124mod tests {
125 use crate::PortableGlobParser;
126 use crate::glob_dir_filter::GlobDirFilter;
127 use std::path::{MAIN_SEPARATOR, Path};
128 use tempfile::tempdir;
129 use walkdir::WalkDir;
130
131 const FILES: [&str; 5] = [
132 "path1/dir1/subdir/a.txt",
133 "path2/dir2/subdir/a.txt",
134 "path3/dir3/subdir/a.txt",
135 "path4/dir4/subdir/a.txt",
136 "path5/dir5/subdir/a.txt",
137 ];
138
139 const PATTERNS: [&str; 5] = [
140 "path1/*",
142 "path2/dir2",
144 "path3/dir3/subdir/a.txt",
146 "path4/**/*",
148 "path5",
150 ];
151
152 #[test]
153 fn match_directory() {
154 let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
155 let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
156 assert!(matcher.match_directory(&Path::new("path1").join("dir1")));
157 assert!(matcher.match_directory(&Path::new("path2").join("dir2")));
158 assert!(matcher.match_directory(&Path::new("path3").join("dir3")));
159 assert!(matcher.match_directory(&Path::new("path4").join("dir4")));
160 assert!(!matcher.match_directory(&Path::new("path5").join("dir5")));
161 }
162
163 #[test]
165 fn prefilter() {
166 let dir = tempdir().unwrap();
167 for file in FILES {
168 let file = dir.path().join(file);
169 fs_err::create_dir_all(file.parent().unwrap()).unwrap();
170 fs_err::File::create(file).unwrap();
171 }
172 let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
173 let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
174
175 let visited: Vec<_> = WalkDir::new(dir.path())
177 .sort_by_file_name()
178 .into_iter()
179 .filter_entry(|entry| {
180 let relative = entry
181 .path()
182 .strip_prefix(dir.path())
183 .expect("walkdir starts with root");
184 matcher.match_directory(relative)
185 })
186 .map(|entry| {
187 let entry = entry.unwrap();
188 let relative = entry
189 .path()
190 .strip_prefix(dir.path())
191 .expect("walkdir starts with root")
192 .to_str()
193 .unwrap()
194 .to_string();
195 relative.replace(MAIN_SEPARATOR, "/")
197 })
198 .collect();
199 assert_eq!(
200 visited,
201 [
202 "",
203 "path1",
204 "path1/dir1",
205 "path2",
206 "path2/dir2",
207 "path3",
208 "path3/dir3",
209 "path3/dir3/subdir",
210 "path3/dir3/subdir/a.txt",
211 "path4",
212 "path4/dir4",
213 "path4/dir4/subdir",
214 "path4/dir4/subdir/a.txt",
215 "path5"
216 ]
217 );
218 }
219
220 #[test]
222 fn walk_dir() {
223 let dir = tempdir().unwrap();
224
225 for file in FILES {
226 let file = dir.path().join(file);
227 fs_err::create_dir_all(file.parent().unwrap()).unwrap();
228 fs_err::File::create(file).unwrap();
229 }
230 let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
231
232 let include_matcher = GlobDirFilter::from_globs(&patterns).unwrap();
233
234 let walkdir_root = dir.path();
235 let mut matches: Vec<_> = WalkDir::new(walkdir_root)
236 .sort_by_file_name()
237 .into_iter()
238 .filter_entry(|entry| {
239 let relative = entry
241 .path()
242 .strip_prefix(walkdir_root)
243 .expect("walkdir starts with root");
244
245 include_matcher.match_directory(relative)
246 })
247 .filter_map(|entry| {
248 let entry = entry.as_ref().unwrap();
249 let relative = entry
251 .path()
252 .strip_prefix(walkdir_root)
253 .expect("walkdir starts with root");
254 if include_matcher.match_path(relative) {
255 Some(relative.to_str().unwrap().replace(MAIN_SEPARATOR, "/"))
257 } else {
258 None
259 }
260 })
261 .collect();
262 matches.sort();
263 assert_eq!(
264 matches,
265 [
266 "",
267 "path1",
268 "path1/dir1",
269 "path2",
270 "path2/dir2",
271 "path3",
272 "path3/dir3",
273 "path3/dir3/subdir",
274 "path3/dir3/subdir/a.txt",
275 "path4",
276 "path4/dir4",
277 "path4/dir4/subdir",
278 "path4/dir4/subdir/a.txt",
279 "path5"
280 ]
281 );
282 }
283}