Skip to main content

uv_globfilter/
glob_dir_filter.rs

1use globset::{Glob, GlobSet, GlobSetBuilder};
2use regex_automata::dfa;
3use regex_automata::dfa::Automaton;
4use std::path::{MAIN_SEPARATOR, MAIN_SEPARATOR_STR, Path};
5use tracing::warn;
6
7/// Chosen at a whim -Konsti
8const DFA_SIZE_LIMIT: usize = 1_000_000;
9
10/// Filter a directory tree traversal (walkdir) by whether any paths of a directory can be included
11/// at all.
12///
13/// Internally, the globs are converted to a regex and then to a DFA, which unlike the globs and the
14/// regex allows to check for prefix matches.
15pub struct GlobDirFilter {
16    glob_set: GlobSet,
17    dfa: Option<dfa::dense::DFA<Vec<u32>>>,
18}
19
20impl GlobDirFilter {
21    /// The filter matches if any of the globs matches.
22    ///
23    /// See <https://github.com/BurntSushi/ripgrep/discussions/2927> for the error returned.
24    pub fn from_globs(globs: &[Glob]) -> Result<Self, globset::Error> {
25        let mut glob_set_builder = GlobSetBuilder::new();
26        for glob in globs {
27            glob_set_builder.add(glob.clone());
28        }
29        let glob_set = glob_set_builder.build()?;
30
31        let regexes: Vec<_> = globs
32            .iter()
33            .map(|glob| {
34                let main_separator = regex::escape(MAIN_SEPARATOR_STR);
35
36                glob.regex()
37                    // We are using a custom DFA builder
38                    .strip_prefix("(?-u)")
39                    .expect("a glob is a non-unicode byte regex")
40                    // Match windows paths if applicable
41                    .replace('/', &main_separator)
42            })
43            .collect();
44
45        let dfa_builder = dfa::dense::Builder::new()
46            .syntax(
47                // The glob regex is a byte matcher
48                regex_automata::util::syntax::Config::new()
49                    .unicode(false)
50                    .utf8(false),
51            )
52            .configure(
53                dfa::dense::Config::new()
54                    .start_kind(dfa::StartKind::Anchored)
55                    // DFA can grow exponentially, in which case we bail out
56                    .dfa_size_limit(Some(DFA_SIZE_LIMIT))
57                    .determinize_size_limit(Some(DFA_SIZE_LIMIT)),
58            )
59            .build_many(&regexes);
60        let dfa = if let Ok(dfa) = dfa_builder {
61            Some(dfa)
62        } else {
63            // TODO(konsti): `regex_automata::dfa::dense::BuildError` should allow asking whether
64            // is a size error
65            warn!(
66                "Glob expressions regex is larger than {DFA_SIZE_LIMIT} bytes, \
67                    falling back to full directory traversal!"
68            );
69            None
70        };
71
72        Ok(Self { glob_set, dfa })
73    }
74
75    /// Whether the path (file or directory) matches any of the globs.
76    ///
77    /// We include a directory if we are potentially including files it contains.
78    pub fn match_path(&self, path: &Path) -> bool {
79        self.match_directory(path) || self.glob_set.is_match(path)
80    }
81
82    /// Check whether a directory or any of its children can be matched by any of the globs.
83    ///
84    /// This option never returns false if any child matches, but it may return true even if we
85    /// don't end up including any child.
86    pub fn match_directory(&self, path: &Path) -> bool {
87        let Some(dfa) = &self.dfa else {
88            return true;
89        };
90
91        // Allow the root path
92        if path == Path::new("") {
93            return true;
94        }
95
96        let config_anchored =
97            regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
98        let mut state = dfa.start_state(&config_anchored).unwrap();
99
100        // Paths aren't necessarily UTF-8, which we can gloss over since the globs match bytes only
101        // anyway.
102        let byte_path = path.as_os_str().as_encoded_bytes();
103        for b in byte_path {
104            state = dfa.next_state(state, *b);
105        }
106        // Say we're looking at a directory `foo/bar`. We want to continue if either `foo/bar` is
107        // a match, e.g., from `foo/*`, or a path below it can match, e.g., from `foo/bar/*`.
108        let eoi_state = dfa.next_eoi_state(state);
109        // We must not call `next_eoi_state` on the slash state, we want to only check if more
110        // characters (path components) are allowed, not if we're matching the `$` anchor at the
111        // end.
112        let slash_state = dfa.next_state(state, u8::try_from(MAIN_SEPARATOR).unwrap());
113
114        debug_assert!(
115            !dfa.is_quit_state(eoi_state) && !dfa.is_quit_state(slash_state),
116            "matcher is in quit state"
117        );
118
119        dfa.is_match_state(eoi_state) || !dfa.is_dead_state(slash_state)
120    }
121}
122
123#[cfg(test)]
124mod tests {
125    use crate::PortableGlobParser;
126    use crate::glob_dir_filter::GlobDirFilter;
127    use std::path::{MAIN_SEPARATOR, Path};
128    use tempfile::tempdir;
129    use walkdir::WalkDir;
130
131    const FILES: [&str; 5] = [
132        "path1/dir1/subdir/a.txt",
133        "path2/dir2/subdir/a.txt",
134        "path3/dir3/subdir/a.txt",
135        "path4/dir4/subdir/a.txt",
136        "path5/dir5/subdir/a.txt",
137    ];
138
139    const PATTERNS: [&str; 5] = [
140        // Only sufficient for descending one level
141        "path1/*",
142        // Only sufficient for descending one level
143        "path2/dir2",
144        // Sufficient for descending
145        "path3/dir3/subdir/a.txt",
146        // Sufficient for descending
147        "path4/**/*",
148        // Not sufficient for descending
149        "path5",
150    ];
151
152    #[test]
153    fn match_directory() {
154        let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
155        let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
156        assert!(matcher.match_directory(&Path::new("path1").join("dir1")));
157        assert!(matcher.match_directory(&Path::new("path2").join("dir2")));
158        assert!(matcher.match_directory(&Path::new("path3").join("dir3")));
159        assert!(matcher.match_directory(&Path::new("path4").join("dir4")));
160        assert!(!matcher.match_directory(&Path::new("path5").join("dir5")));
161    }
162
163    /// Check that we skip directories that can never match.
164    #[test]
165    fn prefilter() {
166        let dir = tempdir().unwrap();
167        for file in FILES {
168            let file = dir.path().join(file);
169            fs_err::create_dir_all(file.parent().unwrap()).unwrap();
170            fs_err::File::create(file).unwrap();
171        }
172        let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
173        let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
174
175        // Test the prefix filtering
176        let visited: Vec<_> = WalkDir::new(dir.path())
177            .sort_by_file_name()
178            .into_iter()
179            .filter_entry(|entry| {
180                let relative = entry
181                    .path()
182                    .strip_prefix(dir.path())
183                    .expect("walkdir starts with root");
184                matcher.match_directory(relative)
185            })
186            .map(|entry| {
187                let entry = entry.unwrap();
188                let relative = entry
189                    .path()
190                    .strip_prefix(dir.path())
191                    .expect("walkdir starts with root")
192                    .to_str()
193                    .unwrap()
194                    .to_string();
195                // Translate windows paths back to the unix fixture
196                relative.replace(MAIN_SEPARATOR, "/")
197            })
198            .collect();
199        assert_eq!(
200            visited,
201            [
202                "",
203                "path1",
204                "path1/dir1",
205                "path2",
206                "path2/dir2",
207                "path3",
208                "path3/dir3",
209                "path3/dir3/subdir",
210                "path3/dir3/subdir/a.txt",
211                "path4",
212                "path4/dir4",
213                "path4/dir4/subdir",
214                "path4/dir4/subdir/a.txt",
215                "path5"
216            ]
217        );
218    }
219
220    /// Check that the walkdir yield the correct set of files.
221    #[test]
222    fn walk_dir() {
223        let dir = tempdir().unwrap();
224
225        for file in FILES {
226            let file = dir.path().join(file);
227            fs_err::create_dir_all(file.parent().unwrap()).unwrap();
228            fs_err::File::create(file).unwrap();
229        }
230        let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap());
231
232        let include_matcher = GlobDirFilter::from_globs(&patterns).unwrap();
233
234        let walkdir_root = dir.path();
235        let mut matches: Vec<_> = WalkDir::new(walkdir_root)
236            .sort_by_file_name()
237            .into_iter()
238            .filter_entry(|entry| {
239                // TODO(konsti): This should be prettier.
240                let relative = entry
241                    .path()
242                    .strip_prefix(walkdir_root)
243                    .expect("walkdir starts with root");
244
245                include_matcher.match_directory(relative)
246            })
247            .filter_map(|entry| {
248                let entry = entry.as_ref().unwrap();
249                // TODO(konsti): This should be prettier.
250                let relative = entry
251                    .path()
252                    .strip_prefix(walkdir_root)
253                    .expect("walkdir starts with root");
254                if include_matcher.match_path(relative) {
255                    // Translate windows paths back to the unix fixture
256                    Some(relative.to_str().unwrap().replace(MAIN_SEPARATOR, "/"))
257                } else {
258                    None
259                }
260            })
261            .collect();
262        matches.sort();
263        assert_eq!(
264            matches,
265            [
266                "",
267                "path1",
268                "path1/dir1",
269                "path2",
270                "path2/dir2",
271                "path3",
272                "path3/dir3",
273                "path3/dir3/subdir",
274                "path3/dir3/subdir/a.txt",
275                "path4",
276                "path4/dir4",
277                "path4/dir4/subdir",
278                "path4/dir4/subdir/a.txt",
279                "path5"
280            ]
281        );
282    }
283}