Skip to main content

vortex_io/filesystem/
glob.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use futures::StreamExt;
5use futures::TryStreamExt;
6use futures::stream::BoxStream;
7use vortex_error::VortexResult;
8use vortex_error::vortex_bail;
9use vortex_error::vortex_err;
10
11use crate::filesystem::FileListing;
12use crate::filesystem::FileSystem;
13
14impl dyn FileSystem + '_ {
15    /// Expand a glob pattern, returning matching files as a stream.
16    ///
17    /// Extracts the directory prefix before the first glob character and uses it
18    /// to narrow the [`list`](FileSystem::list) call. The full glob pattern is
19    /// then applied as a filter over the listed entries.
20    ///
21    /// Escaped glob characters (`\*`, `\?`, `\[`) are not supported.
22    pub fn glob(&self, pattern: &str) -> VortexResult<BoxStream<'_, VortexResult<FileListing>>> {
23        validate_glob(pattern)?;
24
25        // If there are no glob characters, the pattern is an exact file path.
26        // Return it directly without listing the filesystem.
27        if !pattern.contains(['*', '?', '[']) {
28            let listing = FileListing {
29                path: pattern.to_string(),
30                size: None,
31            };
32            return Ok(futures::stream::once(async { Ok(listing) }).boxed());
33        }
34
35        let glob_pattern = glob::Pattern::new(pattern)
36            .map_err(|e| vortex_err!("Invalid glob pattern '{}': {}", pattern, e))?;
37
38        let listing_prefix = glob_list_prefix(pattern).trim_end_matches('/');
39
40        tracing::debug!(
41            "Performing glob with pattern '{}' and listing prefix '{}'",
42            pattern,
43            listing_prefix
44        );
45        let stream = self
46            .list(listing_prefix)
47            .try_filter(move |listing| {
48                let matches = glob_pattern.matches(&listing.path);
49                async move { matches }
50            })
51            .into_stream()
52            .boxed();
53
54        Ok(stream)
55    }
56}
57
58/// Returns the directory prefix of a glob pattern.
59///
60/// Finds the first glob character and returns everything up to and including the last `/`
61/// before it. For example, `data/2023/*/logs/*.log` returns `data/2023/`.
62fn glob_list_prefix(pattern: &str) -> &str {
63    let glob_pos = pattern.find(['*', '?', '[']).unwrap_or(pattern.len());
64    match pattern[..glob_pos].rfind('/') {
65        Some(slash_pos) => &pattern[..=slash_pos],
66        None => "",
67    }
68}
69
70/// Validates that a glob pattern does not contain escaped glob characters.
71fn validate_glob(pattern: &str) -> VortexResult<()> {
72    for escape_pattern in ["\\*", "\\?", "\\["] {
73        if pattern.contains(escape_pattern) {
74            vortex_bail!(
75                "Escaped glob characters are not allowed in patterns. Found '{}' in: {}",
76                escape_pattern,
77                pattern
78            );
79        }
80    }
81    Ok(())
82}
83
84#[cfg(test)]
85mod tests {
86    use std::sync::Arc;
87
88    use async_trait::async_trait;
89    use futures::TryStreamExt;
90    use vortex_error::vortex_panic;
91
92    use super::*;
93    use crate::VortexReadAt;
94    use crate::filesystem::FileSystem;
95
96    /// A mock filesystem that panics if `list` is called.
97    #[derive(Debug)]
98    struct NoListFileSystem;
99
100    #[async_trait]
101    impl FileSystem for NoListFileSystem {
102        fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult<FileListing>> {
103            vortex_panic!("list() should not be called for exact paths")
104        }
105
106        async fn open_read(&self, _path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
107            vortex_panic!("open_read() should not be called")
108        }
109    }
110
111    #[tokio::test]
112    async fn test_glob_exact_path_skips_list() -> VortexResult<()> {
113        let fs: &dyn FileSystem = &NoListFileSystem;
114        let results: Vec<FileListing> = fs.glob("data/file.vortex")?.try_collect().await?;
115        assert_eq!(results.len(), 1);
116        assert_eq!(results[0].path, "data/file.vortex");
117        assert_eq!(results[0].size, None);
118        Ok(())
119    }
120
121    #[test]
122    fn test_glob_list_prefix_with_wildcard_in_filename() {
123        assert_eq!(glob_list_prefix("folder/file*.txt"), "folder/");
124    }
125
126    #[test]
127    fn test_glob_list_prefix_with_wildcard_in_directory() {
128        assert_eq!(glob_list_prefix("folder/*/file.txt"), "folder/");
129    }
130
131    #[test]
132    fn test_glob_list_prefix_nested_directories() {
133        assert_eq!(glob_list_prefix("data/2023/*/logs/*.log"), "data/2023/");
134    }
135
136    #[test]
137    fn test_glob_list_prefix_wildcard_at_root() {
138        assert_eq!(glob_list_prefix("*.txt"), "");
139    }
140
141    #[test]
142    fn test_glob_list_prefix_no_wildcards() {
143        assert_eq!(
144            glob_list_prefix("folder/subfolder/file.txt"),
145            "folder/subfolder/"
146        );
147    }
148
149    #[test]
150    fn test_glob_list_prefix_question_mark() {
151        assert_eq!(glob_list_prefix("folder/file?.txt"), "folder/");
152    }
153
154    #[test]
155    fn test_glob_list_prefix_bracket() {
156        assert_eq!(glob_list_prefix("folder/file[abc].txt"), "folder/");
157    }
158
159    #[test]
160    fn test_glob_list_prefix_empty() {
161        assert_eq!(glob_list_prefix(""), "");
162    }
163
164    #[test]
165    fn test_validate_glob_valid() -> VortexResult<()> {
166        validate_glob("path/*.txt")?;
167        validate_glob("path/to/**/*.vortex")?;
168        Ok(())
169    }
170
171    #[test]
172    fn test_validate_glob_escaped_asterisk() {
173        assert!(validate_glob("path\\*.txt").is_err());
174    }
175
176    #[test]
177    fn test_validate_glob_escaped_question() {
178        assert!(validate_glob("path\\?.txt").is_err());
179    }
180
181    #[test]
182    fn test_validate_glob_escaped_bracket() {
183        assert!(validate_glob("path\\[test].txt").is_err());
184    }
185}