Skip to main content

vortex_io/filesystem/
glob.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use futures::StreamExt;
5use futures::TryStreamExt;
6use futures::stream::BoxStream;
7use vortex_error::VortexResult;
8use vortex_error::vortex_bail;
9use vortex_error::vortex_err;
10
11use crate::filesystem::FileListing;
12use crate::filesystem::FileSystem;
13
14impl dyn FileSystem + '_ {
15    /// Expand a glob pattern, returning matching files as a stream.
16    ///
17    /// Extracts the directory prefix before the first glob character and uses it
18    /// to narrow the [`list`](FileSystem::list) call. The full glob pattern is
19    /// then applied as a filter over the listed entries.
20    ///
21    /// Escaped glob characters (`\*`, `\?`, `\[`) are not supported.
22    pub fn glob(&self, pattern: &str) -> VortexResult<BoxStream<'_, VortexResult<FileListing>>> {
23        validate_glob(pattern)?;
24
25        // If there are no glob characters, the pattern is an exact file path.
26        // Return it directly without listing the filesystem.
27        if !pattern.contains(['*', '?', '[']) {
28            let listing = FileListing {
29                path: pattern.to_string(),
30                size: None,
31            };
32            return Ok(futures::stream::once(async { Ok(listing) }).boxed());
33        }
34
35        let glob_pattern = glob::Pattern::new(pattern)
36            .map_err(|e| vortex_err!("Invalid glob pattern '{}': {}", pattern, e))?;
37
38        let listing_prefix = glob_list_prefix(pattern).trim_end_matches('/');
39
40        tracing::debug!(
41            "Performing glob with pattern '{}' and listing prefix '{}'",
42            pattern,
43            listing_prefix
44        );
45        let stream = self
46            .list(listing_prefix)
47            .try_filter(move |listing| {
48                let matches = glob_pattern.matches(&listing.path);
49                async move { matches }
50            })
51            .into_stream()
52            .boxed();
53
54        Ok(stream)
55    }
56}
57
58/// Returns the directory prefix of a glob pattern.
59///
60/// Finds the first glob character and returns everything up to and including the last `/`
61/// before it. For example, `data/2023/*/logs/*.log` returns `data/2023/`.
62fn glob_list_prefix(pattern: &str) -> &str {
63    let glob_pos = pattern.find(['*', '?', '[']).unwrap_or(pattern.len());
64    match pattern[..glob_pos].rfind('/') {
65        Some(slash_pos) => &pattern[..=slash_pos],
66        None => "",
67    }
68}
69
70/// Validates that a glob pattern does not contain escaped glob characters.
71fn validate_glob(pattern: &str) -> VortexResult<()> {
72    for escape_pattern in ["\\*", "\\?", "\\["] {
73        if pattern.contains(escape_pattern) {
74            vortex_bail!(
75                "Escaped glob characters are not allowed in patterns. Found '{}' in: {}",
76                escape_pattern,
77                pattern
78            );
79        }
80    }
81    Ok(())
82}
83
84#[cfg(test)]
85mod tests {
86    use std::sync::Arc;
87
88    use async_trait::async_trait;
89    use futures::TryStreamExt;
90    use vortex_error::vortex_panic;
91
92    use super::*;
93    use crate::VortexReadAt;
94    use crate::filesystem::FileSystem;
95
96    /// A mock filesystem that panics if `list` is called.
97    #[derive(Debug)]
98    struct NoListFileSystem;
99
100    #[async_trait]
101    impl FileSystem for NoListFileSystem {
102        fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult<FileListing>> {
103            vortex_panic!("list() should not be called for exact paths")
104        }
105
106        async fn open_read(&self, _path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
107            vortex_panic!("open_read() should not be called")
108        }
109
110        async fn delete(&self, _path: &str) -> VortexResult<()> {
111            vortex_panic!("delete() should not be called")
112        }
113    }
114
115    #[tokio::test]
116    async fn test_glob_exact_path_skips_list() -> VortexResult<()> {
117        let fs: &dyn FileSystem = &NoListFileSystem;
118        let results: Vec<FileListing> = fs.glob("data/file.vortex")?.try_collect().await?;
119        assert_eq!(results.len(), 1);
120        assert_eq!(results[0].path, "data/file.vortex");
121        assert_eq!(results[0].size, None);
122        Ok(())
123    }
124
125    #[test]
126    fn test_glob_list_prefix_with_wildcard_in_filename() {
127        assert_eq!(glob_list_prefix("folder/file*.txt"), "folder/");
128    }
129
130    #[test]
131    fn test_glob_list_prefix_with_wildcard_in_directory() {
132        assert_eq!(glob_list_prefix("folder/*/file.txt"), "folder/");
133    }
134
135    #[test]
136    fn test_glob_list_prefix_nested_directories() {
137        assert_eq!(glob_list_prefix("data/2023/*/logs/*.log"), "data/2023/");
138    }
139
140    #[test]
141    fn test_glob_list_prefix_wildcard_at_root() {
142        assert_eq!(glob_list_prefix("*.txt"), "");
143    }
144
145    #[test]
146    fn test_glob_list_prefix_no_wildcards() {
147        assert_eq!(
148            glob_list_prefix("folder/subfolder/file.txt"),
149            "folder/subfolder/"
150        );
151    }
152
153    #[test]
154    fn test_glob_list_prefix_question_mark() {
155        assert_eq!(glob_list_prefix("folder/file?.txt"), "folder/");
156    }
157
158    #[test]
159    fn test_glob_list_prefix_bracket() {
160        assert_eq!(glob_list_prefix("folder/file[abc].txt"), "folder/");
161    }
162
163    #[test]
164    fn test_glob_list_prefix_empty() {
165        assert_eq!(glob_list_prefix(""), "");
166    }
167
168    #[test]
169    fn test_validate_glob_valid() -> VortexResult<()> {
170        validate_glob("path/*.txt")?;
171        validate_glob("path/to/**/*.vortex")?;
172        Ok(())
173    }
174
175    #[test]
176    fn test_validate_glob_escaped_asterisk() {
177        assert!(validate_glob("path\\*.txt").is_err());
178    }
179
180    #[test]
181    fn test_validate_glob_escaped_question() {
182        assert!(validate_glob("path\\?.txt").is_err());
183    }
184
185    #[test]
186    fn test_validate_glob_escaped_bracket() {
187        assert!(validate_glob("path\\[test].txt").is_err());
188    }
189}