Skip to main content

vortex_io/filesystem/
glob.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use futures::StreamExt;
5use futures::TryStreamExt;
6use futures::stream;
7use futures::stream::BoxStream;
8use vortex_error::VortexResult;
9use vortex_error::vortex_bail;
10use vortex_error::vortex_err;
11
12use crate::filesystem::FileListing;
13use crate::filesystem::FileSystem;
14
15impl dyn FileSystem + '_ {
16    /// Expand a glob pattern, returning matching files as a stream.
17    ///
18    /// Extracts the directory prefix before the first glob character and uses it
19    /// to narrow the [`list`](FileSystem::list) call. The full glob pattern is
20    /// then applied as a filter over the listed entries.
21    ///
22    /// Escaped glob characters (`\*`, `\?`, `\[`) are not supported.
23    pub fn glob(&self, pattern: &str) -> VortexResult<BoxStream<'_, VortexResult<FileListing>>> {
24        validate_glob(pattern)?;
25
26        // If there are no glob characters, the pattern is an exact file path. `list` enumerates
27        // entries *under* a prefix on a path-segment basis and never yields the prefix itself, so
28        // listing an exact path would report an existing file as missing (and could surface prefix
29        // collisions such as `foo.vortex.backup` when the caller asked for `foo.vortex`). Use
30        // `head` to confirm the file exists and capture its size, yielding a single-element stream
31        // when it does and an empty stream when it does not.
32        if !pattern.contains(['*', '?', '[']) {
33            let pattern = pattern.to_string();
34            let stream = stream::once(async move { self.head(&pattern).await })
35                .try_filter_map(|listing| async move { Ok(listing) })
36                .boxed();
37            return Ok(stream);
38        }
39
40        let glob_pattern = glob::Pattern::new(pattern)
41            .map_err(|e| vortex_err!("Invalid glob pattern '{}': {}", pattern, e))?;
42
43        let listing_prefix = glob_list_prefix(pattern).trim_end_matches('/');
44
45        tracing::debug!(
46            "Performing glob with pattern '{}' and listing prefix '{}'",
47            pattern,
48            listing_prefix
49        );
50        let stream = self
51            .list(listing_prefix)
52            .try_filter(move |listing| {
53                let matches = glob_pattern.matches(&listing.path);
54                async move { matches }
55            })
56            .into_stream()
57            .boxed();
58
59        Ok(stream)
60    }
61}
62
63/// Returns the directory prefix of a glob pattern.
64///
65/// Finds the first glob character and returns everything up to and including the last `/`
66/// before it. For example, `data/2023/*/logs/*.log` returns `data/2023/`.
67fn glob_list_prefix(pattern: &str) -> &str {
68    let glob_pos = pattern.find(['*', '?', '[']).unwrap_or(pattern.len());
69    match pattern[..glob_pos].rfind('/') {
70        Some(slash_pos) => &pattern[..=slash_pos],
71        None => "",
72    }
73}
74
75/// Validates that a glob pattern does not contain escaped glob characters.
76fn validate_glob(pattern: &str) -> VortexResult<()> {
77    for escape_pattern in ["\\*", "\\?", "\\["] {
78        if pattern.contains(escape_pattern) {
79            vortex_bail!(
80                "Escaped glob characters are not allowed in patterns. Found '{}' in: {}",
81                escape_pattern,
82                pattern
83            );
84        }
85    }
86    Ok(())
87}
88
89#[cfg(test)]
90mod tests {
91    use std::sync::Arc;
92
93    use async_trait::async_trait;
94    use futures::TryStreamExt;
95    use vortex_error::vortex_panic;
96
97    use super::*;
98    use crate::VortexReadAt;
99    use crate::filesystem::FileSystem;
100
101    /// A mock filesystem that resolves exact paths through [`head`](FileSystem::head) and
102    /// panics if [`list`](FileSystem::list) is called. This encodes the invariant the fix
103    /// depends on: the exact-path glob branch must never list, because an object store's `list`
104    /// does not return the exact path of a file.
105    #[derive(Debug)]
106    struct HeadFileSystem {
107        files: Vec<FileListing>,
108    }
109
110    impl HeadFileSystem {
111        fn new(files: &[(&str, u64)]) -> Self {
112            Self {
113                files: files
114                    .iter()
115                    .map(|&(path, size)| FileListing {
116                        path: path.to_string(),
117                        size: Some(size),
118                    })
119                    .collect(),
120            }
121        }
122    }
123
124    #[async_trait]
125    impl FileSystem for HeadFileSystem {
126        fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult<FileListing>> {
127            vortex_panic!("list() must not be called for an exact path; glob should use head()")
128        }
129
130        async fn head(&self, path: &str) -> VortexResult<Option<FileListing>> {
131            Ok(self
132                .files
133                .iter()
134                .find(|listing| listing.path == path)
135                .cloned())
136        }
137
138        async fn open_read(&self, _path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
139            vortex_panic!("open_read() should not be called")
140        }
141
142        async fn delete(&self, _path: &str) -> VortexResult<()> {
143            vortex_panic!("delete() should not be called")
144        }
145    }
146
147    #[tokio::test]
148    async fn test_glob_exact_path_existing_returns_listing_with_size() -> VortexResult<()> {
149        let fs = HeadFileSystem::new(&[("data/file.vortex", 1024)]);
150        let fs_dyn: &dyn FileSystem = &fs;
151        let results: Vec<FileListing> = fs_dyn.glob("data/file.vortex")?.try_collect().await?;
152        assert_eq!(results.len(), 1);
153        assert_eq!(results[0].path, "data/file.vortex");
154        assert_eq!(
155            results[0].size,
156            Some(1024),
157            "exact-path glob should propagate the size reported by head"
158        );
159        Ok(())
160    }
161
162    #[tokio::test]
163    async fn test_glob_exact_path_missing_returns_empty_stream() -> VortexResult<()> {
164        let fs = HeadFileSystem::new(&[]);
165        let fs_dyn: &dyn FileSystem = &fs;
166        let results: Vec<FileListing> = fs_dyn.glob("data/missing.vortex")?.try_collect().await?;
167        assert!(
168            results.is_empty(),
169            "missing exact path should yield an empty stream"
170        );
171        Ok(())
172    }
173
174    #[tokio::test]
175    async fn test_glob_exact_path_ignores_prefix_siblings() -> VortexResult<()> {
176        // A real object store lists by prefix and would surface `foo.vortex.backup` when asked to
177        // list `foo.vortex`. Resolving the exact path via head sidesteps that: only the requested
178        // key is returned, and the panicking `list` proves the branch never enumerated.
179        let fs = HeadFileSystem::new(&[("foo.vortex", 10), ("foo.vortex.backup", 20)]);
180        let fs_dyn: &dyn FileSystem = &fs;
181        let results: Vec<FileListing> = fs_dyn.glob("foo.vortex")?.try_collect().await?;
182        assert_eq!(results.len(), 1);
183        assert_eq!(results[0].path, "foo.vortex");
184        assert_eq!(results[0].size, Some(10));
185        Ok(())
186    }
187
188    #[test]
189    fn test_glob_list_prefix_with_wildcard_in_filename() {
190        assert_eq!(glob_list_prefix("folder/file*.txt"), "folder/");
191    }
192
193    #[test]
194    fn test_glob_list_prefix_with_wildcard_in_directory() {
195        assert_eq!(glob_list_prefix("folder/*/file.txt"), "folder/");
196    }
197
198    #[test]
199    fn test_glob_list_prefix_nested_directories() {
200        assert_eq!(glob_list_prefix("data/2023/*/logs/*.log"), "data/2023/");
201    }
202
203    #[test]
204    fn test_glob_list_prefix_wildcard_at_root() {
205        assert_eq!(glob_list_prefix("*.txt"), "");
206    }
207
208    #[test]
209    fn test_glob_list_prefix_no_wildcards() {
210        assert_eq!(
211            glob_list_prefix("folder/subfolder/file.txt"),
212            "folder/subfolder/"
213        );
214    }
215
216    #[test]
217    fn test_glob_list_prefix_question_mark() {
218        assert_eq!(glob_list_prefix("folder/file?.txt"), "folder/");
219    }
220
221    #[test]
222    fn test_glob_list_prefix_bracket() {
223        assert_eq!(glob_list_prefix("folder/file[abc].txt"), "folder/");
224    }
225
226    #[test]
227    fn test_glob_list_prefix_empty() {
228        assert_eq!(glob_list_prefix(""), "");
229    }
230
231    #[test]
232    fn test_validate_glob_valid() -> VortexResult<()> {
233        validate_glob("path/*.txt")?;
234        validate_glob("path/to/**/*.vortex")?;
235        Ok(())
236    }
237
238    #[test]
239    fn test_validate_glob_escaped_asterisk() {
240        assert!(validate_glob("path\\*.txt").is_err());
241    }
242
243    #[test]
244    fn test_validate_glob_escaped_question() {
245        assert!(validate_glob("path\\?.txt").is_err());
246    }
247
248    #[test]
249    fn test_validate_glob_escaped_bracket() {
250        assert!(validate_glob("path\\[test].txt").is_err());
251    }
252}