object_store/path/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Path abstraction for Object Storage
19
20use itertools::Itertools;
21use percent_encoding::percent_decode;
22use std::fmt::Formatter;
23#[cfg(not(target_arch = "wasm32"))]
24use url::Url;
25
26/// The delimiter to separate object namespaces, creating a directory structure.
27pub const DELIMITER: &str = "/";
28
29/// The path delimiter as a single byte
30pub const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
31
32mod parts;
33
34pub use parts::{InvalidPart, PathPart};
35
36/// Error returned by [`Path::parse`]
37#[derive(Debug, thiserror::Error)]
38#[non_exhaustive]
39pub enum Error {
40    /// Error when there's an empty segment between two slashes `/` in the path
41    #[error("Path \"{}\" contained empty path segment", path)]
42    EmptySegment {
43        /// The source path
44        path: String,
45    },
46
47    /// Error when an invalid segment is encountered in the given path
48    #[error("Error parsing Path \"{}\": {}", path, source)]
49    BadSegment {
50        /// The source path
51        path: String,
52        /// The part containing the error
53        source: InvalidPart,
54    },
55
56    /// Error when path cannot be canonicalized
57    #[error("Failed to canonicalize path \"{}\": {}", path.display(), source)]
58    Canonicalize {
59        /// The source path
60        path: std::path::PathBuf,
61        /// The underlying error
62        source: std::io::Error,
63    },
64
65    /// Error when the path is not a valid URL
66    #[error("Unable to convert path \"{}\" to URL", path.display())]
67    InvalidPath {
68        /// The source path
69        path: std::path::PathBuf,
70    },
71
72    /// Error when a path contains non-unicode characters
73    #[error("Path \"{}\" contained non-unicode characters: {}", path, source)]
74    NonUnicode {
75        /// The source path
76        path: String,
77        /// The underlying `UTF8Error`
78        source: std::str::Utf8Error,
79    },
80
81    /// Error when the a path doesn't start with given prefix
82    #[error("Path {} does not start with prefix {}", path, prefix)]
83    PrefixMismatch {
84        /// The source path
85        path: String,
86        /// The mismatched prefix
87        prefix: String,
88    },
89}
90
91/// A parsed path representation that can be safely written to object storage
92///
93/// A [`Path`] maintains the following invariants:
94///
95/// * Paths are delimited by `/`
96/// * Paths do not contain leading or trailing `/`
97/// * Paths do not contain relative path segments, i.e. `.` or `..`
98/// * Paths do not contain empty path segments
99/// * Paths do not contain any ASCII control characters
100///
101/// There are no enforced restrictions on path length, however, it should be noted that most
102/// object stores do not permit paths longer than 1024 bytes, and many filesystems do not
103/// support path segments longer than 255 bytes.
104///
105/// # Encode
106///
107/// In theory object stores support any UTF-8 character sequence, however, certain character
108/// sequences cause compatibility problems with some applications and protocols. Additionally
109/// some filesystems may impose character restrictions, see [`LocalFileSystem`]. As such the
110/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a
111/// limited character subset.
112///
113/// [S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
114/// [GCS]: https://cloud.google.com/storage/docs/naming-objects
115/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
116///
117/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`]
118/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic
119/// segments according to [RFC 1738].
120///
121/// ```
122/// # use object_store::path::Path;
123/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
124/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
125/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
126/// assert_eq!(Path::from("/").as_ref(), "");
127/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
128/// ```
129///
130/// Note: if provided with an already percent encoded string, this will encode it again
131///
132/// ```
133/// # use object_store::path::Path;
134/// assert_eq!(Path::from("foo/foo%2Fbar").as_ref(), "foo/foo%252Fbar");
135/// ```
136///
137/// # Parse
138///
139/// Alternatively a [`Path`] can be parsed from an existing string, returning an
140/// error if it is invalid. Unlike the encoding methods above, this will permit
141/// arbitrary unicode, including percent encoded sequences.
142///
143/// ```
144/// # use object_store::path::Path;
145/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar");
146/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed
147/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed
148/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed
149/// ```
150///
151/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
152/// [`LocalFileSystem`]: crate::local::LocalFileSystem
153#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
154pub struct Path {
155    /// The raw path with no leading or trailing delimiters
156    raw: String,
157}
158
159impl Path {
160    /// Parse a string as a [`Path`], returning a [`Error`] if invalid,
161    /// as defined on the docstring for [`Path`]
162    ///
163    /// Note: this will strip any leading `/` or trailing `/`
164    pub fn parse(path: impl AsRef<str>) -> Result<Self, Error> {
165        let path = path.as_ref();
166
167        let stripped = path.strip_prefix(DELIMITER).unwrap_or(path);
168        if stripped.is_empty() {
169            return Ok(Default::default());
170        }
171
172        let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped);
173
174        for segment in stripped.split(DELIMITER) {
175            if segment.is_empty() {
176                return Err(Error::EmptySegment { path: path.into() });
177            }
178
179            PathPart::parse(segment).map_err(|source| {
180                let path = path.into();
181                Error::BadSegment { source, path }
182            })?;
183        }
184
185        Ok(Self {
186            raw: stripped.to_string(),
187        })
188    }
189
190    #[cfg(not(target_arch = "wasm32"))]
191    /// Convert a filesystem path to a [`Path`] relative to the filesystem root
192    ///
193    /// This will return an error if the path contains illegal character sequences
194    /// as defined on the docstring for [`Path`] or does not exist
195    ///
196    /// Note: this will canonicalize the provided path, resolving any symlinks
197    pub fn from_filesystem_path(path: impl AsRef<std::path::Path>) -> Result<Self, Error> {
198        let absolute = std::fs::canonicalize(&path).map_err(|source| {
199            let path = path.as_ref().into();
200            Error::Canonicalize { source, path }
201        })?;
202
203        Self::from_absolute_path(absolute)
204    }
205
206    #[cfg(not(target_arch = "wasm32"))]
207    /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root
208    ///
209    /// This will return an error if the path contains illegal character sequences,
210    /// as defined on the docstring for [`Path`], or `base` is not an absolute path
211    pub fn from_absolute_path(path: impl AsRef<std::path::Path>) -> Result<Self, Error> {
212        Self::from_absolute_path_with_base(path, None)
213    }
214
215    #[cfg(not(target_arch = "wasm32"))]
216    /// Convert a filesystem path to a [`Path`] relative to the provided base
217    ///
218    /// This will return an error if the path contains illegal character sequences,
219    /// as defined on the docstring for [`Path`], or `base` does not refer to a parent
220    /// path of `path`, or `base` is not an absolute path
221    pub(crate) fn from_absolute_path_with_base(
222        path: impl AsRef<std::path::Path>,
223        base: Option<&Url>,
224    ) -> Result<Self, Error> {
225        let url = absolute_path_to_url(path)?;
226        let path = match base {
227            Some(prefix) => {
228                url.path()
229                    .strip_prefix(prefix.path())
230                    .ok_or_else(|| Error::PrefixMismatch {
231                        path: url.path().to_string(),
232                        prefix: prefix.to_string(),
233                    })?
234            }
235            None => url.path(),
236        };
237
238        // Reverse any percent encoding performed by conversion to URL
239        Self::from_url_path(path)
240    }
241
242    /// Parse a url encoded string as a [`Path`], returning a [`Error`] if invalid
243    ///
244    /// This will return an error if the path contains illegal character sequences
245    /// as defined on the docstring for [`Path`]
246    pub fn from_url_path(path: impl AsRef<str>) -> Result<Self, Error> {
247        let path = path.as_ref();
248        let decoded = percent_decode(path.as_bytes())
249            .decode_utf8()
250            .map_err(|source| {
251                let path = path.into();
252                Error::NonUnicode { source, path }
253            })?;
254
255        Self::parse(decoded)
256    }
257
258    /// Returns the [`PathPart`] of this [`Path`]
259    pub fn parts(&self) -> impl Iterator<Item = PathPart<'_>> {
260        self.raw
261            .split_terminator(DELIMITER)
262            .map(|s| PathPart { raw: s.into() })
263    }
264
265    /// Returns the last path segment containing the filename stored in this [`Path`]
266    pub fn filename(&self) -> Option<&str> {
267        match self.raw.is_empty() {
268            true => None,
269            false => self.raw.rsplit(DELIMITER).next(),
270        }
271    }
272
273    /// Returns the extension of the file stored in this [`Path`], if any
274    pub fn extension(&self) -> Option<&str> {
275        self.filename()
276            .and_then(|f| f.rsplit_once('.'))
277            .and_then(|(_, extension)| {
278                if extension.is_empty() {
279                    None
280                } else {
281                    Some(extension)
282                }
283            })
284    }
285
286    /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix`
287    ///
288    /// Returns `None` if the prefix does not match
289    pub fn prefix_match(&self, prefix: &Self) -> Option<impl Iterator<Item = PathPart<'_>> + '_> {
290        let mut stripped = self.raw.strip_prefix(&prefix.raw)?;
291        if !stripped.is_empty() && !prefix.raw.is_empty() {
292            stripped = stripped.strip_prefix(DELIMITER)?;
293        }
294        let iter = stripped
295            .split_terminator(DELIMITER)
296            .map(|x| PathPart { raw: x.into() });
297        Some(iter)
298    }
299
300    /// Returns true if this [`Path`] starts with `prefix`
301    pub fn prefix_matches(&self, prefix: &Self) -> bool {
302        self.prefix_match(prefix).is_some()
303    }
304
305    /// Creates a new child of this [`Path`]
306    pub fn child<'a>(&self, child: impl Into<PathPart<'a>>) -> Self {
307        let raw = match self.raw.is_empty() {
308            true => format!("{}", child.into().raw),
309            false => format!("{}{}{}", self.raw, DELIMITER, child.into().raw),
310        };
311
312        Self { raw }
313    }
314}
315
316impl AsRef<str> for Path {
317    fn as_ref(&self) -> &str {
318        &self.raw
319    }
320}
321
322impl From<&str> for Path {
323    fn from(path: &str) -> Self {
324        Self::from_iter(path.split(DELIMITER))
325    }
326}
327
328impl From<String> for Path {
329    fn from(path: String) -> Self {
330        Self::from_iter(path.split(DELIMITER))
331    }
332}
333
334impl From<Path> for String {
335    fn from(path: Path) -> Self {
336        path.raw
337    }
338}
339
340impl std::fmt::Display for Path {
341    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
342        self.raw.fmt(f)
343    }
344}
345
346impl<'a, I> FromIterator<I> for Path
347where
348    I: Into<PathPart<'a>>,
349{
350    fn from_iter<T: IntoIterator<Item = I>>(iter: T) -> Self {
351        let raw = T::into_iter(iter)
352            .map(|s| s.into())
353            .filter(|s| !s.raw.is_empty())
354            .map(|s| s.raw)
355            .join(DELIMITER);
356
357        Self { raw }
358    }
359}
360
361#[cfg(not(target_arch = "wasm32"))]
362/// Given an absolute filesystem path convert it to a URL representation without canonicalization
363pub(crate) fn absolute_path_to_url(path: impl AsRef<std::path::Path>) -> Result<Url, Error> {
364    Url::from_file_path(&path).map_err(|_| Error::InvalidPath {
365        path: path.as_ref().into(),
366    })
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn cloud_prefix_with_trailing_delimiter() {
375        // Use case: files exist in object storage named `foo/bar.json` and
376        // `foo_test.json`. A search for the prefix `foo/` should return
377        // `foo/bar.json` but not `foo_test.json'.
378        let prefix = Path::from_iter(["test"]);
379        assert_eq!(prefix.as_ref(), "test");
380    }
381
382    #[test]
383    fn push_encodes() {
384        let location = Path::from_iter(["foo/bar", "baz%2Ftest"]);
385        assert_eq!(location.as_ref(), "foo%2Fbar/baz%252Ftest");
386    }
387
388    #[test]
389    fn test_parse() {
390        assert_eq!(Path::parse("/").unwrap().as_ref(), "");
391        assert_eq!(Path::parse("").unwrap().as_ref(), "");
392
393        let err = Path::parse("//").unwrap_err();
394        assert!(matches!(err, Error::EmptySegment { .. }));
395
396        assert_eq!(Path::parse("/foo/bar/").unwrap().as_ref(), "foo/bar");
397        assert_eq!(Path::parse("foo/bar/").unwrap().as_ref(), "foo/bar");
398        assert_eq!(Path::parse("foo/bar").unwrap().as_ref(), "foo/bar");
399
400        let err = Path::parse("foo///bar").unwrap_err();
401        assert!(matches!(err, Error::EmptySegment { .. }));
402    }
403
404    #[test]
405    fn convert_raw_before_partial_eq() {
406        // dir and file_name
407        let cloud = Path::from("test_dir/test_file.json");
408        let built = Path::from_iter(["test_dir", "test_file.json"]);
409
410        assert_eq!(built, cloud);
411
412        // dir and file_name w/o dot
413        let cloud = Path::from("test_dir/test_file");
414        let built = Path::from_iter(["test_dir", "test_file"]);
415
416        assert_eq!(built, cloud);
417
418        // dir, no file
419        let cloud = Path::from("test_dir/");
420        let built = Path::from_iter(["test_dir"]);
421        assert_eq!(built, cloud);
422
423        // file_name, no dir
424        let cloud = Path::from("test_file.json");
425        let built = Path::from_iter(["test_file.json"]);
426        assert_eq!(built, cloud);
427
428        // empty
429        let cloud = Path::from("");
430        let built = Path::from_iter(["", ""]);
431
432        assert_eq!(built, cloud);
433    }
434
435    #[test]
436    fn parts_after_prefix_behavior() {
437        let existing_path = Path::from("apple/bear/cow/dog/egg.json");
438
439        // Prefix with one directory
440        let prefix = Path::from("apple");
441        let expected_parts: Vec<PathPart<'_>> = vec!["bear", "cow", "dog", "egg.json"]
442            .into_iter()
443            .map(Into::into)
444            .collect();
445        let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
446        assert_eq!(parts, expected_parts);
447
448        // Prefix with two directories
449        let prefix = Path::from("apple/bear");
450        let expected_parts: Vec<PathPart<'_>> = vec!["cow", "dog", "egg.json"]
451            .into_iter()
452            .map(Into::into)
453            .collect();
454        let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
455        assert_eq!(parts, expected_parts);
456
457        // Not a prefix
458        let prefix = Path::from("cow");
459        assert!(existing_path.prefix_match(&prefix).is_none());
460
461        // Prefix with a partial directory
462        let prefix = Path::from("ap");
463        assert!(existing_path.prefix_match(&prefix).is_none());
464
465        // Prefix matches but there aren't any parts after it
466        let existing = Path::from("apple/bear/cow/dog");
467
468        assert_eq!(existing.prefix_match(&existing).unwrap().count(), 0);
469        assert_eq!(Path::default().parts().count(), 0);
470    }
471
472    #[test]
473    fn prefix_matches() {
474        let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]);
475        // self starts with self
476        assert!(
477            haystack.prefix_matches(&haystack),
478            "{haystack:?} should have started with {haystack:?}"
479        );
480
481        // a longer prefix doesn't match
482        let needle = haystack.child("longer now");
483        assert!(
484            !haystack.prefix_matches(&needle),
485            "{haystack:?} shouldn't have started with {needle:?}"
486        );
487
488        // one dir prefix matches
489        let needle = Path::from_iter(["foo/bar"]);
490        assert!(
491            haystack.prefix_matches(&needle),
492            "{haystack:?} should have started with {needle:?}"
493        );
494
495        // two dir prefix matches
496        let needle = needle.child("baz%2Ftest");
497        assert!(
498            haystack.prefix_matches(&needle),
499            "{haystack:?} should have started with {needle:?}"
500        );
501
502        // partial dir prefix doesn't match
503        let needle = Path::from_iter(["f"]);
504        assert!(
505            !haystack.prefix_matches(&needle),
506            "{haystack:?} should not have started with {needle:?}"
507        );
508
509        // one dir and one partial dir doesn't match
510        let needle = Path::from_iter(["foo/bar", "baz"]);
511        assert!(
512            !haystack.prefix_matches(&needle),
513            "{haystack:?} should not have started with {needle:?}"
514        );
515
516        // empty prefix matches
517        let needle = Path::from("");
518        assert!(
519            haystack.prefix_matches(&needle),
520            "{haystack:?} should have started with {needle:?}"
521        );
522    }
523
524    #[test]
525    fn prefix_matches_with_file_name() {
526        let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]);
527
528        // All directories match and file name is a prefix
529        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]);
530
531        assert!(
532            !haystack.prefix_matches(&needle),
533            "{haystack:?} should not have started with {needle:?}"
534        );
535
536        // All directories match but file name is not a prefix
537        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "e"]);
538
539        assert!(
540            !haystack.prefix_matches(&needle),
541            "{haystack:?} should not have started with {needle:?}"
542        );
543
544        // Not all directories match; file name is a prefix of the next directory; this
545        // does not match
546        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "s"]);
547
548        assert!(
549            !haystack.prefix_matches(&needle),
550            "{haystack:?} should not have started with {needle:?}"
551        );
552
553        // Not all directories match; file name is NOT a prefix of the next directory;
554        // no match
555        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "p"]);
556
557        assert!(
558            !haystack.prefix_matches(&needle),
559            "{haystack:?} should not have started with {needle:?}"
560        );
561    }
562
563    #[test]
564    fn path_containing_spaces() {
565        let a = Path::from_iter(["foo bar", "baz"]);
566        let b = Path::from("foo bar/baz");
567        let c = Path::parse("foo bar/baz").unwrap();
568
569        assert_eq!(a.raw, "foo bar/baz");
570        assert_eq!(a.raw, b.raw);
571        assert_eq!(b.raw, c.raw);
572    }
573
574    #[test]
575    fn from_url_path() {
576        let a = Path::from_url_path("foo%20bar").unwrap();
577        let b = Path::from_url_path("foo/%2E%2E/bar").unwrap_err();
578        let c = Path::from_url_path("foo%2F%252E%252E%2Fbar").unwrap();
579        let d = Path::from_url_path("foo/%252E%252E/bar").unwrap();
580        let e = Path::from_url_path("%48%45%4C%4C%4F").unwrap();
581        let f = Path::from_url_path("foo/%FF/as").unwrap_err();
582
583        assert_eq!(a.raw, "foo bar");
584        assert!(matches!(b, Error::BadSegment { .. }));
585        assert_eq!(c.raw, "foo/%2E%2E/bar");
586        assert_eq!(d.raw, "foo/%2E%2E/bar");
587        assert_eq!(e.raw, "HELLO");
588        assert!(matches!(f, Error::NonUnicode { .. }));
589    }
590
591    #[test]
592    fn filename_from_path() {
593        let a = Path::from("foo/bar");
594        let b = Path::from("foo/bar.baz");
595        let c = Path::from("foo.bar/baz");
596
597        assert_eq!(a.filename(), Some("bar"));
598        assert_eq!(b.filename(), Some("bar.baz"));
599        assert_eq!(c.filename(), Some("baz"));
600    }
601
602    #[test]
603    fn file_extension() {
604        let a = Path::from("foo/bar");
605        let b = Path::from("foo/bar.baz");
606        let c = Path::from("foo.bar/baz");
607        let d = Path::from("foo.bar/baz.qux");
608
609        assert_eq!(a.extension(), None);
610        assert_eq!(b.extension(), Some("baz"));
611        assert_eq!(c.extension(), None);
612        assert_eq!(d.extension(), Some("qux"));
613    }
614}