asciidoc_parser/parser/
path_resolver.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4
5/// A `PathResolver` handles all operations for resolving, cleaning, and joining
6/// paths. This struct includes operations for handling both web paths (request
7/// URIs) and system paths.
8///
9/// The main emphasis of the struct is on creating clean and secure paths. Clean
10/// paths are void of duplicate parent and current directory references in the
11/// path name. Secure paths are paths which are restricted from accessing
12/// directories outside of a jail path, if specified.
13///
14/// Since joining two paths can result in an insecure path, this struct also
15/// handles the task of joining a parent (start) and child (target) path.
16///
17/// Like its counterpart in the Ruby Asciidoctor implementation, this struct
18/// makes no use of path utilities from the underlying Rust libraries. Instead,
19/// it handles all aspects of path manipulation. The main benefit of
20/// internalizing these operations is that the struct is able to handle both
21/// Posix and Windows paths independent of the operating system on which it
22/// runs. This makes the class both deterministic and easier to test.
23#[derive(Clone, Debug, Eq, PartialEq)]
24pub struct PathResolver {
25    /// File separator to use for path operations. (Defaults to
26    /// platform-appropriate separator.)
27    pub file_separator: char,
28    // TO DO: Port this from Ruby?
29    // attr_accessor :working_dir
30}
31
32impl Default for PathResolver {
33    fn default() -> Self {
34        Self {
35            file_separator: std::path::MAIN_SEPARATOR,
36        }
37    }
38}
39
40impl PathResolver {
41    /// Normalize path by converting any backslashes to forward slashes.
42    pub fn posixify(&self, path: &str) -> String {
43        if self.file_separator == '\\' && path.contains('\\') {
44            path.replace('\\', "/")
45        } else {
46            path.to_string()
47        }
48    }
49
50    /// Resolve a web path from the target and start paths.
51    ///
52    /// The main function of this operation is to resolve any parent references
53    /// and remove any self references.
54    ///
55    /// The target is assumed to be a path, not a qualified URI. That check
56    /// should happen before this method is invoked.
57    ///
58    /// Returns a path that joins the target path with the start path with any
59    /// parent references resolved and self references removed.
60    pub fn web_path(&self, target: &str, start: Option<&str>) -> String {
61        let mut target = self.posixify(target);
62        let start = start.map(|start| self.posixify(start));
63
64        let mut uri_prefix: Option<String> = None;
65
66        if !(start.is_none() || self.is_web_root(&target)) {
67            (target, uri_prefix) = extract_uri_prefix(&format!(
68                "{start}{maybe_add_slash}{target}",
69                start = start.as_deref().unwrap_or_default(),
70                maybe_add_slash = start
71                    .as_ref()
72                    .map(|s| if s.ends_with("/") { "" } else { "/" })
73                    .unwrap_or_default()
74            ));
75        }
76
77        let (target_segments, target_root) = self.partition_path(&target, WebPath(true));
78
79        let mut resolved_segments: Vec<String> = vec![];
80
81        for segment in target_segments {
82            if segment == ".." {
83                if resolved_segments.is_empty() {
84                    if let Some(target_root) = target_root.as_ref()
85                        && target_root != "./"
86                    {
87                        // Do nothing.
88                    } else {
89                        resolved_segments.push(segment);
90                    }
91                } else if let Some(last_segment) = resolved_segments.last()
92                    && last_segment == ".."
93                {
94                    resolved_segments.push(segment);
95                } else {
96                    resolved_segments.pop();
97                }
98            } else {
99                resolved_segments.push(segment);
100            }
101        }
102
103        let resolved_path = self
104            .join_path(&resolved_segments, target_root.as_deref())
105            .replace(" ", "%20");
106
107        format!(
108            "{uri_prefix}{resolved_path}",
109            uri_prefix = uri_prefix.unwrap_or_default()
110        )
111    }
112
113    /// Partition the path into path segments and remove self references (`.`)
114    /// and the trailing slash, if present. Prior to being partitioned, the path
115    /// is converted to a Posix path.
116    ///
117    /// Parent references are not resolved by this method since the caller often
118    /// needs to handle this resolution in a certain context (checking for the
119    /// breach of a jail, for instance).
120    ///
121    /// Returns a 2-item tuple containing a `Vec<String>` of path segments and
122    /// an optional path root (e.g., `/`, `./`, `c:/`, or `//`), which is only
123    /// present if the path is absolute.
124    fn partition_path(&self, path: &str, web: WebPath) -> (Vec<String>, Option<String>) {
125        // TO DO: Add cache implementation?
126
127        let posix_path = self.posixify(path);
128
129        let root: Option<String> = if web.0 {
130            if self.is_web_root(&posix_path) {
131                Some("/".to_owned())
132            } else if posix_path.starts_with("./") {
133                Some("./".to_owned())
134            } else {
135                None
136            }
137        } else {
138            todo!(
139                "Port this: {}",
140                r#"
141				elsif root? posix_path
142				  # ex. //sample/path
143				  if unc? posix_path
144					root = DOUBLE_SLASH
145				  # ex. /sample/path
146				  elsif posix_path.start_with? SLASH
147					root = SLASH
148				  # ex. uri:classloader:sample/path (or uri:classloader:/sample/path)
149				  elsif posix_path.start_with? URI_CLASSLOADER
150					root = posix_path.slice 0, URI_CLASSLOADER.length
151				  # ex. C:/sample/path (or file:///sample/path in browser environment)
152				  else
153					root = posix_path.slice 0, (posix_path.index SLASH) + 1
154				  end
155				# ex. ./sample/path
156				elsif posix_path.start_with? DOT_SLASH
157				  root = DOT_SLASH
158				end
159				# otherwise ex. sample/path
160                "#
161            );
162        };
163
164        let path_after_root = if let Some(root) = &root {
165            &posix_path[root.len()..]
166        } else {
167            &posix_path
168        };
169
170        let path_segments: Vec<String> = path_after_root
171            .split('/')
172            .filter(|s| *s != ".")
173            .map(|s| s.to_owned())
174            .collect();
175
176        // TO DO: Add cache write?
177
178        (path_segments, root)
179    }
180
181    /// Join the segments using the Posix file separator (since this crate knows
182    /// how to work with paths specified this way, regardless of OS). Use the
183    /// `root`, if specified, to construct an absolute path. Otherwise join the
184    /// segments as a relative path.
185    fn join_path(&self, segments: &[String], root: Option<&str>) -> String {
186        format!(
187            "{root}{segments}",
188            root = root.unwrap_or_default(),
189            segments = segments.join("/"),
190        )
191    }
192
193    /// Return `true` if the path is an absolute (root) web path (i.e. starts
194    /// with a `'/'`.
195    pub fn is_web_root(&self, path: &str) -> bool {
196        path.starts_with('/')
197    }
198}
199
200/// Efficiently extracts the URI prefix from the specified string if the string
201/// is a URI.
202///
203/// Attempts to match the URI prefix in the specified string (e.g., `http://`). If present, the prefix is removed.
204///
205/// Returns a tuple containing the specified string without the URI prefix, if
206/// present, and the extracted URI prefix if found.
207fn extract_uri_prefix(s: &str) -> (String, Option<String>) {
208    if s.contains(':')
209        && let Some(prefix) = URI_SNIFF.find(s)
210    {
211        (
212            s[prefix.len()..].to_string(),
213            Some(prefix.as_str().to_owned()),
214        )
215    } else {
216        (s.to_string(), None)
217    }
218}
219
220// Also: Place this at module scope:
221static URI_SNIFF: LazyLock<Regex> = LazyLock::new(|| {
222    #[allow(clippy::unwrap_used)]
223    Regex::new(
224        r#"(?x)
225        ^                   # Anchor: start of string
226
227        \p{Alphabetic}      # First character: a Unicode letter
228
229        [\p{Alphabetic}     # Followed by one or more of:
230        \p{Number}         #   - Unicode letters or numbers
231        .                  #   - Period
232        \+                 #   - Plus sign
233        \-                 #   - Hyphen
234        ]+                  # One or more of the above
235
236        :                   # Followed by a literal colon
237
238        /{0,2}              # Followed by zero, one, or two literal slashes
239    "#,
240    )
241    .unwrap()
242});
243
244#[derive(Clone, Debug, Eq, PartialEq)]
245pub(crate) struct WebPath(pub(crate) bool);
246
247#[cfg(test)]
248mod tests {
249    #![allow(clippy::unwrap_used)]
250
251    use crate::parser::PathResolver;
252
253    mod posixify {
254        use pretty_assertions_sorted::assert_eq;
255
256        use crate::parser::PathResolver;
257
258        #[test]
259        fn replaces_backslashes_if_windowsish() {
260            let pr = PathResolver {
261                file_separator: '\\',
262            };
263
264            assert_eq!(pr.posixify("abc/def\\ghi"), "abc/def/ghi");
265        }
266
267        #[test]
268        fn doesnt_replace_backslashes_if_posixish() {
269            let pr = PathResolver {
270                file_separator: '/',
271            };
272
273            assert_eq!(pr.posixify("abc/def\\ghi"), "abc/def\\ghi");
274        }
275
276        #[test]
277        fn doesnt_replace_backslashes_if_none_exist() {
278            let pr = PathResolver {
279                file_separator: '\\',
280            };
281
282            assert_eq!(pr.posixify("abc/def"), "abc/def");
283        }
284    }
285
286    mod web_path {
287        use pretty_assertions_sorted::assert_eq;
288
289        use crate::parser::PathResolver;
290
291        #[test]
292        fn test_cases_from_asciidoctor_rb() {
293            let pr = PathResolver::default();
294
295            assert_eq!(pr.web_path("images", None), "images");
296            assert_eq!(pr.web_path("./images", None), "./images");
297            assert_eq!(pr.web_path("/images", None), "/images");
298
299            assert_eq!(
300                pr.web_path("./images/../assets/images", None),
301                "./assets/images"
302            );
303
304            assert_eq!(pr.web_path("/../images", None), "/images");
305
306            assert_eq!(pr.web_path("/../images", Some("assets")), "/images");
307            assert_eq!(pr.web_path("../images", Some("./")), "./../images");
308            assert_eq!(pr.web_path("../../images", Some("./")), "./../../images");
309
310            assert_eq!(
311                pr.web_path("tiger.png", Some("../assets/images")),
312                "../assets/images/tiger.png"
313            );
314
315            // Basic relative path resolution.
316            assert_eq!(
317                pr.web_path("images/photo.jpg", Some("docs/guide")),
318                "docs/guide/images/photo.jpg"
319            );
320            assert_eq!(pr.web_path("photo.jpg", Some("images")), "images/photo.jpg");
321            assert_eq!(
322                pr.web_path("../photo.jpg", Some("images/folder")),
323                "images/photo.jpg"
324            );
325            assert_eq!(
326                pr.web_path("../../photo.jpg", Some("docs/images/folder")),
327                "docs/photo.jpg"
328            );
329
330            // URI-based scenarios (triggers `extract_uri_prefix`).
331            assert_eq!(
332                pr.web_path("images/photo.jpg", Some("http://example.com/base")),
333                "http://example.com/base/images/photo.jpg"
334            );
335            assert_eq!(
336                pr.web_path("../images/logo.png", Some("https://cdn.example.com/assets")),
337                "https://cdn.example.com/images/logo.png"
338            );
339            assert_eq!(
340                pr.web_path("docs/guide.pdf", Some("file:///Users/docs")),
341                "file:///Users/docs/docs/guide.pdf"
342            );
343            assert_eq!(
344                pr.web_path("assets/style.css", Some("ftp://files.example.com/web")),
345                "ftp://files.example.com/web/assets/style.css"
346            );
347
348            // Web root scenarios (start parameter ignored).
349            assert_eq!(
350                pr.web_path("/absolute/path.jpg", Some("http://example.com/base")),
351                "/absolute/path.jpg"
352            );
353            assert_eq!(
354                pr.web_path("/images/photo.jpg", Some("docs/guide")),
355                "/images/photo.jpg"
356            );
357            assert_eq!(pr.web_path("/", Some("any/path")), "/");
358
359            // No start path scenarios.
360            assert_eq!(pr.web_path("images/photo.jpg", None), "images/photo.jpg");
361            assert_eq!(pr.web_path("../photo.jpg", None), "../photo.jpg");
362
363            // Path normalization with dots.
364            assert_eq!(
365                pr.web_path("./photo.jpg", Some("images")),
366                "images/photo.jpg"
367            );
368            assert_eq!(
369                pr.web_path("folder/./photo.jpg", Some("images")),
370                "images/folder/photo.jpg"
371            );
372            assert_eq!(
373                pr.web_path("folder/../photo.jpg", Some("images")),
374                "images/photo.jpg"
375            );
376
377            // Complex path resolution.
378            assert_eq!(
379                pr.web_path("../../../photo.jpg", Some("docs/images/folder/sub")),
380                "docs/photo.jpg"
381            );
382            assert_eq!(
383                pr.web_path("folder/../../photo.jpg", Some("docs/images")),
384                "docs/photo.jpg"
385            );
386            assert_eq!(
387                pr.web_path("./folder/../photo.jpg", Some("images")),
388                "images/photo.jpg"
389            );
390
391            // Edge cases with trailing slashes.
392            assert_eq!(
393                pr.web_path("photo.jpg", Some("images/")),
394                "images/photo.jpg"
395            );
396            assert_eq!(pr.web_path("photo.jpg", Some("images")), "images/photo.jpg");
397
398            // URLs with paths and parent references.
399            assert_eq!(
400                pr.web_path("../styles/main.css", Some("https://example.com/assets/css")),
401                "https://example.com/assets/styles/main.css"
402            );
403            assert_eq!(
404                pr.web_path(
405                    "../../images/logo.png",
406                    Some("http://site.com/docs/guide/examples")
407                ),
408                "http://site.com/docs/images/logo.png"
409            );
410
411            // Space handling (gets URL encoded).
412            assert_eq!(
413                pr.web_path("my file.jpg", Some("images")),
414                "images/my%20file.jpg"
415            );
416            assert_eq!(
417                pr.web_path("folder with spaces/file.jpg", Some("docs")),
418                "docs/folder%20with%20spaces/file.jpg"
419            );
420
421            // Protocol-less absolute paths.
422            assert_eq!(
423                pr.web_path(
424                    "//cdn.example.com/assets/image.jpg",
425                    Some("http://example.com")
426                ),
427                "//cdn.example.com/assets/image.jpg"
428            );
429
430            // Mixed scenarios.
431            assert_eq!(pr.web_path("", Some("docs/images")), "docs/images/");
432            assert_eq!(pr.web_path("", Some("")), "/");
433            assert_eq!(pr.web_path("", None), "");
434
435            // Complex URI scenarios.
436            assert_eq!(
437                pr.web_path("api/v1/data", Some("https://api.example.com:8080/base")),
438                "https://api.example.com:8080/base/api/v1/data"
439            );
440            assert_eq!(
441                pr.web_path("../v2/data", Some("https://api.example.com/api/v1")),
442                "https://api.example.com/api/v2/data"
443            );
444
445            // File protocol variations.
446            assert_eq!(
447                pr.web_path("document.pdf", Some("file:///C:/Users/docs")),
448                "file:///C:/Users/docs/document.pdf"
449            );
450            assert_eq!(
451                pr.web_path("../shared/doc.pdf", Some("file:///home/user/documents")),
452                "file:///home/user/shared/doc.pdf"
453            );
454        }
455    }
456
457    #[test]
458    fn is_web_root() {
459        let pr = PathResolver::default();
460        assert!(pr.is_web_root("/blah"));
461        assert!(!pr.is_web_root(""));
462        assert!(!pr.is_web_root("./blah"));
463    }
464}