asciidoc_parser/parser/
path_resolver.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4
5/// A `PathResolver` handles all operations for resolving, cleaning, and joining
6/// paths. This struct includes operations for handling both web paths (request
7/// URIs) and system paths.
8///
9/// The main emphasis of the struct is on creating clean and secure paths. Clean
10/// paths are void of duplicate parent and current directory references in the
11/// path name. Secure paths are paths which are restricted from accessing
12/// directories outside of a jail path, if specified.
13///
14/// Since joining two paths can result in an insecure path, this struct also
15/// handles the task of joining a parent (start) and child (target) path.
16///
17/// Like its counterpart in the Ruby Asciidoctor implementation, this struct
18/// makes no use of path utilities from the underlying Rust libraries. Instead,
19/// it handles all aspects of path manipulation. The main benefit of
20/// internalizing these operations is that the struct is able to handle both
21/// Posix and Windows paths independent of the operating system on which it
22/// runs. This makes the class both deterministic and easier to test.
23#[derive(Clone, Debug, Eq, PartialEq)]
24pub struct PathResolver {
25    /// File separator to use for path operations. (Defaults to
26    /// platform-appropriate separator.)
27    pub file_separator: char,
28    // TO DO: Port this from Ruby?
29    // attr_accessor :working_dir
30}
31
32impl Default for PathResolver {
33    fn default() -> Self {
34        Self {
35            file_separator: std::path::MAIN_SEPARATOR,
36        }
37    }
38}
39
40impl PathResolver {
41    /// Normalize path by converting any backslashes to forward slashes.
42    pub fn posixify(&self, path: &str) -> String {
43        if self.file_separator == '\\' && path.contains('\\') {
44            path.replace('\\', "/")
45        } else {
46            path.to_string()
47        }
48    }
49
50    /// Resolve a web path from the target and start paths.
51    ///
52    /// The main function of this operation is to resolve any parent references
53    /// and remove any self references.
54    ///
55    /// The target is assumed to be a path, not a qualified URI. That check
56    /// should happen before this method is invoked.
57    ///
58    /// Returns a path that joins the target path with the start path with any
59    /// parent references resolved and self references removed.
60    pub fn web_path(&self, target: &str, start: Option<&str>) -> String {
61        let mut target = self.posixify(target);
62        let start = start.map(|start| self.posixify(start));
63
64        let mut uri_prefix: Option<String> = None;
65
66        if !(start.is_none() || self.is_web_root(&target)) {
67            (target, uri_prefix) = extract_uri_prefix(&format!(
68                "{start}{maybe_add_slash}{target}",
69                start = start.as_deref().unwrap_or_default(),
70                maybe_add_slash = start
71                    .as_ref()
72                    .map(|s| if s.ends_with("/") { "" } else { "/" })
73                    .unwrap_or_default()
74            ));
75        }
76
77        let (target_segments, target_root) = self.partition_path(&target, WebPath(true));
78
79        let mut resolved_segments: Vec<String> = vec![];
80
81        for segment in target_segments {
82            if segment == ".." {
83                if resolved_segments.is_empty() {
84                    if let Some(target_root) = target_root.as_ref()
85                        && target_root != "./"
86                    {
87                        // Do nothing.
88                    } else {
89                        resolved_segments.push(segment);
90                    }
91                } else if let Some(last_segment) = resolved_segments.last()
92                    && last_segment == ".."
93                {
94                    resolved_segments.push(segment);
95                } else {
96                    resolved_segments.pop();
97                }
98            } else {
99                resolved_segments.push(segment);
100            }
101        }
102
103        let resolved_path = self
104            .join_path(&resolved_segments, target_root.as_deref())
105            .replace(" ", "%20");
106
107        format!(
108            "{uri_prefix}{resolved_path}",
109            uri_prefix = uri_prefix.unwrap_or_default()
110        )
111    }
112
113    /// Partition the path into path segments and remove self references (`.`)
114    /// and the trailing slash, if present. Prior to being partitioned, the path
115    /// is converted to a Posix path.
116    ///
117    /// Parent references are not resolved by this method since the caller often
118    /// needs to handle this resolution in a certain context (checking for the
119    /// breach of a jail, for instance).
120    ///
121    /// Returns a 2-item tuple containing a `Vec<String>` of path segments and
122    /// an optional path root (e.g., `/`, `./`, `c:/`, or `//`), which is only
123    /// present if the path is absolute.
124    fn partition_path(&self, path: &str, web: WebPath) -> (Vec<String>, Option<String>) {
125        // TO DO: Add cache implementation?
126
127        let posix_path = self.posixify(path);
128
129        let root: Option<String> = if web.0 {
130            if self.is_web_root(&posix_path) {
131                Some("/".to_owned())
132            } else if posix_path.starts_with("./") {
133                Some("./".to_owned())
134            } else {
135                None
136            }
137        } else {
138            todo!(
139                "Port this: {}",
140                r#"
141				elsif root? posix_path
142				  # ex. //sample/path
143				  if unc? posix_path
144					root = DOUBLE_SLASH
145				  # ex. /sample/path
146				  elsif posix_path.start_with? SLASH
147					root = SLASH
148				  # ex. uri:classloader:sample/path (or uri:classloader:/sample/path)
149				  elsif posix_path.start_with? URI_CLASSLOADER
150					root = posix_path.slice 0, URI_CLASSLOADER.length
151				  # ex. C:/sample/path (or file:///sample/path in browser environment)
152				  else
153					root = posix_path.slice 0, (posix_path.index SLASH) + 1
154				  end
155				# ex. ./sample/path
156				elsif posix_path.start_with? DOT_SLASH
157				  root = DOT_SLASH
158				end
159				# otherwise ex. sample/path
160                "#
161            );
162        };
163
164        let path_after_root = if let Some(root) = &root {
165            &posix_path[root.len()..]
166        } else {
167            &posix_path
168        };
169
170        let path_segments: Vec<String> = path_after_root
171            .split('/')
172            .filter(|s| *s != ".")
173            .map(|s| s.to_owned())
174            .collect();
175
176        // TO DO: Add cache write?
177
178        (path_segments, root)
179    }
180
181    /// Join the segments using the Posix file separator (since this crate knows
182    /// how to work with paths specified this way, regardless of OS). Use the
183    /// `root`, if specified, to construct an absolute path. Otherwise join the
184    /// segments as a relative path.
185    fn join_path(&self, segments: &[String], root: Option<&str>) -> String {
186        format!(
187            "{root}{segments}",
188            root = root.unwrap_or_default(),
189            segments = segments.join("/"),
190        )
191    }
192
193    /// Return `true` if the path is an absolute (root) web path (i.e. starts
194    /// with a `'/'`.
195    pub fn is_web_root(&self, path: &str) -> bool {
196        path.starts_with('/')
197    }
198}
199
200/// Efficiently extracts the URI prefix from the specified string if the string
201/// is a URI.
202///
203/// Attempts to match the URI prefix in the specified string (e.g., `http://`). If present, the prefix is removed.
204///
205/// Returns a tuple containing the specified string without the URI prefix, if
206/// present, and the extracted URI prefix if found.
207fn extract_uri_prefix(s: &str) -> (String, Option<String>) {
208    if s.contains(':')
209        && let Some(prefix) = URI_SNIFF.find(s)
210    {
211        (
212            s[prefix.len()..].to_string(),
213            Some(prefix.as_str().to_owned()),
214        )
215    } else {
216        (s.to_string(), None)
217    }
218}
219
220// Also: Place this at module scope:
221static URI_SNIFF: LazyLock<Regex> = LazyLock::new(|| {
222    #[allow(clippy::unwrap_used)]
223    Regex::new(
224        r#"(?x)
225        ^                   # Anchor: start of string
226
227        \p{Alphabetic}      # First character: a Unicode letter
228
229        [\p{Alphabetic}     # Followed by one or more of:
230        \p{Number}         #   - Unicode letters or numbers
231        .                  #   - Period
232        \+                 #   - Plus sign
233        \-                 #   - Hyphen
234        ]+                  # One or more of the above
235
236        :                   # Followed by a literal colon
237
238        /{0,2}              # Followed by zero, one, or two literal slashes
239    "#,
240    )
241    .unwrap()
242});
243
244#[derive(Clone, Debug, Eq, PartialEq)]
245pub(crate) struct WebPath(pub(crate) bool);
asciidoc_parser/parser/path_resolver.rs

asciidoc_parser/parser/
path_resolver.rs