asciidoc_parser/parser/path_resolver.rs
1use std::sync::LazyLock;
2
3use regex::Regex;
4
5/// A `PathResolver` handles all operations for resolving, cleaning, and joining
6/// paths. This struct includes operations for handling both web paths (request
7/// URIs) and system paths.
8///
9/// The main emphasis of the struct is on creating clean and secure paths. Clean
10/// paths are void of duplicate parent and current directory references in the
11/// path name. Secure paths are paths which are restricted from accessing
12/// directories outside of a jail path, if specified.
13///
14/// Since joining two paths can result in an insecure path, this struct also
15/// handles the task of joining a parent (start) and child (target) path.
16///
17/// Like its counterpart in the Ruby Asciidoctor implementation, this struct
18/// makes no use of path utilities from the underlying Rust libraries. Instead,
19/// it handles all aspects of path manipulation. The main benefit of
20/// internalizing these operations is that the struct is able to handle both
21/// Posix and Windows paths independent of the operating system on which it
22/// runs. This makes the class both deterministic and easier to test.
23#[derive(Clone, Debug, Eq, PartialEq)]
24pub struct PathResolver {
25 /// File separator to use for path operations. (Defaults to
26 /// platform-appropriate separator.)
27 pub file_separator: char,
28 // TO DO: Port this from Ruby?
29 // attr_accessor :working_dir
30}
31
32impl Default for PathResolver {
33 fn default() -> Self {
34 Self {
35 file_separator: std::path::MAIN_SEPARATOR,
36 }
37 }
38}
39
40impl PathResolver {
41 /// Normalize path by converting any backslashes to forward slashes.
42 pub fn posixify(&self, path: &str) -> String {
43 if self.file_separator == '\\' && path.contains('\\') {
44 path.replace('\\', "/")
45 } else {
46 path.to_string()
47 }
48 }
49
50 /// Resolve a web path from the target and start paths.
51 ///
52 /// The main function of this operation is to resolve any parent references
53 /// and remove any self references.
54 ///
55 /// The target is assumed to be a path, not a qualified URI. That check
56 /// should happen before this method is invoked.
57 ///
58 /// Returns a path that joins the target path with the start path with any
59 /// parent references resolved and self references removed.
60 pub fn web_path(&self, target: &str, start: Option<&str>) -> String {
61 let mut target = self.posixify(target);
62 let start = start.map(|start| self.posixify(start));
63
64 let mut uri_prefix: Option<String> = None;
65
66 if !(start.is_none() || self.is_web_root(&target)) {
67 (target, uri_prefix) = extract_uri_prefix(&format!(
68 "{start}{maybe_add_slash}{target}",
69 start = start.as_deref().unwrap_or_default(),
70 maybe_add_slash = start
71 .as_ref()
72 .map(|s| if s.ends_with("/") { "" } else { "/" })
73 .unwrap_or_default()
74 ));
75 }
76
77 let (target_segments, target_root) = self.partition_path(&target, WebPath(true));
78
79 let mut resolved_segments: Vec<String> = vec![];
80
81 for segment in target_segments {
82 if segment == ".." {
83 if resolved_segments.is_empty() {
84 if let Some(target_root) = target_root.as_ref()
85 && target_root != "./"
86 {
87 // Do nothing.
88 } else {
89 resolved_segments.push(segment);
90 }
91 } else if let Some(last_segment) = resolved_segments.last()
92 && last_segment == ".."
93 {
94 resolved_segments.push(segment);
95 } else {
96 resolved_segments.pop();
97 }
98 } else {
99 resolved_segments.push(segment);
100 }
101 }
102
103 let resolved_path = self
104 .join_path(&resolved_segments, target_root.as_deref())
105 .replace(" ", "%20");
106
107 format!(
108 "{uri_prefix}{resolved_path}",
109 uri_prefix = uri_prefix.unwrap_or_default()
110 )
111 }
112
113 /// Partition the path into path segments and remove self references (`.`)
114 /// and the trailing slash, if present. Prior to being partitioned, the path
115 /// is converted to a Posix path.
116 ///
117 /// Parent references are not resolved by this method since the caller often
118 /// needs to handle this resolution in a certain context (checking for the
119 /// breach of a jail, for instance).
120 ///
121 /// Returns a 2-item tuple containing a `Vec<String>` of path segments and
122 /// an optional path root (e.g., `/`, `./`, `c:/`, or `//`), which is only
123 /// present if the path is absolute.
124 fn partition_path(&self, path: &str, web: WebPath) -> (Vec<String>, Option<String>) {
125 // TO DO: Add cache implementation?
126
127 let posix_path = self.posixify(path);
128
129 let root: Option<String> = if web.0 {
130 if self.is_web_root(&posix_path) {
131 Some("/".to_owned())
132 } else if posix_path.starts_with("./") {
133 Some("./".to_owned())
134 } else {
135 None
136 }
137 } else {
138 todo!(
139 "Port this: {}",
140 r#"
141 elsif root? posix_path
142 # ex. //sample/path
143 if unc? posix_path
144 root = DOUBLE_SLASH
145 # ex. /sample/path
146 elsif posix_path.start_with? SLASH
147 root = SLASH
148 # ex. uri:classloader:sample/path (or uri:classloader:/sample/path)
149 elsif posix_path.start_with? URI_CLASSLOADER
150 root = posix_path.slice 0, URI_CLASSLOADER.length
151 # ex. C:/sample/path (or file:///sample/path in browser environment)
152 else
153 root = posix_path.slice 0, (posix_path.index SLASH) + 1
154 end
155 # ex. ./sample/path
156 elsif posix_path.start_with? DOT_SLASH
157 root = DOT_SLASH
158 end
159 # otherwise ex. sample/path
160 "#
161 );
162 };
163
164 let path_after_root = if let Some(root) = &root {
165 &posix_path[root.len()..]
166 } else {
167 &posix_path
168 };
169
170 let path_segments: Vec<String> = path_after_root
171 .split('/')
172 .filter(|s| *s != ".")
173 .map(|s| s.to_owned())
174 .collect();
175
176 // TO DO: Add cache write?
177
178 (path_segments, root)
179 }
180
181 /// Join the segments using the Posix file separator (since this crate knows
182 /// how to work with paths specified this way, regardless of OS). Use the
183 /// `root`, if specified, to construct an absolute path. Otherwise join the
184 /// segments as a relative path.
185 fn join_path(&self, segments: &[String], root: Option<&str>) -> String {
186 format!(
187 "{root}{segments}",
188 root = root.unwrap_or_default(),
189 segments = segments.join("/"),
190 )
191 }
192
193 /// Return `true` if the path is an absolute (root) web path (i.e. starts
194 /// with a `'/'`.
195 pub fn is_web_root(&self, path: &str) -> bool {
196 path.starts_with('/')
197 }
198}
199
200/// Efficiently extracts the URI prefix from the specified string if the string
201/// is a URI.
202///
203/// Attempts to match the URI prefix in the specified string (e.g., `http://`). If present, the prefix is removed.
204///
205/// Returns a tuple containing the specified string without the URI prefix, if
206/// present, and the extracted URI prefix if found.
207fn extract_uri_prefix(s: &str) -> (String, Option<String>) {
208 if s.contains(':')
209 && let Some(prefix) = URI_SNIFF.find(s)
210 {
211 (
212 s[prefix.len()..].to_string(),
213 Some(prefix.as_str().to_owned()),
214 )
215 } else {
216 (s.to_string(), None)
217 }
218}
219
220// Also: Place this at module scope:
221static URI_SNIFF: LazyLock<Regex> = LazyLock::new(|| {
222 #[allow(clippy::unwrap_used)]
223 Regex::new(
224 r#"(?x)
225 ^ # Anchor: start of string
226
227 \p{Alphabetic} # First character: a Unicode letter
228
229 [\p{Alphabetic} # Followed by one or more of:
230 \p{Number} # - Unicode letters or numbers
231 . # - Period
232 \+ # - Plus sign
233 \- # - Hyphen
234 ]+ # One or more of the above
235
236 : # Followed by a literal colon
237
238 /{0,2} # Followed by zero, one, or two literal slashes
239 "#,
240 )
241 .unwrap()
242});
243
244#[derive(Clone, Debug, Eq, PartialEq)]
245pub(crate) struct WebPath(pub(crate) bool);