lychee_lib/
remap.rs

1//! Remapping rules which allow to map URLs matching a pattern to a different
2//! URL.
3//!
4//! # Notes
5//! Use in moderation as there are no sanity or performance guarantees.
6//!
7//! - There is no constraint on remapping rules upon instantiation or during
8//!   remapping. In particular, rules are checked sequentially so later rules
9//!   might contradict with earlier ones if they both match a URL.
10//! - A large rule set has a performance impact because the client needs to
11//!   match every link against all rules.
12
13// Notes on terminology:
14// The major difference between URI (Uniform Resource Identifier) and
15// URL (Uniform Resource Locator) is that the former is an identifier for
16// resources and the latter is a locator.
17// We are not interested in differentiating resources by names and the purpose of
18// remapping is to provide an alternative **location** in certain
19// circumanstances. Thus the documentation should be about remapping URLs
20// (locations), not remapping URIs (identities).
21
22use std::ops::Index;
23
24use regex::Regex;
25use url::Url;
26
27use crate::{ErrorKind, Result};
28
29/// Rules that remap matching URL patterns.
30///
31/// Some use-cases are:
32/// - Testing URLs prior to production deployment.
33/// - Testing URLs behind a proxy.
34///
35/// # Notes
36/// See module level documentation of usage notes.
37#[derive(Debug, Clone)]
38pub struct Remaps(Vec<(Regex, String)>);
39
40impl Remaps {
41    /// Create a new remapper
42    #[must_use]
43    pub const fn new(patterns: Vec<(Regex, String)>) -> Self {
44        Self(patterns)
45    }
46
47    /// Returns an iterator over the rules.
48    // `iter_mut` is deliberately avoided.
49    pub fn iter(&self) -> std::slice::Iter<'_, (Regex, String)> {
50        self.0.iter()
51    }
52
53    /// Remap URL against remapping rules.
54    ///
55    /// If there is no matching rule, the original URL is returned.
56    ///
57    /// # Errors
58    ///
59    /// Returns an `Err` if the remapping rule produces an invalid URL.
60    #[must_use = "Remapped URLs must be used"]
61    pub fn remap(&self, original: &Url) -> Result<Url> {
62        for (pattern, replacement) in self {
63            if pattern.is_match(original.as_str()) {
64                let after = pattern.replace_all(original.as_str(), replacement);
65                let after_url = Url::parse(after.as_ref()).map_err(|_| {
66                    ErrorKind::InvalidUrlRemap(format!(
67                        "The remapping pattern must produce a valid URL, but it is not: {after}"
68                    ))
69                })?;
70                return Ok(after_url);
71            }
72        }
73        Ok(original.clone())
74    }
75
76    /// Returns `true` if there is no remapping rule defined.
77    #[must_use]
78    pub const fn is_empty(&self) -> bool {
79        self.0.is_empty()
80    }
81
82    /// Get the number of remapping rules.
83    #[must_use]
84    pub const fn len(&self) -> usize {
85        self.0.len()
86    }
87}
88
89impl Index<usize> for Remaps {
90    type Output = (Regex, String);
91
92    fn index(&self, index: usize) -> &(regex::Regex, String) {
93        &self.0[index]
94    }
95}
96
97impl TryFrom<&[String]> for Remaps {
98    type Error = ErrorKind;
99
100    /// Try to convert a slice of `String`s to remapping rules.
101    ///
102    /// Each string should contain a Regex pattern and a URL, separated by
103    /// whitespaces.
104    ///
105    /// # Errors
106    ///
107    /// Returns an `Err` if:
108    /// - Any string in the slice is not of the form `REGEX URL`.
109    /// - REGEX is not a valid regular expression.
110    /// - URL is not a valid URL.
111    fn try_from(remaps: &[String]) -> std::result::Result<Self, Self::Error> {
112        let mut parsed = Vec::new();
113
114        for remap in remaps {
115            let params: Vec<_> = remap.split_whitespace().collect();
116            if params.len() != 2 {
117                return Err(ErrorKind::InvalidUrlRemap(format!(
118                    "Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: {remap}"
119                )));
120            }
121
122            let pattern = Regex::new(params[0])?;
123            let replacement = params[1].to_string();
124            parsed.push((pattern, replacement));
125        }
126
127        Ok(Remaps::new(parsed))
128    }
129}
130
131// Implementation for mutable iterator and moving iterator are deliberately
132// avoided
133impl<'a> IntoIterator for &'a Remaps {
134    type Item = &'a (Regex, String);
135
136    type IntoIter = std::slice::Iter<'a, (Regex, String)>;
137
138    fn into_iter(self) -> Self::IntoIter {
139        self.0.iter()
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use url::Url;
146
147    use super::*;
148
149    #[test]
150    fn test_remap() {
151        let input = "https://example.com";
152        let input_url = Url::try_from(input).unwrap();
153        let input_pattern = Regex::new(input).unwrap();
154        let replacement = "http://127.0.0.1:8080";
155        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
156
157        let output = remaps.remap(&input_url).unwrap();
158
159        assert_eq!(output, Url::try_from(replacement).unwrap());
160    }
161
162    #[test]
163    fn test_remap_path() {
164        let input = Url::try_from("file://../../issues").unwrap();
165        let input_pattern = Regex::new(".*?../../issues").unwrap();
166        let replacement = Url::try_from("https://example.com").unwrap();
167        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
168
169        let output = remaps.remap(&input).unwrap();
170
171        assert_eq!(output, replacement);
172    }
173
174    #[test]
175    fn test_remap_skip() {
176        let input = Url::try_from("https://unrelated.example.com").unwrap();
177        let pattern = Regex::new("https://example.com").unwrap();
178        let replacement = Url::try_from("http://127.0.0.1:8080").unwrap();
179        let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
180
181        let output = remaps.remap(&input).unwrap();
182
183        // URL was not modified
184        assert_eq!(input, output);
185    }
186
187    #[test]
188    fn test_remap_url_to_file() {
189        let pattern = Regex::new("https://docs.example.org").unwrap();
190        let replacement = "file:///Users/user/code/repo/docs/_site";
191        let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
192
193        let tests = [
194            (
195                "https://docs.example.org/integrations/distcp.html",
196                "file:///Users/user/code/repo/docs/_site/integrations/distcp.html",
197            ),
198            (
199                "https://docs.example.org/howto/import.html#working-with-imported-data",
200                "file:///Users/user/code/repo/docs/_site/howto/import.html#working-with-imported-data",
201            ),
202            (
203                "https://docs.example.org/howto/garbage-collection-committed.html",
204                "file:///Users/user/code/repo/docs/_site/howto/garbage-collection-committed.html",
205            ),
206        ];
207
208        for (input, expected) in tests {
209            let input = Url::parse(input).unwrap();
210            let output = remaps.remap(&input).unwrap();
211            assert_eq!(output, Url::parse(expected).unwrap());
212        }
213    }
214
215    /// This is a partial remap, i.e. the URL is not fully replaced but only
216    /// part of it. The parts to be replaced are defined by the regex pattern
217    /// using capture groups.
218    #[test]
219    fn test_remap_capture_group() {
220        let input = Url::try_from("https://example.com/1/2/3").unwrap();
221        let input_pattern = Regex::new("https://example.com/.*?/(.*?)/.*").unwrap();
222        let replacement = Url::try_from("https://example.com/foo/$1/bar").unwrap();
223
224        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
225
226        let output = remaps.remap(&input).unwrap();
227
228        assert_eq!(
229            output,
230            Url::try_from("https://example.com/foo/2/bar").unwrap()
231        );
232    }
233
234    #[test]
235    fn test_remap_named_capture() {
236        let input = Url::try_from("https://example.com/1/2/3").unwrap();
237        let input_pattern = Regex::new("https://example.com/.*?/(?P<foo>.*?)/.*").unwrap();
238        let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
239
240        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
241
242        let output = remaps.remap(&input).unwrap();
243
244        assert_eq!(
245            output,
246            Url::try_from("https://example.com/foo/2/bar").unwrap()
247        );
248    }
249
250    #[test]
251    fn test_remap_named_capture_shorthand() {
252        let input = Url::try_from("https://example.com/1/2/3").unwrap();
253        #[allow(clippy::invalid_regex)]
254        // Clippy acts up here, but this syntax is actually valid
255        // See https://docs.rs/regex/latest/regex/index.html#grouping-and-flags
256        let input_pattern = Regex::new(r"https://example.com/.*?/(?<foo>.*?)/.*").unwrap();
257        let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
258
259        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
260
261        let output = remaps.remap(&input).unwrap();
262
263        assert_eq!(
264            output,
265            Url::try_from("https://example.com/foo/2/bar").unwrap()
266        );
267    }
268}