lychee-lib 0.24.1

A fast, async link checker
Documentation
//! Remap rules which allow to map URLs matching a pattern to a different
//! URL.
//!
//! # Notes
//! Use in moderation as there are no sanity or performance guarantees.
//!
//! - There is no constraint on remap rules upon instantiation or during
//!   remapping. In particular, rules are checked sequentially so later rules
//!   might contradict with earlier ones if they both match a URL.
//! - A large rule set has a performance impact because the client needs to
//!   match every link against all rules.

// Notes on terminology:
// The major difference between URI (Uniform Resource Identifier) and
// URL (Uniform Resource Locator) is that the former is an identifier for
// resources and the latter is a locator.
// We are not interested in differentiating resources by names and the purpose of
// remapping is to provide an alternative **location** in certain
// circumanstances. Thus the documentation should be about remapping URLs
// (locations), not remapping URIs (identities).

use std::{fmt::Display, ops::Index};

use regex::Regex;
use serde::Serialize;
use url::Url;

use crate::{ErrorKind, Result, Uri};

/// Records a single [`Uri`] remapping
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize)]
pub struct Remap {
    /// The original [`Uri`] before remapping
    pub original: Uri,
    /// The new [`Uri`] after applying [`Remaps`]
    pub new: Uri,
}

impl Display for Remap {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} --> {}", self.original, self.new)
    }
}

/// Rules that remap matching URL patterns.
///
/// Some use-cases are:
/// - Testing URLs prior to production deployment.
/// - Testing URLs behind a proxy.
///
/// # Notes
/// See module level documentation of usage notes.
#[derive(Debug, Clone)]
pub struct Remaps {
    patterns: Vec<(Regex, String)>,
}

impl Remaps {
    /// Create a new remapper
    #[must_use]
    pub const fn new(patterns: Vec<(Regex, String)>) -> Self {
        Self { patterns }
    }

    /// Returns an iterator over the rules.
    // `iter_mut` is deliberately avoided.
    pub fn iter(&self) -> std::slice::Iter<'_, (Regex, String)> {
        self.patterns.iter()
    }

    /// Remap [`Uri`] as a side-effect, using the given patterns.
    /// Return `None` if no rule matches.
    /// Return `Some` if any rule applied.
    ///
    /// # Errors
    ///
    /// Returns an `Err` if the remap rule produces an invalid URL.
    #[must_use = "Remapped URLs must be used"]
    pub fn remap(&self, original: &Uri) -> Result<Option<Remap>> {
        for (pattern, replacement) in self {
            if pattern.is_match(original.as_str()) {
                let new = pattern.replace_all(original.as_str(), replacement);
                let new = Url::parse(&new).map_err(|_| {
                    ErrorKind::InvalidUrlRemap(format!("the result `{new}` is not a valid URL"))
                })?;

                let remap = Remap {
                    original: original.clone(),
                    new: Uri { url: new },
                };
                return Ok(Some(remap));
            }
        }

        Ok(None)
    }

    /// Returns `true` if there is no remap rule defined.
    #[must_use]
    pub const fn is_empty(&self) -> bool {
        self.patterns.is_empty()
    }

    /// Get the number of remap rules.
    #[must_use]
    pub const fn len(&self) -> usize {
        self.patterns.len()
    }
}

impl Index<usize> for Remaps {
    type Output = (Regex, String);

    fn index(&self, index: usize) -> &(regex::Regex, String) {
        &self.patterns[index]
    }
}

impl TryFrom<&[String]> for Remaps {
    type Error = ErrorKind;

    /// Try to convert a slice of `String`s to remap rules.
    ///
    /// Each string should contain a Regex pattern and a URL, separated by
    /// whitespaces.
    ///
    /// # Errors
    ///
    /// Returns an `Err` if:
    /// - Any string in the slice is not of the form `REGEX URL`.
    /// - REGEX is not a valid regular expression.
    /// - URL is not a valid URL.
    fn try_from(remaps: &[String]) -> std::result::Result<Self, Self::Error> {
        let mut parsed = Vec::new();

        for remap in remaps {
            let params: Vec<_> = remap.split_whitespace().collect();
            if params.len() != 2 {
                return Err(ErrorKind::InvalidUrlRemap(format!(
                    "Cannot parse into URI remap rule, must be a Regex pattern and a URL separated by whitespaces: {remap}"
                )));
            }

            let pattern = Regex::new(params[0])?;
            let replacement = params[1].to_string();
            parsed.push((pattern, replacement));
        }

        Ok(Remaps::new(parsed))
    }
}

// Implementation for mutable iterator and moving iterator are deliberately
// avoided
impl<'a> IntoIterator for &'a Remaps {
    type Item = &'a (Regex, String);

    type IntoIter = std::slice::Iter<'a, (Regex, String)>;

    fn into_iter(self) -> Self::IntoIter {
        self.patterns.iter()
    }
}

#[cfg(test)]
mod tests {
    use url::Url;

    use super::*;

    #[test]
    fn test_remap() {
        let input = "https://example.com";
        let input_url = Uri::try_from(input).unwrap();
        let input_pattern = Regex::new(input).unwrap();
        let replacement = "http://127.0.0.1:8080";
        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);

        let output = remaps.remap(&input_url).unwrap();

        assert_eq!(
            output,
            Some(Remap {
                new: Uri::try_from(replacement).unwrap(),
                original: input_url
            })
        );
    }

    #[test]
    fn test_remap_path() {
        let input = Uri::try_from("file://../../issues").unwrap();
        let input_pattern = Regex::new(".*?../../issues").unwrap();
        let replacement = Uri::try_from("https://example.com").unwrap();
        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);

        let output = remaps.remap(&input).unwrap();

        assert_eq!(
            output,
            Some(Remap {
                new: replacement,
                original: input
            })
        );
    }

    #[test]
    fn test_remap_skip() {
        let input = Uri::try_from("https://unrelated.example.com").unwrap();
        let pattern = Regex::new("https://example.com").unwrap();
        let replacement = Uri::try_from("http://127.0.0.1:8080").unwrap();
        let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);

        let output = remaps.remap(&input).unwrap();
        assert_eq!(output, None); // URL was not remapped
    }

    #[test]
    fn test_remap_url_to_file() {
        let pattern = Regex::new("https://docs.example.org").unwrap();
        let replacement = "file:///Users/user/code/repo/docs/_site";
        let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);

        let tests = [
            (
                "https://docs.example.org/integrations/distcp.html",
                "file:///Users/user/code/repo/docs/_site/integrations/distcp.html",
            ),
            (
                "https://docs.example.org/howto/import.html#working-with-imported-data",
                "file:///Users/user/code/repo/docs/_site/howto/import.html#working-with-imported-data",
            ),
            (
                "https://docs.example.org/howto/garbage-collection-committed.html",
                "file:///Users/user/code/repo/docs/_site/howto/garbage-collection-committed.html",
            ),
        ];

        for (input, expected) in tests {
            let input = Uri::try_from(input).unwrap();
            let output = remaps.remap(&input).unwrap();
            assert_eq!(output.unwrap().new, Uri::try_from(expected).unwrap());
        }
    }

    /// This is a partial remap, i.e. the URL is not fully replaced but only
    /// part of it. The parts to be replaced are defined by the regex pattern
    /// using capture groups.
    #[test]
    fn test_remap_capture_group() {
        let input = Uri::try_from("https://example.com/1/2/3").unwrap();
        let input_pattern = Regex::new("https://example.com/.*?/(.*?)/.*").unwrap();
        let replacement = Uri::try_from("https://example.com/foo/$1/bar").unwrap();

        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);

        let output = remaps.remap(&input).unwrap();

        assert_eq!(
            output,
            Some(Remap {
                new: Uri::try_from("https://example.com/foo/2/bar").unwrap(),
                original: input
            })
        );
    }

    #[test]
    fn test_remap_named_capture() {
        let input = Uri::try_from("https://example.com/1/2/3").unwrap();
        let input_pattern = Regex::new("https://example.com/.*?/(?P<foo>.*?)/.*").unwrap();
        let replacement = Uri::try_from("https://example.com/foo/$foo/bar").unwrap();

        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);

        let output = remaps.remap(&input).unwrap();

        assert_eq!(
            output,
            Some(Remap {
                new: Uri::try_from("https://example.com/foo/2/bar").unwrap(),
                original: input
            })
        );
    }

    #[test]
    fn test_remap_named_capture_shorthand() {
        let input = Uri::try_from("https://example.com/1/2/3").unwrap();
        #[allow(clippy::invalid_regex)]
        // Clippy acts up here, but this syntax is actually valid
        // See https://docs.rs/regex/latest/regex/index.html#grouping-and-flags
        let input_pattern = Regex::new(r"https://example.com/.*?/(?<foo>.*?)/.*").unwrap();
        let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();

        let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);

        let output = remaps.remap(&input).unwrap();

        assert_eq!(
            output,
            Some(Remap {
                new: Uri::try_from("https://example.com/foo/2/bar").unwrap(),
                original: input
            })
        );
    }
}