urlable 0.2.0

A comprehensive URL manipulation library for Rust, providing utilities for parsing, encoding, and manipulating URLs with support for query strings, path manipulation, punycode domains and more
Documentation
use idna::domain_to_ascii;
use lazy_static::lazy_static;
use regex::Regex;

lazy_static! {
    // Regular expression to match the hash symbol (#)
    // Example: "http://example.com#section" will match the "#" character
    static ref HASH_RE: Regex = Regex::new(r"#").unwrap();

    // Regular expression to match the ampersand symbol (&)
    // Example: "foo=bar&baz=qux" will match the "&" character
    static ref AMPERSAND_RE: Regex = Regex::new(r"&").unwrap();

    // Regular expression to match the forward slash (/)
    // Example: "http://example.com/path" will match the "/" character
    static ref SLASH_RE: Regex = Regex::new(r"/").unwrap();

    // Regular expression to match the equal sign (=)
    // Example: "foo=bar" will match the "=" character
    static ref EQUAL_RE: Regex = Regex::new(r"=").unwrap();

    // Regular expression to match the question mark (?)
    // Example: "http://example.com?foo=bar" will match the "?" character
    static ref IM_RE: Regex = Regex::new(r"\?").unwrap();

    // Regular expression to match the plus sign (+)
    // Example: "foo+bar" will match the "+" character
    static ref PLUS_RE: Regex = Regex::new(r"\+").unwrap();

    // Regular expression to match the percent-encoded caret (%5E or %5e)
    // Example: "foo%5Ebar" will match the "%5E" or "%5e" sequence
    static ref ENC_CARET_RE: Regex = Regex::new(r"%5[eE]").unwrap();

    // Regular expression to match the percent-encoded backtick (%60)
    // Example: "foo%60bar" will match the "%60" sequence
    static ref ENC_BACKTICK_RE: Regex = Regex::new(r"%60").unwrap();

    // Regular expression to match the percent-encoded opening curly brace (%7B or %7b)
    // Example: "foo%7Bbar" will match the "%7B" or "%7b" sequence
    static ref ENC_CURLY_OPEN_RE: Regex = Regex::new(r"%7[bB]").unwrap();

    // Regular expression to match the percent-encoded pipe (%7C or %7c)
    // Example: "foo%7Cbar" will match the "%7C" or "%7c" sequence
    static ref ENC_PIPE_RE: Regex = Regex::new(r"%7[cC]").unwrap();

    // Regular expression to match the percent-encoded closing curly brace (%7D or %7d)
    // Example: "foo%7Dbar" will match the "%7D" or "%7d" sequence
    static ref ENC_CURLY_CLOSE_RE: Regex = Regex::new(r"%7[dD]").unwrap();

    // Regular expression to match the percent-encoded space (%20)
    // Example: "foo%20bar" will match the "%20" sequence
    static ref ENC_SPACE_RE: Regex = Regex::new(r"%20").unwrap();

    // Regular expression to match the percent-encoded forward slash (%2F or %2f)
    // Example: "foo%2Fbar" will match the "%2F" or "%2f" sequence
    static ref ENC_SLASH_RE: Regex = Regex::new(r"%2[fF]").unwrap();

    // Regular expression to match the double percent-encoded forward slash (%252F or %252f)
    // Example: "foo%252Fbar" will match the "%252F" or "%252f" sequence
    static ref ENC_ENC_SLASH_RE: Regex = Regex::new(r"%252[fF]").unwrap();
}

/// Encode characters that need to be encoded on the path, search and hash
/// sections of the URL.
pub fn encode(text: impl AsRef<str>) -> String {
    let encoded = urlencoding::encode(text.as_ref());
    ENC_PIPE_RE.replace_all(&encoded, "|").into_owned()
}

/// Encode characters that need to be encoded on the hash section of the URL.
pub fn encode_hash(text: impl AsRef<str>) -> String {
    let encoded = encode(text.as_ref());
    let encoded = ENC_CURLY_OPEN_RE.replace_all(&encoded, "{");
    let encoded = ENC_CURLY_CLOSE_RE.replace_all(&encoded, "}");
    ENC_CARET_RE.replace_all(&encoded, "^").into_owned()
}

/// Encode characters that need to be encoded query values on the query
/// section of the URL.
pub fn encode_query_value(input: impl AsRef<str>) -> String {
    let input = input.as_ref().to_string();

    let encoded = encode(&input);
    let encoded = PLUS_RE.replace_all(&encoded, "%2B");
    let encoded = ENC_SPACE_RE.replace_all(&encoded, "+");
    let encoded = HASH_RE.replace_all(&encoded, "%23");
    let encoded = AMPERSAND_RE.replace_all(&encoded, "%26");
    let encoded = ENC_BACKTICK_RE.replace_all(&encoded, "`");
    let encoded = ENC_CARET_RE.replace_all(&encoded, "^");
    SLASH_RE.replace_all(&encoded, "%2F").into_owned()
}

/// Encode characters that need to be encoded query values on the query
/// section of the URL and also encodes the `=` character.
pub fn encode_query_key(text: impl AsRef<str>) -> String {
    let encoded = encode_query_value(text);
    EQUAL_RE.replace_all(&encoded, "%3D").into_owned()
}

/// Encode characters that need to be encoded on the path section of the URL.
pub fn encode_path(text: impl AsRef<str>) -> String {
    let encoded = encode(text.as_ref());
    let encoded = HASH_RE.replace_all(&encoded, "%23");
    let encoded = IM_RE.replace_all(&encoded, "%3F");
    let encoded = ENC_ENC_SLASH_RE.replace_all(&encoded, "%2F");
    let encoded = AMPERSAND_RE.replace_all(&encoded, "%26");
    PLUS_RE.replace_all(&encoded, "%2B").into_owned()
}

/// Encode characters that need to be encoded on the path section of the URL as a
/// param. This function encodes everything `encode_path` does plus the
/// slash (`/`) character.
pub fn encode_param(text: impl AsRef<str>) -> String {
    let encoded = encode_path(text);
    SLASH_RE.replace_all(&encoded, "%2F").into_owned()
}

/// Decode text using `decode`. Returns the original text if it fails.
pub fn decode(text: impl AsRef<str>) -> String {
    urlencoding::decode(text.as_ref())
        .map(|cow| cow.into_owned())
        .unwrap_or_else(|_| text.as_ref().to_string())
}

/// Decode path section of URL (consistent with encode_path for slash encoding).
pub fn decode_path(text: impl AsRef<str>) -> String {
    decode(&ENC_SLASH_RE.replace_all(text.as_ref(), "%252F"))
}

/// Decodes query key (consistent with `encode_query_key` for plus encoding).
pub fn decode_query_key(text: impl AsRef<str>) -> String {
    decode(&PLUS_RE.replace_all(text.as_ref(), " "))
}

/// Decode query value (consistent with `encode_query_value` for plus encoding).
pub fn decode_query_value(text: impl AsRef<str>) -> String {
    decode(&PLUS_RE.replace_all(text.as_ref(), " "))
}

/// Encodes hostname with punycode encoding.
pub fn encode_host(text: impl AsRef<str>) -> String {
    // Use idna crate for encoding
    match domain_to_ascii(text.as_ref()) {
        Ok(encoded) => encoded,
        Err(_) => text.as_ref().to_string(),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode() {
        assert_eq!(encode("hello world"), "hello%20world");
        assert_eq!(encode("test|test"), "test|test");
        assert_eq!(encode("!@#$%^&*()"), "%21%40%23%24%25%5E%26%2A%28%29");
        assert_eq!(encode("中文测试"), "%E4%B8%AD%E6%96%87%E6%B5%8B%E8%AF%95");
        assert_eq!(encode(""), "");
        assert_eq!(encode(" "), "%20");
        assert_eq!(
            encode("~!@#$%^&*()_+"),
            "~%21%40%23%24%25%5E%26%2A%28%29_%2B"
        );
        assert_eq!(encode("tab\tspace"), "tab%09space");
    }

    #[test]
    fn test_encode_hash() {
        assert_eq!(encode_hash("{test}^"), "{test}^");
        assert_eq!(encode_hash("test#test"), "test%23test");
        assert_eq!(encode_hash("a#b#c"), "a%23b%23c");
        assert_eq!(encode_hash("test##test"), "test%23%23test");
        assert_eq!(encode_hash("#"), "%23");
        assert_eq!(encode_hash("###"), "%23%23%23");
        assert_eq!(encode_hash("before#after#end"), "before%23after%23end");
        assert_eq!(encode_hash("no hash here"), "no%20hash%20here");
    }

    #[test]
    fn test_encode_query_value() {
        assert_eq!(encode_query_value("hello world"), "hello+world");
        assert_eq!(encode_query_value("test+test"), "test%2Btest");
        assert_eq!(encode_query_value("a + b + c"), "a+%2B+b+%2B+c");
        assert_eq!(encode_query_value("test=value"), "test%3Dvalue");
        assert_eq!(encode_query_value(""), "");
        assert_eq!(encode_query_value(" "), "+");
        assert_eq!(encode_query_value("+++"), "%2B%2B%2B");
        assert_eq!(encode_query_value("special!@#$%"), "special%21%40%23%24%25");
        assert_eq!(
            encode_query_value("multiple    spaces"),
            "multiple++++spaces"
        );
    }

    #[test]
    fn test_encode_query_key() {
        assert_eq!(encode_query_key("test=test"), "test%3Dtest");
        assert_eq!(encode_query_key("key with space"), "key+with+space");
        assert_eq!(encode_query_key("key+plus"), "key%2Bplus");
        assert_eq!(encode_query_key("key=value=test"), "key%3Dvalue%3Dtest");
        assert_eq!(encode_query_key(""), "");
        assert_eq!(encode_query_key("==="), "%3D%3D%3D");
        assert_eq!(encode_query_key("key?value"), "key%3Fvalue");
        assert_eq!(encode_query_key("special&chars"), "special%26chars");
        assert_eq!(encode_query_key("multiple++plus"), "multiple%2B%2Bplus");
    }

    #[test]
    fn test_encode_path() {
        assert_eq!(encode_path("/test/path"), "%2Ftest%2Fpath");
        assert_eq!(encode_path("test#hash"), "test%23hash");
        assert_eq!(encode_path("path?query"), "path%3Fquery");
        assert_eq!(encode_path("a&b"), "a%26b");
        assert_eq!(encode_path("test+plus"), "test%2Bplus");
        assert_eq!(encode_path(""), "");
        assert_eq!(encode_path("///"), "%2F%2F%2F");
        assert_eq!(encode_path("path with spaces"), "path%20with%20spaces");
        assert_eq!(encode_path("?#&+/"), "%3F%23%26%2B%2F");
        assert_eq!(
            encode_path("中文路径"),
            "%E4%B8%AD%E6%96%87%E8%B7%AF%E5%BE%84"
        );
    }

    #[test]
    fn test_encode_param() {
        assert_eq!(encode_param("/test/path"), "%2Ftest%2Fpath");
        assert_eq!(encode_param("param#hash"), "param%23hash");
        assert_eq!(encode_param("test/test/test"), "test%2Ftest%2Ftest");
        assert_eq!(encode_param(""), "");
        assert_eq!(encode_param("/"), "%2F");
        assert_eq!(encode_param("///"), "%2F%2F%2F");
        assert_eq!(
            encode_param("param/with/特殊字符"),
            "param%2Fwith%2F%E7%89%B9%E6%AE%8A%E5%AD%97%E7%AC%A6"
        );
        assert_eq!(encode_param("?#&+/"), "%3F%23%26%2B%2F");
    }

    #[test]
    fn test_decode() {
        assert_eq!(decode("hello%20world"), "hello world");
        assert_eq!(decode("invalid%"), "invalid%");
        assert_eq!(decode("%E4%B8%AD%E6%96%87"), "中文");
        assert_eq!(decode("%3F%23%2F%26"), "?#/&");
        assert_eq!(decode(""), "");
        assert_eq!(decode("%20"), " ");
        assert_eq!(decode("%20%20%20"), "   ");
        assert_eq!(decode("invalid%xx"), "invalid%xx");
        assert_eq!(decode("%E4%B8%AD%E6%96%87%E6%B5%8B%E8%AF%95"), "中文测试");
    }

    #[test]
    fn test_decode_path() {
        assert_eq!(decode_path("/test%2Ftest"), "/test%2Ftest");
        assert_eq!(decode_path("%252Ftest%252Fpath"), "%2Ftest%2Fpath");
        assert_eq!(decode_path("test%23hash"), "test#hash");
        assert_eq!(decode_path(""), "");
        assert_eq!(decode_path("%252F"), "%2F");
        assert_eq!(decode_path("%252F%252F%252F"), "%2F%2F%2F");
        assert_eq!(
            decode_path("path%2Fwith%2F%E7%89%B9%E6%AE%8A%E5%AD%97%E7%AC%A6"),
            "path%2Fwith%2F特殊字符"
        );
        assert_eq!(decode_path("invalid%2"), "invalid%2");
    }

    #[test]
    fn test_decode_query_key() {
        assert_eq!(decode_query_key("hello+world"), "hello world");
        assert_eq!(decode_query_key("test%2Btest"), "test+test");
        assert_eq!(decode_query_key("key%3Dvalue"), "key=value");
        assert_eq!(decode_query_key(""), "");
        assert_eq!(decode_query_key("+"), " ");
        assert_eq!(decode_query_key("+++"), "   ");
        assert_eq!(decode_query_key("key%2B%2B%2Bvalue"), "key+++value");
        assert_eq!(decode_query_key("special%21%40%23"), "special!@#");
    }

    #[test]
    fn test_decode_query_value() {
        assert_eq!(decode_query_value("hello+world"), "hello world");
        assert_eq!(decode_query_value("test%2Btest"), "test+test");
        assert_eq!(decode_query_value("value%3Dtest"), "value=test");
        assert_eq!(decode_query_value(""), "");
        assert_eq!(decode_query_value("+"), " ");
        assert_eq!(decode_query_value("+++"), "   ");
        assert_eq!(decode_query_value("%E4%B8%AD%E6%96%87"), "中文");
        assert_eq!(decode_query_value("test++%2B++test"), "test  +  test");
    }

    #[test]
    fn test_encode_host() {
        assert_eq!(encode_host("example.com"), "example.com");
        assert_eq!(encode_host("测试.com"), "xn--0zwm56d.com");
        assert_eq!(encode_host("bücher.de"), "xn--bcher-kva.de");
        assert_eq!(encode_host(""), "");
        assert_eq!(encode_host("localhost"), "localhost");
        assert_eq!(encode_host("测试。测试.com"), "xn--0zwm56d.xn--0zwm56d.com");
        assert_eq!(encode_host("münich.de"), "xn--mnich-kva.de");
        assert_eq!(encode_host("ドメイン.jp"), "xn--eckwd4c7c.jp");
    }
}