#![doc = include_str!("../README.md")]
#[cfg(test)]
use std::str::Chars;
use regex::Regex;
use url::Url;
const DEFAULT_IGNORED_QUERY_PARAMS: [&str; 15] = [
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"utm_expid",
"gclid",
"_ga",
"_gl",
"msclkid",
"fbclid",
"mc_cid",
"mc_eid",
"[Ww][Tt]\\.mc_(id|ev)",
"__[a-z]+",
];
const DEFAULT_WWW_PREFIX: &str = r#"(?x)
([0-9]-?)?
(old)?
(www?[0-9]*|m|mobile)
(-[a-z0-9]{1,3})?
\.
"#;
const DEFAULT_EXTENSION_SUFFIX: &str = "[a-zA-Z]+[0-9]?$";
pub struct Options {
pub ignored_query_params: Vec<String>,
pub trimmed_host_prefixes: Vec<String>,
pub trimmed_path_extension_suffixes: Vec<String>,
pub path_extension_length: usize,
}
impl Default for Options {
fn default() -> Self {
let new = Self::new();
new.with_ignored_query_params(DEFAULT_IGNORED_QUERY_PARAMS)
.with_trimmed_host_prefixes([DEFAULT_WWW_PREFIX])
.with_trimmed_path_extension_suffixes([DEFAULT_EXTENSION_SUFFIX])
.with_path_extension_length(6)
}
}
impl Options {
pub fn new() -> Self {
Self {
ignored_query_params: vec![],
trimmed_host_prefixes: vec![],
trimmed_path_extension_suffixes: vec![],
path_extension_length: 0,
}
}
fn compile_ignored_query_params_regex(
ignored_query_params: Vec<String>,
) -> Result<Regex, regex::Error> {
Regex::new(&format!("^({})$", ignored_query_params.join("|")))
}
fn compile_trimmed_host_prefixes_regex(
trimmed_host_prefixes: Vec<String>,
) -> Result<Regex, regex::Error> {
if trimmed_host_prefixes.is_empty() {
Regex::new("\\A[\0]")
} else {
Regex::new(&format!("\\A({})", trimmed_host_prefixes.join("|")))
}
}
fn compile_trimmed_path_extension_suffixes_regex(
trimmed_path_extension_suffixes: Vec<String>,
) -> Result<Regex, regex::Error> {
Regex::new(&format!("({})$", trimmed_path_extension_suffixes.join("|")))
}
pub fn compile(self) -> Result<UrlNormalizer, regex::Error> {
Ok(UrlNormalizer {
ignored_query_params: Self::compile_ignored_query_params_regex(
self.ignored_query_params,
)?,
trimmed_host_prefixes: Self::compile_trimmed_host_prefixes_regex(
self.trimmed_host_prefixes,
)?,
trimmed_path_extension_suffixes: Self::compile_trimmed_path_extension_suffixes_regex(
self.trimmed_path_extension_suffixes,
)?,
path_extension_length: self.path_extension_length,
})
}
pub fn with_ignored_query_params<S: AsRef<str>, I: IntoIterator<Item = S>>(
mut self,
iter: I,
) -> Self {
self.ignored_query_params = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
self
}
pub fn with_trimmed_host_prefixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
mut self,
iter: I,
) -> Self {
self.trimmed_host_prefixes = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
self
}
pub fn with_trimmed_path_extension_suffixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
mut self,
iter: I,
) -> Self {
self.trimmed_path_extension_suffixes =
iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
self
}
pub fn with_path_extension_length(mut self, path_extension_length: usize) -> Self {
self.path_extension_length = path_extension_length;
self
}
}
pub struct UrlNormalizer {
ignored_query_params: Regex,
trimmed_host_prefixes: Regex,
trimmed_path_extension_suffixes: Regex,
path_extension_length: usize,
}
#[derive(Debug, PartialEq, Eq)]
struct CompareToken<'a>(&'a str);
#[derive(Debug)]
#[cfg(test)]
struct EscapedCompareToken<'a>(&'a str);
#[cfg(test)]
impl<'a> PartialEq for EscapedCompareToken<'a> {
fn eq(&self, other: &Self) -> bool {
fn consume_with_escape(c: char, ci: &mut Chars) -> char {
const HEX_DIGIT: &str = "0123456789abcdef0123456789ABCDEF";
if c == '+' {
return ' ';
}
if c != '%' {
return c;
}
let a = ci.next().unwrap_or_default();
let a = HEX_DIGIT.find(a).unwrap_or_default() as u8;
let b = ci.next().unwrap_or_default();
let b = HEX_DIGIT.find(b).unwrap_or_default() as u8;
((a << 4) | b) as char
}
if self.0 == other.0 {
return true;
}
let mut it1 = self.0.chars();
let mut it2 = other.0.chars();
while let Some(c) = it1.next() {
let c = consume_with_escape(c, &mut it1);
let c2 = it2.next().unwrap_or_default();
let c2 = consume_with_escape(c2, &mut it2);
if c != c2 {
return false;
}
}
it2.next().is_none()
}
}
impl UrlNormalizer {
fn token_stream<'b>(&self, url: &'b Url) -> impl Iterator<Item = CompareToken<'b>> {
let mut out = Vec::with_capacity(10);
let host = self.normalize_host(url).unwrap_or_default();
out.push(CompareToken(host));
let path = url.path_segments();
if let Some(path) = path {
let mut iter = path.filter(|path| !path.is_empty());
if let Some(mut curr) = iter.next() {
loop {
if let Some(next) = iter.next() {
out.push(CompareToken(curr));
curr = next;
} else {
if let Some((a, b)) = curr.rsplit_once('.') {
if b.len() <= self.path_extension_length
&& self.trimmed_path_extension_suffixes.is_match_at(b, 0)
{
out.push(CompareToken(a));
} else {
out.push(CompareToken(curr));
}
} else {
out.push(CompareToken(curr));
}
break;
}
}
}
}
if let Some(query) = url.query() {
let mut query_pairs = Vec::with_capacity(10);
for bit in query.split('&') {
let (a, b) = if let Some((a, b)) = bit.split_once('=') {
(a, b)
} else {
(bit, "")
};
if !self.ignored_query_params.is_match(a) {
query_pairs.push((a, b));
}
}
query_pairs.sort();
for (key, value) in query_pairs {
out.push(CompareToken(key));
out.push(CompareToken(value));
}
}
let fragment = url.fragment().unwrap_or_default();
let hash_bang = fragment.starts_with('!');
let slash_hash_slash = url.path().ends_with('/') && fragment.starts_with('/');
if hash_bang || slash_hash_slash {
out.push(CompareToken(&fragment[1..fragment.len()]));
}
out.into_iter().filter(|s| !s.0.is_empty())
}
pub fn are_same(&self, a: &Url, b: &Url) -> bool {
self.token_stream(a).eq(self.token_stream(b))
}
pub fn compute_normalization_string(&self, url: &Url) -> String {
let mut s = String::with_capacity(url.as_str().len());
for bit in self.token_stream(url) {
s += bit.0;
s.push(':');
}
s
}
pub fn normalize_host<'a>(&self, url: &'a Url) -> Option<&'a str> {
if let Some(mut host) = url.host_str() {
while let Some(stripped) = self.trimmed_host_prefixes.find_at(host, 0) {
host = &host[stripped.end()..host.len()];
}
let host = host.trim_start_matches('.');
let host = host.trim_end_matches('.');
Some(host)
} else {
None
}
}
}
impl Default for UrlNormalizer {
fn default() -> Self {
Options::default()
.compile()
.expect("Default options will always safely compile")
}
}
#[cfg(test)]
mod test {
use super::*;
use rstest::*;
#[fixture]
fn norm() -> UrlNormalizer {
UrlNormalizer::default()
}
#[test]
fn test_with_empty_options() {
let options = Options::new();
let norm = options.compile().unwrap();
let url = Url::parse("http://www.google.com").unwrap();
assert!(norm.are_same(&url, &Url::parse("https://www.google.com").unwrap()));
assert_eq!(norm.compute_normalization_string(&url), "www.google.com:");
assert!(!norm.are_same(
&Url::parse("https://www.google.com?fbclid=1").unwrap(),
&Url::parse("https://www.google.com?fbclid=2").unwrap()
));
}
#[test]
fn test_existing_data() {
let testdata = include_str!("testdata.txt").trim_end_matches('\n');
let norm = norm();
for line in testdata.split('\n') {
let (url, existing_norm) = line.split_once("\",\"").expect("Expected one comma");
let url = &url[1..url.len()];
let existing_norm = &existing_norm[0..existing_norm.len() - 1];
let url = Url::parse(url).expect("Failed to parse URL");
let expected_norm = norm.compute_normalization_string(&url);
assert_eq!(existing_norm, expected_norm);
}
}
#[rstest]
#[case("http://www.example.com", "example.com")]
#[case("http://m.www.example.com", "example.com")]
#[case("http://www1.example.com", "example.com")]
#[case("http://ww1.example.com", "example.com")]
#[case("http://test.www.example.com", "test.www.example.com")]
#[case("http://www-03.example.com", "example.com")]
#[case("http://m.example.com", "example.com")]
#[case("http://m.m.m.m.m.example.com", "example.com")]
#[case("http://mobile.example.com", "example.com")]
#[case("http://bwwwww.example.com", "bwwwww.example.com")]
fn test_host_normalization(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
assert_eq!(norm.normalize_host(&Url::parse(a).expect("url")), Some(b));
}
#[rstest]
#[case("abc", "abc")]
#[case("abc.", "abc.")]
#[case("ab+c", "ab c")]
#[case("ab%2ec", "ab.c")]
fn test_compare_token(#[case] a: &str, #[case] b: &str) {
let a = EscapedCompareToken(a);
let b = EscapedCompareToken(b);
assert_eq!(a, b);
}
#[rstest]
#[case("abc", "abc.")]
#[case("abc.", "abc")]
#[case("abc", "abc%")]
#[case("abc", "abc%xx")]
#[case("ab+c", "ab c")]
#[case("ab%2ec", "ab/c")]
fn test_compare_token_ne(#[case] a: &str, #[case] b: &str) {
let a = EscapedCompareToken(a);
let b = EscapedCompareToken(b);
assert_ne!(a, b);
}
#[rstest]
#[case("http://x.com")]
#[case("http://1.2.3.4")]
#[case("http://google.com/path/?query")]
#[case("http://google.com/path/?query=bar")]
#[case("http://facebook.com/path/?fbclid=bar&somequery=ok")]
fn test_url_normalization_identical(norm: UrlNormalizer, #[case] a: &str) {
assert!(
norm.are_same(&Url::parse(a).unwrap(), &Url::parse(a).unwrap()),
"{} != {}",
a,
a
);
}
#[rstest]
#[case("http://google.com", "https://google.com")]
#[case("http://google%2ecom", "https://google.com")]
#[case("https://www.google.com", "https://google.com")]
#[case("https://www.google.com/foo.html", "https://www.google.com/foo")]
#[case("https://www.google.com/?#", "https://www.google.com")]
#[case("https://www.google.com/", "https://www.google.com")]
#[case("https://www.google.com/foo", "https://www.google.com/foo/")]
#[case("https://www.google.com//foo", "https://www.google.com/foo")]
#[case("http://x.com?utm_source=foo", "http://x.com")]
#[case("http://x.com?fbclid=foo&gclid=bar", "http://x.com")]
#[case("http://x.com?fbclid=foo", "http://x.com?fbclid=basdf")]
#[case("http://archinte.jamanetwork.com/article.aspx?articleid=1898878&__hstc=9292970.6d480b0896ec071bae4c3d40c40ec7d5.1407456000124.1407456000125.1407456000126.1&__hssc=9292970.1.1407456000127&__hsfp=1314462730", "http://archinte.jamanetwork.com/article.aspx?articleid=1898878")]
#[case("http://x.com", "http://x.com#something")]
#[case("http://x.com", "http://x.com.")]
#[case("http://x.com", "http://x.com..")]
#[case("http://x.com", "http://.x.com")]
fn test_url_normalization_same(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
let a = Url::parse(a).unwrap();
let b = Url::parse(b).unwrap();
assert_eq!(
norm.compute_normalization_string(&a),
norm.compute_normalization_string(&b)
);
assert!(norm.are_same(&a, &b), "{} != {}", a, b);
}
#[rstest]
#[case("http://1.2.3.4", "http://1.2.3.5")]
#[case("https://test.www.google.com", "https://test.www1.google.com")]
#[case("https://google.com", "https://facebook.com")]
#[case("https://google.com/abc", "https://google.com/def")]
#[case("https://google.com/?page=1", "https://google.com/?page=2")]
#[case("https://google.com/?page=%31", "https://google.com/?page=%32")]
#[case("https://amazon.com/product/ref=a", "https://amazon.com/product/ref=b")]
#[case("http://x.com?xfbclid=foo", "http://x.com?xfbclid=basdf")]
#[case("http://x.com/file.html12345", "http://x.com/file.html12346")]
#[case("http://arxiv.org/abs/1405.0126", "http://arxiv.org/abs/1405.0351")]
#[case(
"http://www.bmj.com/content/360/bmj.j5855",
"http://www.bmj.com/content/360/bmj.k322"
)]
#[case(
"https://www.google.com/contributor/welcome/#/intro",
"https://www.google.com/contributor/welcome/#/about"
)]
#[case(
"https://groups.google.com/forum/#!topic/mailing.postfix.users/6Kkel3J_nv4",
"https://groups.google.com/forum/#!topic/erlang-programming/nFWfmwK64RU"
)]
fn test_url_normalization_different(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
let a = Url::parse(a).unwrap();
let b = Url::parse(b).unwrap();
assert_ne!(
norm.compute_normalization_string(&a),
norm.compute_normalization_string(&b)
);
assert!(!norm.are_same(&a, &b), "{} != {}", a, b);
}
}