urlnorm/
lib.rs

1#![doc = include_str!("../README.md")]
2#[cfg(test)]
3use std::str::Chars;
4
5use regex::Regex;
6use url::Url;
7
8/// Default query parameters that are ignored.
9const DEFAULT_IGNORED_QUERY_PARAMS: [&str; 15] = [
10    "utm_source",
11    "utm_medium",
12    "utm_campaign",
13    "utm_term",
14    "utm_content",
15    "utm_expid",
16    "gclid",
17    "_ga",
18    "_gl",
19    "msclkid",
20    "fbclid",
21    "mc_cid",
22    "mc_eid",
23    "[Ww][Tt]\\.mc_(id|ev)",
24    "__[a-z]+",
25];
26
27/// Regular expression that trims common www- and mobile-style prefixes. From an analysis of the existing scrape dump, we have
28/// patterns like: www, www1, www-03, www-psych, www-refresh, m, mobile, etc.
29const DEFAULT_WWW_PREFIX: &str = r#"(?x)
30    ([0-9]-?)?
31    (old)?
32    (www?[0-9]*|m|mobile)
33    (-[a-z0-9]{1,3})?
34    \.
35"#;
36
37/// By default, trim extensions that look like .html, .html5, etc.
38const DEFAULT_EXTENSION_SUFFIX: &str = "[a-zA-Z]+[0-9]?$";
39
40/// Defines how URL normalization will work. This struct offers reasonable defaults, as well as a fluent interface for building normalization.
41///
42/// Construct an empty [`Options`] object and provide a query parameter:
43///
44/// ```
45/// # use urlnorm::*;
46/// let options = Options::new().with_ignored_query_params(["fbclid"]);
47/// ```
48///
49/// Construct a default [`Options`] object and modify the query parameters:
50///
51/// ```
52/// # use urlnorm::*;
53/// let options = Options::default().with_ignored_query_params(["fbclid"]);
54/// ```
55///
56/// And once you've constructed the [`Options`] object, you can [`Options::compile`] it
57/// to a [`UrlNormalizer`]. This may fail if the regular expressions fail to compile.
58///
59/// ```
60/// # use urlnorm::*;
61/// let options: Options = Options::default().with_ignored_query_params(["fbclid"]);
62/// let normalizer: UrlNormalizer = options.compile().expect("Failed to compile");
63/// ```
64///
65/// In most cases, however, you'll want to just use [`UrlNormalizer::default()`] and can skip [`Options`] entirely. The
66/// default [`UrlNormalizer`] is also infallible:
67///
68/// ```
69/// # use url::Url;
70/// # use urlnorm::*;
71/// let normalizer = UrlNormalizer::default();
72/// let s = normalizer.compute_normalization_string(&Url::parse("http://google.com").unwrap());
73/// ```
74pub struct Options {
75    /// Query parameters to ignore. These are wrapped in the regular expression beginning and end-of-string markers (ie: `^...$`).
76    pub ignored_query_params: Vec<String>,
77    /// Host prefixes to trim. These match only at the start of the URL's host, and repeated matches will be removed.
78    pub trimmed_host_prefixes: Vec<String>,
79    /// Path extensions to trim. These match only at the end of the path, and an end-of-string marker (`$`) is added to the patterns
80    /// automatically.
81    pub trimmed_path_extension_suffixes: Vec<String>,
82    /// Specifies the maximum length of a path extension to remove. Some paths may contain periods that signify identify or have some
83    /// other meaning than marking a file extension.
84    pub path_extension_length: usize,
85}
86
87impl Default for Options {
88    fn default() -> Self {
89        let new = Self::new();
90        new.with_ignored_query_params(DEFAULT_IGNORED_QUERY_PARAMS)
91            .with_trimmed_host_prefixes([DEFAULT_WWW_PREFIX])
92            .with_trimmed_path_extension_suffixes([DEFAULT_EXTENSION_SUFFIX])
93            .with_path_extension_length(6)
94    }
95}
96
97impl Options {
98    /// Create a blank [`Options`] object which is not terribly useful for anything other than configuring.
99    pub fn new() -> Self {
100        Self {
101            ignored_query_params: vec![],
102            trimmed_host_prefixes: vec![],
103            trimmed_path_extension_suffixes: vec![],
104            path_extension_length: 0,
105        }
106    }
107
108    fn compile_ignored_query_params_regex(
109        ignored_query_params: Vec<String>,
110    ) -> Result<Regex, regex::Error> {
111        Regex::new(&format!("^({})$", ignored_query_params.join("|")))
112    }
113
114    fn compile_trimmed_host_prefixes_regex(
115        trimmed_host_prefixes: Vec<String>,
116    ) -> Result<Regex, regex::Error> {
117        if trimmed_host_prefixes.is_empty() {
118            // A regular expression prefix that matches nothing (NUL byte)
119            Regex::new("\\A[\0]")
120        } else {
121            Regex::new(&format!("\\A({})", trimmed_host_prefixes.join("|")))
122        }
123    }
124
125    fn compile_trimmed_path_extension_suffixes_regex(
126        trimmed_path_extension_suffixes: Vec<String>,
127    ) -> Result<Regex, regex::Error> {
128        Regex::new(&format!("({})$", trimmed_path_extension_suffixes.join("|")))
129    }
130
131    /// Compile this [`Options`] object to a [`UrlNormalizer`].
132    pub fn compile(self) -> Result<UrlNormalizer, regex::Error> {
133        // Per benchmark, Regex is faster than RegexSet
134        Ok(UrlNormalizer {
135            ignored_query_params: Self::compile_ignored_query_params_regex(
136                self.ignored_query_params,
137            )?,
138            trimmed_host_prefixes: Self::compile_trimmed_host_prefixes_regex(
139                self.trimmed_host_prefixes,
140            )?,
141            trimmed_path_extension_suffixes: Self::compile_trimmed_path_extension_suffixes_regex(
142                self.trimmed_path_extension_suffixes,
143            )?,
144            path_extension_length: self.path_extension_length,
145        })
146    }
147
148    /// Replaces the ignored query parameters.
149    pub fn with_ignored_query_params<S: AsRef<str>, I: IntoIterator<Item = S>>(
150        mut self,
151        iter: I,
152    ) -> Self {
153        self.ignored_query_params = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
154        self
155    }
156
157    /// Replaces the trimmed host prefixes.
158    pub fn with_trimmed_host_prefixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
159        mut self,
160        iter: I,
161    ) -> Self {
162        self.trimmed_host_prefixes = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
163        self
164    }
165
166    /// Replaces the trimmed path extensions.
167    pub fn with_trimmed_path_extension_suffixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
168        mut self,
169        iter: I,
170    ) -> Self {
171        self.trimmed_path_extension_suffixes =
172            iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
173        self
174    }
175
176    /// Replaces the path extension length.
177    pub fn with_path_extension_length(mut self, path_extension_length: usize) -> Self {
178        self.path_extension_length = path_extension_length;
179        self
180    }
181}
182
183/// A fully-constructed normalizer instance.
184pub struct UrlNormalizer {
185    ignored_query_params: Regex,
186    trimmed_host_prefixes: Regex,
187    trimmed_path_extension_suffixes: Regex,
188    path_extension_length: usize,
189}
190
191#[derive(Debug, PartialEq, Eq)]
192struct CompareToken<'a>(&'a str);
193
194/// We will need to use this if we end up with a non-unescaping URL parser. Not currently used, but tested at a basic level.
195#[derive(Debug)]
196#[cfg(test)]
197struct EscapedCompareToken<'a>(&'a str);
198
199#[cfg(test)]
200impl<'a> PartialEq for EscapedCompareToken<'a> {
201    fn eq(&self, other: &Self) -> bool {
202        fn consume_with_escape(c: char, ci: &mut Chars) -> char {
203            const HEX_DIGIT: &str = "0123456789abcdef0123456789ABCDEF";
204            if c == '+' {
205                return ' ';
206            }
207            if c != '%' {
208                return c;
209            }
210            let a = ci.next().unwrap_or_default();
211            let a = HEX_DIGIT.find(a).unwrap_or_default() as u8;
212            let b = ci.next().unwrap_or_default();
213            let b = HEX_DIGIT.find(b).unwrap_or_default() as u8;
214            ((a << 4) | b) as char
215        }
216
217        if self.0 == other.0 {
218            return true;
219        }
220        let mut it1 = self.0.chars();
221        let mut it2 = other.0.chars();
222        while let Some(c) = it1.next() {
223            let c = consume_with_escape(c, &mut it1);
224            let c2 = it2.next().unwrap_or_default();
225            let c2 = consume_with_escape(c2, &mut it2);
226            if c != c2 {
227                return false;
228            }
229        }
230        it2.next().is_none()
231    }
232}
233
234impl UrlNormalizer {
235    /// Generates a stream of token bits that can be used to compare whether URLs are "normalized-equal", that is: whether two URLs normalize to the same stream of tokens.
236    fn token_stream<'b>(&self, url: &'b Url) -> impl Iterator<Item = CompareToken<'b>> {
237        let mut out = Vec::with_capacity(10);
238        let host = self.normalize_host(url).unwrap_or_default();
239        out.push(CompareToken(host));
240        let path = url.path_segments();
241        if let Some(path) = path {
242            let mut iter = path.filter(|path| !path.is_empty());
243            if let Some(mut curr) = iter.next() {
244                loop {
245                    if let Some(next) = iter.next() {
246                        out.push(CompareToken(curr));
247                        curr = next;
248                    } else {
249                        // Remove anything that looks like a trailing file type (.html, etc)
250                        // We allow at most one numeric char
251                        if let Some((a, b)) = curr.rsplit_once('.') {
252                            if b.len() <= self.path_extension_length
253                                && self.trimmed_path_extension_suffixes.is_match_at(b, 0)
254                            {
255                                out.push(CompareToken(a));
256                            } else {
257                                out.push(CompareToken(curr));
258                            }
259                        } else {
260                            out.push(CompareToken(curr));
261                        }
262                        break;
263                    }
264                }
265            }
266        }
267
268        if let Some(query) = url.query() {
269            let mut query_pairs = Vec::with_capacity(10);
270            for bit in query.split('&') {
271                let (a, b) = if let Some((a, b)) = bit.split_once('=') {
272                    (a, b)
273                } else {
274                    (bit, "")
275                };
276                if !self.ignored_query_params.is_match(a) {
277                    query_pairs.push((a, b));
278                }
279            }
280            query_pairs.sort();
281            for (key, value) in query_pairs {
282                out.push(CompareToken(key));
283                out.push(CompareToken(value));
284            }
285        }
286
287        // Keep the fragment iff it looks significant
288        let fragment = url.fragment().unwrap_or_default();
289        // #!-style fragment paths
290        let hash_bang = fragment.starts_with('!');
291        // /#/-style fragment paths
292        let slash_hash_slash = url.path().ends_with('/') && fragment.starts_with('/');
293
294        if hash_bang || slash_hash_slash {
295            out.push(CompareToken(&fragment[1..fragment.len()]));
296        }
297
298        // Trim any empty tokens
299        out.into_iter().filter(|s| !s.0.is_empty())
300    }
301
302    /// Are these two URLs considered the same?
303    ///
304    /// ```
305    /// # use url::Url;
306    /// # use urlnorm::UrlNormalizer;
307    /// assert!(UrlNormalizer::default().are_same(&Url::parse("http://google.com").unwrap(), &Url::parse("https://google.com").unwrap()));
308    /// ```
309    pub fn are_same(&self, a: &Url, b: &Url) -> bool {
310        self.token_stream(a).eq(self.token_stream(b))
311    }
312
313    /// Compute a normalization string that can be persisted for later comparison. If two normalization strings are identical, the URLs are
314    /// considered to be the same.
315    ///
316    /// ```
317    /// # use url::Url;
318    /// # use urlnorm::UrlNormalizer;
319    /// assert_eq!(UrlNormalizer::default().compute_normalization_string(&Url::parse("http://www.google.com").unwrap()), "google.com:");
320    /// ```
321    pub fn compute_normalization_string(&self, url: &Url) -> String {
322        let mut s = String::with_capacity(url.as_str().len());
323        for bit in self.token_stream(url) {
324            s += bit.0;
325            s.push(':');
326        }
327        s
328    }
329
330    /// Normalize the host portion of a `Url`.
331    ///
332    /// ```
333    /// # use url::Url;
334    /// # use urlnorm::UrlNormalizer;
335    /// assert_eq!(UrlNormalizer::default().normalize_host(&Url::parse("http://www.google.com/?q=search").unwrap()), Some("google.com"));
336    /// ```
337    pub fn normalize_host<'a>(&self, url: &'a Url) -> Option<&'a str> {
338        if let Some(mut host) = url.host_str() {
339            while let Some(stripped) = self.trimmed_host_prefixes.find_at(host, 0) {
340                host = &host[stripped.end()..host.len()];
341            }
342            let host = host.trim_start_matches('.');
343            let host = host.trim_end_matches('.');
344            Some(host)
345        } else {
346            None
347        }
348    }
349}
350
351impl Default for UrlNormalizer {
352    fn default() -> Self {
353        Options::default()
354            .compile()
355            .expect("Default options will always safely compile")
356    }
357}
358
359#[cfg(test)]
360mod test {
361    use super::*;
362    use rstest::*;
363
364    #[fixture]
365    fn norm() -> UrlNormalizer {
366        UrlNormalizer::default()
367    }
368
369    #[test]
370    fn test_with_empty_options() {
371        let options = Options::new();
372        let norm = options.compile().unwrap();
373        let url = Url::parse("http://www.google.com").unwrap();
374        assert!(norm.are_same(&url, &Url::parse("https://www.google.com").unwrap()));
375        assert_eq!(norm.compute_normalization_string(&url), "www.google.com:");
376        assert!(!norm.are_same(
377            &Url::parse("https://www.google.com?fbclid=1").unwrap(),
378            &Url::parse("https://www.google.com?fbclid=2").unwrap()
379        ));
380    }
381
382    /// Ensure that we don't accidentally break the normalization strings between versions.
383    #[test]
384    fn test_existing_data() {
385        let testdata = include_str!("testdata.txt").trim_end_matches('\n');
386        let norm = norm();
387        // Note that we can update the test data as needed between versions
388        // let mut expected = "".to_owned();
389        for line in testdata.split('\n') {
390            let (url, existing_norm) = line.split_once("\",\"").expect("Expected one comma");
391            let url = &url[1..url.len()];
392            let existing_norm = &existing_norm[0..existing_norm.len() - 1];
393            let url = Url::parse(url).expect("Failed to parse URL");
394            let expected_norm = norm.compute_normalization_string(&url);
395            assert_eq!(existing_norm, expected_norm);
396            // expected += &format!("\"{}\",\"{}\"\n", url, expected_norm);
397        }
398        // File::create("testdata2.txt").unwrap().write_all(expected.as_bytes()).unwrap();
399    }
400
401    #[rstest]
402    #[case("http://www.example.com", "example.com")]
403    #[case("http://m.www.example.com", "example.com")]
404    #[case("http://www1.example.com", "example.com")]
405    #[case("http://ww1.example.com", "example.com")]
406    #[case("http://test.www.example.com", "test.www.example.com")]
407    #[case("http://www-03.example.com", "example.com")]
408    #[case("http://m.example.com", "example.com")]
409    #[case("http://m.m.m.m.m.example.com", "example.com")]
410    #[case("http://mobile.example.com", "example.com")]
411    // Negative cases
412    #[case("http://bwwwww.example.com", "bwwwww.example.com")]
413    fn test_host_normalization(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
414        assert_eq!(norm.normalize_host(&Url::parse(a).expect("url")), Some(b));
415    }
416
417    #[rstest]
418    #[case("abc", "abc")]
419    #[case("abc.", "abc.")]
420    #[case("ab+c", "ab c")]
421    #[case("ab%2ec", "ab.c")]
422    fn test_compare_token(#[case] a: &str, #[case] b: &str) {
423        let a = EscapedCompareToken(a);
424        let b = EscapedCompareToken(b);
425        assert_eq!(a, b);
426    }
427
428    #[rstest]
429    #[case("abc", "abc.")]
430    #[case("abc.", "abc")]
431    #[case("abc", "abc%")]
432    #[case("abc", "abc%xx")]
433    #[case("ab+c", "ab  c")]
434    #[case("ab%2ec", "ab/c")]
435    fn test_compare_token_ne(#[case] a: &str, #[case] b: &str) {
436        let a = EscapedCompareToken(a);
437        let b = EscapedCompareToken(b);
438        assert_ne!(a, b);
439    }
440
441    /// Test identical URLs on both sides.
442    #[rstest]
443    #[case("http://x.com")]
444    #[case("http://1.2.3.4")]
445    #[case("http://google.com/path/?query")]
446    #[case("http://google.com/path/?query=bar")]
447    #[case("http://facebook.com/path/?fbclid=bar&somequery=ok")]
448    fn test_url_normalization_identical(norm: UrlNormalizer, #[case] a: &str) {
449        assert!(
450            norm.are_same(&Url::parse(a).unwrap(), &Url::parse(a).unwrap()),
451            "{} != {}",
452            a,
453            a
454        );
455    }
456
457    #[rstest]
458    // http/https
459    #[case("http://google.com", "https://google.com")]
460    // Escaped period
461    #[case("http://google%2ecom", "https://google.com")]
462    // www.
463    #[case("https://www.google.com", "https://google.com")]
464    // .html
465    #[case("https://www.google.com/foo.html", "https://www.google.com/foo")]
466    // Empty query/fragment/path
467    #[case("https://www.google.com/?#", "https://www.google.com")]
468    // Trailing/multiple slashes
469    #[case("https://www.google.com/", "https://www.google.com")]
470    #[case("https://www.google.com/foo", "https://www.google.com/foo/")]
471    #[case("https://www.google.com//foo", "https://www.google.com/foo")]
472    // Ignored query params
473    #[case("http://x.com?utm_source=foo", "http://x.com")]
474    #[case("http://x.com?fbclid=foo&gclid=bar", "http://x.com")]
475    #[case("http://x.com?fbclid=foo", "http://x.com?fbclid=basdf")]
476    #[case("http://archinte.jamanetwork.com/article.aspx?articleid=1898878&__hstc=9292970.6d480b0896ec071bae4c3d40c40ec7d5.1407456000124.1407456000125.1407456000126.1&__hssc=9292970.1.1407456000127&__hsfp=1314462730", "http://archinte.jamanetwork.com/article.aspx?articleid=1898878")]
477    // Ignored fragments
478    #[case("http://x.com", "http://x.com#something")]
479    // Ignored empty domain segments
480    #[case("http://x.com", "http://x.com.")]
481    #[case("http://x.com", "http://x.com..")]
482    #[case("http://x.com", "http://.x.com")]
483    // TODO: We need to normalize these too
484    // #[case("http://x.com", "http://x..com")]
485    fn test_url_normalization_same(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
486        let a = Url::parse(a).unwrap();
487        let b = Url::parse(b).unwrap();
488        assert_eq!(
489            norm.compute_normalization_string(&a),
490            norm.compute_normalization_string(&b)
491        );
492        assert!(norm.are_same(&a, &b), "{} != {}", a, b);
493    }
494
495    #[rstest]
496    #[case("http://1.2.3.4", "http://1.2.3.5")]
497    #[case("https://test.www.google.com", "https://test.www1.google.com")]
498    #[case("https://google.com", "https://facebook.com")]
499    #[case("https://google.com/abc", "https://google.com/def")]
500    #[case("https://google.com/?page=1", "https://google.com/?page=2")]
501    #[case("https://google.com/?page=%31", "https://google.com/?page=%32")]
502    #[case("https://amazon.com/product/ref=a", "https://amazon.com/product/ref=b")]
503    // Negative case: slightly modified query string param
504    #[case("http://x.com?xfbclid=foo", "http://x.com?xfbclid=basdf")]
505    // Negative case: long extension
506    #[case("http://x.com/file.html12345", "http://x.com/file.html12346")]
507    // Examples of real URLs that should not be normalized together
508    #[case("http://arxiv.org/abs/1405.0126", "http://arxiv.org/abs/1405.0351")]
509    #[case(
510        "http://www.bmj.com/content/360/bmj.j5855",
511        "http://www.bmj.com/content/360/bmj.k322"
512    )]
513    #[case(
514        "https://www.google.com/contributor/welcome/#/intro",
515        "https://www.google.com/contributor/welcome/#/about"
516    )]
517    #[case(
518        "https://groups.google.com/forum/#!topic/mailing.postfix.users/6Kkel3J_nv4",
519        "https://groups.google.com/forum/#!topic/erlang-programming/nFWfmwK64RU"
520    )]
521    fn test_url_normalization_different(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
522        let a = Url::parse(a).unwrap();
523        let b = Url::parse(b).unwrap();
524        assert_ne!(
525            norm.compute_normalization_string(&a),
526            norm.compute_normalization_string(&b)
527        );
528        assert!(!norm.are_same(&a, &b), "{} != {}", a, b);
529    }
530
531    // TODO: Known failures
532    // http://apenwarr.ca/log/?m=201407#01 http://apenwarr.ca/log/?m=201407#14
533    // https://www.google.com/trends/explore#q=golang https://www.google.com/trends/explore#q=rustlang
534    // fn test_known_failures() {
535
536    // }
537}