1#![doc = include_str!("../README.md")]
2#[cfg(test)]
3use std::str::Chars;
4
5use regex::Regex;
6use url::Url;
7
8const DEFAULT_IGNORED_QUERY_PARAMS: [&str; 15] = [
10 "utm_source",
11 "utm_medium",
12 "utm_campaign",
13 "utm_term",
14 "utm_content",
15 "utm_expid",
16 "gclid",
17 "_ga",
18 "_gl",
19 "msclkid",
20 "fbclid",
21 "mc_cid",
22 "mc_eid",
23 "[Ww][Tt]\\.mc_(id|ev)",
24 "__[a-z]+",
25];
26
27const DEFAULT_WWW_PREFIX: &str = r#"(?x)
30 ([0-9]-?)?
31 (old)?
32 (www?[0-9]*|m|mobile)
33 (-[a-z0-9]{1,3})?
34 \.
35"#;
36
37const DEFAULT_EXTENSION_SUFFIX: &str = "[a-zA-Z]+[0-9]?$";
39
40pub struct Options {
75 pub ignored_query_params: Vec<String>,
77 pub trimmed_host_prefixes: Vec<String>,
79 pub trimmed_path_extension_suffixes: Vec<String>,
82 pub path_extension_length: usize,
85}
86
87impl Default for Options {
88 fn default() -> Self {
89 let new = Self::new();
90 new.with_ignored_query_params(DEFAULT_IGNORED_QUERY_PARAMS)
91 .with_trimmed_host_prefixes([DEFAULT_WWW_PREFIX])
92 .with_trimmed_path_extension_suffixes([DEFAULT_EXTENSION_SUFFIX])
93 .with_path_extension_length(6)
94 }
95}
96
97impl Options {
98 pub fn new() -> Self {
100 Self {
101 ignored_query_params: vec![],
102 trimmed_host_prefixes: vec![],
103 trimmed_path_extension_suffixes: vec![],
104 path_extension_length: 0,
105 }
106 }
107
108 fn compile_ignored_query_params_regex(
109 ignored_query_params: Vec<String>,
110 ) -> Result<Regex, regex::Error> {
111 Regex::new(&format!("^({})$", ignored_query_params.join("|")))
112 }
113
114 fn compile_trimmed_host_prefixes_regex(
115 trimmed_host_prefixes: Vec<String>,
116 ) -> Result<Regex, regex::Error> {
117 if trimmed_host_prefixes.is_empty() {
118 Regex::new("\\A[\0]")
120 } else {
121 Regex::new(&format!("\\A({})", trimmed_host_prefixes.join("|")))
122 }
123 }
124
125 fn compile_trimmed_path_extension_suffixes_regex(
126 trimmed_path_extension_suffixes: Vec<String>,
127 ) -> Result<Regex, regex::Error> {
128 Regex::new(&format!("({})$", trimmed_path_extension_suffixes.join("|")))
129 }
130
131 pub fn compile(self) -> Result<UrlNormalizer, regex::Error> {
133 Ok(UrlNormalizer {
135 ignored_query_params: Self::compile_ignored_query_params_regex(
136 self.ignored_query_params,
137 )?,
138 trimmed_host_prefixes: Self::compile_trimmed_host_prefixes_regex(
139 self.trimmed_host_prefixes,
140 )?,
141 trimmed_path_extension_suffixes: Self::compile_trimmed_path_extension_suffixes_regex(
142 self.trimmed_path_extension_suffixes,
143 )?,
144 path_extension_length: self.path_extension_length,
145 })
146 }
147
148 pub fn with_ignored_query_params<S: AsRef<str>, I: IntoIterator<Item = S>>(
150 mut self,
151 iter: I,
152 ) -> Self {
153 self.ignored_query_params = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
154 self
155 }
156
157 pub fn with_trimmed_host_prefixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
159 mut self,
160 iter: I,
161 ) -> Self {
162 self.trimmed_host_prefixes = iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
163 self
164 }
165
166 pub fn with_trimmed_path_extension_suffixes<S: AsRef<str>, I: IntoIterator<Item = S>>(
168 mut self,
169 iter: I,
170 ) -> Self {
171 self.trimmed_path_extension_suffixes =
172 iter.into_iter().map(|s| s.as_ref().to_owned()).collect();
173 self
174 }
175
176 pub fn with_path_extension_length(mut self, path_extension_length: usize) -> Self {
178 self.path_extension_length = path_extension_length;
179 self
180 }
181}
182
183pub struct UrlNormalizer {
185 ignored_query_params: Regex,
186 trimmed_host_prefixes: Regex,
187 trimmed_path_extension_suffixes: Regex,
188 path_extension_length: usize,
189}
190
191#[derive(Debug, PartialEq, Eq)]
192struct CompareToken<'a>(&'a str);
193
194#[derive(Debug)]
196#[cfg(test)]
197struct EscapedCompareToken<'a>(&'a str);
198
199#[cfg(test)]
200impl<'a> PartialEq for EscapedCompareToken<'a> {
201 fn eq(&self, other: &Self) -> bool {
202 fn consume_with_escape(c: char, ci: &mut Chars) -> char {
203 const HEX_DIGIT: &str = "0123456789abcdef0123456789ABCDEF";
204 if c == '+' {
205 return ' ';
206 }
207 if c != '%' {
208 return c;
209 }
210 let a = ci.next().unwrap_or_default();
211 let a = HEX_DIGIT.find(a).unwrap_or_default() as u8;
212 let b = ci.next().unwrap_or_default();
213 let b = HEX_DIGIT.find(b).unwrap_or_default() as u8;
214 ((a << 4) | b) as char
215 }
216
217 if self.0 == other.0 {
218 return true;
219 }
220 let mut it1 = self.0.chars();
221 let mut it2 = other.0.chars();
222 while let Some(c) = it1.next() {
223 let c = consume_with_escape(c, &mut it1);
224 let c2 = it2.next().unwrap_or_default();
225 let c2 = consume_with_escape(c2, &mut it2);
226 if c != c2 {
227 return false;
228 }
229 }
230 it2.next().is_none()
231 }
232}
233
234impl UrlNormalizer {
235 fn token_stream<'b>(&self, url: &'b Url) -> impl Iterator<Item = CompareToken<'b>> {
237 let mut out = Vec::with_capacity(10);
238 let host = self.normalize_host(url).unwrap_or_default();
239 out.push(CompareToken(host));
240 let path = url.path_segments();
241 if let Some(path) = path {
242 let mut iter = path.filter(|path| !path.is_empty());
243 if let Some(mut curr) = iter.next() {
244 loop {
245 if let Some(next) = iter.next() {
246 out.push(CompareToken(curr));
247 curr = next;
248 } else {
249 if let Some((a, b)) = curr.rsplit_once('.') {
252 if b.len() <= self.path_extension_length
253 && self.trimmed_path_extension_suffixes.is_match_at(b, 0)
254 {
255 out.push(CompareToken(a));
256 } else {
257 out.push(CompareToken(curr));
258 }
259 } else {
260 out.push(CompareToken(curr));
261 }
262 break;
263 }
264 }
265 }
266 }
267
268 if let Some(query) = url.query() {
269 let mut query_pairs = Vec::with_capacity(10);
270 for bit in query.split('&') {
271 let (a, b) = if let Some((a, b)) = bit.split_once('=') {
272 (a, b)
273 } else {
274 (bit, "")
275 };
276 if !self.ignored_query_params.is_match(a) {
277 query_pairs.push((a, b));
278 }
279 }
280 query_pairs.sort();
281 for (key, value) in query_pairs {
282 out.push(CompareToken(key));
283 out.push(CompareToken(value));
284 }
285 }
286
287 let fragment = url.fragment().unwrap_or_default();
289 let hash_bang = fragment.starts_with('!');
291 let slash_hash_slash = url.path().ends_with('/') && fragment.starts_with('/');
293
294 if hash_bang || slash_hash_slash {
295 out.push(CompareToken(&fragment[1..fragment.len()]));
296 }
297
298 out.into_iter().filter(|s| !s.0.is_empty())
300 }
301
302 pub fn are_same(&self, a: &Url, b: &Url) -> bool {
310 self.token_stream(a).eq(self.token_stream(b))
311 }
312
313 pub fn compute_normalization_string(&self, url: &Url) -> String {
322 let mut s = String::with_capacity(url.as_str().len());
323 for bit in self.token_stream(url) {
324 s += bit.0;
325 s.push(':');
326 }
327 s
328 }
329
330 pub fn normalize_host<'a>(&self, url: &'a Url) -> Option<&'a str> {
338 if let Some(mut host) = url.host_str() {
339 while let Some(stripped) = self.trimmed_host_prefixes.find_at(host, 0) {
340 host = &host[stripped.end()..host.len()];
341 }
342 let host = host.trim_start_matches('.');
343 let host = host.trim_end_matches('.');
344 Some(host)
345 } else {
346 None
347 }
348 }
349}
350
351impl Default for UrlNormalizer {
352 fn default() -> Self {
353 Options::default()
354 .compile()
355 .expect("Default options will always safely compile")
356 }
357}
358
359#[cfg(test)]
360mod test {
361 use super::*;
362 use rstest::*;
363
364 #[fixture]
365 fn norm() -> UrlNormalizer {
366 UrlNormalizer::default()
367 }
368
369 #[test]
370 fn test_with_empty_options() {
371 let options = Options::new();
372 let norm = options.compile().unwrap();
373 let url = Url::parse("http://www.google.com").unwrap();
374 assert!(norm.are_same(&url, &Url::parse("https://www.google.com").unwrap()));
375 assert_eq!(norm.compute_normalization_string(&url), "www.google.com:");
376 assert!(!norm.are_same(
377 &Url::parse("https://www.google.com?fbclid=1").unwrap(),
378 &Url::parse("https://www.google.com?fbclid=2").unwrap()
379 ));
380 }
381
382 #[test]
384 fn test_existing_data() {
385 let testdata = include_str!("testdata.txt").trim_end_matches('\n');
386 let norm = norm();
387 for line in testdata.split('\n') {
390 let (url, existing_norm) = line.split_once("\",\"").expect("Expected one comma");
391 let url = &url[1..url.len()];
392 let existing_norm = &existing_norm[0..existing_norm.len() - 1];
393 let url = Url::parse(url).expect("Failed to parse URL");
394 let expected_norm = norm.compute_normalization_string(&url);
395 assert_eq!(existing_norm, expected_norm);
396 }
398 }
400
401 #[rstest]
402 #[case("http://www.example.com", "example.com")]
403 #[case("http://m.www.example.com", "example.com")]
404 #[case("http://www1.example.com", "example.com")]
405 #[case("http://ww1.example.com", "example.com")]
406 #[case("http://test.www.example.com", "test.www.example.com")]
407 #[case("http://www-03.example.com", "example.com")]
408 #[case("http://m.example.com", "example.com")]
409 #[case("http://m.m.m.m.m.example.com", "example.com")]
410 #[case("http://mobile.example.com", "example.com")]
411 #[case("http://bwwwww.example.com", "bwwwww.example.com")]
413 fn test_host_normalization(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
414 assert_eq!(norm.normalize_host(&Url::parse(a).expect("url")), Some(b));
415 }
416
417 #[rstest]
418 #[case("abc", "abc")]
419 #[case("abc.", "abc.")]
420 #[case("ab+c", "ab c")]
421 #[case("ab%2ec", "ab.c")]
422 fn test_compare_token(#[case] a: &str, #[case] b: &str) {
423 let a = EscapedCompareToken(a);
424 let b = EscapedCompareToken(b);
425 assert_eq!(a, b);
426 }
427
428 #[rstest]
429 #[case("abc", "abc.")]
430 #[case("abc.", "abc")]
431 #[case("abc", "abc%")]
432 #[case("abc", "abc%xx")]
433 #[case("ab+c", "ab c")]
434 #[case("ab%2ec", "ab/c")]
435 fn test_compare_token_ne(#[case] a: &str, #[case] b: &str) {
436 let a = EscapedCompareToken(a);
437 let b = EscapedCompareToken(b);
438 assert_ne!(a, b);
439 }
440
441 #[rstest]
443 #[case("http://x.com")]
444 #[case("http://1.2.3.4")]
445 #[case("http://google.com/path/?query")]
446 #[case("http://google.com/path/?query=bar")]
447 #[case("http://facebook.com/path/?fbclid=bar&somequery=ok")]
448 fn test_url_normalization_identical(norm: UrlNormalizer, #[case] a: &str) {
449 assert!(
450 norm.are_same(&Url::parse(a).unwrap(), &Url::parse(a).unwrap()),
451 "{} != {}",
452 a,
453 a
454 );
455 }
456
457 #[rstest]
458 #[case("http://google.com", "https://google.com")]
460 #[case("http://google%2ecom", "https://google.com")]
462 #[case("https://www.google.com", "https://google.com")]
464 #[case("https://www.google.com/foo.html", "https://www.google.com/foo")]
466 #[case("https://www.google.com/?#", "https://www.google.com")]
468 #[case("https://www.google.com/", "https://www.google.com")]
470 #[case("https://www.google.com/foo", "https://www.google.com/foo/")]
471 #[case("https://www.google.com//foo", "https://www.google.com/foo")]
472 #[case("http://x.com?utm_source=foo", "http://x.com")]
474 #[case("http://x.com?fbclid=foo&gclid=bar", "http://x.com")]
475 #[case("http://x.com?fbclid=foo", "http://x.com?fbclid=basdf")]
476 #[case("http://archinte.jamanetwork.com/article.aspx?articleid=1898878&__hstc=9292970.6d480b0896ec071bae4c3d40c40ec7d5.1407456000124.1407456000125.1407456000126.1&__hssc=9292970.1.1407456000127&__hsfp=1314462730", "http://archinte.jamanetwork.com/article.aspx?articleid=1898878")]
477 #[case("http://x.com", "http://x.com#something")]
479 #[case("http://x.com", "http://x.com.")]
481 #[case("http://x.com", "http://x.com..")]
482 #[case("http://x.com", "http://.x.com")]
483 fn test_url_normalization_same(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
486 let a = Url::parse(a).unwrap();
487 let b = Url::parse(b).unwrap();
488 assert_eq!(
489 norm.compute_normalization_string(&a),
490 norm.compute_normalization_string(&b)
491 );
492 assert!(norm.are_same(&a, &b), "{} != {}", a, b);
493 }
494
495 #[rstest]
496 #[case("http://1.2.3.4", "http://1.2.3.5")]
497 #[case("https://test.www.google.com", "https://test.www1.google.com")]
498 #[case("https://google.com", "https://facebook.com")]
499 #[case("https://google.com/abc", "https://google.com/def")]
500 #[case("https://google.com/?page=1", "https://google.com/?page=2")]
501 #[case("https://google.com/?page=%31", "https://google.com/?page=%32")]
502 #[case("https://amazon.com/product/ref=a", "https://amazon.com/product/ref=b")]
503 #[case("http://x.com?xfbclid=foo", "http://x.com?xfbclid=basdf")]
505 #[case("http://x.com/file.html12345", "http://x.com/file.html12346")]
507 #[case("http://arxiv.org/abs/1405.0126", "http://arxiv.org/abs/1405.0351")]
509 #[case(
510 "http://www.bmj.com/content/360/bmj.j5855",
511 "http://www.bmj.com/content/360/bmj.k322"
512 )]
513 #[case(
514 "https://www.google.com/contributor/welcome/#/intro",
515 "https://www.google.com/contributor/welcome/#/about"
516 )]
517 #[case(
518 "https://groups.google.com/forum/#!topic/mailing.postfix.users/6Kkel3J_nv4",
519 "https://groups.google.com/forum/#!topic/erlang-programming/nFWfmwK64RU"
520 )]
521 fn test_url_normalization_different(norm: UrlNormalizer, #[case] a: &str, #[case] b: &str) {
522 let a = Url::parse(a).unwrap();
523 let b = Url::parse(b).unwrap();
524 assert_ne!(
525 norm.compute_normalization_string(&a),
526 norm.compute_normalization_string(&b)
527 );
528 assert!(!norm.are_same(&a, &b), "{} != {}", a, b);
529 }
530
531 }