halldyll_core/crawl/
normalize.rs1use url::Url;
4use std::collections::BTreeMap;
5
6pub struct UrlNormalizer {
8 remove_fragments: bool,
10 sort_query_params: bool,
12 remove_tracking_params: bool,
14 tracking_params: Vec<String>,
16 force_https: bool,
18 remove_www: bool,
20 remove_trailing_slash: bool,
22 remove_default_port: bool,
24}
25
26impl Default for UrlNormalizer {
27 fn default() -> Self {
28 Self {
29 remove_fragments: true,
30 sort_query_params: true,
31 remove_tracking_params: true,
32 tracking_params: vec![
33 "utm_source".to_string(),
34 "utm_medium".to_string(),
35 "utm_campaign".to_string(),
36 "utm_term".to_string(),
37 "utm_content".to_string(),
38 "fbclid".to_string(),
39 "gclid".to_string(),
40 "ref".to_string(),
41 "_ga".to_string(),
42 ],
43 force_https: false,
44 remove_www: false,
45 remove_trailing_slash: false,
46 remove_default_port: true,
47 }
48 }
49}
50
51impl UrlNormalizer {
52 pub fn new() -> Self {
54 Self::default()
55 }
56
57 pub fn normalize(&self, url: &Url) -> Url {
59 let mut url = url.clone();
60
61 if self.remove_fragments {
63 url.set_fragment(None);
64 }
65
66 if self.remove_default_port {
68 if let Some(port) = url.port() {
69 let default_port = match url.scheme() {
70 "http" => 80,
71 "https" => 443,
72 _ => 0,
73 };
74 if port == default_port {
75 let _ = url.set_port(None);
76 }
77 }
78 }
79
80 if self.force_https && url.scheme() == "http" {
82 let _ = url.set_scheme("https");
83 }
84
85 if self.remove_www {
87 if let Some(host) = url.host_str() {
88 if host.starts_with("www.") {
89 let new_host = host[4..].to_string();
90 let _ = url.set_host(Some(&new_host));
91 }
92 }
93 }
94
95 if self.remove_trailing_slash {
97 let path = url.path().to_string();
98 if path.len() > 1 && path.ends_with('/') {
99 url.set_path(&path[..path.len() - 1]);
100 }
101 }
102
103 if self.sort_query_params || self.remove_tracking_params {
105 let query_pairs: Vec<(String, String)> = url
106 .query_pairs()
107 .map(|(k, v)| (k.to_string(), v.to_string()))
108 .collect();
109
110 if !query_pairs.is_empty() {
111 let mut filtered: BTreeMap<String, String> = BTreeMap::new();
112
113 for (key, value) in query_pairs {
114 if self.remove_tracking_params
116 && self.tracking_params.iter().any(|t| t.eq_ignore_ascii_case(&key))
117 {
118 continue;
119 }
120 filtered.insert(key, value);
121 }
122
123 if filtered.is_empty() {
124 url.set_query(None);
125 } else {
126 let query: String = filtered
127 .iter()
128 .map(|(k, v)| format!("{}={}", k, v))
129 .collect::<Vec<_>>()
130 .join("&");
131 url.set_query(Some(&query));
132 }
133 }
134 }
135
136 url
137 }
138
139 pub fn resolve(&self, base: &Url, relative: &str) -> Option<Url> {
141 base.join(relative).ok().map(|u| self.normalize(&u))
142 }
143
144 pub fn are_equal(&self, a: &Url, b: &Url) -> bool {
146 self.normalize(a) == self.normalize(b)
147 }
148}
149
150pub fn extract_domain(url: &Url) -> Option<String> {
152 url.host_str().map(String::from)
153}
154
155pub fn extract_base_domain(url: &Url) -> Option<String> {
157 url.host_str().map(|h| {
158 let parts: Vec<&str> = h.split('.').collect();
159 if parts.len() > 2 {
160 parts[parts.len() - 2..].join(".")
162 } else {
163 h.to_string()
164 }
165 })
166}
167
168pub fn is_same_domain(a: &Url, b: &Url) -> bool {
170 a.host_str() == b.host_str()
171}
172
173pub fn is_same_base_domain(a: &Url, b: &Url) -> bool {
175 extract_base_domain(a) == extract_base_domain(b)
176}