1use derivative::Derivative;
17use url::Url;
18
19mod rules;
20
21#[derive(Derivative)]
22#[derivative(Debug)]
23pub(crate) struct Rule {
24 host_path: Vec<M>,
26 params: Vec<M>,
28 #[derivative(Debug = "ignore")]
49 handler: Option<Box<dyn Fn(Url) -> Url + Sync + Send>>,
50}
51
52#[allow(dead_code)]
53#[derive(Clone, Debug)]
54pub(crate) enum M {
55 Any,
56 AllBut(&'static str),
57 ContainsAll(Vec<&'static str>),
58 Exact(&'static str),
59 StartsWith(&'static str),
60 Contains(&'static str),
61}
62
63impl M {
64 fn matches_str(&self, input: Option<&str>) -> bool {
65 self.matches(input.map(|i| i.as_bytes()))
66 }
67
68 fn matches(&self, input: Option<&[u8]>) -> bool {
69 match input {
70 Some(input) => match self {
71 M::Any => true,
72 M::Exact(e) => input.eq(e.as_bytes()),
73 M::StartsWith(sw) => input.starts_with(sw.as_bytes()),
74 M::Contains(c) => input.windows(c.len()).any(|w| w.eq(c.as_bytes())),
75 M::ContainsAll(all) => all
76 .iter()
77 .map(|a| M::Contains(a))
78 .collect::<Vec<_>>()
79 .iter()
80 .all(|a| a.matches(Some(input))),
81
82 M::AllBut(c) => !c.as_bytes().eq(input),
83 },
84 None => match self {
85 M::Any => true,
86
87 M::Exact(_)
88 | M::StartsWith(_)
89 | M::Contains(_)
90 | M::ContainsAll(_)
91 | M::AllBut(_) => false,
92 },
93 }
94 }
95}
96
97#[derive(Debug, Clone)]
109pub struct Cleaned {
110 result: Url,
111 handlers_used: i32,
112}
113
114impl std::ops::Deref for Cleaned {
115 type Target = Url;
116
117 fn deref(&self) -> &Self::Target {
118 &self.result
119 }
120}
121
122impl Cleaned {
123 pub fn number_of_handlers_used(&self) -> i32 {
124 self.handlers_used
125 }
126}
127
128impl ToString for Cleaned {
129 fn to_string(&self) -> String {
130 self.result.as_ref().trim_end_matches('=').to_string()
131 }
132}
133
134pub fn clean(url: Url) -> Cleaned {
138 let mut handlers_used = 0;
139 let host_path = format!(
141 "{}/{}",
142 url.host_str().unwrap_or_default().trim_end_matches('/'),
143 url.path()
144 );
145 let matched_rules = rules::GLOBAL_PARAMS
146 .iter()
147 .filter(|r| r.host_path.iter().any(|d| d.matches_str(Some(&host_path))))
148 .collect::<Vec<_>>();
149
150 let rules_with_handles = matched_rules.iter().filter(|r| r.handler.is_some());
152
153 let mut url = url;
154 for rule in rules_with_handles {
155 if let Some(handler) = &rule.handler {
156 url = handler(url);
157 handlers_used += 1;
158 }
159 }
160
161 Cleaned {
162 result: clean_hash_params(clean_query_string(url, &matched_rules), &matched_rules),
163 handlers_used,
164 }
165}
166
167pub fn clean_str(url: &str) -> Result<String, url::ParseError> {
172 let url = Url::parse(url)?;
173 let url = clean(url);
174
175 Ok(url.to_string())
176}
177
178pub fn clean_str_raw(url: &str) -> Result<Cleaned, url::ParseError> {
180 let url = Url::parse(url)?;
181 let cleaned = clean(url);
182
183 Ok(cleaned)
184}
185
186fn clean_query_string(url: Url, rules: &[&Rule]) -> Url {
187 let mut url = url;
188 if url.query().is_none() {
189 return url;
190 }
191
192 let queries = url
193 .query_pairs()
194 .into_iter()
195 .filter(|(k, _)| {
196 !rules
197 .iter()
198 .any(|r| r.params.iter().any(|p| p.matches_str(Some(k.as_ref()))))
199 })
200 .map(|(k, v)| (k.to_string(), v.to_string()))
201 .collect::<Vec<_>>();
202
203 url.set_query(None); if queries.is_empty() {
205 return url; }
207
208 let mut params = url.query_pairs_mut();
209
210 for (k, v) in queries {
211 params.append_pair(k.as_ref(), v.as_ref());
212 }
213
214 params.finish().to_owned()
215}
216
217fn clean_hash_params(url: Url, rules: &[&Rule]) -> Url {
218 let mut url = url;
219
220 if let Some(f) = url.fragment() {
221 let mut fr = String::with_capacity(f.len());
222
223 for item in f.split('&') {
224 if let Some(key) = item.split('=').take(1).collect::<Vec<_>>().first() {
225 if !rules
226 .iter()
227 .any(|r| r.params.iter().any(|p| p.matches_str(Some(*key))))
228 {
229 fr.push_str(item);
230 fr.push('&');
231 }
232 }
233 }
234 if fr.ends_with('&') {
235 fr.remove(fr.len() - 1);
236 }
237
238 if fr.is_empty() {
239 url.set_fragment(None); } else {
241 url.set_fragment(Some(fr.as_str()));
242 }
243 }
244
245 url
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251 use test_case::test_case;
252
253 #[test_case(
257 "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw",
258 "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad query"
259 )]
260 #[test_case(
261 "https://twitter.com/elonmusk/status/1608273870901096454?from=home",
262 "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: single good query"
263 )]
264 #[test_case(
265 "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw&from=home",
266 "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: good & bad query"
267 )]
268 #[test_case(
272 "https://twitter.com/elonmusk/status/1608273870901096454?ref_src",
273 "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad query without value"
274 )]
275 #[test_case(
276 "https://twitter.com/elonmusk/status/1608273870901096454?from=home&ref_src",
277 "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: bad query without value"
278 )]
279 #[test_case(
280 "https://twitter.com/elonmusk/status/1608273870901096454?from",
281 "https://twitter.com/elonmusk/status/1608273870901096454?from"; "twitter: single good query without value"
282 )]
283 #[test_case(
284 "https://twitter.com/elonmusk/status/1608273870901096454?from&ref_src=abc",
285 "https://twitter.com/elonmusk/status/1608273870901096454?from"; "twitter: bad query with value good query without value"
286 )]
287 fn query(input: &str, expected: &str) {
288 test_common(input, expected)
289 }
290
291 #[test_case(
295 "https://twitter.com/elonmusk/status/1608273870901096454#ref_src=twsrc%5Etfw",
296 "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad hash param"
297 )]
298 #[test_case(
299 "https://twitter.com/elonmusk/status/1608273870901096454#from=home",
300 "https://twitter.com/elonmusk/status/1608273870901096454#from=home"; "twitter: single good hash param"
301 )]
302 #[test_case(
303 "https://twitter.com/elonmusk/status/1608273870901096454#ref_src=twsrc%5Etfw&from=home",
304 "https://twitter.com/elonmusk/status/1608273870901096454#from=home"; "twitter: good & bad hash param"
305 )]
306 fn hash(input: &str, expected: &str) {
307 test_common(input, expected)
308 }
309
310 #[test_case(
311 "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw&from=home#ref_src=twsrc%5Etfw&from=home",
312 "https://twitter.com/elonmusk/status/1608273870901096454?from=home#from=home"; "twitter: good & bad hash param and query"
313 )]
314 #[test_case(
315 "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw#ref_src=twsrc%5Etfw",
316 "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: all bad hash param and query"
317 )]
318 fn both(input: &str, expected: &str) {
319 test_common(input, expected)
320 }
321
322 #[test_case(
324 "https://example.com/my-post?utm_xyx=abc&id=12456",
325 "https://example.com/my-post?id=12456"; "misc: all utm_ query"
326 )]
327 #[test_case(
328 "https://example.com/my-post?utm_xyx=abc&id=12456&utm_life=asssc",
329 "https://example.com/my-post?id=12456"; "misc: all utm_ query (two)"
330 )]
331 #[test_case(
332 "https://whatsmyreferer.com/?json",
333 "https://whatsmyreferer.com/?json"; "misc: no trailing eq ="
334 )]
335 fn misc(input: &str, expected: &str) {
336 test_common(input, expected)
337 }
338
339 #[test_case(
340 "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&url=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
341 "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: parses the url query string"
342 )]
343 #[test_case(
344 "https://www.google.com/url?q=http://www.capitalfm.com/news/tv-film/netflix/kaleidoscope-episode-order/&sa=D&source=calendar&usd=2&usg=AOvVaw0DUKL0RoiXBhCFMYU_U2jY",
345 "http://www.capitalfm.com/news/tv-film/netflix/kaleidoscope-episode-order/"; "google result: no url query string"
346 )]
347 #[test_case(
348 "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
349 "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has q query string"
350 )]
351 #[test_case(
352 "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=invalid_url&q=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
353 "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has two q query strings"
354 )]
355 #[test_case(
356 "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=invalid_url&q=https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer",
357 "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has two q query strings + unencoded value"
358 )]
359 #[test_case(
360 "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=https%3A%2F%2Fwww.britishairways.com",
361 "https://www.britishairways.com/"; "youtube /redirect: parses q"
362 )]
363 #[test_case(
364 "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=invalid_url",
365 "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=invalid_url"; "youtube /redirect: ingnores invalid q"
366 )]
367 #[test_case(
368 "https://www.amazon.co.uk/gp/r.html?C=HEX&K=SOMEHEX&M=urn:rtn:msg:NUMBERS&R=SOMETHING&T=C&U=https%3A%2F%2Fwww.amazon.co.uk%2Fgp%2Fyour-account%2Forder-details%3ForderID%3DOREDER_ID%26ref_%3Dpreference&H=TEXT&ref_=pe_ref_with_underscore",
369 "https://www.amazon.co.uk/gp/your-account/order-details?orderID=OREDER_ID&ref_=preference"; "amazon: extract from U"
370 )]
371 #[test_case(
372 "https://email.clearscore.com/uni/track?uid=UUID&txnid=UUID&bsft_aaid=UUID&eid=UUID&mid=UUID&bsft_ek=RANDOM&bsft_mime_type=html&bsft_tv=27&bsft_lx=9&a=click&redir=https%3A%2F%2Fapp.clearscore.com%2Freport%3Futm_campaign%3Deml_lc_ca_alerts_2021_02_09%26utm_source%3Dblueshift%26utm_medium%3Demail%26utm_content%3Deml_lc_alerts_new_template_2022_04_01",
373 "https://app.clearscore.com/report"; "generic email tracker: with track in path"
374 )]
375 fn site_specific(input: &str, expected: &str) {
376 test_common(input, expected)
377 }
378
379 fn test_common(input: &str, expected: &str) {
380 let result = clean(Url::parse(input).unwrap()).to_string();
381
382 assert_eq!(
383 result,
384 expected.to_string(),
385 "\nExpected: `{}`\n Found: `{}`",
386 expected,
387 result
388 );
389 }
390
391 #[test]
392 fn matcher() {
393 assert!(M::Any.matches_str(Some("yoyo")), "any");
394 assert!(
395 M::Contains("utm_").matches_str(Some("abc_utm_")),
396 "contains"
397 );
398 assert!(M::Exact("utm_").matches_str(Some("utm_")), "exact");
399 assert!(
400 M::StartsWith("utm_").matches_str(Some("utm_abc")),
401 "starts_with"
402 );
403 }
404}