tracking_params/
lib.rs

1//! # tracking-params
2//!
3//! Removes unwanted tracking parameters from a given URLs.
4//!
5//! ```rust
6//! let dirty_url = url::Url::parse("https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5EdUmBgUY")?;
7//! let clean_url = tracking_params::clean(dirty_url); // returns `Cleaned` which derefs to `url::Url`
8//!
9//! assert_eq!(
10//!     clean_url.to_string(),
11//!     "https://twitter.com/elonmusk/status/1608273870901096454".to_string() // No `ref_src` tracking params
12//! );
13//!
14//! # Ok::<_, url::ParseError>(())
15//! ```
16use derivative::Derivative;
17use url::Url;
18
19mod rules;
20
21#[derive(Derivative)]
22#[derivative(Debug)]
23pub(crate) struct Rule {
24    /// List of domains for which this rule applies.
25    host_path: Vec<M>,
26    /// List of query string and fragment params to remove.
27    params: Vec<M>,
28    /// Handler to run any specific code for this rule.
29    ///
30    /// When defined, the handler run run before removing the matching
31    /// params from the input url (defined in `params` field).
32    /// The handler can change or return a completely different Url.
33    /// Note: the handler is expected (although not validated for perf reason)
34    /// to return url that belongs to the same origin and treated as such because
35    /// * any matching params will still be removed later even if it's a different origin.
36    /// * any defined rule for that new origin won't be applied
37    ///
38    /// A common use for handler function is to extract destination url from a query string
39    /// from the input url. Consider the following link when click on a google search result:
40    ///
41    /// `https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=invalid_url&q=https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer`
42    ///
43    /// We can extract the destination url from the `q` or `url` query string (whichever is present)
44    /// and skip sending traffic to `/url` endpoint. For such cases you can use
45    /// the defined [`rules::extract_link_from_query_string`] function to extract valid url
46    /// from one or many query strings.
47    ///
48    #[derivative(Debug = "ignore")]
49    handler: Option<Box<dyn Fn(Url) -> Url + Sync + Send>>,
50}
51
52#[allow(dead_code)]
53#[derive(Clone, Debug)]
54pub(crate) enum M {
55    Any,
56    AllBut(&'static str),
57    ContainsAll(Vec<&'static str>),
58    Exact(&'static str),
59    StartsWith(&'static str),
60    Contains(&'static str),
61}
62
63impl M {
64    fn matches_str(&self, input: Option<&str>) -> bool {
65        self.matches(input.map(|i| i.as_bytes()))
66    }
67
68    fn matches(&self, input: Option<&[u8]>) -> bool {
69        match input {
70            Some(input) => match self {
71                M::Any => true,
72                M::Exact(e) => input.eq(e.as_bytes()),
73                M::StartsWith(sw) => input.starts_with(sw.as_bytes()),
74                M::Contains(c) => input.windows(c.len()).any(|w| w.eq(c.as_bytes())),
75                M::ContainsAll(all) => all
76                    .iter()
77                    .map(|a| M::Contains(a))
78                    .collect::<Vec<_>>()
79                    .iter()
80                    .all(|a| a.matches(Some(input))),
81
82                M::AllBut(c) => !c.as_bytes().eq(input),
83            },
84            None => match self {
85                M::Any => true,
86
87                M::Exact(_)
88                | M::StartsWith(_)
89                | M::Contains(_)
90                | M::ContainsAll(_)
91                | M::AllBut(_) => false,
92            },
93        }
94    }
95}
96
97/// A cleaned URL.
98///
99///
100/// This is a wrapper around and `Deref` into [`url::Url`] that also overriedes the `ToString`
101/// to prevent extra `=` at the end of the URL when the query string does
102/// not have any value.
103///
104/// eg.`https://example.com/?json` turns to `https://example.com/?json=` when
105/// `ToString` is called on the Url type.
106///
107
108#[derive(Debug, Clone)]
109pub struct Cleaned {
110    result: Url,
111    handlers_used: i32,
112}
113
114impl std::ops::Deref for Cleaned {
115    type Target = Url;
116
117    fn deref(&self) -> &Self::Target {
118        &self.result
119    }
120}
121
122impl Cleaned {
123    pub fn number_of_handlers_used(&self) -> i32 {
124        self.handlers_used
125    }
126}
127
128impl ToString for Cleaned {
129    fn to_string(&self) -> String {
130        self.result.as_ref().trim_end_matches('=').to_string()
131    }
132}
133
134/// Removes tracking parameters from a given [`Url`] type.
135///
136/// This owns the input and returns a [`Cleaned`] type.
137pub fn clean(url: Url) -> Cleaned {
138    let mut handlers_used = 0;
139    // Find applicable rules for this hostname
140    let host_path = format!(
141        "{}/{}",
142        url.host_str().unwrap_or_default().trim_end_matches('/'),
143        url.path()
144    );
145    let matched_rules = rules::GLOBAL_PARAMS
146        .iter()
147        .filter(|r| r.host_path.iter().any(|d| d.matches_str(Some(&host_path))))
148        .collect::<Vec<_>>();
149
150    // Run ths url through any rules that has a handler defined
151    let rules_with_handles = matched_rules.iter().filter(|r| r.handler.is_some());
152
153    let mut url = url;
154    for rule in rules_with_handles {
155        if let Some(handler) = &rule.handler {
156            url = handler(url);
157            handlers_used += 1;
158        }
159    }
160
161    Cleaned {
162        result: clean_hash_params(clean_query_string(url, &matched_rules), &matched_rules),
163        handlers_used,
164    }
165}
166
167/// Removes tracking parameters from a given string reference that is expected to be a valid URL.
168///
169/// This returns the cleaned URL as String.
170/// This returns error when the given input is not a valid URL.
171pub fn clean_str(url: &str) -> Result<String, url::ParseError> {
172    let url = Url::parse(url)?;
173    let url = clean(url);
174
175    Ok(url.to_string())
176}
177
178/// Same as [`clean_str`] but returns the [`Cleaned`] type
179pub fn clean_str_raw(url: &str) -> Result<Cleaned, url::ParseError> {
180    let url = Url::parse(url)?;
181    let cleaned = clean(url);
182
183    Ok(cleaned)
184}
185
186fn clean_query_string(url: Url, rules: &[&Rule]) -> Url {
187    let mut url = url;
188    if url.query().is_none() {
189        return url;
190    }
191
192    let queries = url
193        .query_pairs()
194        .into_iter()
195        .filter(|(k, _)| {
196            !rules
197                .iter()
198                .any(|r| r.params.iter().any(|p| p.matches_str(Some(k.as_ref()))))
199        })
200        .map(|(k, v)| (k.to_string(), v.to_string()))
201        .collect::<Vec<_>>();
202
203    url.set_query(None); // clear all queries
204    if queries.is_empty() {
205        return url; // prevents dangling `?` at the end (as a result of `query_pairs_mut` call below)
206    }
207
208    let mut params = url.query_pairs_mut();
209
210    for (k, v) in queries {
211        params.append_pair(k.as_ref(), v.as_ref());
212    }
213
214    params.finish().to_owned()
215}
216
217fn clean_hash_params(url: Url, rules: &[&Rule]) -> Url {
218    let mut url = url;
219
220    if let Some(f) = url.fragment() {
221        let mut fr = String::with_capacity(f.len());
222
223        for item in f.split('&') {
224            if let Some(key) = item.split('=').take(1).collect::<Vec<_>>().first() {
225                if !rules
226                    .iter()
227                    .any(|r| r.params.iter().any(|p| p.matches_str(Some(*key))))
228                {
229                    fr.push_str(item);
230                    fr.push('&');
231                }
232            }
233        }
234        if fr.ends_with('&') {
235            fr.remove(fr.len() - 1);
236        }
237
238        if fr.is_empty() {
239            url.set_fragment(None); // prevents dangling `#` at the end
240        } else {
241            url.set_fragment(Some(fr.as_str()));
242        }
243    }
244
245    url
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251    use test_case::test_case;
252
253    //
254    // Query
255    //
256    #[test_case(
257        "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw",
258        "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad query"
259    )]
260    #[test_case(
261        "https://twitter.com/elonmusk/status/1608273870901096454?from=home",
262        "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: single good query"
263    )]
264    #[test_case(
265        "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw&from=home",
266        "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: good & bad query"
267    )]
268    //
269    // Query without value
270    //
271    #[test_case(
272        "https://twitter.com/elonmusk/status/1608273870901096454?ref_src",
273        "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad query without value"
274    )]
275    #[test_case(
276        "https://twitter.com/elonmusk/status/1608273870901096454?from=home&ref_src",
277        "https://twitter.com/elonmusk/status/1608273870901096454?from=home"; "twitter: bad query without value"
278    )]
279    #[test_case(
280        "https://twitter.com/elonmusk/status/1608273870901096454?from",
281        "https://twitter.com/elonmusk/status/1608273870901096454?from"; "twitter: single good query without value"
282    )]
283    #[test_case(
284        "https://twitter.com/elonmusk/status/1608273870901096454?from&ref_src=abc",
285        "https://twitter.com/elonmusk/status/1608273870901096454?from"; "twitter: bad query with value good query without value"
286    )]
287    fn query(input: &str, expected: &str) {
288        test_common(input, expected)
289    }
290
291    //
292    // Hash Params
293    //
294    #[test_case(
295        "https://twitter.com/elonmusk/status/1608273870901096454#ref_src=twsrc%5Etfw",
296        "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: single bad hash param"
297    )]
298    #[test_case(
299        "https://twitter.com/elonmusk/status/1608273870901096454#from=home",
300        "https://twitter.com/elonmusk/status/1608273870901096454#from=home"; "twitter: single good hash param"
301    )]
302    #[test_case(
303        "https://twitter.com/elonmusk/status/1608273870901096454#ref_src=twsrc%5Etfw&from=home",
304        "https://twitter.com/elonmusk/status/1608273870901096454#from=home"; "twitter: good & bad hash param"
305    )]
306    fn hash(input: &str, expected: &str) {
307        test_common(input, expected)
308    }
309
310    #[test_case(
311        "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw&from=home#ref_src=twsrc%5Etfw&from=home",
312        "https://twitter.com/elonmusk/status/1608273870901096454?from=home#from=home"; "twitter: good & bad hash param and query"
313    )]
314    #[test_case(
315        "https://twitter.com/elonmusk/status/1608273870901096454?ref_src=twsrc%5Etfw#ref_src=twsrc%5Etfw",
316        "https://twitter.com/elonmusk/status/1608273870901096454"; "twitter: all bad hash param and query"
317    )]
318    fn both(input: &str, expected: &str) {
319        test_common(input, expected)
320    }
321
322    // Misc
323    #[test_case(
324        "https://example.com/my-post?utm_xyx=abc&id=12456",
325        "https://example.com/my-post?id=12456"; "misc: all utm_ query"
326    )]
327    #[test_case(
328        "https://example.com/my-post?utm_xyx=abc&id=12456&utm_life=asssc",
329        "https://example.com/my-post?id=12456"; "misc: all utm_ query (two)"
330    )]
331    #[test_case(
332        "https://whatsmyreferer.com/?json",
333        "https://whatsmyreferer.com/?json"; "misc: no trailing eq ="
334    )]
335    fn misc(input: &str, expected: &str) {
336        test_common(input, expected)
337    }
338
339    #[test_case(
340        "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&url=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
341        "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: parses the url query string"
342    )]
343    #[test_case(
344        "https://www.google.com/url?q=http://www.capitalfm.com/news/tv-film/netflix/kaleidoscope-episode-order/&sa=D&source=calendar&usd=2&usg=AOvVaw0DUKL0RoiXBhCFMYU_U2jY",
345        "http://www.capitalfm.com/news/tv-film/netflix/kaleidoscope-episode-order/"; "google result: no url query string"
346    )]
347    #[test_case(
348        "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
349        "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has q query string"
350    )]
351    #[test_case(
352        "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=invalid_url&q=https%3A%2F%2Fdeveloper.mozilla.org%2Fen-US%2Fdocs%2FWeb%2FHTTP%2FHeaders%2FReferer&usg=AOvVaw0W8-mEp9kfFnE9c5S1DUp0",
353        "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has two q query strings"
354    )]
355    #[test_case(
356        "https://www.google.com/url?sa=t&rct=j&esrc=s&source=web&cd=&ved=2ahUKEwi8hMv_nKP8AhWXhFwKHSetARUQFnoECBgQAQ&q=invalid_url&q=https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer",
357        "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referer"; "google result: has two q query strings + unencoded value"
358    )]
359    #[test_case(
360        "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=https%3A%2F%2Fwww.britishairways.com",
361        "https://www.britishairways.com/"; "youtube /redirect: parses q"
362    )]
363    #[test_case(
364        "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=invalid_url",
365        "https://www.youtube.com/redirect?event=channel_description&redir_token=JWT_TOKEN&q=invalid_url"; "youtube /redirect: ingnores invalid q"
366    )]
367    #[test_case(
368        "https://www.amazon.co.uk/gp/r.html?C=HEX&K=SOMEHEX&M=urn:rtn:msg:NUMBERS&R=SOMETHING&T=C&U=https%3A%2F%2Fwww.amazon.co.uk%2Fgp%2Fyour-account%2Forder-details%3ForderID%3DOREDER_ID%26ref_%3Dpreference&H=TEXT&ref_=pe_ref_with_underscore",
369        "https://www.amazon.co.uk/gp/your-account/order-details?orderID=OREDER_ID&ref_=preference"; "amazon: extract from U"
370    )]
371    #[test_case(
372        "https://email.clearscore.com/uni/track?uid=UUID&txnid=UUID&bsft_aaid=UUID&eid=UUID&mid=UUID&bsft_ek=RANDOM&bsft_mime_type=html&bsft_tv=27&bsft_lx=9&a=click&redir=https%3A%2F%2Fapp.clearscore.com%2Freport%3Futm_campaign%3Deml_lc_ca_alerts_2021_02_09%26utm_source%3Dblueshift%26utm_medium%3Demail%26utm_content%3Deml_lc_alerts_new_template_2022_04_01",
373        "https://app.clearscore.com/report"; "generic email tracker: with track in path"
374    )]
375    fn site_specific(input: &str, expected: &str) {
376        test_common(input, expected)
377    }
378
379    fn test_common(input: &str, expected: &str) {
380        let result = clean(Url::parse(input).unwrap()).to_string();
381
382        assert_eq!(
383            result,
384            expected.to_string(),
385            "\nExpected: `{}`\n   Found: `{}`",
386            expected,
387            result
388        );
389    }
390
391    #[test]
392    fn matcher() {
393        assert!(M::Any.matches_str(Some("yoyo")), "any");
394        assert!(
395            M::Contains("utm_").matches_str(Some("abc_utm_")),
396            "contains"
397        );
398        assert!(M::Exact("utm_").matches_str(Some("utm_")), "exact");
399        assert!(
400            M::StartsWith("utm_").matches_str(Some("utm_abc")),
401            "starts_with"
402        );
403    }
404}