clearurls/
lib.rs

1// Rustc lints
2#![forbid(unsafe_code)]
3#![warn(future_incompatible)]
4#![warn(keyword_idents)]
5#![warn(let_underscore)]
6#![warn(missing_copy_implementations)]
7#![warn(missing_debug_implementations)]
8#![warn(missing_docs)]
9#![warn(non_ascii_idents)]
10#![warn(nonstandard_style)]
11#![warn(noop_method_call)]
12#![warn(refining_impl_trait)]
13#![warn(rust_2018_compatibility)]
14#![warn(rust_2018_idioms)]
15#![warn(rust_2021_compatibility)]
16#![warn(rust_2024_compatibility)]
17#![warn(unused)]
18#![warn(unused_extern_crates)]
19#![warn(unused_import_braces)]
20// Clippy categories
21#![warn(clippy::cargo)]
22#![warn(clippy::complexity)]
23#![warn(clippy::correctness)]
24#![warn(clippy::nursery)]
25#![warn(clippy::pedantic)]
26#![warn(clippy::perf)]
27#![warn(clippy::style)]
28#![warn(clippy::suspicious)]
29// selected clippy lints from nursery and restriction
30#![allow(clippy::redundant_pub_crate)] // I like it my way
31#![warn(clippy::cognitive_complexity)]
32#![warn(clippy::dbg_macro)]
33#![warn(clippy::debug_assert_with_mut_call)]
34#![warn(clippy::empty_line_after_outer_attr)]
35#![warn(clippy::empty_structs_with_brackets)]
36#![warn(clippy::float_cmp_const)]
37#![warn(clippy::float_equality_without_abs)]
38#![warn(clippy::missing_const_for_fn)]
39#![warn(clippy::option_if_let_else)]
40#![warn(clippy::print_stderr)]
41#![warn(clippy::print_stdout)]
42#![warn(clippy::suspicious_operation_groupings)]
43#![warn(clippy::unseparated_literal_suffix)]
44#![warn(clippy::use_debug)]
45#![warn(clippy::useless_let_if_seq)]
46#![warn(clippy::wildcard_dependencies)]
47// Rustdoc lints
48#![warn(rustdoc::broken_intra_doc_links)]
49#![warn(rustdoc::missing_crate_level_docs)]
50
51#![no_std]
52#![cfg_attr(docsrs, feature(doc_cfg, doc_auto_cfg))]
53
54//! This crate provides a solution to remove tracking parameters and other nuisance from URLs.
55//!
56//! In order to detect such parameters, this crates uses crowdsourced *Rules* from the
57//! [ClearURLs browser extension](https://clearurls.xyz/).
58//!
59//! A set of rules is included in this library, but you can supply your own. Refer to the
60//! [ClearURLs documentation](https://docs.clearurls.xyz/1.26.1/specs/rules/) for specific syntax and semantics.
61//!
62//! # Example
63//! ```
64//! # use clearurls::UrlCleaner;
65//! # fn main() -> Result<(), clearurls::Error> {
66//! let cleaner = UrlCleaner::from_embedded_rules()?;
67//! let res = cleaner.clear_single_url_str("https://example.com/test?utm_source=abc")?;
68//! assert_eq!(res, "https://example.com/test");
69//! # Ok(())
70//! # }
71//! ```
72
73#[cfg(doctest)]
74#[doc = include_str!("../README.md")]
75extern "C" {}
76
77extern crate alloc;
78#[cfg(feature = "std")]
79extern crate std;
80
81use alloc::borrow::Cow;
82use core::fmt::{Display, Formatter};
83use core::str::{FromStr, Utf8Error};
84use regex::Regex;
85use url::{ParseError, Url};
86
87use rules::Rules;
88
89mod deserialize_utils;
90mod rules;
91#[cfg(test)]
92#[allow(clippy::mod_module_files)]
93mod tests;
94
95/// A [`UrlCleaner`] can remove tracking parameters from URLs.
96///
97/// This struct is relatively expensive to construct because it needs to parse the rules from JSON.
98/// It's recommended to create one per application and reuse it.
99#[derive(Debug)]
100pub struct UrlCleaner {
101    rules: Rules,
102    strip_referral_marketing: bool,
103}
104
105impl UrlCleaner {
106    /// Construct a [`UrlCleaner`] with rules from a path, which will be opened and read.
107    /// # Errors
108    /// See [`Error`]
109    #[cfg(feature = "std")]
110    pub fn from_rules_path(path: &std::path::Path) -> Result<Self, Error> {
111        Self::from_rules_file(std::fs::File::open(path)?)
112    }
113
114    /// Construct a [`UrlCleaner`] with rules from a [reader][std::io::Read], most often a [`File`][std::fs::File]
115    /// # Errors
116    /// See [`Error`]
117    #[cfg(feature = "std")]
118    pub fn from_rules_file<R: std::io::Read>(reader: R) -> Result<Self, Error> {
119        let buf = std::io::BufReader::new(reader);
120        Ok(Self {
121            rules: serde_json::from_reader(buf)?,
122            strip_referral_marketing: false,
123        })
124    }
125
126    /// # Errors
127    /// See [`Error`]
128    pub fn from_rules_str(rules: &str) -> Result<Self, Error> {
129        Ok(Self {
130            rules: serde_json::from_str(rules)?,
131            strip_referral_marketing: false,
132        })
133    }
134
135    /// Construct using the JSON embedded in this library.
136    /// This may be outdated, but should provide a good baseline.
137    ///
138    /// # Errors
139    /// See [`Error`]
140    pub fn from_embedded_rules() -> Result<Self, Error> {
141        Self::from_rules_str(include_str!("../data.minify.json"))
142    }
143
144    /// Configure whether you want to strip referral codes and similar parameters.
145    ///
146    /// While they can be considered to be tracking, they are useful on occasion.
147    /// The default is `false`, meaning these are kept.
148    #[must_use]
149    #[allow(clippy::missing_const_for_fn)]
150    pub fn strip_referral_marketing(mut self, value: bool) -> Self {
151        self.strip_referral_marketing = value;
152        self
153    }
154
155    /// Clean a single URL.
156    ///
157    /// The argument is a string that is *just* a URL, with no text around.
158    ///
159    /// The Cleaning may involve
160    /// - 1. removing tracking parameters
161    ///      and/or,
162    /// - 2. detecting redirections with the target url in a query parameters
163    ///
164    /// # Returns
165    /// a cleaned URL
166    ///
167    /// # Errors
168    /// If an error occurred. See the [`Error`] enum for possible reasons.
169    pub fn clear_single_url_str<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
170        if url.starts_with("data:") {
171            return Ok(Cow::Borrowed(url));
172        }
173        let mut result = Url::from_str(url)?;
174        for p in &self.rules.providers {
175            if p.match_url(result.as_str()) {
176                result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
177            }
178        }
179
180        Ok(Cow::Owned(result.into()))
181    }
182
183    /// Clean a single URL.
184    ///
185    /// The Cleaning may involve
186    /// - 1. removing tracking parameters
187    ///      and/or,
188    /// - 2. detecting redirections with the target url in a query parameters
189    ///
190    /// # Returns
191    /// a cleaned URL
192    ///
193    /// # Errors
194    /// If an error occurred. See the [`Error`] enum for possible reasons.
195    pub fn clear_single_url<'a>(&self, url: &'a Url) -> Result<Cow<'a, Url>, Error> {
196        if url.scheme().starts_with("data") {
197            return Ok(Cow::Borrowed(url));
198        }
199        let mut url = Cow::Borrowed(url);
200        for p in &self.rules.providers {
201            if p.match_url(url.as_str()) {
202                url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?);
203            }
204        }
205
206        Ok(url)
207    }
208
209    /// Clean all URLs in a text.
210    ///
211    /// This may involve
212    /// - 1. removing tracking parameters
213    ///      and/or,
214    /// - 2. detecting redirections with the target url in a query parameters
215    ///
216    /// # Returns
217    /// The string with all URLs inside cleaned.
218    /// Text outside of URLs is left unchanged.
219    ///
220    /// # Errors
221    /// Alls errors encountered are returned in a [`Vec`][alloc::vec::Vec].
222    #[cfg(feature = "linkify")]
223    pub fn clear_text<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
224        self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new())
225    }
226
227
228    /// Clean all URLs in a text.
229    ///
230    /// This may involve
231    /// - 1. removing tracking parameters
232    ///      and/or,
233    /// - 2. detecting redirections with the target url in a query parameters
234    ///
235    /// # Returns
236    /// The string with all URLs inside cleaned.
237    /// Text outside of URLs is left unchanged.
238    ///
239    /// # Errors
240    /// Alls errors encountered are returned in a [`Vec`][alloc::vec::Vec].
241    #[cfg(feature = "linkify")]
242    pub fn clear_text_with_linkfinder<'a>(
243        &self,
244        s: &'a str,
245        finder: &linkify::LinkFinder,
246    ) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
247        use alloc::vec::Vec;
248        use alloc::string::String;
249
250        let mut spans = Vec::new();
251        let mut errors = Vec::new();
252
253        for res in finder.spans(s) {
254            match res.kind() {
255                Some(linkify::LinkKind::Url) => match self.clear_single_url_str(res.as_str()) {
256                    Ok(cow) => spans.push(cow),
257                    Err(e) => errors.push(e),
258                },
259                _ => spans.push(Cow::Borrowed(res.as_str())),
260            }
261        }
262
263        if errors.is_empty() {
264            if spans.iter().all(|s| matches!(s, Cow::Borrowed(_))) {
265                Ok(Cow::Borrowed(s))
266            } else {
267                Ok(Cow::Owned(spans.into_iter().collect::<String>()))
268            }
269        } else {
270            Err(errors)
271        }
272    }
273
274    /// Clean all URLs in a Markdown document. This affects all kinds of URLs, like
275    /// - proper Markdown Links
276    /// - auto links (links inside angle brackets)
277    /// - links to images
278    /// - bare links with no extra markup.
279    ///
280    /// The document will be modified in-place.
281    ///
282    /// # Errors
283    /// The algorithm continues with the rest of the document if an error occurs.
284    /// The return value is `Ok(())` if there were no errors.
285    /// Otherwise, the list of errors is returned as the `Err` value.
286    #[cfg(feature = "markdown-it")]
287    pub fn clear_markdown(&self, doc: &mut markdown_it::Node) -> Result<(), alloc::vec::Vec<Error>> {
288        use markdown_it::parser::inline::Text;
289        use markdown_it::plugins::cmark::inline::autolink::Autolink;
290        use markdown_it::plugins::cmark::inline::image::Image;
291        use markdown_it::plugins::cmark::inline::link::Link;
292        use markdown_it::plugins::extra::linkify::Linkified;
293        use markdown_it::Node;
294        use alloc::string::String;
295
296        fn replace_url(cleaner: &UrlCleaner, url: &mut String) -> Result<(), Error> {
297            match cleaner.clear_single_url_str(url)? {
298                Cow::Borrowed(_) => {}
299                Cow::Owned(new_url) => {
300                    *url = new_url;
301                }
302            }
303            Ok(())
304        }
305
306        fn callback(cleaner: &UrlCleaner, node: &mut Node) -> Result<(), Error> {
307            if let Some(link) = node.cast_mut::<Autolink>() {
308                replace_url(cleaner, &mut link.url)?;
309                node.children = alloc::vec![Node::new(Text {
310                    content: link.url.clone()
311                })];
312            }
313            if let Some(link) = node.cast_mut::<Linkified>() {
314                replace_url(cleaner, &mut link.url)?;
315                node.children = alloc::vec![Node::new(Text {
316                    content: link.url.clone()
317                })];
318            }
319            if let Some(link) = node.cast_mut::<Link>() {
320                replace_url(cleaner, &mut link.url)?;
321            }
322            if let Some(link) = node.cast_mut::<Image>() {
323                replace_url(cleaner, &mut link.url)?;
324            }
325            Ok(())
326        }
327
328        let mut result = alloc::vec![];
329        doc.walk_mut(|node, _| {
330            if let Err(e) = callback(self, node) {
331                result.push(e);
332            };
333        });
334
335        if result.is_empty() {
336            Ok(())
337        } else {
338            Err(result)
339        }
340    }
341}
342
343/// Various errors that can happen while cleaning a URL
344#[derive(Debug)]
345#[non_exhaustive]
346pub enum Error {
347    /// An Error occurred while opening or reading a file
348    #[cfg(feature = "std")]
349    FileRead(std::io::Error),
350    /// The provided rules is invalid json or doesn't have the expected format
351    RuleSyntax(serde_json::Error),
352    /// A URL could not be parsed from the input.
353    UrlSyntax(ParseError),
354    /// The rules contained a redirection regex that doesn't specify the target
355    RedirectionHasNoCapturingGroup(Regex),
356    /// Bytes that are invalid UTF-8
357    PercentDecodeUtf8Error(Utf8Error),
358}
359
360impl Display for Error {
361    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
362        match self {
363            #[cfg(feature = "std")]
364            Self::FileRead(x) => write!(f, "error reading rules: {x}"),
365            Self::RuleSyntax(x) => write!(f, "error parsing rules: {x}"),
366            Self::UrlSyntax(x) => write!(f, "error parsing url: {x}"),
367            Self::RedirectionHasNoCapturingGroup(x) => {
368                write!(f, "redirection regex {x} has no capture group")
369            }
370            Self::PercentDecodeUtf8Error(x) => {
371                write!(f, "percent decoding resulted in non-UTF-8 bytes: {x}")
372            }
373        }
374    }
375}
376
377#[cfg(feature = "std")]
378impl From<std::io::Error> for Error {
379    fn from(value: std::io::Error) -> Self {
380        Self::FileRead(value)
381    }
382}
383
384impl From<serde_json::Error> for Error {
385    fn from(value: serde_json::Error) -> Self {
386        Self::RuleSyntax(value)
387    }
388}
389
390impl From<ParseError> for Error {
391    fn from(value: ParseError) -> Self {
392        Self::UrlSyntax(value)
393    }
394}
395
396impl From<Utf8Error> for Error {
397    fn from(value: Utf8Error) -> Self {
398        Self::PercentDecodeUtf8Error(value)
399    }
400}
401
402#[cfg(feature = "std")]
403impl std::error::Error for Error {
404    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
405        match self {
406            Self::FileRead(e) => Some(e),
407            Self::RuleSyntax(e) => Some(e),
408            Self::UrlSyntax(e) => Some(e),
409            Self::RedirectionHasNoCapturingGroup(_) => None,
410            Self::PercentDecodeUtf8Error(e) => Some(e),
411        }
412    }
413}
clearurls/lib.rs

clearurls/
lib.rs