1#![forbid(unsafe_code)]
3#![warn(future_incompatible)]
4#![warn(keyword_idents)]
5#![warn(let_underscore)]
6#![warn(missing_copy_implementations)]
7#![warn(missing_debug_implementations)]
8#![warn(missing_docs)]
9#![warn(non_ascii_idents)]
10#![warn(nonstandard_style)]
11#![warn(noop_method_call)]
12#![warn(refining_impl_trait)]
13#![warn(rust_2018_compatibility)]
14#![warn(rust_2018_idioms)]
15#![warn(rust_2021_compatibility)]
16#![warn(rust_2024_compatibility)]
17#![warn(unused)]
18#![warn(unused_extern_crates)]
19#![warn(unused_import_braces)]
20#![warn(clippy::cargo)]
22#![warn(clippy::complexity)]
23#![warn(clippy::correctness)]
24#![warn(clippy::nursery)]
25#![warn(clippy::pedantic)]
26#![warn(clippy::perf)]
27#![warn(clippy::style)]
28#![warn(clippy::suspicious)]
29#![allow(clippy::redundant_pub_crate)] #![warn(clippy::cognitive_complexity)]
32#![warn(clippy::dbg_macro)]
33#![warn(clippy::debug_assert_with_mut_call)]
34#![warn(clippy::empty_line_after_outer_attr)]
35#![warn(clippy::empty_structs_with_brackets)]
36#![warn(clippy::float_cmp_const)]
37#![warn(clippy::float_equality_without_abs)]
38#![warn(clippy::missing_const_for_fn)]
39#![warn(clippy::option_if_let_else)]
40#![warn(clippy::print_stderr)]
41#![warn(clippy::print_stdout)]
42#![warn(clippy::suspicious_operation_groupings)]
43#![warn(clippy::unseparated_literal_suffix)]
44#![warn(clippy::use_debug)]
45#![warn(clippy::useless_let_if_seq)]
46#![warn(clippy::wildcard_dependencies)]
47#![warn(rustdoc::broken_intra_doc_links)]
49#![warn(rustdoc::missing_crate_level_docs)]
50
51#![no_std]
52#![cfg_attr(docsrs, feature(doc_cfg, doc_auto_cfg))]
53
54#[cfg(doctest)]
74#[doc = include_str!("../README.md")]
75extern "C" {}
76
77extern crate alloc;
78#[cfg(feature = "std")]
79extern crate std;
80
81use alloc::borrow::Cow;
82use core::fmt::{Display, Formatter};
83use core::str::{FromStr, Utf8Error};
84use regex::Regex;
85use url::{ParseError, Url};
86
87use rules::Rules;
88
89mod deserialize_utils;
90mod rules;
91#[cfg(test)]
92#[allow(clippy::mod_module_files)]
93mod tests;
94
95#[derive(Debug)]
100pub struct UrlCleaner {
101 rules: Rules,
102 strip_referral_marketing: bool,
103}
104
105impl UrlCleaner {
106 #[cfg(feature = "std")]
110 pub fn from_rules_path(path: &std::path::Path) -> Result<Self, Error> {
111 Self::from_rules_file(std::fs::File::open(path)?)
112 }
113
114 #[cfg(feature = "std")]
118 pub fn from_rules_file<R: std::io::Read>(reader: R) -> Result<Self, Error> {
119 let buf = std::io::BufReader::new(reader);
120 Ok(Self {
121 rules: serde_json::from_reader(buf)?,
122 strip_referral_marketing: false,
123 })
124 }
125
126 pub fn from_rules_str(rules: &str) -> Result<Self, Error> {
129 Ok(Self {
130 rules: serde_json::from_str(rules)?,
131 strip_referral_marketing: false,
132 })
133 }
134
135 pub fn from_embedded_rules() -> Result<Self, Error> {
141 Self::from_rules_str(include_str!("../data.minify.json"))
142 }
143
144 #[must_use]
149 #[allow(clippy::missing_const_for_fn)]
150 pub fn strip_referral_marketing(mut self, value: bool) -> Self {
151 self.strip_referral_marketing = value;
152 self
153 }
154
155 pub fn clear_single_url_str<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
170 if url.starts_with("data:") {
171 return Ok(Cow::Borrowed(url));
172 }
173 let mut result = Url::from_str(url)?;
174 for p in &self.rules.providers {
175 if p.match_url(result.as_str()) {
176 result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
177 }
178 }
179
180 Ok(Cow::Owned(result.into()))
181 }
182
183 pub fn clear_single_url<'a>(&self, url: &'a Url) -> Result<Cow<'a, Url>, Error> {
196 if url.scheme().starts_with("data") {
197 return Ok(Cow::Borrowed(url));
198 }
199 let mut url = Cow::Borrowed(url);
200 for p in &self.rules.providers {
201 if p.match_url(url.as_str()) {
202 url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?);
203 }
204 }
205
206 Ok(url)
207 }
208
209 #[cfg(feature = "linkify")]
223 pub fn clear_text<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
224 self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new())
225 }
226
227
228 #[cfg(feature = "linkify")]
242 pub fn clear_text_with_linkfinder<'a>(
243 &self,
244 s: &'a str,
245 finder: &linkify::LinkFinder,
246 ) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
247 use alloc::vec::Vec;
248 use alloc::string::String;
249
250 let mut spans = Vec::new();
251 let mut errors = Vec::new();
252
253 for res in finder.spans(s) {
254 match res.kind() {
255 Some(linkify::LinkKind::Url) => match self.clear_single_url_str(res.as_str()) {
256 Ok(cow) => spans.push(cow),
257 Err(e) => errors.push(e),
258 },
259 _ => spans.push(Cow::Borrowed(res.as_str())),
260 }
261 }
262
263 if errors.is_empty() {
264 if spans.iter().all(|s| matches!(s, Cow::Borrowed(_))) {
265 Ok(Cow::Borrowed(s))
266 } else {
267 Ok(Cow::Owned(spans.into_iter().collect::<String>()))
268 }
269 } else {
270 Err(errors)
271 }
272 }
273
274 #[cfg(feature = "markdown-it")]
287 pub fn clear_markdown(&self, doc: &mut markdown_it::Node) -> Result<(), alloc::vec::Vec<Error>> {
288 use markdown_it::parser::inline::Text;
289 use markdown_it::plugins::cmark::inline::autolink::Autolink;
290 use markdown_it::plugins::cmark::inline::image::Image;
291 use markdown_it::plugins::cmark::inline::link::Link;
292 use markdown_it::plugins::extra::linkify::Linkified;
293 use markdown_it::Node;
294 use alloc::string::String;
295
296 fn replace_url(cleaner: &UrlCleaner, url: &mut String) -> Result<(), Error> {
297 match cleaner.clear_single_url_str(url)? {
298 Cow::Borrowed(_) => {}
299 Cow::Owned(new_url) => {
300 *url = new_url;
301 }
302 }
303 Ok(())
304 }
305
306 fn callback(cleaner: &UrlCleaner, node: &mut Node) -> Result<(), Error> {
307 if let Some(link) = node.cast_mut::<Autolink>() {
308 replace_url(cleaner, &mut link.url)?;
309 node.children = alloc::vec![Node::new(Text {
310 content: link.url.clone()
311 })];
312 }
313 if let Some(link) = node.cast_mut::<Linkified>() {
314 replace_url(cleaner, &mut link.url)?;
315 node.children = alloc::vec![Node::new(Text {
316 content: link.url.clone()
317 })];
318 }
319 if let Some(link) = node.cast_mut::<Link>() {
320 replace_url(cleaner, &mut link.url)?;
321 }
322 if let Some(link) = node.cast_mut::<Image>() {
323 replace_url(cleaner, &mut link.url)?;
324 }
325 Ok(())
326 }
327
328 let mut result = alloc::vec![];
329 doc.walk_mut(|node, _| {
330 if let Err(e) = callback(self, node) {
331 result.push(e);
332 };
333 });
334
335 if result.is_empty() {
336 Ok(())
337 } else {
338 Err(result)
339 }
340 }
341}
342
343#[derive(Debug)]
345#[non_exhaustive]
346pub enum Error {
347 #[cfg(feature = "std")]
349 FileRead(std::io::Error),
350 RuleSyntax(serde_json::Error),
352 UrlSyntax(ParseError),
354 RedirectionHasNoCapturingGroup(Regex),
356 PercentDecodeUtf8Error(Utf8Error),
358}
359
360impl Display for Error {
361 fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
362 match self {
363 #[cfg(feature = "std")]
364 Self::FileRead(x) => write!(f, "error reading rules: {x}"),
365 Self::RuleSyntax(x) => write!(f, "error parsing rules: {x}"),
366 Self::UrlSyntax(x) => write!(f, "error parsing url: {x}"),
367 Self::RedirectionHasNoCapturingGroup(x) => {
368 write!(f, "redirection regex {x} has no capture group")
369 }
370 Self::PercentDecodeUtf8Error(x) => {
371 write!(f, "percent decoding resulted in non-UTF-8 bytes: {x}")
372 }
373 }
374 }
375}
376
377#[cfg(feature = "std")]
378impl From<std::io::Error> for Error {
379 fn from(value: std::io::Error) -> Self {
380 Self::FileRead(value)
381 }
382}
383
384impl From<serde_json::Error> for Error {
385 fn from(value: serde_json::Error) -> Self {
386 Self::RuleSyntax(value)
387 }
388}
389
390impl From<ParseError> for Error {
391 fn from(value: ParseError) -> Self {
392 Self::UrlSyntax(value)
393 }
394}
395
396impl From<Utf8Error> for Error {
397 fn from(value: Utf8Error) -> Self {
398 Self::PercentDecodeUtf8Error(value)
399 }
400}
401
402#[cfg(feature = "std")]
403impl std::error::Error for Error {
404 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
405 match self {
406 Self::FileRead(e) => Some(e),
407 Self::RuleSyntax(e) => Some(e),
408 Self::UrlSyntax(e) => Some(e),
409 Self::RedirectionHasNoCapturingGroup(_) => None,
410 Self::PercentDecodeUtf8Error(e) => Some(e),
411 }
412 }
413}