1mod regex_filter;
2
3use regex::RegexSet;
4use std::collections::HashSet;
5use std::sync::LazyLock;
6
7pub type Includes = regex_filter::RegexFilter;
10
11pub type Excludes = regex_filter::RegexFilter;
14
15pub type PathExcludes = regex_filter::RegexFilter;
17
18use crate::Uri;
19
20#[cfg(all(not(test), not(feature = "check_example_domains")))]
24static EXAMPLE_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
25 HashSet::from_iter(["example.com", "example.org", "example.net", "example.edu"])
26});
27
28#[cfg(all(not(test), not(feature = "check_example_domains")))]
31static EXAMPLE_TLDS: LazyLock<HashSet<&'static str>> =
32 LazyLock::new(|| HashSet::from_iter([".test", ".example", ".invalid", ".localhost"]));
33
34#[cfg(any(test, feature = "check_example_domains"))]
36static EXAMPLE_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(HashSet::new);
37
38#[cfg(any(test, feature = "check_example_domains"))]
39static EXAMPLE_TLDS: LazyLock<HashSet<&'static str>> = LazyLock::new(HashSet::new);
40
41static UNSUPPORTED_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
42 HashSet::from_iter([
43 "twitter.com",
46 ])
47});
48
49const FALSE_POSITIVE_PAT: &[&str] = &[
51 r"^https?://schemas\.openxmlformats\.org",
52 r"^https?://schemas\.microsoft\.com",
53 r"^https?://schemas\.zune\.net",
54 r"^https?://www\.w3\.org/1999/xhtml",
55 r"^https?://www\.w3\.org/1999/xlink",
56 r"^https?://www\.w3\.org/2000/svg",
57 r"^https?://www\.w3\.org/2001/XMLSchema-instance",
58 r"^https?://ogp\.me/ns#",
59 r"^https?://(.*)/xmlrpc\.php$",
60];
61
62static FALSE_POSITIVE_SET: LazyLock<RegexSet> =
63 LazyLock::new(|| regex::RegexSet::new(FALSE_POSITIVE_PAT).expect("Failed to create RegexSet"));
64
65#[inline]
69#[must_use]
70pub fn is_false_positive(input: &str) -> bool {
71 FALSE_POSITIVE_SET.is_match(input)
72}
73
74#[inline]
77#[must_use]
78pub fn is_example_domain(uri: &Uri) -> bool {
79 match uri.domain() {
80 Some(domain) => {
81 EXAMPLE_DOMAINS.iter().any(|&example| {
83 domain == example
84 || domain
85 .split_once('.')
86 .is_some_and(|(_subdomain, tld_part)| tld_part == example)
87 }) || EXAMPLE_TLDS
88 .iter()
89 .any(|&example_tld| domain.ends_with(example_tld))
90 }
91 None => {
92 if uri.is_mail() {
96 EXAMPLE_DOMAINS.iter().any(|tld| uri.path().ends_with(tld))
97 } else {
98 false
99 }
100 }
101 }
102}
103
104#[inline]
106#[must_use]
107pub fn is_unsupported_domain(uri: &Uri) -> bool {
108 if let Some(domain) = uri.domain() {
109 UNSUPPORTED_DOMAINS.iter().any(|tld| domain.ends_with(tld))
113 } else {
114 false
115 }
116}
117
118#[allow(clippy::struct_excessive_bools)]
121#[derive(Clone, Debug, Default)]
122pub struct Filter {
123 pub includes: Option<Includes>,
125 pub excludes: Option<Excludes>,
127 pub schemes: HashSet<String>,
131 pub exclude_private_ips: bool,
133 pub exclude_link_local_ips: bool,
135 pub exclude_loopback_ips: bool,
138 pub include_mail: bool,
140}
141
142impl Filter {
143 #[inline]
144 #[must_use]
145 pub fn is_mail_excluded(&self, uri: &Uri) -> bool {
147 uri.is_mail() && !self.include_mail
148 }
149
150 #[must_use]
151 pub fn is_ip_excluded(&self, uri: &Uri) -> bool {
153 if (self.exclude_loopback_ips && uri.is_loopback())
154 || (self.exclude_private_ips && uri.is_private())
155 || (self.exclude_link_local_ips && uri.is_link_local())
156 {
157 return true;
158 }
159
160 false
161 }
162
163 #[must_use]
164 pub fn is_host_excluded(&self, uri: &Uri) -> bool {
166 self.exclude_loopback_ips && uri.domain() == Some("localhost")
168 }
169
170 #[inline]
171 #[must_use]
172 pub fn is_scheme_excluded(&self, uri: &Uri) -> bool {
174 if self.schemes.is_empty() {
175 return false;
176 }
177 !self.schemes.contains(uri.scheme())
178 }
179
180 #[inline]
181 fn is_includes_empty(&self) -> bool {
182 !matches!(self.includes, Some(ref includes) if !includes.is_empty())
183 }
184
185 #[inline]
186 fn is_excludes_empty(&self) -> bool {
187 !matches!(self.excludes, Some(ref excludes) if !excludes.is_empty())
188 }
189
190 #[inline]
191 fn is_includes_match(&self, input: &str) -> bool {
192 matches!(self.includes, Some(ref includes) if includes.is_match(input))
193 }
194
195 #[inline]
196 fn is_excludes_match(&self, input: &str) -> bool {
197 matches!(self.excludes, Some(ref excludes) if excludes.is_match(input))
198 }
199
200 #[must_use]
220 pub fn is_excluded(&self, uri: &Uri) -> bool {
221 if self.is_scheme_excluded(uri)
223 || self.is_host_excluded(uri)
224 || self.is_ip_excluded(uri)
225 || self.is_mail_excluded(uri)
226 || uri.is_tel()
227 || is_example_domain(uri)
228 || is_unsupported_domain(uri)
229 {
230 return true;
231 }
232
233 let input = uri.as_str();
234
235 if self.is_includes_empty() {
236 if self.is_excludes_empty() {
237 return is_false_positive(input);
240 }
241 } else if self.is_includes_match(input) {
242 return false;
244 }
245
246 if is_false_positive(input)
249 || self.is_excludes_empty()
252 || self.is_excludes_match(input)
254 {
255 return true;
256 }
257
258 false
259 }
260}
261
262#[cfg(test)]
263mod tests {
264 use reqwest::Url;
265 use test_utils::{mail, website};
266 use url::Host;
267
268 use super::{Excludes, Filter, Includes};
269 use crate::Uri;
270
271 const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
278 const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
279 const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
280
281 const V4_LOOPBACK: &str = "http://127.0.0.1";
282 const V6_LOOPBACK: &str = "http://[::1]";
283
284 const V4_LINK_LOCAL_1: &str = "http://169.254.0.1";
285 const V4_LINK_LOCAL_2: &str = "http://169.254.10.1:8080";
286
287 const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
289 const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
290
291 macro_rules! assert_ip_address {
292 (v4: $ip:expr, $predicate:tt) => {
293 let res = if let Host::Ipv4(ipv4) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
294 ipv4.$predicate()
295 } else {
296 false
297 };
298 std::assert!(res);
299 };
300 (v6: $ip:expr, $predicate:tt) => {
301 let res = if let Host::Ipv6(ipv6) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
302 ipv6.$predicate()
303 } else {
304 false
305 };
306 std::assert!(res);
307 };
308 }
309
310 #[allow(clippy::shadow_unrelated)]
311 #[test]
312 fn test_const_sanity() -> Result<(), ()> {
313 assert_ip_address!(v4: V4_PRIVATE_CLASS_A, is_private);
314 assert_ip_address!(v4: V4_PRIVATE_CLASS_B, is_private);
315 assert_ip_address!(v4: V4_PRIVATE_CLASS_C, is_private);
316
317 assert_ip_address!(v4: V4_LOOPBACK, is_loopback);
318 assert_ip_address!(v6: V6_LOOPBACK, is_loopback);
319
320 assert_ip_address!(v4: V4_LINK_LOCAL_1, is_link_local);
321 assert_ip_address!(v4: V4_LINK_LOCAL_2, is_link_local);
322
323 Ok(())
324 }
325
326 #[test]
327 fn test_exclude_loopback_ips() {
328 let filter = Filter {
329 exclude_loopback_ips: true,
330 ..Filter::default()
331 };
332 let uri = Uri::try_from("https://[::1]").unwrap();
333 assert!(filter.is_excluded(&uri));
334 let uri = Uri::try_from("https://127.0.0.1/8").unwrap();
335 assert!(filter.is_excluded(&uri));
336 }
337
338 #[test]
339 fn test_includes_and_excludes_empty() {
340 let filter = Filter::default();
343
344 assert!(!filter.is_excluded(&website!("https://example.com")));
345 }
346
347 #[test]
348 fn test_false_positives() {
349 let filter = Filter::default();
350
351 assert!(filter.is_excluded(&website!("http://www.w3.org/1999/xhtml")));
352 assert!(filter.is_excluded(&website!(
353 "http://schemas.openxmlformats.org/markup-compatibility/2006"
354 )));
355 assert!(!filter.is_excluded(&website!("https://example.com")));
356 }
357
358 #[test]
359 fn test_overwrite_false_positives() {
360 let includes = Includes::new([r"http://www.w3.org/1999/xhtml"]).unwrap();
361 let filter = Filter {
362 includes: Some(includes),
363 ..Filter::default()
364 };
365 assert!(!filter.is_excluded(&website!("http://www.w3.org/1999/xhtml")));
366 }
367
368 #[test]
369 fn test_include_regex() {
370 let includes = Includes::new([r"foo.example.com"]).unwrap();
371 let filter = Filter {
372 includes: Some(includes),
373 ..Filter::default()
374 };
375
376 assert!(!filter.is_excluded(&website!("https://foo.example.com")));
378 assert!(filter.is_excluded(&website!("https://bar.example.com")));
379 assert!(filter.is_excluded(&website!("https://example.com")));
380 }
381
382 #[test]
383 fn test_exclude_mail_by_default() {
384 let filter = Filter {
385 ..Filter::default()
386 };
387
388 assert!(filter.is_excluded(&mail!("mail@example.com")));
389 assert!(filter.is_excluded(&mail!("foo@bar.dev")));
390 assert!(!filter.is_excluded(&website!("http://bar.dev")));
391 }
392
393 #[test]
394 fn test_include_mail() {
395 let filter = Filter {
396 include_mail: true,
397 ..Filter::default()
398 };
399
400 assert!(!filter.is_excluded(&mail!("mail@example.com")));
401 assert!(!filter.is_excluded(&mail!("foo@bar.dev")));
402 assert!(!filter.is_excluded(&website!("http://bar.dev")));
403 }
404
405 #[test]
406 fn test_exclude_regex() {
407 let excludes =
408 Excludes::new([r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap();
409 let filter = Filter {
410 excludes: Some(excludes),
411 ..Filter::default()
412 };
413
414 assert!(filter.is_excluded(&website!("https://github.com")));
415 assert!(filter.is_excluded(&website!("http://exclude.org")));
416 assert!(filter.is_excluded(&mail!("mail@example.com")));
417
418 assert!(!filter.is_excluded(&website!("http://bar.dev")));
419 assert!(filter.is_excluded(&mail!("foo@bar.dev")));
420 }
421 #[test]
422 fn test_exclude_include_regex() {
423 let includes = Includes::new([r"foo.example.com"]).unwrap();
424 let excludes = Excludes::new([r"example.com"]).unwrap();
425 let filter = Filter {
426 includes: Some(includes),
427 excludes: Some(excludes),
428 ..Filter::default()
429 };
430
431 assert!(!filter.is_excluded(&website!("https://foo.example.com")),);
433
434 assert!(filter.is_excluded(&website!("https://example.com")));
435 assert!(filter.is_excluded(&website!("https://bar.example.com")));
436 }
437
438 #[test]
439 fn test_excludes_no_private_ips_by_default() {
440 let filter = Filter::default();
441
442 assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_A)));
443 assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_B)));
444 assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_C)));
445 assert!(!filter.is_excluded(&website!(V4_LINK_LOCAL_1)));
446 assert!(!filter.is_excluded(&website!(V4_LINK_LOCAL_2)));
447 assert!(!filter.is_excluded(&website!(V4_LOOPBACK)));
448 assert!(!filter.is_excluded(&website!(V6_LOOPBACK)));
449 assert!(!filter.is_excluded(&website!("http://localhost")));
450 }
451
452 #[test]
453 fn test_exclude_private_ips() {
454 let filter = Filter {
455 exclude_private_ips: true,
456 ..Filter::default()
457 };
458
459 assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_A)));
460 assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_B)));
461 assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_C)));
462 }
463
464 #[test]
465 fn test_exclude_link_local() {
466 let filter = Filter {
467 exclude_link_local_ips: true,
468 ..Filter::default()
469 };
470
471 assert!(filter.is_excluded(&website!(V4_LINK_LOCAL_1)));
472 assert!(filter.is_excluded(&website!(V4_LINK_LOCAL_2)));
473 }
474
475 #[test]
476 fn test_exclude_loopback() {
477 let filter = Filter {
478 exclude_loopback_ips: true,
479 ..Filter::default()
480 };
481
482 assert!(filter.is_excluded(&website!(V4_LOOPBACK)));
483 assert!(filter.is_excluded(&website!(V6_LOOPBACK)));
484 assert!(filter.is_excluded(&website!("http://localhost")));
485 }
486
487 #[test]
488 fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
489 let filter = Filter {
490 exclude_private_ips: true,
491 exclude_link_local_ips: true,
492 ..Filter::default()
493 };
494
495 assert!(!filter.is_excluded(&website!(V6_MAPPED_V4_PRIVATE_CLASS_A)));
497 assert!(!filter.is_excluded(&website!(V6_MAPPED_V4_LINK_LOCAL)));
498 }
499}