lychee_lib/filter/
mod.rs

1mod regex_filter;
2
3use regex::RegexSet;
4use std::collections::HashSet;
5use std::sync::LazyLock;
6
7/// Include configuration for the link checker.
8/// You can include links based on regex patterns.
9pub type Includes = regex_filter::RegexFilter;
10
11/// Exclude configuration for the link checker.
12/// You can ignore links based on regex patterns.
13pub type Excludes = regex_filter::RegexFilter;
14
15/// You can exclude paths and files based on regex patterns.
16pub type PathExcludes = regex_filter::RegexFilter;
17
18use crate::Uri;
19
20/// These domains are explicitly defined by RFC 2606, section 3 Reserved Example
21/// Second Level Domain Names for describing example cases and should not be
22/// dereferenced as they should not have content.
23#[cfg(all(not(test), not(feature = "check_example_domains")))]
24static EXAMPLE_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
25    HashSet::from_iter(["example.com", "example.org", "example.net", "example.edu"])
26});
27
28/// We also exclude the example TLDs in section 2 of the same RFC.
29/// This exclusion gets subsumed by the `check_example_domains` feature.
30#[cfg(all(not(test), not(feature = "check_example_domains")))]
31static EXAMPLE_TLDS: LazyLock<HashSet<&'static str>> =
32    LazyLock::new(|| HashSet::from_iter([".test", ".example", ".invalid", ".localhost"]));
33
34// Allow usage of example domains in tests
35#[cfg(any(test, feature = "check_example_domains"))]
36static EXAMPLE_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(HashSet::new);
37
38#[cfg(any(test, feature = "check_example_domains"))]
39static EXAMPLE_TLDS: LazyLock<HashSet<&'static str>> = LazyLock::new(HashSet::new);
40
41static UNSUPPORTED_DOMAINS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
42    HashSet::from_iter([
43        // Twitter requires an account to view tweets
44        // https://news.ycombinator.com/item?id=36540957
45        "twitter.com",
46    ])
47});
48
49/// Pre-defined exclusions for known false-positives
50const FALSE_POSITIVE_PAT: &[&str] = &[
51    r"^https?://schemas\.openxmlformats\.org",
52    r"^https?://schemas\.microsoft\.com",
53    r"^https?://schemas\.zune\.net",
54    r"^https?://www\.w3\.org/1999/xhtml",
55    r"^https?://www\.w3\.org/1999/xlink",
56    r"^https?://www\.w3\.org/2000/svg",
57    r"^https?://www\.w3\.org/2001/XMLSchema-instance",
58    r"^https?://ogp\.me/ns#",
59    r"^https?://(.*)/xmlrpc\.php$",
60];
61
62static FALSE_POSITIVE_SET: LazyLock<RegexSet> =
63    LazyLock::new(|| regex::RegexSet::new(FALSE_POSITIVE_PAT).expect("Failed to create RegexSet"));
64
65/// The given input is a well-known false-positive, which won't be checked by
66/// default. This behavior can be explicitly overwritten by defining an
67/// `Include` pattern, which will match on a false positive
68#[inline]
69#[must_use]
70pub fn is_false_positive(input: &str) -> bool {
71    FALSE_POSITIVE_SET.is_match(input)
72}
73
74/// Check if the host belongs to a known example domain as defined in
75/// [RFC 2606](https://datatracker.ietf.org/doc/html/rfc2606)
76#[inline]
77#[must_use]
78pub fn is_example_domain(uri: &Uri) -> bool {
79    match uri.domain() {
80        Some(domain) => {
81            // Check if the domain is exactly an example domain or a subdomain of it.
82            EXAMPLE_DOMAINS.iter().any(|&example| {
83                domain == example
84                    || domain
85                        .split_once('.')
86                        .is_some_and(|(_subdomain, tld_part)| tld_part == example)
87            }) || EXAMPLE_TLDS
88                .iter()
89                .any(|&example_tld| domain.ends_with(example_tld))
90        }
91        None => {
92            // Check if the URI is an email address.
93            // e.g. `mailto:mail@example.com`
94            // In this case, the domain is part of the path
95            if uri.is_mail() {
96                EXAMPLE_DOMAINS.iter().any(|tld| uri.path().ends_with(tld))
97            } else {
98                false
99            }
100        }
101    }
102}
103
104/// Check if the host belongs to a known unsupported domain
105#[inline]
106#[must_use]
107pub fn is_unsupported_domain(uri: &Uri) -> bool {
108    if let Some(domain) = uri.domain() {
109        // It is not enough to use `UNSUPPORTED_DOMAINS.contains(domain)` here
110        // as this would not include checks for subdomains, such as
111        // `foo.example.com`
112        UNSUPPORTED_DOMAINS.iter().any(|tld| domain.ends_with(tld))
113    } else {
114        false
115    }
116}
117
118/// A generic URI filter
119/// Used to decide if a given URI should be checked or skipped
120#[allow(clippy::struct_excessive_bools)]
121#[derive(Clone, Debug, Default)]
122pub struct Filter {
123    /// URIs explicitly included for checking. This takes precedence over excludes
124    pub includes: Option<Includes>,
125    /// URIs excluded from checking
126    pub excludes: Option<Excludes>,
127    /// Only check URIs with the given schemes (e.g. `https` and `http`)
128    // TODO: includes_scheme and excludes_scheme
129    // TODO: excludes_mail should be an alias for exclude_scheme=mailto
130    pub schemes: HashSet<String>,
131    /// Example: 192.168.0.1
132    pub exclude_private_ips: bool,
133    /// Example: 169.254.0.0
134    pub exclude_link_local_ips: bool,
135    /// For IPv4: 127.0.0.1/8
136    /// For IPv6: `::1/128`
137    pub exclude_loopback_ips: bool,
138    /// Example: octocat@github.com
139    pub include_mail: bool,
140}
141
142impl Filter {
143    #[inline]
144    #[must_use]
145    /// Whether e-mails aren't checked (which is the default)
146    pub fn is_mail_excluded(&self, uri: &Uri) -> bool {
147        uri.is_mail() && !self.include_mail
148    }
149
150    #[must_use]
151    /// Whether the IP address is excluded from checking
152    pub fn is_ip_excluded(&self, uri: &Uri) -> bool {
153        if (self.exclude_loopback_ips && uri.is_loopback())
154            || (self.exclude_private_ips && uri.is_private())
155            || (self.exclude_link_local_ips && uri.is_link_local())
156        {
157            return true;
158        }
159
160        false
161    }
162
163    #[must_use]
164    /// Whether the host is excluded from checking
165    pub fn is_host_excluded(&self, uri: &Uri) -> bool {
166        // If loopback IPs are excluded, exclude localhost as well, which usually maps to a loopback IP
167        self.exclude_loopback_ips && uri.domain() == Some("localhost")
168    }
169
170    #[inline]
171    #[must_use]
172    /// Whether the scheme of the given URI is excluded
173    pub fn is_scheme_excluded(&self, uri: &Uri) -> bool {
174        if self.schemes.is_empty() {
175            return false;
176        }
177        !self.schemes.contains(uri.scheme())
178    }
179
180    #[inline]
181    fn is_includes_empty(&self) -> bool {
182        !matches!(self.includes, Some(ref includes) if !includes.is_empty())
183    }
184
185    #[inline]
186    fn is_excludes_empty(&self) -> bool {
187        !matches!(self.excludes, Some(ref excludes) if !excludes.is_empty())
188    }
189
190    #[inline]
191    fn is_includes_match(&self, input: &str) -> bool {
192        matches!(self.includes, Some(ref includes) if includes.is_match(input))
193    }
194
195    #[inline]
196    fn is_excludes_match(&self, input: &str) -> bool {
197        matches!(self.excludes, Some(ref excludes) if excludes.is_match(input))
198    }
199
200    /// Determine whether a given [`Uri`] should be excluded.
201    ///
202    /// # Details
203    ///
204    /// 1. If any of the following conditions are met, the URI is excluded:
205    ///   - If it's a mail address and it's not configured to include mail addresses.
206    ///   - If the IP address belongs to a type that is configured to exclude.
207    ///   - If the host belongs to a type that is configured to exclude.
208    ///   - If the scheme of URI is not the allowed scheme.
209    /// 2. Decide whether the URI is *presumably included* or *explicitly included*:
210    ///    - When both excludes and includes rules are empty, it's *presumably included* unless
211    ///      it's a known false positive.
212    ///    - When the includes rules matches the URI, it's *explicitly included*.
213    /// 3. When it's a known *false positive* pattern, it's *explicitly excluded*.
214    /// 4. Decide whether the URI is *presumably excluded* or *explicitly excluded*:
215    ///    - When excludes rules is empty, but includes rules doesn't match the URI, it's
216    ///      *presumably excluded*.
217    ///    - When the excludes rules matches the URI, it's *explicitly excluded*.
218    ///    - When the excludes rules matches the URI, it's *explicitly excluded*.
219    #[must_use]
220    pub fn is_excluded(&self, uri: &Uri) -> bool {
221        // Skip mail address, specific IP, specific host and scheme
222        if self.is_scheme_excluded(uri)
223            || self.is_host_excluded(uri)
224            || self.is_ip_excluded(uri)
225            || self.is_mail_excluded(uri)
226            || uri.is_tel()
227            || is_example_domain(uri)
228            || is_unsupported_domain(uri)
229        {
230            return true;
231        }
232
233        let input = uri.as_str();
234
235        if self.is_includes_empty() {
236            if self.is_excludes_empty() {
237                // Both excludes and includes rules are empty:
238                // *Presumably included* unless it's a false positive
239                return is_false_positive(input);
240            }
241        } else if self.is_includes_match(input) {
242            // *Explicitly included* (Includes take precedence over excludes)
243            return false;
244        }
245
246        // Exclude well-known false-positives
247        // Performed after checking includes to allow user-overwrites
248        if is_false_positive(input)
249                // Previous checks imply input is not explicitly included.
250                // If exclude rules are empty, then *presumably excluded*
251                || self.is_excludes_empty()
252                // If exclude rules match input, then *explicitly excluded*
253                || self.is_excludes_match(input)
254        {
255            return true;
256        }
257
258        false
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use reqwest::Url;
265    use test_utils::{mail, website};
266    use url::Host;
267
268    use super::{Excludes, Filter, Includes};
269    use crate::Uri;
270
271    // Note: the standard library, as of Rust stable 1.47.0, does not expose
272    // "link-local" or "private" IPv6 checks. However, one might argue
273    // that these concepts do exist in IPv6, albeit the naming is different.
274    // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
275    // See: https://en.wikipedia.org/wiki/Private_network#IPv6
276    // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
277    const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
278    const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
279    const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
280
281    const V4_LOOPBACK: &str = "http://127.0.0.1";
282    const V6_LOOPBACK: &str = "http://[::1]";
283
284    const V4_LINK_LOCAL_1: &str = "http://169.254.0.1";
285    const V4_LINK_LOCAL_2: &str = "http://169.254.10.1:8080";
286
287    // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
288    const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
289    const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
290
291    macro_rules! assert_ip_address {
292        (v4: $ip:expr, $predicate:tt) => {
293            let res = if let Host::Ipv4(ipv4) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
294                ipv4.$predicate()
295            } else {
296                false
297            };
298            std::assert!(res);
299        };
300        (v6: $ip:expr, $predicate:tt) => {
301            let res = if let Host::Ipv6(ipv6) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? {
302                ipv6.$predicate()
303            } else {
304                false
305            };
306            std::assert!(res);
307        };
308    }
309
310    #[allow(clippy::shadow_unrelated)]
311    #[test]
312    fn test_const_sanity() -> Result<(), ()> {
313        assert_ip_address!(v4: V4_PRIVATE_CLASS_A, is_private);
314        assert_ip_address!(v4: V4_PRIVATE_CLASS_B, is_private);
315        assert_ip_address!(v4: V4_PRIVATE_CLASS_C, is_private);
316
317        assert_ip_address!(v4: V4_LOOPBACK, is_loopback);
318        assert_ip_address!(v6: V6_LOOPBACK, is_loopback);
319
320        assert_ip_address!(v4: V4_LINK_LOCAL_1, is_link_local);
321        assert_ip_address!(v4: V4_LINK_LOCAL_2, is_link_local);
322
323        Ok(())
324    }
325
326    #[test]
327    fn test_exclude_loopback_ips() {
328        let filter = Filter {
329            exclude_loopback_ips: true,
330            ..Filter::default()
331        };
332        let uri = Uri::try_from("https://[::1]").unwrap();
333        assert!(filter.is_excluded(&uri));
334        let uri = Uri::try_from("https://127.0.0.1/8").unwrap();
335        assert!(filter.is_excluded(&uri));
336    }
337
338    #[test]
339    fn test_includes_and_excludes_empty() {
340        // This is the pre-configured, empty set of excludes for a client.
341        // In this case, only the requests matching the include set will be checked
342        let filter = Filter::default();
343
344        assert!(!filter.is_excluded(&website!("https://example.com")));
345    }
346
347    #[test]
348    fn test_false_positives() {
349        let filter = Filter::default();
350
351        assert!(filter.is_excluded(&website!("http://www.w3.org/1999/xhtml")));
352        assert!(filter.is_excluded(&website!(
353            "http://schemas.openxmlformats.org/markup-compatibility/2006"
354        )));
355        assert!(!filter.is_excluded(&website!("https://example.com")));
356    }
357
358    #[test]
359    fn test_overwrite_false_positives() {
360        let includes = Includes::new([r"http://www.w3.org/1999/xhtml"]).unwrap();
361        let filter = Filter {
362            includes: Some(includes),
363            ..Filter::default()
364        };
365        assert!(!filter.is_excluded(&website!("http://www.w3.org/1999/xhtml")));
366    }
367
368    #[test]
369    fn test_include_regex() {
370        let includes = Includes::new([r"foo.example.com"]).unwrap();
371        let filter = Filter {
372            includes: Some(includes),
373            ..Filter::default()
374        };
375
376        // Only the requests matching the include set will be checked
377        assert!(!filter.is_excluded(&website!("https://foo.example.com")));
378        assert!(filter.is_excluded(&website!("https://bar.example.com")));
379        assert!(filter.is_excluded(&website!("https://example.com")));
380    }
381
382    #[test]
383    fn test_exclude_mail_by_default() {
384        let filter = Filter {
385            ..Filter::default()
386        };
387
388        assert!(filter.is_excluded(&mail!("mail@example.com")));
389        assert!(filter.is_excluded(&mail!("foo@bar.dev")));
390        assert!(!filter.is_excluded(&website!("http://bar.dev")));
391    }
392
393    #[test]
394    fn test_include_mail() {
395        let filter = Filter {
396            include_mail: true,
397            ..Filter::default()
398        };
399
400        assert!(!filter.is_excluded(&mail!("mail@example.com")));
401        assert!(!filter.is_excluded(&mail!("foo@bar.dev")));
402        assert!(!filter.is_excluded(&website!("http://bar.dev")));
403    }
404
405    #[test]
406    fn test_exclude_regex() {
407        let excludes =
408            Excludes::new([r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap();
409        let filter = Filter {
410            excludes: Some(excludes),
411            ..Filter::default()
412        };
413
414        assert!(filter.is_excluded(&website!("https://github.com")));
415        assert!(filter.is_excluded(&website!("http://exclude.org")));
416        assert!(filter.is_excluded(&mail!("mail@example.com")));
417
418        assert!(!filter.is_excluded(&website!("http://bar.dev")));
419        assert!(filter.is_excluded(&mail!("foo@bar.dev")));
420    }
421    #[test]
422    fn test_exclude_include_regex() {
423        let includes = Includes::new([r"foo.example.com"]).unwrap();
424        let excludes = Excludes::new([r"example.com"]).unwrap();
425        let filter = Filter {
426            includes: Some(includes),
427            excludes: Some(excludes),
428            ..Filter::default()
429        };
430
431        // Includes take preference over excludes
432        assert!(!filter.is_excluded(&website!("https://foo.example.com")),);
433
434        assert!(filter.is_excluded(&website!("https://example.com")));
435        assert!(filter.is_excluded(&website!("https://bar.example.com")));
436    }
437
438    #[test]
439    fn test_excludes_no_private_ips_by_default() {
440        let filter = Filter::default();
441
442        assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_A)));
443        assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_B)));
444        assert!(!filter.is_excluded(&website!(V4_PRIVATE_CLASS_C)));
445        assert!(!filter.is_excluded(&website!(V4_LINK_LOCAL_1)));
446        assert!(!filter.is_excluded(&website!(V4_LINK_LOCAL_2)));
447        assert!(!filter.is_excluded(&website!(V4_LOOPBACK)));
448        assert!(!filter.is_excluded(&website!(V6_LOOPBACK)));
449        assert!(!filter.is_excluded(&website!("http://localhost")));
450    }
451
452    #[test]
453    fn test_exclude_private_ips() {
454        let filter = Filter {
455            exclude_private_ips: true,
456            ..Filter::default()
457        };
458
459        assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_A)));
460        assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_B)));
461        assert!(filter.is_excluded(&website!(V4_PRIVATE_CLASS_C)));
462    }
463
464    #[test]
465    fn test_exclude_link_local() {
466        let filter = Filter {
467            exclude_link_local_ips: true,
468            ..Filter::default()
469        };
470
471        assert!(filter.is_excluded(&website!(V4_LINK_LOCAL_1)));
472        assert!(filter.is_excluded(&website!(V4_LINK_LOCAL_2)));
473    }
474
475    #[test]
476    fn test_exclude_loopback() {
477        let filter = Filter {
478            exclude_loopback_ips: true,
479            ..Filter::default()
480        };
481
482        assert!(filter.is_excluded(&website!(V4_LOOPBACK)));
483        assert!(filter.is_excluded(&website!(V6_LOOPBACK)));
484        assert!(filter.is_excluded(&website!("http://localhost")));
485    }
486
487    #[test]
488    fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
489        let filter = Filter {
490            exclude_private_ips: true,
491            exclude_link_local_ips: true,
492            ..Filter::default()
493        };
494
495        // if these were pure IPv4, we would exclude
496        assert!(!filter.is_excluded(&website!(V6_MAPPED_V4_PRIVATE_CLASS_A)));
497        assert!(!filter.is_excluded(&website!(V6_MAPPED_V4_LINK_LOCAL)));
498    }
499}