lychee_lib/extract/html/
mod.rs

1//! Extract links and fragments from HTML documents
2pub(crate) mod html5ever;
3pub(crate) mod html5gum;
4mod srcset;
5
6use linkify::{LinkFinder, LinkKind};
7
8/// Check if the given URL is an email link.
9///
10/// This operates on the raw URL strings, not the linkified version because it
11/// gets used in the HTML extractors, which parse the HTML attributes directly
12/// and return the raw strings.
13///
14/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
15/// so there should be no big performance penalty for calling this function.
16pub(crate) fn is_email_link(input: &str) -> bool {
17    let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
18    let email = match findings.next() {
19        None => return false,
20        Some(email) => email.as_str(),
21    };
22
23    // Email needs to match the full string.
24    // Strip the "mailto:" prefix if it exists.
25    input.strip_prefix("mailto:").unwrap_or(input) == email
26}
27
28/// Check if the given element is in the list of preformatted ("verbatim") tags.
29///
30/// These will be excluded from link checking by default.
31// Including the <script> tag is debatable, but the alternative is to
32// have a separate list of tags which need a separate config setting and that
33// seems worse.
34pub(crate) fn is_verbatim_elem(name: &str) -> bool {
35    matches!(
36        name,
37        "code"
38            | "kbd"
39            | "listing"
40            | "noscript"
41            | "plaintext"
42            | "pre"
43            | "samp"
44            | "script"
45            | "textarea"
46            | "var"
47            | "xmp"
48    )
49}
50
51#[cfg(test)]
52mod tests {
53    use super::*;
54
55    #[test]
56    fn test_is_email_link() {
57        assert!(is_email_link("mailto:steve@apple.com"));
58        assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
59
60        assert!(is_email_link("foo@example.org"));
61        assert!(!is_email_link("foo@example.org in sentence"));
62        assert!(!is_email_link("https://example.org"));
63    }
64
65    #[test]
66    fn test_verbatim_matching() {
67        assert!(is_verbatim_elem("pre"));
68        assert!(is_verbatim_elem("code"));
69        assert!(is_verbatim_elem("listing"));
70        assert!(is_verbatim_elem("script"));
71    }
72}