lychee_lib/extract/
mod.rs

1use crate::types::{
2    FileType, InputContent,
3    uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod html;
7pub mod markdown;
8mod plaintext;
9
10use markdown::extract_markdown;
11use plaintext::extract_raw_uri_from_plaintext;
12
13/// A handler for extracting links from various input formats like Markdown and
14/// HTML. Allocations should be avoided if possible as this is a
15/// performance-critical section of the library.
16#[derive(Default, Debug, Clone, Copy)]
17pub struct Extractor {
18    use_html5ever: bool,
19    include_verbatim: bool,
20    include_wikilinks: bool,
21}
22
23impl Extractor {
24    /// Creates a new extractor
25    ///
26    /// The extractor can be configured with the following settings:
27    ///
28    /// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
29    ///   is also used in the Servo browser by Mozilla.
30    ///   The default is `html5gum`, which is more performant and well maintained.
31    ///
32    /// - `include_verbatim` ignores links inside Markdown code blocks.
33    ///   These can be denoted as a block starting with three backticks or an indented block.
34    ///   For more information, consult the `pulldown_cmark` documentation about code blocks
35    ///   [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
36    #[must_use]
37    pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
38        Self {
39            use_html5ever,
40            include_verbatim,
41            include_wikilinks,
42        }
43    }
44
45    /// Main entrypoint for extracting links from various sources
46    /// (Markdown, HTML, and plaintext)
47    #[must_use]
48    pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
49        match input_content.file_type {
50            FileType::Markdown => extract_markdown(
51                &input_content.content,
52                self.include_verbatim,
53                self.include_wikilinks,
54            ),
55            FileType::Html => {
56                if self.use_html5ever {
57                    html::html5ever::extract_html(&input_content.content, self.include_verbatim)
58                } else {
59                    html::html5gum::extract_html(&input_content.content, self.include_verbatim)
60                }
61            }
62            FileType::Plaintext => extract_raw_uri_from_plaintext(
63                &input_content.content,
64                &SourceSpanProvider::from_input(&input_content.content),
65            ),
66        }
67    }
68}
69
70#[cfg(test)]
71mod tests {
72    use pretty_assertions::assert_eq;
73    use reqwest::Url;
74    use std::{collections::HashSet, path::Path};
75    use test_utils::{fixtures_path, load_fixture, mail, website};
76
77    use super::*;
78    use crate::{
79        Uri,
80        types::{FileType, InputContent, ResolvedInputSource},
81        utils::url::find_links,
82    };
83
84    fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
85        let input_content = InputContent::from_string(input, file_type);
86
87        let extractor = Extractor::new(false, false, false);
88        let uris_html5gum: HashSet<Uri> = extractor
89            .extract(&input_content)
90            .into_iter()
91            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
92            .collect();
93        let uris_html5gum_sorted: Vec<Uri> = {
94            let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
95            uris.sort();
96            uris
97        };
98
99        let extractor = Extractor::new(true, false, false);
100        let uris_html5ever: HashSet<Uri> = extractor
101            .extract(&input_content)
102            .into_iter()
103            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
104            .collect();
105        let uris_html5ever_sorted: Vec<Uri> = {
106            let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
107            uris.sort();
108            uris
109        };
110
111        assert_eq!(
112            uris_html5gum_sorted, uris_html5ever_sorted,
113            "Mismatch between html5gum and html5ever"
114        );
115        uris_html5gum
116    }
117
118    #[test]
119    fn verbatim_elem() {
120        let input = "<pre>https://example.com</pre>";
121        let uris = extract_uris(input, FileType::Html);
122        assert!(uris.is_empty());
123    }
124
125    #[test]
126    fn test_file_type() {
127        assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
128        assert_eq!(FileType::from("test.md"), FileType::Markdown);
129        assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
130        assert_eq!(FileType::from("test.html"), FileType::Html);
131        assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
132        assert_eq!(FileType::from("test.something"), FileType::Plaintext);
133        assert_eq!(
134            FileType::from("/absolute/path/to/test.something"),
135            FileType::Plaintext
136        );
137    }
138
139    #[test]
140    fn test_skip_markdown_anchors() {
141        let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
142
143        assert!(links.is_empty());
144    }
145
146    #[test]
147    fn test_skip_markdown_internal_urls() {
148        let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
149
150        assert!(links.is_empty());
151    }
152
153    #[test]
154    fn test_skip_markdown_email() {
155        let input = "Get in touch - [Contact Us](mailto:test@test.com)";
156        let links = extract_uris(input, FileType::Markdown);
157        let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
158
159        assert_eq!(links, expected);
160    }
161
162    #[test]
163    fn relative_urls() {
164        let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
165
166        assert!(links.is_empty());
167    }
168
169    #[test]
170    fn test_non_markdown_links() {
171        let input =
172            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
173        let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
174
175        let expected = IntoIterator::into_iter([
176            website!("https://endler.dev"),
177            website!("https://hello-rust.show/foo/bar?lol=1"),
178            mail!("test@example.com"),
179        ])
180        .collect::<HashSet<Uri>>();
181
182        assert_eq!(links, expected);
183    }
184
185    #[test]
186    fn test_md_escape() {
187        let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
188        let links: Vec<_> = find_links(input).collect();
189        let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
190
191        matches!(&links[..], [link] if link.as_str() == expected);
192    }
193
194    #[test]
195    fn test_extract_html5_not_valid_xml() {
196        let input = load_fixture!("TEST_HTML5.html");
197        let links = extract_uris(&input, FileType::Html);
198
199        let expected_links = IntoIterator::into_iter([
200            website!("https://example.com/head/home"),
201            website!("https://example.com/css/style_full_url.css"),
202            // the body links wouldn't be present if the file was parsed strictly as XML
203            website!("https://example.com/body/a"),
204            website!("https://example.com/body/div_empty_a"),
205        ])
206        .collect::<HashSet<Uri>>();
207
208        assert_eq!(links, expected_links);
209    }
210
211    #[test]
212    fn test_extract_relative_url() {
213        let source = ResolvedInputSource::RemoteUrl(Box::new(
214            Url::parse("https://example.com/some-post").unwrap(),
215        ));
216
217        let contents = r#"<html>
218            <div class="row">
219                <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
220                <a href="/about">About</a>
221            </div>
222        </html>"#;
223
224        let input_content = &InputContent {
225            source,
226            file_type: FileType::Html,
227            content: contents.to_string(),
228        };
229
230        for use_html5ever in [true, false] {
231            let extractor = Extractor::new(use_html5ever, false, false);
232            let links = extractor.extract(input_content);
233
234            let urls = links
235                .into_iter()
236                .map(|raw_uri| raw_uri.text)
237                .collect::<HashSet<_>>();
238
239            let expected_urls = IntoIterator::into_iter([
240                String::from("https://github.com/lycheeverse/lychee/"),
241                String::from("/about"),
242            ])
243            .collect::<HashSet<_>>();
244
245            assert_eq!(urls, expected_urls);
246        }
247    }
248
249    #[test]
250    fn test_extract_html5_lowercase_doctype() {
251        // this has been problematic with previous XML based parser
252        let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
253        let links = extract_uris(&input, FileType::Html);
254
255        let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
256            .collect::<HashSet<Uri>>();
257
258        assert_eq!(links, expected_links);
259    }
260
261    #[test]
262    fn test_extract_html5_minified() {
263        // minified HTML with some quirky elements such as href attribute values specified without quotes
264        let input = load_fixture!("TEST_HTML5_MINIFIED.html");
265        let links = extract_uris(&input, FileType::Html);
266
267        let expected_links = IntoIterator::into_iter([
268            website!("https://example.com/"),
269            website!("https://example.com/favicon.ico"),
270            // Note that we exclude `preconnect` links:
271            // website!("https://fonts.externalsite.com"),
272            website!("https://example.com/docs/"),
273            website!("https://example.com/forum"),
274        ])
275        .collect::<HashSet<Uri>>();
276
277        assert_eq!(links, expected_links);
278    }
279
280    #[test]
281    fn test_extract_html5_malformed() {
282        // malformed links shouldn't stop the parser from further parsing
283        let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
284        let links = extract_uris(&input, FileType::Html);
285
286        let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
287            .collect::<HashSet<Uri>>();
288
289        assert_eq!(links, expected_links);
290    }
291
292    #[test]
293    fn test_extract_html5_custom_elements() {
294        // the element name shouldn't matter for attributes like href, src, cite etc
295        let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
296        let links = extract_uris(&input, FileType::Html);
297
298        let expected_links = IntoIterator::into_iter([
299            website!("https://example.com/some-weird-element"),
300            website!("https://example.com/even-weirder-src"),
301            website!("https://example.com/even-weirder-href"),
302            website!("https://example.com/citations"),
303        ])
304        .collect::<HashSet<Uri>>();
305
306        assert_eq!(links, expected_links);
307    }
308
309    #[test]
310    fn test_extract_urls_with_at_sign_properly() {
311        // note that these used to parse as emails
312        let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
313        let links = extract_uris(&input, FileType::Plaintext);
314
315        let expected_links = IntoIterator::into_iter([
316            website!("https://example.com/@test/test"),
317            website!("http://otherdomain.com/test/@test"),
318        ])
319        .collect::<HashSet<Uri>>();
320
321        assert_eq!(links, expected_links);
322    }
323
324    #[test]
325    fn test_extract_link_at_end_of_line() {
326        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
327        let links = extract_uris(input, FileType::Plaintext);
328
329        let expected_links =
330            IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
331                .collect::<HashSet<Uri>>();
332
333        assert_eq!(links, expected_links);
334    }
335}
lychee_lib/extract/mod.rs

lychee_lib/extract/
mod.rs