Skip to main content

lychee_lib/extract/
mod.rs

1use crate::types::{
2    FileType, InputContent,
3    uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod css;
7pub mod html;
8pub mod markdown;
9mod plaintext;
10pub mod xml;
11
12use css::extract_css;
13use markdown::extract_markdown;
14use plaintext::extract_raw_uri_from_plaintext;
15use xml::extract_xml;
16
17/// A handler for extracting links from various input formats like Markdown and
18/// HTML. Allocations should be avoided if possible as this is a
19/// performance-critical section of the library.
20#[derive(Default, Debug, Clone, Copy)]
21pub struct Extractor {
22    use_html5ever: bool,
23    include_verbatim: bool,
24    include_wikilinks: bool,
25}
26
27impl Extractor {
28    /// Creates a new extractor
29    ///
30    /// The extractor can be configured with the following settings:
31    ///
32    /// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
33    ///   is also used in the Servo browser by Mozilla.
34    ///   The default is `html5gum`, which is more performant and well maintained.
35    ///
36    /// - `include_verbatim` ignores links inside Markdown code blocks.
37    ///   These can be denoted as a block starting with three backticks or an indented block.
38    ///   For more information, consult the `pulldown_cmark` documentation about code blocks
39    ///   [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
40    #[must_use]
41    pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
42        Self {
43            use_html5ever,
44            include_verbatim,
45            include_wikilinks,
46        }
47    }
48
49    /// Main entrypoint for extracting links from various sources
50    /// (Markdown, HTML, CSS, and plaintext)
51    #[must_use]
52    pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
53        let content = &input_content.content;
54        match input_content.file_type {
55            FileType::Markdown => {
56                extract_markdown(content, self.include_verbatim, self.include_wikilinks)
57            }
58            FileType::Html => {
59                if self.use_html5ever {
60                    html::html5ever::extract_html(content, self.include_verbatim)
61                } else {
62                    html::html5gum::extract_html(content, self.include_verbatim)
63                }
64            }
65            FileType::Css => extract_css(content, &SourceSpanProvider::from_input(content)),
66            FileType::Plaintext => {
67                extract_raw_uri_from_plaintext(content, &SourceSpanProvider::from_input(content))
68            }
69            FileType::Xml => extract_xml(content, &SourceSpanProvider::from_input(content)),
70        }
71    }
72}
73
74#[cfg(test)]
75mod tests {
76    use pretty_assertions::assert_eq;
77    use reqwest::Url;
78    use std::{collections::HashSet, path::Path};
79    use test_utils::{fixtures_path, load_fixture, mail, website};
80
81    use super::*;
82    use crate::{
83        Uri,
84        types::{
85            FileType, InputContent, ResolvedInputSource,
86            uri::raw::{RawUriSpan, span},
87        },
88        utils::url::find_links,
89    };
90
91    fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
92        let input_content = InputContent::from_string(input, file_type);
93
94        let extractor = Extractor::new(false, false, false);
95        let uris_html5gum: HashSet<Uri> = extractor
96            .extract(&input_content)
97            .into_iter()
98            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
99            .collect();
100        let uris_html5gum_sorted: Vec<Uri> = {
101            let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
102            uris.sort();
103            uris
104        };
105
106        let extractor = Extractor::new(true, false, false);
107        let uris_html5ever: HashSet<Uri> = extractor
108            .extract(&input_content)
109            .into_iter()
110            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
111            .collect();
112        let uris_html5ever_sorted: Vec<Uri> = {
113            let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
114            uris.sort();
115            uris
116        };
117
118        assert_eq!(
119            uris_html5gum_sorted, uris_html5ever_sorted,
120            "Mismatch between html5gum and html5ever"
121        );
122        uris_html5gum
123    }
124
125    #[test]
126    fn verbatim_elem() {
127        let input = "<pre>https://example.com</pre>";
128        let uris = extract_uris(input, FileType::Html);
129        assert!(uris.is_empty());
130    }
131
132    #[test]
133    fn test_file_type() {
134        assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
135        assert_eq!(FileType::from("test.md"), FileType::Markdown);
136        assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
137        assert_eq!(FileType::from("test.html"), FileType::Html);
138        assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
139        assert_eq!(FileType::from("test.something"), FileType::Plaintext);
140        assert_eq!(
141            FileType::from("/absolute/path/to/test.something"),
142            FileType::Plaintext
143        );
144    }
145
146    #[test]
147    fn test_skip_markdown_anchors() {
148        let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
149
150        assert!(links.is_empty());
151    }
152
153    #[test]
154    fn test_skip_markdown_internal_urls() {
155        let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
156
157        assert!(links.is_empty());
158    }
159
160    #[test]
161    fn test_skip_markdown_email() {
162        let input = "Get in touch - [Contact Us](mailto:test@test.com)";
163        let links = extract_uris(input, FileType::Markdown);
164        let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
165
166        assert_eq!(links, expected);
167    }
168
169    #[test]
170    fn relative_urls() {
171        let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
172
173        assert!(links.is_empty());
174    }
175
176    #[test]
177    fn test_non_markdown_links() {
178        let input =
179            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
180        let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
181
182        let expected = IntoIterator::into_iter([
183            website!("https://endler.dev"),
184            website!("https://hello-rust.show/foo/bar?lol=1"),
185            mail!("test@example.com"),
186        ])
187        .collect::<HashSet<Uri>>();
188
189        assert_eq!(links, expected);
190    }
191
192    #[test]
193    fn test_md_escape() {
194        let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
195        let links: Vec<_> = find_links(input).collect();
196        let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
197
198        matches!(&links[..], [link] if link.as_str() == expected);
199    }
200
201    #[test]
202    fn test_extract_html5_not_valid_xml() {
203        let input = load_fixture!("TEST_HTML5.html");
204        let links = extract_uris(&input, FileType::Html);
205
206        let expected_links = IntoIterator::into_iter([
207            website!("https://example.com/head/home"),
208            website!("https://example.com/css/style_full_url.css"),
209            // the body links wouldn't be present if the file was parsed strictly as XML
210            website!("https://example.com/body/a"),
211            website!("https://example.com/body/div_empty_a"),
212        ])
213        .collect::<HashSet<Uri>>();
214
215        assert_eq!(links, expected_links);
216    }
217
218    #[test]
219    fn test_extract_relative_url() {
220        let source = ResolvedInputSource::RemoteUrl(Box::new(
221            Url::parse("https://example.com/some-post").unwrap(),
222        ));
223
224        let contents = r#"<html>
225            <div class="row">
226                <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
227                <a href="/about">About</a>
228            </div>
229        </html>"#;
230
231        let input_content = &InputContent {
232            source,
233            file_type: FileType::Html,
234            content: contents.to_string(),
235        };
236
237        for use_html5ever in [true, false] {
238            let extractor = Extractor::new(use_html5ever, false, false);
239            let links = extractor.extract(input_content);
240
241            let urls = links
242                .into_iter()
243                .map(|raw_uri| raw_uri.text)
244                .collect::<HashSet<_>>();
245
246            let expected_urls = IntoIterator::into_iter([
247                String::from("https://github.com/lycheeverse/lychee/"),
248                String::from("/about"),
249            ])
250            .collect::<HashSet<_>>();
251
252            assert_eq!(urls, expected_urls);
253        }
254    }
255
256    #[test]
257    fn test_extract_html5_lowercase_doctype() {
258        // this has been problematic with previous XML based parser
259        let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
260        let links = extract_uris(&input, FileType::Html);
261
262        let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
263            .collect::<HashSet<Uri>>();
264
265        assert_eq!(links, expected_links);
266    }
267
268    #[test]
269    fn test_extract_html5_minified() {
270        // minified HTML with some quirky elements such as href attribute values specified without quotes
271        let input = load_fixture!("TEST_HTML5_MINIFIED.html");
272        let links = extract_uris(&input, FileType::Html);
273
274        let expected_links = IntoIterator::into_iter([
275            website!("https://example.com/"),
276            website!("https://example.com/favicon.ico"),
277            // Note that we exclude `preconnect` links:
278            // website!("https://fonts.externalsite.com"),
279            website!("https://example.com/docs/"),
280            website!("https://example.com/forum"),
281        ])
282        .collect::<HashSet<Uri>>();
283
284        assert_eq!(links, expected_links);
285    }
286
287    #[test]
288    fn test_extract_html5_malformed() {
289        // malformed links shouldn't stop the parser from further parsing
290        let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
291        let links = extract_uris(&input, FileType::Html);
292
293        let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
294            .collect::<HashSet<Uri>>();
295
296        assert_eq!(links, expected_links);
297    }
298
299    #[test]
300    fn test_extract_html5_custom_elements() {
301        // the element name shouldn't matter for attributes like href, src, cite etc
302        let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
303        let links = extract_uris(&input, FileType::Html);
304
305        let expected_links = IntoIterator::into_iter([
306            website!("https://example.com/some-weird-element"),
307            website!("https://example.com/even-weirder-src"),
308            website!("https://example.com/even-weirder-href"),
309            website!("https://example.com/citations"),
310        ])
311        .collect::<HashSet<Uri>>();
312
313        assert_eq!(links, expected_links);
314    }
315
316    #[test]
317    fn test_extract_urls_with_at_sign_properly() {
318        // note that these used to parse as emails
319        let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
320        let links = extract_uris(&input, FileType::Plaintext);
321
322        let expected_links = IntoIterator::into_iter([
323            website!("https://example.com/@test/test"),
324            website!("http://otherdomain.com/test/@test"),
325        ])
326        .collect::<HashSet<Uri>>();
327
328        assert_eq!(links, expected_links);
329    }
330
331    #[test]
332    fn test_extract_link_at_end_of_line() {
333        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
334        let links = extract_uris(input, FileType::Plaintext);
335
336        let expected_links =
337            IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
338                .collect::<HashSet<Uri>>();
339
340        assert_eq!(links, expected_links);
341    }
342
343    #[test]
344    fn test_extract_css_from_style_tag() {
345        // Test case from issue #1485
346        let input = r#"<html>
347   <head>
348      <style>
349         div {
350             background-image: url("./lychee.png");
351         }
352      </style>
353   </head>
354</html>"#;
355        let input_content = InputContent::from_string(input, FileType::Html);
356        let extractor = Extractor::new(false, false, false);
357        let raw_uris = extractor.extract(&input_content);
358        assert_eq!(raw_uris, vec![css_url("./lychee.png", span(5, 32))]);
359    }
360
361    #[test]
362    fn test_extract_css_from_css_file() {
363        let input = r#"
364.example {
365    background-image: url("./image.png");
366    background: url('/absolute/path.jpg');
367}
368@import url(https://example.com/style.css);
369"#;
370        let input_content = InputContent::from_string(input, FileType::Css);
371        let extractor = Extractor::new(false, false, false);
372        let raw_uris = extractor.extract(&input_content);
373        assert_eq!(
374            raw_uris,
375            vec![
376                css_url("./image.png", span(3, 23)),
377                css_url("/absolute/path.jpg", span(4, 17)),
378                css_url("https://example.com/style.css", span(6, 9)),
379            ]
380        );
381    }
382
383    #[test]
384    fn test_extract_multiple_css_urls_from_style_tag() {
385        let input = r#"<html>
386   <head>
387      <style>
388         .background {
389             background-image: url("./bg.png");
390         }
391         @font-face {
392             src: url(../fonts/font.woff2);
393         }
394      </style>
395   </head>
396</html>"#;
397        let input_content = InputContent::from_string(input, FileType::Html);
398        let extractor = Extractor::new(false, false, false);
399        let raw_uris = extractor.extract(&input_content);
400
401        assert_eq!(
402            raw_uris,
403            vec![
404                css_url("./bg.png", span(5, 32)),
405                css_url("../fonts/font.woff2", span(8, 19)),
406            ]
407        );
408    }
409
410    fn css_url(text: &str, span: RawUriSpan) -> RawUri {
411        RawUri {
412            text: text.into(),
413            element: Some("style".into()),
414            attribute: Some("url".into()),
415            span,
416        }
417    }
418}