Skip to main content

lychee_lib/extract/
mod.rs

1use crate::types::{
2    FileType, InputContent,
3    uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod css;
7pub mod html;
8pub mod markdown;
9mod plaintext;
10
11use css::extract_css;
12use markdown::extract_markdown;
13use plaintext::extract_raw_uri_from_plaintext;
14
15/// A handler for extracting links from various input formats like Markdown and
16/// HTML. Allocations should be avoided if possible as this is a
17/// performance-critical section of the library.
18#[derive(Default, Debug, Clone, Copy)]
19pub struct Extractor {
20    use_html5ever: bool,
21    include_verbatim: bool,
22    include_wikilinks: bool,
23}
24
25impl Extractor {
26    /// Creates a new extractor
27    ///
28    /// The extractor can be configured with the following settings:
29    ///
30    /// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
31    ///   is also used in the Servo browser by Mozilla.
32    ///   The default is `html5gum`, which is more performant and well maintained.
33    ///
34    /// - `include_verbatim` ignores links inside Markdown code blocks.
35    ///   These can be denoted as a block starting with three backticks or an indented block.
36    ///   For more information, consult the `pulldown_cmark` documentation about code blocks
37    ///   [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
38    #[must_use]
39    pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
40        Self {
41            use_html5ever,
42            include_verbatim,
43            include_wikilinks,
44        }
45    }
46
47    /// Main entrypoint for extracting links from various sources
48    /// (Markdown, HTML, CSS, and plaintext)
49    #[must_use]
50    pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
51        match input_content.file_type {
52            FileType::Markdown => extract_markdown(
53                &input_content.content,
54                self.include_verbatim,
55                self.include_wikilinks,
56            ),
57            FileType::Html => {
58                if self.use_html5ever {
59                    html::html5ever::extract_html(&input_content.content, self.include_verbatim)
60                } else {
61                    html::html5gum::extract_html(&input_content.content, self.include_verbatim)
62                }
63            }
64            FileType::Css => extract_css(
65                &input_content.content,
66                &SourceSpanProvider::from_input(&input_content.content),
67            ),
68            FileType::Plaintext => extract_raw_uri_from_plaintext(
69                &input_content.content,
70                &SourceSpanProvider::from_input(&input_content.content),
71            ),
72        }
73    }
74}
75
76#[cfg(test)]
77mod tests {
78    use pretty_assertions::assert_eq;
79    use reqwest::Url;
80    use std::{collections::HashSet, path::Path};
81    use test_utils::{fixtures_path, load_fixture, mail, website};
82
83    use super::*;
84    use crate::{
85        Uri,
86        types::{
87            FileType, InputContent, ResolvedInputSource,
88            uri::raw::{RawUriSpan, span},
89        },
90        utils::url::find_links,
91    };
92
93    fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
94        let input_content = InputContent::from_string(input, file_type);
95
96        let extractor = Extractor::new(false, false, false);
97        let uris_html5gum: HashSet<Uri> = extractor
98            .extract(&input_content)
99            .into_iter()
100            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
101            .collect();
102        let uris_html5gum_sorted: Vec<Uri> = {
103            let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
104            uris.sort();
105            uris
106        };
107
108        let extractor = Extractor::new(true, false, false);
109        let uris_html5ever: HashSet<Uri> = extractor
110            .extract(&input_content)
111            .into_iter()
112            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
113            .collect();
114        let uris_html5ever_sorted: Vec<Uri> = {
115            let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
116            uris.sort();
117            uris
118        };
119
120        assert_eq!(
121            uris_html5gum_sorted, uris_html5ever_sorted,
122            "Mismatch between html5gum and html5ever"
123        );
124        uris_html5gum
125    }
126
127    #[test]
128    fn verbatim_elem() {
129        let input = "<pre>https://example.com</pre>";
130        let uris = extract_uris(input, FileType::Html);
131        assert!(uris.is_empty());
132    }
133
134    #[test]
135    fn test_file_type() {
136        assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
137        assert_eq!(FileType::from("test.md"), FileType::Markdown);
138        assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
139        assert_eq!(FileType::from("test.html"), FileType::Html);
140        assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
141        assert_eq!(FileType::from("test.something"), FileType::Plaintext);
142        assert_eq!(
143            FileType::from("/absolute/path/to/test.something"),
144            FileType::Plaintext
145        );
146    }
147
148    #[test]
149    fn test_skip_markdown_anchors() {
150        let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
151
152        assert!(links.is_empty());
153    }
154
155    #[test]
156    fn test_skip_markdown_internal_urls() {
157        let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
158
159        assert!(links.is_empty());
160    }
161
162    #[test]
163    fn test_skip_markdown_email() {
164        let input = "Get in touch - [Contact Us](mailto:test@test.com)";
165        let links = extract_uris(input, FileType::Markdown);
166        let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
167
168        assert_eq!(links, expected);
169    }
170
171    #[test]
172    fn relative_urls() {
173        let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
174
175        assert!(links.is_empty());
176    }
177
178    #[test]
179    fn test_non_markdown_links() {
180        let input =
181            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
182        let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
183
184        let expected = IntoIterator::into_iter([
185            website!("https://endler.dev"),
186            website!("https://hello-rust.show/foo/bar?lol=1"),
187            mail!("test@example.com"),
188        ])
189        .collect::<HashSet<Uri>>();
190
191        assert_eq!(links, expected);
192    }
193
194    #[test]
195    fn test_md_escape() {
196        let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
197        let links: Vec<_> = find_links(input).collect();
198        let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
199
200        matches!(&links[..], [link] if link.as_str() == expected);
201    }
202
203    #[test]
204    fn test_extract_html5_not_valid_xml() {
205        let input = load_fixture!("TEST_HTML5.html");
206        let links = extract_uris(&input, FileType::Html);
207
208        let expected_links = IntoIterator::into_iter([
209            website!("https://example.com/head/home"),
210            website!("https://example.com/css/style_full_url.css"),
211            // the body links wouldn't be present if the file was parsed strictly as XML
212            website!("https://example.com/body/a"),
213            website!("https://example.com/body/div_empty_a"),
214        ])
215        .collect::<HashSet<Uri>>();
216
217        assert_eq!(links, expected_links);
218    }
219
220    #[test]
221    fn test_extract_relative_url() {
222        let source = ResolvedInputSource::RemoteUrl(Box::new(
223            Url::parse("https://example.com/some-post").unwrap(),
224        ));
225
226        let contents = r#"<html>
227            <div class="row">
228                <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
229                <a href="/about">About</a>
230            </div>
231        </html>"#;
232
233        let input_content = &InputContent {
234            source,
235            file_type: FileType::Html,
236            content: contents.to_string(),
237        };
238
239        for use_html5ever in [true, false] {
240            let extractor = Extractor::new(use_html5ever, false, false);
241            let links = extractor.extract(input_content);
242
243            let urls = links
244                .into_iter()
245                .map(|raw_uri| raw_uri.text)
246                .collect::<HashSet<_>>();
247
248            let expected_urls = IntoIterator::into_iter([
249                String::from("https://github.com/lycheeverse/lychee/"),
250                String::from("/about"),
251            ])
252            .collect::<HashSet<_>>();
253
254            assert_eq!(urls, expected_urls);
255        }
256    }
257
258    #[test]
259    fn test_extract_html5_lowercase_doctype() {
260        // this has been problematic with previous XML based parser
261        let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
262        let links = extract_uris(&input, FileType::Html);
263
264        let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
265            .collect::<HashSet<Uri>>();
266
267        assert_eq!(links, expected_links);
268    }
269
270    #[test]
271    fn test_extract_html5_minified() {
272        // minified HTML with some quirky elements such as href attribute values specified without quotes
273        let input = load_fixture!("TEST_HTML5_MINIFIED.html");
274        let links = extract_uris(&input, FileType::Html);
275
276        let expected_links = IntoIterator::into_iter([
277            website!("https://example.com/"),
278            website!("https://example.com/favicon.ico"),
279            // Note that we exclude `preconnect` links:
280            // website!("https://fonts.externalsite.com"),
281            website!("https://example.com/docs/"),
282            website!("https://example.com/forum"),
283        ])
284        .collect::<HashSet<Uri>>();
285
286        assert_eq!(links, expected_links);
287    }
288
289    #[test]
290    fn test_extract_html5_malformed() {
291        // malformed links shouldn't stop the parser from further parsing
292        let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
293        let links = extract_uris(&input, FileType::Html);
294
295        let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
296            .collect::<HashSet<Uri>>();
297
298        assert_eq!(links, expected_links);
299    }
300
301    #[test]
302    fn test_extract_html5_custom_elements() {
303        // the element name shouldn't matter for attributes like href, src, cite etc
304        let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
305        let links = extract_uris(&input, FileType::Html);
306
307        let expected_links = IntoIterator::into_iter([
308            website!("https://example.com/some-weird-element"),
309            website!("https://example.com/even-weirder-src"),
310            website!("https://example.com/even-weirder-href"),
311            website!("https://example.com/citations"),
312        ])
313        .collect::<HashSet<Uri>>();
314
315        assert_eq!(links, expected_links);
316    }
317
318    #[test]
319    fn test_extract_urls_with_at_sign_properly() {
320        // note that these used to parse as emails
321        let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
322        let links = extract_uris(&input, FileType::Plaintext);
323
324        let expected_links = IntoIterator::into_iter([
325            website!("https://example.com/@test/test"),
326            website!("http://otherdomain.com/test/@test"),
327        ])
328        .collect::<HashSet<Uri>>();
329
330        assert_eq!(links, expected_links);
331    }
332
333    #[test]
334    fn test_extract_link_at_end_of_line() {
335        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
336        let links = extract_uris(input, FileType::Plaintext);
337
338        let expected_links =
339            IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
340                .collect::<HashSet<Uri>>();
341
342        assert_eq!(links, expected_links);
343    }
344
345    #[test]
346    fn test_extract_css_from_style_tag() {
347        // Test case from issue #1485
348        let input = r#"<html>
349   <head>
350      <style>
351         div {
352             background-image: url("./lychee.png");
353         }
354      </style>
355   </head>
356</html>"#;
357        let input_content = InputContent::from_string(input, FileType::Html);
358        let extractor = Extractor::new(false, false, false);
359        let raw_uris = extractor.extract(&input_content);
360        assert_eq!(raw_uris, vec![css_url("./lychee.png", span(5, 32))]);
361    }
362
363    #[test]
364    fn test_extract_css_from_css_file() {
365        let input = r#"
366.example {
367    background-image: url("./image.png");
368    background: url('/absolute/path.jpg');
369}
370@import url(https://example.com/style.css);
371"#;
372        let input_content = InputContent::from_string(input, FileType::Css);
373        let extractor = Extractor::new(false, false, false);
374        let raw_uris = extractor.extract(&input_content);
375        assert_eq!(
376            raw_uris,
377            vec![
378                css_url("./image.png", span(3, 23)),
379                css_url("/absolute/path.jpg", span(4, 17)),
380                css_url("https://example.com/style.css", span(6, 9)),
381            ]
382        );
383    }
384
385    #[test]
386    fn test_extract_multiple_css_urls_from_style_tag() {
387        let input = r#"<html>
388   <head>
389      <style>
390         .background {
391             background-image: url("./bg.png");
392         }
393         @font-face {
394             src: url(../fonts/font.woff2);
395         }
396      </style>
397   </head>
398</html>"#;
399        let input_content = InputContent::from_string(input, FileType::Html);
400        let extractor = Extractor::new(false, false, false);
401        let raw_uris = extractor.extract(&input_content);
402
403        assert_eq!(
404            raw_uris,
405            vec![
406                css_url("./bg.png", span(5, 32)),
407                css_url("../fonts/font.woff2", span(8, 19)),
408            ]
409        );
410    }
411
412    fn css_url(text: &str, span: RawUriSpan) -> RawUri {
413        RawUri {
414            text: text.into(),
415            element: Some("style".into()),
416            attribute: Some("url".into()),
417            span,
418        }
419    }
420}