1use crate::types::{
2 FileType, InputContent,
3 uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod css;
7pub mod html;
8pub mod markdown;
9mod plaintext;
10
11use css::extract_css;
12use markdown::extract_markdown;
13use plaintext::extract_raw_uri_from_plaintext;
14
15#[derive(Default, Debug, Clone, Copy)]
19pub struct Extractor {
20 use_html5ever: bool,
21 include_verbatim: bool,
22 include_wikilinks: bool,
23}
24
25impl Extractor {
26 #[must_use]
39 pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
40 Self {
41 use_html5ever,
42 include_verbatim,
43 include_wikilinks,
44 }
45 }
46
47 #[must_use]
50 pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
51 match input_content.file_type {
52 FileType::Markdown => extract_markdown(
53 &input_content.content,
54 self.include_verbatim,
55 self.include_wikilinks,
56 ),
57 FileType::Html => {
58 if self.use_html5ever {
59 html::html5ever::extract_html(&input_content.content, self.include_verbatim)
60 } else {
61 html::html5gum::extract_html(&input_content.content, self.include_verbatim)
62 }
63 }
64 FileType::Css => extract_css(
65 &input_content.content,
66 &SourceSpanProvider::from_input(&input_content.content),
67 ),
68 FileType::Plaintext => extract_raw_uri_from_plaintext(
69 &input_content.content,
70 &SourceSpanProvider::from_input(&input_content.content),
71 ),
72 }
73 }
74}
75
76#[cfg(test)]
77mod tests {
78 use pretty_assertions::assert_eq;
79 use reqwest::Url;
80 use std::{collections::HashSet, path::Path};
81 use test_utils::{fixtures_path, load_fixture, mail, website};
82
83 use super::*;
84 use crate::{
85 Uri,
86 types::{
87 FileType, InputContent, ResolvedInputSource,
88 uri::raw::{RawUriSpan, span},
89 },
90 utils::url::find_links,
91 };
92
93 fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
94 let input_content = InputContent::from_string(input, file_type);
95
96 let extractor = Extractor::new(false, false, false);
97 let uris_html5gum: HashSet<Uri> = extractor
98 .extract(&input_content)
99 .into_iter()
100 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
101 .collect();
102 let uris_html5gum_sorted: Vec<Uri> = {
103 let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
104 uris.sort();
105 uris
106 };
107
108 let extractor = Extractor::new(true, false, false);
109 let uris_html5ever: HashSet<Uri> = extractor
110 .extract(&input_content)
111 .into_iter()
112 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
113 .collect();
114 let uris_html5ever_sorted: Vec<Uri> = {
115 let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
116 uris.sort();
117 uris
118 };
119
120 assert_eq!(
121 uris_html5gum_sorted, uris_html5ever_sorted,
122 "Mismatch between html5gum and html5ever"
123 );
124 uris_html5gum
125 }
126
127 #[test]
128 fn verbatim_elem() {
129 let input = "<pre>https://example.com</pre>";
130 let uris = extract_uris(input, FileType::Html);
131 assert!(uris.is_empty());
132 }
133
134 #[test]
135 fn test_file_type() {
136 assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
137 assert_eq!(FileType::from("test.md"), FileType::Markdown);
138 assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
139 assert_eq!(FileType::from("test.html"), FileType::Html);
140 assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
141 assert_eq!(FileType::from("test.something"), FileType::Plaintext);
142 assert_eq!(
143 FileType::from("/absolute/path/to/test.something"),
144 FileType::Plaintext
145 );
146 }
147
148 #[test]
149 fn test_skip_markdown_anchors() {
150 let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
151
152 assert!(links.is_empty());
153 }
154
155 #[test]
156 fn test_skip_markdown_internal_urls() {
157 let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
158
159 assert!(links.is_empty());
160 }
161
162 #[test]
163 fn test_skip_markdown_email() {
164 let input = "Get in touch - [Contact Us](mailto:test@test.com)";
165 let links = extract_uris(input, FileType::Markdown);
166 let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
167
168 assert_eq!(links, expected);
169 }
170
171 #[test]
172 fn relative_urls() {
173 let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
174
175 assert!(links.is_empty());
176 }
177
178 #[test]
179 fn test_non_markdown_links() {
180 let input =
181 "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
182 let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
183
184 let expected = IntoIterator::into_iter([
185 website!("https://endler.dev"),
186 website!("https://hello-rust.show/foo/bar?lol=1"),
187 mail!("test@example.com"),
188 ])
189 .collect::<HashSet<Uri>>();
190
191 assert_eq!(links, expected);
192 }
193
194 #[test]
195 fn test_md_escape() {
196 let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
197 let links: Vec<_> = find_links(input).collect();
198 let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
199
200 matches!(&links[..], [link] if link.as_str() == expected);
201 }
202
203 #[test]
204 fn test_extract_html5_not_valid_xml() {
205 let input = load_fixture!("TEST_HTML5.html");
206 let links = extract_uris(&input, FileType::Html);
207
208 let expected_links = IntoIterator::into_iter([
209 website!("https://example.com/head/home"),
210 website!("https://example.com/css/style_full_url.css"),
211 website!("https://example.com/body/a"),
213 website!("https://example.com/body/div_empty_a"),
214 ])
215 .collect::<HashSet<Uri>>();
216
217 assert_eq!(links, expected_links);
218 }
219
220 #[test]
221 fn test_extract_relative_url() {
222 let source = ResolvedInputSource::RemoteUrl(Box::new(
223 Url::parse("https://example.com/some-post").unwrap(),
224 ));
225
226 let contents = r#"<html>
227 <div class="row">
228 <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
229 <a href="/about">About</a>
230 </div>
231 </html>"#;
232
233 let input_content = &InputContent {
234 source,
235 file_type: FileType::Html,
236 content: contents.to_string(),
237 };
238
239 for use_html5ever in [true, false] {
240 let extractor = Extractor::new(use_html5ever, false, false);
241 let links = extractor.extract(input_content);
242
243 let urls = links
244 .into_iter()
245 .map(|raw_uri| raw_uri.text)
246 .collect::<HashSet<_>>();
247
248 let expected_urls = IntoIterator::into_iter([
249 String::from("https://github.com/lycheeverse/lychee/"),
250 String::from("/about"),
251 ])
252 .collect::<HashSet<_>>();
253
254 assert_eq!(urls, expected_urls);
255 }
256 }
257
258 #[test]
259 fn test_extract_html5_lowercase_doctype() {
260 let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
262 let links = extract_uris(&input, FileType::Html);
263
264 let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
265 .collect::<HashSet<Uri>>();
266
267 assert_eq!(links, expected_links);
268 }
269
270 #[test]
271 fn test_extract_html5_minified() {
272 let input = load_fixture!("TEST_HTML5_MINIFIED.html");
274 let links = extract_uris(&input, FileType::Html);
275
276 let expected_links = IntoIterator::into_iter([
277 website!("https://example.com/"),
278 website!("https://example.com/favicon.ico"),
279 website!("https://example.com/docs/"),
282 website!("https://example.com/forum"),
283 ])
284 .collect::<HashSet<Uri>>();
285
286 assert_eq!(links, expected_links);
287 }
288
289 #[test]
290 fn test_extract_html5_malformed() {
291 let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
293 let links = extract_uris(&input, FileType::Html);
294
295 let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
296 .collect::<HashSet<Uri>>();
297
298 assert_eq!(links, expected_links);
299 }
300
301 #[test]
302 fn test_extract_html5_custom_elements() {
303 let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
305 let links = extract_uris(&input, FileType::Html);
306
307 let expected_links = IntoIterator::into_iter([
308 website!("https://example.com/some-weird-element"),
309 website!("https://example.com/even-weirder-src"),
310 website!("https://example.com/even-weirder-href"),
311 website!("https://example.com/citations"),
312 ])
313 .collect::<HashSet<Uri>>();
314
315 assert_eq!(links, expected_links);
316 }
317
318 #[test]
319 fn test_extract_urls_with_at_sign_properly() {
320 let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
322 let links = extract_uris(&input, FileType::Plaintext);
323
324 let expected_links = IntoIterator::into_iter([
325 website!("https://example.com/@test/test"),
326 website!("http://otherdomain.com/test/@test"),
327 ])
328 .collect::<HashSet<Uri>>();
329
330 assert_eq!(links, expected_links);
331 }
332
333 #[test]
334 fn test_extract_link_at_end_of_line() {
335 let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
336 let links = extract_uris(input, FileType::Plaintext);
337
338 let expected_links =
339 IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
340 .collect::<HashSet<Uri>>();
341
342 assert_eq!(links, expected_links);
343 }
344
345 #[test]
346 fn test_extract_css_from_style_tag() {
347 let input = r#"<html>
349 <head>
350 <style>
351 div {
352 background-image: url("./lychee.png");
353 }
354 </style>
355 </head>
356</html>"#;
357 let input_content = InputContent::from_string(input, FileType::Html);
358 let extractor = Extractor::new(false, false, false);
359 let raw_uris = extractor.extract(&input_content);
360 assert_eq!(raw_uris, vec![css_url("./lychee.png", span(5, 32))]);
361 }
362
363 #[test]
364 fn test_extract_css_from_css_file() {
365 let input = r#"
366.example {
367 background-image: url("./image.png");
368 background: url('/absolute/path.jpg');
369}
370@import url(https://example.com/style.css);
371"#;
372 let input_content = InputContent::from_string(input, FileType::Css);
373 let extractor = Extractor::new(false, false, false);
374 let raw_uris = extractor.extract(&input_content);
375 assert_eq!(
376 raw_uris,
377 vec![
378 css_url("./image.png", span(3, 23)),
379 css_url("/absolute/path.jpg", span(4, 17)),
380 css_url("https://example.com/style.css", span(6, 9)),
381 ]
382 );
383 }
384
385 #[test]
386 fn test_extract_multiple_css_urls_from_style_tag() {
387 let input = r#"<html>
388 <head>
389 <style>
390 .background {
391 background-image: url("./bg.png");
392 }
393 @font-face {
394 src: url(../fonts/font.woff2);
395 }
396 </style>
397 </head>
398</html>"#;
399 let input_content = InputContent::from_string(input, FileType::Html);
400 let extractor = Extractor::new(false, false, false);
401 let raw_uris = extractor.extract(&input_content);
402
403 assert_eq!(
404 raw_uris,
405 vec![
406 css_url("./bg.png", span(5, 32)),
407 css_url("../fonts/font.woff2", span(8, 19)),
408 ]
409 );
410 }
411
412 fn css_url(text: &str, span: RawUriSpan) -> RawUri {
413 RawUri {
414 text: text.into(),
415 element: Some("style".into()),
416 attribute: Some("url".into()),
417 span,
418 }
419 }
420}