1use crate::types::{
2 FileType, InputContent,
3 uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod css;
7pub mod html;
8pub mod markdown;
9mod plaintext;
10pub mod xml;
11
12use css::extract_css;
13use markdown::extract_markdown;
14use plaintext::extract_raw_uri_from_plaintext;
15use xml::extract_xml;
16
17#[derive(Default, Debug, Clone, Copy)]
21pub struct Extractor {
22 use_html5ever: bool,
23 include_verbatim: bool,
24 include_wikilinks: bool,
25}
26
27impl Extractor {
28 #[must_use]
41 pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
42 Self {
43 use_html5ever,
44 include_verbatim,
45 include_wikilinks,
46 }
47 }
48
49 #[must_use]
52 pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
53 let content = &input_content.content;
54 match input_content.file_type {
55 FileType::Markdown => {
56 extract_markdown(content, self.include_verbatim, self.include_wikilinks)
57 }
58 FileType::Html => {
59 if self.use_html5ever {
60 html::html5ever::extract_html(content, self.include_verbatim)
61 } else {
62 html::html5gum::extract_html(content, self.include_verbatim)
63 }
64 }
65 FileType::Css => extract_css(content, &SourceSpanProvider::from_input(content)),
66 FileType::Plaintext => {
67 extract_raw_uri_from_plaintext(content, &SourceSpanProvider::from_input(content))
68 }
69 FileType::Xml => extract_xml(content, &SourceSpanProvider::from_input(content)),
70 }
71 }
72}
73
74#[cfg(test)]
75mod tests {
76 use pretty_assertions::assert_eq;
77 use reqwest::Url;
78 use std::{collections::HashSet, path::Path};
79 use test_utils::{fixtures_path, load_fixture, mail, website};
80
81 use super::*;
82 use crate::{
83 Uri,
84 types::{
85 FileType, InputContent, ResolvedInputSource,
86 uri::raw::{RawUriSpan, span},
87 },
88 utils::url::find_links,
89 };
90
91 fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
92 let input_content = InputContent::from_string(input, file_type);
93
94 let extractor = Extractor::new(false, false, false);
95 let uris_html5gum: HashSet<Uri> = extractor
96 .extract(&input_content)
97 .into_iter()
98 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
99 .collect();
100 let uris_html5gum_sorted: Vec<Uri> = {
101 let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
102 uris.sort();
103 uris
104 };
105
106 let extractor = Extractor::new(true, false, false);
107 let uris_html5ever: HashSet<Uri> = extractor
108 .extract(&input_content)
109 .into_iter()
110 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
111 .collect();
112 let uris_html5ever_sorted: Vec<Uri> = {
113 let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
114 uris.sort();
115 uris
116 };
117
118 assert_eq!(
119 uris_html5gum_sorted, uris_html5ever_sorted,
120 "Mismatch between html5gum and html5ever"
121 );
122 uris_html5gum
123 }
124
125 #[test]
126 fn verbatim_elem() {
127 let input = "<pre>https://example.com</pre>";
128 let uris = extract_uris(input, FileType::Html);
129 assert!(uris.is_empty());
130 }
131
132 #[test]
133 fn test_file_type() {
134 assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
135 assert_eq!(FileType::from("test.md"), FileType::Markdown);
136 assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
137 assert_eq!(FileType::from("test.html"), FileType::Html);
138 assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
139 assert_eq!(FileType::from("test.something"), FileType::Plaintext);
140 assert_eq!(
141 FileType::from("/absolute/path/to/test.something"),
142 FileType::Plaintext
143 );
144 }
145
146 #[test]
147 fn test_skip_markdown_anchors() {
148 let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
149
150 assert!(links.is_empty());
151 }
152
153 #[test]
154 fn test_skip_markdown_internal_urls() {
155 let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
156
157 assert!(links.is_empty());
158 }
159
160 #[test]
161 fn test_skip_markdown_email() {
162 let input = "Get in touch - [Contact Us](mailto:test@test.com)";
163 let links = extract_uris(input, FileType::Markdown);
164 let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
165
166 assert_eq!(links, expected);
167 }
168
169 #[test]
170 fn relative_urls() {
171 let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
172
173 assert!(links.is_empty());
174 }
175
176 #[test]
177 fn test_non_markdown_links() {
178 let input =
179 "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
180 let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
181
182 let expected = IntoIterator::into_iter([
183 website!("https://endler.dev"),
184 website!("https://hello-rust.show/foo/bar?lol=1"),
185 mail!("test@example.com"),
186 ])
187 .collect::<HashSet<Uri>>();
188
189 assert_eq!(links, expected);
190 }
191
192 #[test]
193 fn test_md_escape() {
194 let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
195 let links: Vec<_> = find_links(input).collect();
196 let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
197
198 matches!(&links[..], [link] if link.as_str() == expected);
199 }
200
201 #[test]
202 fn test_extract_html5_not_valid_xml() {
203 let input = load_fixture!("TEST_HTML5.html");
204 let links = extract_uris(&input, FileType::Html);
205
206 let expected_links = IntoIterator::into_iter([
207 website!("https://example.com/head/home"),
208 website!("https://example.com/css/style_full_url.css"),
209 website!("https://example.com/body/a"),
211 website!("https://example.com/body/div_empty_a"),
212 ])
213 .collect::<HashSet<Uri>>();
214
215 assert_eq!(links, expected_links);
216 }
217
218 #[test]
219 fn test_extract_relative_url() {
220 let source = ResolvedInputSource::RemoteUrl(Box::new(
221 Url::parse("https://example.com/some-post").unwrap(),
222 ));
223
224 let contents = r#"<html>
225 <div class="row">
226 <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
227 <a href="/about">About</a>
228 </div>
229 </html>"#;
230
231 let input_content = &InputContent {
232 source,
233 file_type: FileType::Html,
234 content: contents.to_string(),
235 };
236
237 for use_html5ever in [true, false] {
238 let extractor = Extractor::new(use_html5ever, false, false);
239 let links = extractor.extract(input_content);
240
241 let urls = links
242 .into_iter()
243 .map(|raw_uri| raw_uri.text)
244 .collect::<HashSet<_>>();
245
246 let expected_urls = IntoIterator::into_iter([
247 String::from("https://github.com/lycheeverse/lychee/"),
248 String::from("/about"),
249 ])
250 .collect::<HashSet<_>>();
251
252 assert_eq!(urls, expected_urls);
253 }
254 }
255
256 #[test]
257 fn test_extract_html5_lowercase_doctype() {
258 let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
260 let links = extract_uris(&input, FileType::Html);
261
262 let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
263 .collect::<HashSet<Uri>>();
264
265 assert_eq!(links, expected_links);
266 }
267
268 #[test]
269 fn test_extract_html5_minified() {
270 let input = load_fixture!("TEST_HTML5_MINIFIED.html");
272 let links = extract_uris(&input, FileType::Html);
273
274 let expected_links = IntoIterator::into_iter([
275 website!("https://example.com/"),
276 website!("https://example.com/favicon.ico"),
277 website!("https://example.com/docs/"),
280 website!("https://example.com/forum"),
281 ])
282 .collect::<HashSet<Uri>>();
283
284 assert_eq!(links, expected_links);
285 }
286
287 #[test]
288 fn test_extract_html5_malformed() {
289 let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
291 let links = extract_uris(&input, FileType::Html);
292
293 let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
294 .collect::<HashSet<Uri>>();
295
296 assert_eq!(links, expected_links);
297 }
298
299 #[test]
300 fn test_extract_html5_custom_elements() {
301 let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
303 let links = extract_uris(&input, FileType::Html);
304
305 let expected_links = IntoIterator::into_iter([
306 website!("https://example.com/some-weird-element"),
307 website!("https://example.com/even-weirder-src"),
308 website!("https://example.com/even-weirder-href"),
309 website!("https://example.com/citations"),
310 ])
311 .collect::<HashSet<Uri>>();
312
313 assert_eq!(links, expected_links);
314 }
315
316 #[test]
317 fn test_extract_urls_with_at_sign_properly() {
318 let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
320 let links = extract_uris(&input, FileType::Plaintext);
321
322 let expected_links = IntoIterator::into_iter([
323 website!("https://example.com/@test/test"),
324 website!("http://otherdomain.com/test/@test"),
325 ])
326 .collect::<HashSet<Uri>>();
327
328 assert_eq!(links, expected_links);
329 }
330
331 #[test]
332 fn test_extract_link_at_end_of_line() {
333 let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
334 let links = extract_uris(input, FileType::Plaintext);
335
336 let expected_links =
337 IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
338 .collect::<HashSet<Uri>>();
339
340 assert_eq!(links, expected_links);
341 }
342
343 #[test]
344 fn test_extract_css_from_style_tag() {
345 let input = r#"<html>
347 <head>
348 <style>
349 div {
350 background-image: url("./lychee.png");
351 }
352 </style>
353 </head>
354</html>"#;
355 let input_content = InputContent::from_string(input, FileType::Html);
356 let extractor = Extractor::new(false, false, false);
357 let raw_uris = extractor.extract(&input_content);
358 assert_eq!(raw_uris, vec![css_url("./lychee.png", span(5, 32))]);
359 }
360
361 #[test]
362 fn test_extract_css_from_css_file() {
363 let input = r#"
364.example {
365 background-image: url("./image.png");
366 background: url('/absolute/path.jpg');
367}
368@import url(https://example.com/style.css);
369"#;
370 let input_content = InputContent::from_string(input, FileType::Css);
371 let extractor = Extractor::new(false, false, false);
372 let raw_uris = extractor.extract(&input_content);
373 assert_eq!(
374 raw_uris,
375 vec![
376 css_url("./image.png", span(3, 23)),
377 css_url("/absolute/path.jpg", span(4, 17)),
378 css_url("https://example.com/style.css", span(6, 9)),
379 ]
380 );
381 }
382
383 #[test]
384 fn test_extract_multiple_css_urls_from_style_tag() {
385 let input = r#"<html>
386 <head>
387 <style>
388 .background {
389 background-image: url("./bg.png");
390 }
391 @font-face {
392 src: url(../fonts/font.woff2);
393 }
394 </style>
395 </head>
396</html>"#;
397 let input_content = InputContent::from_string(input, FileType::Html);
398 let extractor = Extractor::new(false, false, false);
399 let raw_uris = extractor.extract(&input_content);
400
401 assert_eq!(
402 raw_uris,
403 vec![
404 css_url("./bg.png", span(5, 32)),
405 css_url("../fonts/font.woff2", span(8, 19)),
406 ]
407 );
408 }
409
410 fn css_url(text: &str, span: RawUriSpan) -> RawUri {
411 RawUri {
412 text: text.into(),
413 element: Some("style".into()),
414 attribute: Some("url".into()),
415 span,
416 }
417 }
418}