1use crate::types::{
2 FileType, InputContent,
3 uri::raw::{RawUri, SourceSpanProvider},
4};
5
6pub mod html;
7pub mod markdown;
8mod plaintext;
9
10use markdown::extract_markdown;
11use plaintext::extract_raw_uri_from_plaintext;
12
13#[derive(Default, Debug, Clone, Copy)]
17pub struct Extractor {
18 use_html5ever: bool,
19 include_verbatim: bool,
20 include_wikilinks: bool,
21}
22
23impl Extractor {
24 #[must_use]
37 pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
38 Self {
39 use_html5ever,
40 include_verbatim,
41 include_wikilinks,
42 }
43 }
44
45 #[must_use]
48 pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
49 match input_content.file_type {
50 FileType::Markdown => extract_markdown(
51 &input_content.content,
52 self.include_verbatim,
53 self.include_wikilinks,
54 ),
55 FileType::Html => {
56 if self.use_html5ever {
57 html::html5ever::extract_html(&input_content.content, self.include_verbatim)
58 } else {
59 html::html5gum::extract_html(&input_content.content, self.include_verbatim)
60 }
61 }
62 FileType::Plaintext => extract_raw_uri_from_plaintext(
63 &input_content.content,
64 &SourceSpanProvider::from_input(&input_content.content),
65 ),
66 }
67 }
68}
69
70#[cfg(test)]
71mod tests {
72 use pretty_assertions::assert_eq;
73 use reqwest::Url;
74 use std::{collections::HashSet, path::Path};
75 use test_utils::{fixtures_path, load_fixture, mail, website};
76
77 use super::*;
78 use crate::{
79 Uri,
80 types::{FileType, InputContent, ResolvedInputSource},
81 utils::url::find_links,
82 };
83
84 fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
85 let input_content = InputContent::from_string(input, file_type);
86
87 let extractor = Extractor::new(false, false, false);
88 let uris_html5gum: HashSet<Uri> = extractor
89 .extract(&input_content)
90 .into_iter()
91 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
92 .collect();
93 let uris_html5gum_sorted: Vec<Uri> = {
94 let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
95 uris.sort();
96 uris
97 };
98
99 let extractor = Extractor::new(true, false, false);
100 let uris_html5ever: HashSet<Uri> = extractor
101 .extract(&input_content)
102 .into_iter()
103 .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
104 .collect();
105 let uris_html5ever_sorted: Vec<Uri> = {
106 let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
107 uris.sort();
108 uris
109 };
110
111 assert_eq!(
112 uris_html5gum_sorted, uris_html5ever_sorted,
113 "Mismatch between html5gum and html5ever"
114 );
115 uris_html5gum
116 }
117
118 #[test]
119 fn verbatim_elem() {
120 let input = "<pre>https://example.com</pre>";
121 let uris = extract_uris(input, FileType::Html);
122 assert!(uris.is_empty());
123 }
124
125 #[test]
126 fn test_file_type() {
127 assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
128 assert_eq!(FileType::from("test.md"), FileType::Markdown);
129 assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
130 assert_eq!(FileType::from("test.html"), FileType::Html);
131 assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
132 assert_eq!(FileType::from("test.something"), FileType::Plaintext);
133 assert_eq!(
134 FileType::from("/absolute/path/to/test.something"),
135 FileType::Plaintext
136 );
137 }
138
139 #[test]
140 fn test_skip_markdown_anchors() {
141 let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
142
143 assert!(links.is_empty());
144 }
145
146 #[test]
147 fn test_skip_markdown_internal_urls() {
148 let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
149
150 assert!(links.is_empty());
151 }
152
153 #[test]
154 fn test_skip_markdown_email() {
155 let input = "Get in touch - [Contact Us](mailto:test@test.com)";
156 let links = extract_uris(input, FileType::Markdown);
157 let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
158
159 assert_eq!(links, expected);
160 }
161
162 #[test]
163 fn relative_urls() {
164 let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
165
166 assert!(links.is_empty());
167 }
168
169 #[test]
170 fn test_non_markdown_links() {
171 let input =
172 "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
173 let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
174
175 let expected = IntoIterator::into_iter([
176 website!("https://endler.dev"),
177 website!("https://hello-rust.show/foo/bar?lol=1"),
178 mail!("test@example.com"),
179 ])
180 .collect::<HashSet<Uri>>();
181
182 assert_eq!(links, expected);
183 }
184
185 #[test]
186 fn test_md_escape() {
187 let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
188 let links: Vec<_> = find_links(input).collect();
189 let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
190
191 matches!(&links[..], [link] if link.as_str() == expected);
192 }
193
194 #[test]
195 fn test_extract_html5_not_valid_xml() {
196 let input = load_fixture!("TEST_HTML5.html");
197 let links = extract_uris(&input, FileType::Html);
198
199 let expected_links = IntoIterator::into_iter([
200 website!("https://example.com/head/home"),
201 website!("https://example.com/css/style_full_url.css"),
202 website!("https://example.com/body/a"),
204 website!("https://example.com/body/div_empty_a"),
205 ])
206 .collect::<HashSet<Uri>>();
207
208 assert_eq!(links, expected_links);
209 }
210
211 #[test]
212 fn test_extract_relative_url() {
213 let source = ResolvedInputSource::RemoteUrl(Box::new(
214 Url::parse("https://example.com/some-post").unwrap(),
215 ));
216
217 let contents = r#"<html>
218 <div class="row">
219 <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
220 <a href="/about">About</a>
221 </div>
222 </html>"#;
223
224 let input_content = &InputContent {
225 source,
226 file_type: FileType::Html,
227 content: contents.to_string(),
228 };
229
230 for use_html5ever in [true, false] {
231 let extractor = Extractor::new(use_html5ever, false, false);
232 let links = extractor.extract(input_content);
233
234 let urls = links
235 .into_iter()
236 .map(|raw_uri| raw_uri.text)
237 .collect::<HashSet<_>>();
238
239 let expected_urls = IntoIterator::into_iter([
240 String::from("https://github.com/lycheeverse/lychee/"),
241 String::from("/about"),
242 ])
243 .collect::<HashSet<_>>();
244
245 assert_eq!(urls, expected_urls);
246 }
247 }
248
249 #[test]
250 fn test_extract_html5_lowercase_doctype() {
251 let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
253 let links = extract_uris(&input, FileType::Html);
254
255 let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
256 .collect::<HashSet<Uri>>();
257
258 assert_eq!(links, expected_links);
259 }
260
261 #[test]
262 fn test_extract_html5_minified() {
263 let input = load_fixture!("TEST_HTML5_MINIFIED.html");
265 let links = extract_uris(&input, FileType::Html);
266
267 let expected_links = IntoIterator::into_iter([
268 website!("https://example.com/"),
269 website!("https://example.com/favicon.ico"),
270 website!("https://example.com/docs/"),
273 website!("https://example.com/forum"),
274 ])
275 .collect::<HashSet<Uri>>();
276
277 assert_eq!(links, expected_links);
278 }
279
280 #[test]
281 fn test_extract_html5_malformed() {
282 let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
284 let links = extract_uris(&input, FileType::Html);
285
286 let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
287 .collect::<HashSet<Uri>>();
288
289 assert_eq!(links, expected_links);
290 }
291
292 #[test]
293 fn test_extract_html5_custom_elements() {
294 let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
296 let links = extract_uris(&input, FileType::Html);
297
298 let expected_links = IntoIterator::into_iter([
299 website!("https://example.com/some-weird-element"),
300 website!("https://example.com/even-weirder-src"),
301 website!("https://example.com/even-weirder-href"),
302 website!("https://example.com/citations"),
303 ])
304 .collect::<HashSet<Uri>>();
305
306 assert_eq!(links, expected_links);
307 }
308
309 #[test]
310 fn test_extract_urls_with_at_sign_properly() {
311 let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
313 let links = extract_uris(&input, FileType::Plaintext);
314
315 let expected_links = IntoIterator::into_iter([
316 website!("https://example.com/@test/test"),
317 website!("http://otherdomain.com/test/@test"),
318 ])
319 .collect::<HashSet<Uri>>();
320
321 assert_eq!(links, expected_links);
322 }
323
324 #[test]
325 fn test_extract_link_at_end_of_line() {
326 let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
327 let links = extract_uris(input, FileType::Plaintext);
328
329 let expected_links =
330 IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
331 .collect::<HashSet<Uri>>();
332
333 assert_eq!(links, expected_links);
334 }
335}