1use crate::Preprocessor;
2use crate::filter::PathExcludes;
3use crate::ratelimit::HostPool;
4use crate::types::resolver::UrlContentResolver;
5use crate::{
6 BaseInfo, Input, LycheeResult, Request, RequestError, basic_auth::BasicAuthExtractor,
7 extract::Extractor, types::FileExtensions, types::uri::raw::RawUri, utils::request,
8};
9use futures::TryStreamExt;
10use futures::{
11 StreamExt,
12 stream::{self, Stream},
13};
14use http::HeaderMap;
15use log::warn;
16use par_stream::ParStreamExt;
17use std::collections::HashSet;
18use std::path::{Path, PathBuf};
19use std::sync::Arc;
20
21#[allow(clippy::struct_excessive_bools)]
24#[derive(Debug, Clone)]
25pub struct Collector {
26 basic_auth_extractor: Option<BasicAuthExtractor>,
27 skip_missing_inputs: bool,
28 skip_ignored: bool,
29 skip_hidden: bool,
30 include_verbatim: bool,
31 include_wikilinks: bool,
32 use_html5ever: bool,
33 root_dir: Option<PathBuf>,
34 base: BaseInfo,
35 excluded_paths: PathExcludes,
36 headers: HeaderMap,
40 host_pool: Arc<HostPool>,
46 preprocessor: Option<Preprocessor>,
47}
48
49impl Default for Collector {
50 fn default() -> Self {
57 Collector {
58 basic_auth_extractor: None,
59 skip_missing_inputs: false,
60 include_verbatim: false,
61 include_wikilinks: false,
62 use_html5ever: false,
63 skip_hidden: true,
64 skip_ignored: true,
65 root_dir: None,
66 base: BaseInfo::none(),
67 headers: HeaderMap::new(),
68 host_pool: Arc::new(HostPool::default()),
69 excluded_paths: PathExcludes::empty(),
70 preprocessor: None,
71 }
72 }
73}
74
75impl Collector {
76 pub fn new(root_dir: Option<PathBuf>, base: BaseInfo) -> LycheeResult<Self> {
83 let (root_dir, base) = match (root_dir, base) {
86 (Some(root_dir), BaseInfo::Full { origin, path })
87 if origin.scheme() == "file" && path.is_empty() =>
88 {
89 let root_dir = root_dir
90 .strip_prefix("/")
91 .map(Path::to_path_buf)
92 .unwrap_or(root_dir)
93 .join("");
94
95 match origin.to_file_path() {
96 Ok(base_path) => (Some(base_path.join(root_dir)), BaseInfo::full(origin, path)),
97 Err(()) => (Some(root_dir), BaseInfo::full(origin, path)),
98 }
99 }
100 (Some(root_dir), base) => {
101 let root_dir = std::path::absolute(&root_dir).unwrap_or(root_dir);
102
103 if !root_dir.exists() {
104 warn!("Root dir '{}' does not exist", root_dir.to_string_lossy());
105 } else if !root_dir.is_dir() {
106 warn!("Root dir '{}' not a directory", root_dir.to_string_lossy());
107 }
108 (Some(root_dir), base)
109 }
110 (None, base) => (None, base),
111 };
112 Ok(Collector {
113 basic_auth_extractor: None,
114 skip_missing_inputs: false,
115 include_verbatim: false,
116 include_wikilinks: false,
117 use_html5ever: false,
118 skip_hidden: true,
119 skip_ignored: true,
120 preprocessor: None,
121 headers: HeaderMap::new(),
122 host_pool: Arc::new(HostPool::default()),
123 excluded_paths: PathExcludes::empty(),
124 root_dir,
125 base,
126 })
127 }
128
129 #[must_use]
131 pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
132 self.skip_missing_inputs = yes;
133 self
134 }
135
136 #[must_use]
138 pub const fn skip_hidden(mut self, yes: bool) -> Self {
139 self.skip_hidden = yes;
140 self
141 }
142
143 #[must_use]
145 pub const fn skip_ignored(mut self, yes: bool) -> Self {
146 self.skip_ignored = yes;
147 self
148 }
149
150 #[must_use]
152 pub fn headers(mut self, headers: HeaderMap) -> Self {
153 self.headers = headers;
154 self
155 }
156
157 #[must_use]
171 pub fn host_pool(mut self, host_pool: Arc<HostPool>) -> Self {
172 self.host_pool = host_pool;
173 self
174 }
175
176 #[must_use]
178 pub const fn use_html5ever(mut self, yes: bool) -> Self {
179 self.use_html5ever = yes;
180 self
181 }
182
183 #[must_use]
185 pub const fn include_verbatim(mut self, yes: bool) -> Self {
186 self.include_verbatim = yes;
187 self
188 }
189
190 #[allow(clippy::doc_markdown)]
192 #[must_use]
193 pub const fn include_wikilinks(mut self, yes: bool) -> Self {
194 self.include_wikilinks = yes;
195 self
196 }
197
198 #[must_use]
200 pub fn preprocessor(mut self, preprocessor: Option<Preprocessor>) -> Self {
201 self.preprocessor = preprocessor;
202 self
203 }
204
205 #[must_use]
209 #[allow(clippy::missing_const_for_fn)]
210 pub fn basic_auth_extractor(mut self, extractor: BasicAuthExtractor) -> Self {
211 self.basic_auth_extractor = Some(extractor);
212 self
213 }
214
215 #[must_use]
217 pub fn excluded_paths(mut self, excluded_paths: PathExcludes) -> Self {
218 self.excluded_paths = excluded_paths;
219 self
220 }
221
222 pub fn collect_links(
225 self,
226 inputs: HashSet<Input>,
227 ) -> impl Stream<Item = Result<Request, RequestError>> {
228 self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
229 }
230
231 pub fn collect_links_from_file_types(
239 self,
240 inputs: HashSet<Input>,
241 extensions: FileExtensions,
242 ) -> impl Stream<Item = Result<Request, RequestError>> {
243 let skip_missing_inputs = self.skip_missing_inputs;
244 let skip_hidden = self.skip_hidden;
245 let skip_ignored = self.skip_ignored;
246 let global_base = self.base;
247 let excluded_paths = self.excluded_paths;
248
249 let resolver = UrlContentResolver {
250 basic_auth_extractor: self.basic_auth_extractor.clone(),
251 headers: self.headers.clone(),
252 host_pool: self.host_pool,
253 };
254
255 let extractor = Extractor::new(
256 self.use_html5ever,
257 self.include_verbatim,
258 self.include_wikilinks,
259 );
260
261 stream::iter(inputs)
262 .par_then_unordered(None, move |input| {
263 let extensions = extensions.clone();
264 let resolver = resolver.clone();
265 let excluded_paths = excluded_paths.clone();
266 let preprocessor = self.preprocessor.clone();
267
268 async move {
269 input.get_contents(
270 skip_missing_inputs,
271 skip_hidden,
272 skip_ignored,
273 extensions,
274 resolver,
275 excluded_paths,
276 preprocessor,
277 )
278 }
279 })
280 .flatten()
281 .par_then_unordered(None, move |content| {
282 let global_base = global_base.clone();
283 let root_dir = self.root_dir.clone();
284 let basic_auth_extractor = self.basic_auth_extractor.clone();
285 async move {
286 let content = content?;
287 let uris: Vec<RawUri> = extractor.extract(&content);
288 let requests = request::create(
289 uris,
290 &content.source,
291 root_dir.as_deref(),
292 &global_base,
293 basic_auth_extractor.as_ref(),
294 );
295 Result::Ok(stream::iter(requests))
296 }
297 })
298 .try_flatten()
299 }
300}
301
302#[cfg(test)]
303mod tests {
304 use std::borrow::Cow;
305 use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};
306 use test_utils::{fixtures_path, load_fixture, mail, mock_server, website};
307
308 use http::StatusCode;
309 use reqwest::Url;
310
311 use super::*;
312 use crate::{
313 LycheeResult, Uri,
314 filter::PathExcludes,
315 types::{FileType, Input, InputSource},
316 };
317
318 async fn collect(
320 inputs: HashSet<Input>,
321 root_dir: Option<PathBuf>,
322 base: BaseInfo,
323 ) -> LycheeResult<HashSet<Uri>> {
324 let responses = Collector::new(root_dir, base)?.collect_links(inputs);
325 Ok(responses.map(|r| r.unwrap().uri).collect().await)
326 }
327
328 async fn collect_verbatim(
333 inputs: HashSet<Input>,
334 root_dir: Option<PathBuf>,
335 base: BaseInfo,
336 extensions: FileExtensions,
337 ) -> LycheeResult<HashSet<Uri>> {
338 let responses = Collector::new(root_dir, base)?
339 .include_verbatim(true)
340 .collect_links_from_file_types(inputs, extensions);
341 Ok(responses.map(|r| r.unwrap().uri).collect().await)
342 }
343
344 const TEST_STRING: &str = "http://test-string.com";
345 const TEST_URL: &str = "https://test-url.org";
346 const TEST_FILE: &str = "https://test-file.io";
347 const TEST_GLOB_1: &str = "https://test-glob-1.io";
348 const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
349
350 #[tokio::test]
351 async fn test_file_without_extension_is_plaintext() -> LycheeResult<()> {
352 let temp_dir = tempfile::tempdir().unwrap();
353 let file_path = temp_dir.path().join("README");
355 let _file = File::create(&file_path).unwrap();
356 let input = Input::new(&file_path.as_path().display().to_string(), None, true)?;
357 let contents: Vec<_> = input
358 .get_contents(
359 true,
360 true,
361 true,
362 FileType::default_extensions(),
363 UrlContentResolver::default(),
364 PathExcludes::empty(),
365 None,
366 )
367 .collect::<Vec<_>>()
368 .await;
369
370 assert_eq!(contents.len(), 1);
371 assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext);
372 Ok(())
373 }
374
375 #[tokio::test]
376 async fn test_url_without_extension_is_html() -> LycheeResult<()> {
377 let input = Input::new("https://example.com/", None, true)?;
378 let contents: Vec<_> = input
379 .get_contents(
380 true,
381 true,
382 true,
383 FileType::default_extensions(),
384 UrlContentResolver::default(),
385 PathExcludes::empty(),
386 None,
387 )
388 .collect::<Vec<_>>()
389 .await;
390
391 assert_eq!(contents.len(), 1);
392 assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html);
393 Ok(())
394 }
395
396 #[tokio::test]
397 async fn test_collect_links() -> LycheeResult<()> {
398 let temp_dir = tempfile::tempdir().unwrap();
399 let temp_dir_path = temp_dir.path();
400
401 let file_path = temp_dir_path.join("f");
402 let file_glob_1_path = temp_dir_path.join("glob-1");
403 let file_glob_2_path = temp_dir_path.join("glob-2");
404
405 let mut file = File::create(&file_path).unwrap();
406 let mut file_glob_1 = File::create(file_glob_1_path).unwrap();
407 let mut file_glob_2 = File::create(file_glob_2_path).unwrap();
408
409 writeln!(file, "{TEST_FILE}").unwrap();
410 writeln!(file_glob_1, "{TEST_GLOB_1}").unwrap();
411 writeln!(file_glob_2, "{TEST_GLOB_2_MAIL}").unwrap();
412
413 let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
414
415 let inputs = HashSet::from_iter([
416 Input::from_input_source(InputSource::String(Cow::Borrowed(TEST_STRING))),
417 Input::from_input_source(InputSource::RemoteUrl(Box::new(
418 Url::parse(&mock_server.uri())
419 .map_err(|e| (mock_server.uri(), e))
420 .unwrap(),
421 ))),
422 Input::from_input_source(InputSource::FsPath(file_path)),
423 Input::from_input_source(InputSource::FsGlob {
424 pattern: glob::Pattern::new(&temp_dir_path.join("glob*").to_string_lossy())?,
425 ignore_case: true,
426 }),
427 ]);
428
429 let links = collect_verbatim(
430 inputs,
431 None,
432 BaseInfo::none(),
433 FileType::default_extensions(),
434 )
435 .await
436 .ok()
437 .unwrap();
438
439 let expected_links = HashSet::from_iter([
440 website!(TEST_STRING),
441 website!(TEST_URL),
442 website!(TEST_FILE),
443 website!(TEST_GLOB_1),
444 mail!(TEST_GLOB_2_MAIL),
445 ]);
446
447 assert_eq!(links, expected_links);
448
449 Ok(())
450 }
451
452 #[tokio::test]
453 async fn test_collect_markdown_links() {
454 let base = BaseInfo::try_from("https://github.com/hello-rust/lychee/").unwrap();
455 let input = Input {
456 source: InputSource::String(Cow::Borrowed(
457 "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
458 )),
459 file_type_hint: Some(FileType::Markdown),
460 };
461 let inputs = HashSet::from_iter([input]);
462
463 let links = collect(inputs, None, base).await.ok().unwrap();
464
465 let expected_links = HashSet::from_iter([
466 website!("https://endler.dev"),
467 website!("https://github.com/hello-rust/lychee/relative_link"),
468 ]);
469
470 assert_eq!(links, expected_links);
471 }
472
473 #[tokio::test]
474 async fn test_collect_html_links() {
475 let base = BaseInfo::try_from("https://github.com/lycheeverse/").unwrap();
476 let input = Input {
477 source: InputSource::String(Cow::Borrowed(
478 r#"<html>
479 <div class="row">
480 <a href="https://github.com/lycheeverse/lychee/">
481 <a href="blob/master/README.md">README</a>
482 </div>
483 </html>"#,
484 )),
485 file_type_hint: Some(FileType::Html),
486 };
487 let inputs = HashSet::from_iter([input]);
488
489 let links = collect(inputs, None, base).await.ok().unwrap();
490
491 let expected_links = HashSet::from_iter([
492 website!("https://github.com/lycheeverse/lychee/"),
493 website!("https://github.com/lycheeverse/blob/master/README.md"),
494 ]);
495
496 assert_eq!(links, expected_links);
497 }
498
499 #[tokio::test]
500 async fn test_collect_html_srcset() {
501 let base = BaseInfo::try_from("https://example.com/").unwrap();
502 let input = Input {
503 source: InputSource::String(Cow::Borrowed(
504 r#"
505 <img
506 src="/static/image.png"
507 srcset="
508 /static/image300.png 300w,
509 /static/image600.png 600w,
510 "
511 />
512 "#,
513 )),
514 file_type_hint: Some(FileType::Html),
515 };
516 let inputs = HashSet::from_iter([input]);
517
518 let links = collect(inputs, None, base).await.ok().unwrap();
519
520 let expected_links = HashSet::from_iter([
521 website!("https://example.com/static/image.png"),
522 website!("https://example.com/static/image300.png"),
523 website!("https://example.com/static/image600.png"),
524 ]);
525
526 assert_eq!(links, expected_links);
527 }
528
529 #[tokio::test]
530 async fn test_markdown_internal_url() {
531 let base = BaseInfo::try_from("https://localhost.com/").unwrap();
532
533 let input = Input {
534 source: InputSource::String(Cow::Borrowed(
535 "This is [an internal url](@/internal.md)
536 This is [an internal url](@/internal.markdown)
537 This is [an internal url](@/internal.markdown#example)
538 This is [an internal url](@/internal.md#example)",
539 )),
540 file_type_hint: Some(FileType::Markdown),
541 };
542 let inputs = HashSet::from_iter([input]);
543
544 let links = collect(inputs, None, base).await.ok().unwrap();
545
546 let expected = HashSet::from_iter([
547 website!("https://localhost.com/@/internal.md"),
548 website!("https://localhost.com/@/internal.markdown"),
549 website!("https://localhost.com/@/internal.md#example"),
550 website!("https://localhost.com/@/internal.markdown#example"),
551 ]);
552
553 assert_eq!(links, expected);
554 }
555
556 #[tokio::test]
557 async fn test_extract_html5_not_valid_xml_relative_links() {
558 let base = BaseInfo::try_from("https://example.com").unwrap();
559 let input = load_fixture!("TEST_HTML5.html");
560
561 let input = Input {
562 source: InputSource::String(Cow::Owned(input)),
563 file_type_hint: Some(FileType::Html),
564 };
565 let inputs = HashSet::from_iter([input]);
566
567 let links = collect(inputs, None, base).await.ok().unwrap();
568
569 let expected_links = HashSet::from_iter([
570 website!("https://example.com/body/a"),
572 website!("https://example.com/body/div_empty_a"),
573 website!("https://example.com/css/style_full_url.css"),
574 website!("https://example.com/css/style_relative_url.css"),
575 website!("https://example.com/head/home"),
576 website!("https://example.com/images/icon.png"),
577 ]);
578
579 assert_eq!(links, expected_links);
580 }
581
582 #[tokio::test]
583 async fn test_relative_url_with_base_extracted_from_input() {
584 let contents = r#"<html>
585 <div class="row">
586 <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
587 <a href="/about">About</a>
588 </div>
589 </html>"#;
590 let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));
591
592 let server_uri = Url::parse(&mock_server.uri()).unwrap();
593
594 let input = Input::from_input_source(InputSource::RemoteUrl(Box::new(server_uri.clone())));
595
596 let inputs = HashSet::from_iter([input]);
597
598 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
599
600 let expected_urls = HashSet::from_iter([
601 website!("https://github.com/lycheeverse/lychee/"),
602 website!(&format!("{server_uri}about")),
603 ]);
604
605 assert_eq!(links, expected_urls);
606 }
607
608 #[tokio::test]
609 async fn test_email_with_query_params() {
610 let input = Input::from_input_source(InputSource::String(Cow::Borrowed(
611 "This is a mailto:user@example.com?subject=Hello link",
612 )));
613
614 let inputs = HashSet::from_iter([input]);
615
616 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
617
618 let expected_links = HashSet::from_iter([mail!("user@example.com")]);
619
620 assert_eq!(links, expected_links);
621 }
622
623 #[tokio::test]
624 async fn test_user_agent_is_sent_for_remote_input_url() {
625 use wiremock::matchers::{header, method, path};
626 use wiremock::{Mock, MockServer, ResponseTemplate};
627
628 let mock_server = MockServer::start().await;
629 let uri = Uri::try_from("https://example.com").unwrap();
630
631 Mock::given(method("GET"))
632 .and(path("/"))
633 .and(header("user-agent", "test-agent/1.0"))
634 .respond_with(
635 ResponseTemplate::new(200).set_body_string(format!(r#"<a href="{uri}">Link</a>"#)),
636 )
637 .expect(1)
638 .mount(&mock_server)
639 .await;
640
641 let url = Url::parse(&mock_server.uri()).unwrap();
642 let inputs = std::collections::HashSet::from_iter([Input {
643 source: InputSource::RemoteUrl(Box::new(url)),
644 file_type_hint: Some(FileType::Html),
645 }]);
646
647 let client = crate::ClientBuilder::builder()
648 .user_agent("test-agent/1.0".to_string())
649 .build()
650 .client()
651 .unwrap();
652
653 let links = Collector::new(None, BaseInfo::none())
654 .unwrap()
655 .host_pool(client.host_pool())
656 .collect_links_from_file_types(inputs, crate::FileExtensions::default())
657 .map(|r| r.unwrap().uri)
658 .collect::<std::collections::HashSet<_>>()
659 .await;
660
661 assert_eq!(links, HashSet::from([uri]));
662 }
663
664 #[tokio::test]
665 async fn test_multiple_remote_urls() {
666 let mock_server_1 = mock_server!(
667 StatusCode::OK,
668 set_body_string(r#"<a href="relative.html">Link</a>"#)
669 );
670 let mock_server_2 = mock_server!(
671 StatusCode::OK,
672 set_body_string(r#"<a href="relative.html">Link</a>"#)
673 );
674
675 let inputs = HashSet::from_iter([
676 Input {
677 source: InputSource::RemoteUrl(Box::new(
678 Url::parse(&format!(
679 "{}/foo/index.html",
680 mock_server_1.uri().trim_end_matches('/')
681 ))
682 .unwrap(),
683 )),
684 file_type_hint: Some(FileType::Html),
685 },
686 Input {
687 source: InputSource::RemoteUrl(Box::new(
688 Url::parse(&format!(
689 "{}/bar/index.html",
690 mock_server_2.uri().trim_end_matches('/')
691 ))
692 .unwrap(),
693 )),
694 file_type_hint: Some(FileType::Html),
695 },
696 ]);
697
698 let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
699
700 let expected_links = HashSet::from_iter([
701 website!(&format!(
702 "{}/foo/relative.html",
703 mock_server_1.uri().trim_end_matches('/')
704 )),
705 website!(&format!(
706 "{}/bar/relative.html",
707 mock_server_2.uri().trim_end_matches('/')
708 )),
709 ]);
710
711 assert_eq!(links, expected_links);
712 }
713
714 #[tokio::test]
715 async fn test_file_path_with_base() {
716 let base = BaseInfo::try_from("/path/to/root").unwrap();
717
718 let input = Input {
719 source: InputSource::String(Cow::Borrowed(
720 r#"
721 <a href="index.html">Index</a>
722 <a href="about.html">About</a>
723 <a href="../up.html">About</a>
724 <a href="/another.html">Another</a>
725 "#,
726 )),
727 file_type_hint: Some(FileType::Html),
728 };
729
730 let inputs = HashSet::from_iter([input]);
731
732 let links = collect(inputs, None, base).await.ok().unwrap();
733 let links_str: HashSet<_> = links.iter().map(|x| x.url.as_str()).collect();
734
735 let expected_links: HashSet<_> = HashSet::from_iter([
736 ("file:///path/to/root/index.html"),
737 ("file:///path/to/root/about.html"),
738 ("file:///path/to/up.html"),
739 ("file:///path/to/root/another.html"),
740 ]);
741
742 assert_eq!(links_str, expected_links);
743 }
744}