Skip to main content

lychee_lib/
collector.rs

1use crate::Preprocessor;
2use crate::filter::PathExcludes;
3use crate::ratelimit::HostPool;
4use crate::types::resolver::UrlContentResolver;
5use crate::{
6    BaseInfo, Input, LycheeResult, Request, RequestError, basic_auth::BasicAuthExtractor,
7    extract::Extractor, types::FileExtensions, types::uri::raw::RawUri, utils::request,
8};
9use futures::TryStreamExt;
10use futures::{
11    StreamExt,
12    stream::{self, Stream},
13};
14use http::HeaderMap;
15use log::warn;
16use par_stream::ParStreamExt;
17use std::collections::HashSet;
18use std::path::{Path, PathBuf};
19use std::sync::Arc;
20
21/// Collector keeps the state of link collection
22/// It drives the link extraction from inputs
23#[allow(clippy::struct_excessive_bools)]
24#[derive(Debug, Clone)]
25pub struct Collector {
26    basic_auth_extractor: Option<BasicAuthExtractor>,
27    skip_missing_inputs: bool,
28    skip_ignored: bool,
29    skip_hidden: bool,
30    include_verbatim: bool,
31    include_wikilinks: bool,
32    use_html5ever: bool,
33    root_dir: Option<PathBuf>,
34    base: BaseInfo,
35    excluded_paths: PathExcludes,
36    /// Custom headers forwarded to the resolver for remote input fetches.
37    /// Note: when a `host_pool` is set, per-host headers configured there
38    /// take precedence over these global headers for known hosts.
39    headers: HeaderMap,
40    /// Shared host pool used to fetch remote input documents.
41    ///
42    /// Using the same pool as the link checker means that input URL fetches
43    /// use the configured user-agent, TLS settings, cookies, per-host rate
44    /// limits, and custom headers.
45    host_pool: Arc<HostPool>,
46    preprocessor: Option<Preprocessor>,
47}
48
49impl Default for Collector {
50    /// # Panics
51    ///
52    /// We call [`Collector::new()`] which can panic in certain scenarios.
53    ///
54    /// Use `Collector::new()` instead if you need to handle
55    /// [`ClientBuilder`](crate::ClientBuilder) errors gracefully.
56    fn default() -> Self {
57        Collector {
58            basic_auth_extractor: None,
59            skip_missing_inputs: false,
60            include_verbatim: false,
61            include_wikilinks: false,
62            use_html5ever: false,
63            skip_hidden: true,
64            skip_ignored: true,
65            root_dir: None,
66            base: BaseInfo::none(),
67            headers: HeaderMap::new(),
68            host_pool: Arc::new(HostPool::default()),
69            excluded_paths: PathExcludes::empty(),
70            preprocessor: None,
71        }
72    }
73}
74
75impl Collector {
76    /// Create a new collector with an empty cache
77    ///
78    /// # Errors
79    ///
80    /// Returns an `Err` if the `root_dir` is not a valid path
81    /// or if the reqwest `Client` fails to build
82    pub fn new(root_dir: Option<PathBuf>, base: BaseInfo) -> LycheeResult<Self> {
83        // HACK: if root-dir and base-url are given together and the base is a full file path,
84        // then join the root dir onto the base to match old behaviour.........
85        let (root_dir, base) = match (root_dir, base) {
86            (Some(root_dir), BaseInfo::Full { origin, path })
87                if origin.scheme() == "file" && path.is_empty() =>
88            {
89                let root_dir = root_dir
90                    .strip_prefix("/")
91                    .map(Path::to_path_buf)
92                    .unwrap_or(root_dir)
93                    .join("");
94
95                match origin.to_file_path() {
96                    Ok(base_path) => (Some(base_path.join(root_dir)), BaseInfo::full(origin, path)),
97                    Err(()) => (Some(root_dir), BaseInfo::full(origin, path)),
98                }
99            }
100            (Some(root_dir), base) => {
101                let root_dir = std::path::absolute(&root_dir).unwrap_or(root_dir);
102
103                if !root_dir.exists() {
104                    warn!("Root dir '{}' does not exist", root_dir.to_string_lossy());
105                } else if !root_dir.is_dir() {
106                    warn!("Root dir '{}' not a directory", root_dir.to_string_lossy());
107                }
108                (Some(root_dir), base)
109            }
110            (None, base) => (None, base),
111        };
112        Ok(Collector {
113            basic_auth_extractor: None,
114            skip_missing_inputs: false,
115            include_verbatim: false,
116            include_wikilinks: false,
117            use_html5ever: false,
118            skip_hidden: true,
119            skip_ignored: true,
120            preprocessor: None,
121            headers: HeaderMap::new(),
122            host_pool: Arc::new(HostPool::default()),
123            excluded_paths: PathExcludes::empty(),
124            root_dir,
125            base,
126        })
127    }
128
129    /// Skip missing input files (default is to error if they don't exist)
130    #[must_use]
131    pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
132        self.skip_missing_inputs = yes;
133        self
134    }
135
136    /// Skip files that are hidden
137    #[must_use]
138    pub const fn skip_hidden(mut self, yes: bool) -> Self {
139        self.skip_hidden = yes;
140        self
141    }
142
143    /// Skip files that are ignored
144    #[must_use]
145    pub const fn skip_ignored(mut self, yes: bool) -> Self {
146        self.skip_ignored = yes;
147        self
148    }
149
150    /// Set headers to use when resolving input URLs
151    #[must_use]
152    pub fn headers(mut self, headers: HeaderMap) -> Self {
153        self.headers = headers;
154        self
155    }
156
157    /// Set the [`HostPool`] to use when fetching remote input URLs.
158    ///
159    /// Pass the pool from a fully-configured [`crate::Client`] so that input
160    /// fetches share the same user-agent, TLS settings, cookies, per-host
161    /// rate limits and headers as regular link checks:
162    ///
163    /// ```
164    /// # use lychee_lib::{BaseInfo, ClientBuilder, Collector, ErrorKind};
165    /// let client = ClientBuilder::builder().build().client()?;
166    /// let collector = Collector::new(None, BaseInfo::none())?
167    ///     .host_pool(client.host_pool());
168    /// # Ok::<(), ErrorKind>(())
169    /// ```
170    #[must_use]
171    pub fn host_pool(mut self, host_pool: Arc<HostPool>) -> Self {
172        self.host_pool = host_pool;
173        self
174    }
175
176    /// Use `html5ever` to parse HTML instead of `html5gum`.
177    #[must_use]
178    pub const fn use_html5ever(mut self, yes: bool) -> Self {
179        self.use_html5ever = yes;
180        self
181    }
182
183    /// Skip over links in verbatim sections (like Markdown code blocks)
184    #[must_use]
185    pub const fn include_verbatim(mut self, yes: bool) -> Self {
186        self.include_verbatim = yes;
187        self
188    }
189
190    /// Check WikiLinks in Markdown files
191    #[allow(clippy::doc_markdown)]
192    #[must_use]
193    pub const fn include_wikilinks(mut self, yes: bool) -> Self {
194        self.include_wikilinks = yes;
195        self
196    }
197
198    /// Configure a file [`Preprocessor`]
199    #[must_use]
200    pub fn preprocessor(mut self, preprocessor: Option<Preprocessor>) -> Self {
201        self.preprocessor = preprocessor;
202        self
203    }
204
205    /// Pass a [`BasicAuthExtractor`] which is capable to match found
206    /// URIs to basic auth credentials. These credentials get passed to the
207    /// request in question.
208    #[must_use]
209    #[allow(clippy::missing_const_for_fn)]
210    pub fn basic_auth_extractor(mut self, extractor: BasicAuthExtractor) -> Self {
211        self.basic_auth_extractor = Some(extractor);
212        self
213    }
214
215    /// Configure which paths to exclude
216    #[must_use]
217    pub fn excluded_paths(mut self, excluded_paths: PathExcludes) -> Self {
218        self.excluded_paths = excluded_paths;
219        self
220    }
221
222    /// Convenience method to fetch all unique links from inputs
223    /// with the default extensions.
224    pub fn collect_links(
225        self,
226        inputs: HashSet<Input>,
227    ) -> impl Stream<Item = Result<Request, RequestError>> {
228        self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
229    }
230
231    /// Fetch all unique links from inputs
232    /// All relative URLs get prefixed with `base` (if given).
233    /// (This can be a directory or a base URL)
234    ///
235    /// # Errors
236    ///
237    /// Will return `Err` if links cannot be extracted from an input
238    pub fn collect_links_from_file_types(
239        self,
240        inputs: HashSet<Input>,
241        extensions: FileExtensions,
242    ) -> impl Stream<Item = Result<Request, RequestError>> {
243        let skip_missing_inputs = self.skip_missing_inputs;
244        let skip_hidden = self.skip_hidden;
245        let skip_ignored = self.skip_ignored;
246        let global_base = self.base;
247        let excluded_paths = self.excluded_paths;
248
249        let resolver = UrlContentResolver {
250            basic_auth_extractor: self.basic_auth_extractor.clone(),
251            headers: self.headers.clone(),
252            host_pool: self.host_pool,
253        };
254
255        let extractor = Extractor::new(
256            self.use_html5ever,
257            self.include_verbatim,
258            self.include_wikilinks,
259        );
260
261        stream::iter(inputs)
262            .par_then_unordered(None, move |input| {
263                let extensions = extensions.clone();
264                let resolver = resolver.clone();
265                let excluded_paths = excluded_paths.clone();
266                let preprocessor = self.preprocessor.clone();
267
268                async move {
269                    input.get_contents(
270                        skip_missing_inputs,
271                        skip_hidden,
272                        skip_ignored,
273                        extensions,
274                        resolver,
275                        excluded_paths,
276                        preprocessor,
277                    )
278                }
279            })
280            .flatten()
281            .par_then_unordered(None, move |content| {
282                let global_base = global_base.clone();
283                let root_dir = self.root_dir.clone();
284                let basic_auth_extractor = self.basic_auth_extractor.clone();
285                async move {
286                    let content = content?;
287                    let uris: Vec<RawUri> = extractor.extract(&content);
288                    let requests = request::create(
289                        uris,
290                        &content.source,
291                        root_dir.as_deref(),
292                        &global_base,
293                        basic_auth_extractor.as_ref(),
294                    );
295                    Result::Ok(stream::iter(requests))
296                }
297            })
298            .try_flatten()
299    }
300}
301
302#[cfg(test)]
303mod tests {
304    use std::borrow::Cow;
305    use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};
306    use test_utils::{fixtures_path, load_fixture, mail, mock_server, website};
307
308    use http::StatusCode;
309    use reqwest::Url;
310
311    use super::*;
312    use crate::{
313        LycheeResult, Uri,
314        filter::PathExcludes,
315        types::{FileType, Input, InputSource},
316    };
317
318    // Helper function to run the collector on the given inputs
319    async fn collect(
320        inputs: HashSet<Input>,
321        root_dir: Option<PathBuf>,
322        base: BaseInfo,
323    ) -> LycheeResult<HashSet<Uri>> {
324        let responses = Collector::new(root_dir, base)?.collect_links(inputs);
325        Ok(responses.map(|r| r.unwrap().uri).collect().await)
326    }
327
328    /// Helper function for collecting verbatim links
329    ///
330    /// A verbatim link is a link that is not parsed by the HTML parser.
331    /// For example, a link in a code block or a script tag.
332    async fn collect_verbatim(
333        inputs: HashSet<Input>,
334        root_dir: Option<PathBuf>,
335        base: BaseInfo,
336        extensions: FileExtensions,
337    ) -> LycheeResult<HashSet<Uri>> {
338        let responses = Collector::new(root_dir, base)?
339            .include_verbatim(true)
340            .collect_links_from_file_types(inputs, extensions);
341        Ok(responses.map(|r| r.unwrap().uri).collect().await)
342    }
343
344    const TEST_STRING: &str = "http://test-string.com";
345    const TEST_URL: &str = "https://test-url.org";
346    const TEST_FILE: &str = "https://test-file.io";
347    const TEST_GLOB_1: &str = "https://test-glob-1.io";
348    const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
349
350    #[tokio::test]
351    async fn test_file_without_extension_is_plaintext() -> LycheeResult<()> {
352        let temp_dir = tempfile::tempdir().unwrap();
353        // Treat as plaintext file (no extension)
354        let file_path = temp_dir.path().join("README");
355        let _file = File::create(&file_path).unwrap();
356        let input = Input::new(&file_path.as_path().display().to_string(), None, true)?;
357        let contents: Vec<_> = input
358            .get_contents(
359                true,
360                true,
361                true,
362                FileType::default_extensions(),
363                UrlContentResolver::default(),
364                PathExcludes::empty(),
365                None,
366            )
367            .collect::<Vec<_>>()
368            .await;
369
370        assert_eq!(contents.len(), 1);
371        assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Plaintext);
372        Ok(())
373    }
374
375    #[tokio::test]
376    async fn test_url_without_extension_is_html() -> LycheeResult<()> {
377        let input = Input::new("https://example.com/", None, true)?;
378        let contents: Vec<_> = input
379            .get_contents(
380                true,
381                true,
382                true,
383                FileType::default_extensions(),
384                UrlContentResolver::default(),
385                PathExcludes::empty(),
386                None,
387            )
388            .collect::<Vec<_>>()
389            .await;
390
391        assert_eq!(contents.len(), 1);
392        assert_eq!(contents[0].as_ref().unwrap().file_type, FileType::Html);
393        Ok(())
394    }
395
396    #[tokio::test]
397    async fn test_collect_links() -> LycheeResult<()> {
398        let temp_dir = tempfile::tempdir().unwrap();
399        let temp_dir_path = temp_dir.path();
400
401        let file_path = temp_dir_path.join("f");
402        let file_glob_1_path = temp_dir_path.join("glob-1");
403        let file_glob_2_path = temp_dir_path.join("glob-2");
404
405        let mut file = File::create(&file_path).unwrap();
406        let mut file_glob_1 = File::create(file_glob_1_path).unwrap();
407        let mut file_glob_2 = File::create(file_glob_2_path).unwrap();
408
409        writeln!(file, "{TEST_FILE}").unwrap();
410        writeln!(file_glob_1, "{TEST_GLOB_1}").unwrap();
411        writeln!(file_glob_2, "{TEST_GLOB_2_MAIL}").unwrap();
412
413        let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));
414
415        let inputs = HashSet::from_iter([
416            Input::from_input_source(InputSource::String(Cow::Borrowed(TEST_STRING))),
417            Input::from_input_source(InputSource::RemoteUrl(Box::new(
418                Url::parse(&mock_server.uri())
419                    .map_err(|e| (mock_server.uri(), e))
420                    .unwrap(),
421            ))),
422            Input::from_input_source(InputSource::FsPath(file_path)),
423            Input::from_input_source(InputSource::FsGlob {
424                pattern: glob::Pattern::new(&temp_dir_path.join("glob*").to_string_lossy())?,
425                ignore_case: true,
426            }),
427        ]);
428
429        let links = collect_verbatim(
430            inputs,
431            None,
432            BaseInfo::none(),
433            FileType::default_extensions(),
434        )
435        .await
436        .ok()
437        .unwrap();
438
439        let expected_links = HashSet::from_iter([
440            website!(TEST_STRING),
441            website!(TEST_URL),
442            website!(TEST_FILE),
443            website!(TEST_GLOB_1),
444            mail!(TEST_GLOB_2_MAIL),
445        ]);
446
447        assert_eq!(links, expected_links);
448
449        Ok(())
450    }
451
452    #[tokio::test]
453    async fn test_collect_markdown_links() {
454        let base = BaseInfo::try_from("https://github.com/hello-rust/lychee/").unwrap();
455        let input = Input {
456            source: InputSource::String(Cow::Borrowed(
457                "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)",
458            )),
459            file_type_hint: Some(FileType::Markdown),
460        };
461        let inputs = HashSet::from_iter([input]);
462
463        let links = collect(inputs, None, base).await.ok().unwrap();
464
465        let expected_links = HashSet::from_iter([
466            website!("https://endler.dev"),
467            website!("https://github.com/hello-rust/lychee/relative_link"),
468        ]);
469
470        assert_eq!(links, expected_links);
471    }
472
473    #[tokio::test]
474    async fn test_collect_html_links() {
475        let base = BaseInfo::try_from("https://github.com/lycheeverse/").unwrap();
476        let input = Input {
477            source: InputSource::String(Cow::Borrowed(
478                r#"<html>
479                <div class="row">
480                    <a href="https://github.com/lycheeverse/lychee/">
481                    <a href="blob/master/README.md">README</a>
482                </div>
483            </html>"#,
484            )),
485            file_type_hint: Some(FileType::Html),
486        };
487        let inputs = HashSet::from_iter([input]);
488
489        let links = collect(inputs, None, base).await.ok().unwrap();
490
491        let expected_links = HashSet::from_iter([
492            website!("https://github.com/lycheeverse/lychee/"),
493            website!("https://github.com/lycheeverse/blob/master/README.md"),
494        ]);
495
496        assert_eq!(links, expected_links);
497    }
498
499    #[tokio::test]
500    async fn test_collect_html_srcset() {
501        let base = BaseInfo::try_from("https://example.com/").unwrap();
502        let input = Input {
503            source: InputSource::String(Cow::Borrowed(
504                r#"
505            <img
506                src="/static/image.png"
507                srcset="
508                /static/image300.png  300w,
509                /static/image600.png  600w,
510                "
511            />
512          "#,
513            )),
514            file_type_hint: Some(FileType::Html),
515        };
516        let inputs = HashSet::from_iter([input]);
517
518        let links = collect(inputs, None, base).await.ok().unwrap();
519
520        let expected_links = HashSet::from_iter([
521            website!("https://example.com/static/image.png"),
522            website!("https://example.com/static/image300.png"),
523            website!("https://example.com/static/image600.png"),
524        ]);
525
526        assert_eq!(links, expected_links);
527    }
528
529    #[tokio::test]
530    async fn test_markdown_internal_url() {
531        let base = BaseInfo::try_from("https://localhost.com/").unwrap();
532
533        let input = Input {
534            source: InputSource::String(Cow::Borrowed(
535                "This is [an internal url](@/internal.md)
536        This is [an internal url](@/internal.markdown)
537        This is [an internal url](@/internal.markdown#example)
538        This is [an internal url](@/internal.md#example)",
539            )),
540            file_type_hint: Some(FileType::Markdown),
541        };
542        let inputs = HashSet::from_iter([input]);
543
544        let links = collect(inputs, None, base).await.ok().unwrap();
545
546        let expected = HashSet::from_iter([
547            website!("https://localhost.com/@/internal.md"),
548            website!("https://localhost.com/@/internal.markdown"),
549            website!("https://localhost.com/@/internal.md#example"),
550            website!("https://localhost.com/@/internal.markdown#example"),
551        ]);
552
553        assert_eq!(links, expected);
554    }
555
556    #[tokio::test]
557    async fn test_extract_html5_not_valid_xml_relative_links() {
558        let base = BaseInfo::try_from("https://example.com").unwrap();
559        let input = load_fixture!("TEST_HTML5.html");
560
561        let input = Input {
562            source: InputSource::String(Cow::Owned(input)),
563            file_type_hint: Some(FileType::Html),
564        };
565        let inputs = HashSet::from_iter([input]);
566
567        let links = collect(inputs, None, base).await.ok().unwrap();
568
569        let expected_links = HashSet::from_iter([
570            // the body links wouldn't be present if the file was parsed strictly as XML
571            website!("https://example.com/body/a"),
572            website!("https://example.com/body/div_empty_a"),
573            website!("https://example.com/css/style_full_url.css"),
574            website!("https://example.com/css/style_relative_url.css"),
575            website!("https://example.com/head/home"),
576            website!("https://example.com/images/icon.png"),
577        ]);
578
579        assert_eq!(links, expected_links);
580    }
581
582    #[tokio::test]
583    async fn test_relative_url_with_base_extracted_from_input() {
584        let contents = r#"<html>
585            <div class="row">
586                <a href="https://github.com/lycheeverse/lychee/">GitHub</a>
587                <a href="/about">About</a>
588            </div>
589        </html>"#;
590        let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));
591
592        let server_uri = Url::parse(&mock_server.uri()).unwrap();
593
594        let input = Input::from_input_source(InputSource::RemoteUrl(Box::new(server_uri.clone())));
595
596        let inputs = HashSet::from_iter([input]);
597
598        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
599
600        let expected_urls = HashSet::from_iter([
601            website!("https://github.com/lycheeverse/lychee/"),
602            website!(&format!("{server_uri}about")),
603        ]);
604
605        assert_eq!(links, expected_urls);
606    }
607
608    #[tokio::test]
609    async fn test_email_with_query_params() {
610        let input = Input::from_input_source(InputSource::String(Cow::Borrowed(
611            "This is a mailto:user@example.com?subject=Hello link",
612        )));
613
614        let inputs = HashSet::from_iter([input]);
615
616        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
617
618        let expected_links = HashSet::from_iter([mail!("user@example.com")]);
619
620        assert_eq!(links, expected_links);
621    }
622
623    #[tokio::test]
624    async fn test_user_agent_is_sent_for_remote_input_url() {
625        use wiremock::matchers::{header, method, path};
626        use wiremock::{Mock, MockServer, ResponseTemplate};
627
628        let mock_server = MockServer::start().await;
629        let uri = Uri::try_from("https://example.com").unwrap();
630
631        Mock::given(method("GET"))
632            .and(path("/"))
633            .and(header("user-agent", "test-agent/1.0"))
634            .respond_with(
635                ResponseTemplate::new(200).set_body_string(format!(r#"<a href="{uri}">Link</a>"#)),
636            )
637            .expect(1)
638            .mount(&mock_server)
639            .await;
640
641        let url = Url::parse(&mock_server.uri()).unwrap();
642        let inputs = std::collections::HashSet::from_iter([Input {
643            source: InputSource::RemoteUrl(Box::new(url)),
644            file_type_hint: Some(FileType::Html),
645        }]);
646
647        let client = crate::ClientBuilder::builder()
648            .user_agent("test-agent/1.0".to_string())
649            .build()
650            .client()
651            .unwrap();
652
653        let links = Collector::new(None, BaseInfo::none())
654            .unwrap()
655            .host_pool(client.host_pool())
656            .collect_links_from_file_types(inputs, crate::FileExtensions::default())
657            .map(|r| r.unwrap().uri)
658            .collect::<std::collections::HashSet<_>>()
659            .await;
660
661        assert_eq!(links, HashSet::from([uri]));
662    }
663
664    #[tokio::test]
665    async fn test_multiple_remote_urls() {
666        let mock_server_1 = mock_server!(
667            StatusCode::OK,
668            set_body_string(r#"<a href="relative.html">Link</a>"#)
669        );
670        let mock_server_2 = mock_server!(
671            StatusCode::OK,
672            set_body_string(r#"<a href="relative.html">Link</a>"#)
673        );
674
675        let inputs = HashSet::from_iter([
676            Input {
677                source: InputSource::RemoteUrl(Box::new(
678                    Url::parse(&format!(
679                        "{}/foo/index.html",
680                        mock_server_1.uri().trim_end_matches('/')
681                    ))
682                    .unwrap(),
683                )),
684                file_type_hint: Some(FileType::Html),
685            },
686            Input {
687                source: InputSource::RemoteUrl(Box::new(
688                    Url::parse(&format!(
689                        "{}/bar/index.html",
690                        mock_server_2.uri().trim_end_matches('/')
691                    ))
692                    .unwrap(),
693                )),
694                file_type_hint: Some(FileType::Html),
695            },
696        ]);
697
698        let links = collect(inputs, None, BaseInfo::none()).await.ok().unwrap();
699
700        let expected_links = HashSet::from_iter([
701            website!(&format!(
702                "{}/foo/relative.html",
703                mock_server_1.uri().trim_end_matches('/')
704            )),
705            website!(&format!(
706                "{}/bar/relative.html",
707                mock_server_2.uri().trim_end_matches('/')
708            )),
709        ]);
710
711        assert_eq!(links, expected_links);
712    }
713
714    #[tokio::test]
715    async fn test_file_path_with_base() {
716        let base = BaseInfo::try_from("/path/to/root").unwrap();
717
718        let input = Input {
719            source: InputSource::String(Cow::Borrowed(
720                r#"
721                <a href="index.html">Index</a>
722                <a href="about.html">About</a>
723                <a href="../up.html">About</a>
724                <a href="/another.html">Another</a>
725            "#,
726            )),
727            file_type_hint: Some(FileType::Html),
728        };
729
730        let inputs = HashSet::from_iter([input]);
731
732        let links = collect(inputs, None, base).await.ok().unwrap();
733        let links_str: HashSet<_> = links.iter().map(|x| x.url.as_str()).collect();
734
735        let expected_links: HashSet<_> = HashSet::from_iter([
736            ("file:///path/to/root/index.html"),
737            ("file:///path/to/root/about.html"),
738            ("file:///path/to/up.html"),
739            ("file:///path/to/root/another.html"),
740        ]);
741
742        assert_eq!(links_str, expected_links);
743    }
744}