Skip to main content

lychee_lib/extract/
markdown.rs

1//! Extract links and fragments from markdown documents
2use std::collections::{HashMap, HashSet};
3
4use log::warn;
5use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset};
6
7use crate::{
8    checker::wikilink::wikilink,
9    extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext},
10    types::uri::raw::{
11        OffsetSpanProvider, RawUri, RawUriSpan, SourceSpanProvider, SpanProvider as _,
12    },
13};
14
15use super::html::html5gum::extract_html_fragments;
16
17/// Returns the default markdown extensions used by lychee.
18/// Sadly, `|` is not const for `Options` so we can't use a const global.
19fn md_extensions() -> Options {
20    Options::ENABLE_HEADING_ATTRIBUTES
21        | Options::ENABLE_MATH
22        | Options::ENABLE_WIKILINKS
23        | Options::ENABLE_FOOTNOTES
24}
25
26/// Extract unparsed URL strings from a Markdown string.
27// TODO: Refactor the extractor to reduce the complexity and number of lines.
28#[expect(clippy::too_many_lines)]
29pub(crate) fn extract_markdown(
30    input: &str,
31    include_verbatim: bool,
32    include_wikilinks: bool,
33) -> Vec<RawUri> {
34    let mut inside_code_block = false;
35    let mut inside_link_label = false; // encountering `X` in `[X]()`
36    let mut inside_extracted_link = false; // prevent double extraction when encountering `Text(X)` in `<X>` or `[[X]]`
37
38    // HTML blocks come in chunks from pulldown_cmark, so we need to accumulate them
39    let mut inside_html_block = false;
40    let mut html_block_buffer = String::new();
41    let mut html_block_start_offset = 0;
42
43    let span_provider = SourceSpanProvider::from_input(input);
44    let parser =
45        TextMergeWithOffset::new(Parser::new_ext(input, md_extensions()).into_offset_iter());
46    parser
47        .filter_map(|(event, span)| match event {
48            // A link.
49            Event::Start(Tag::Link {
50                link_type,
51                dest_url,
52                ..
53            }) => {
54                match link_type {
55                    // Inline link like `[foo](bar)`
56                    // This is the most common link type
57                    LinkType::Inline => {
58                        inside_link_label = true;
59                        Some(raw_uri(&dest_url, span_provider.span(span.start)))
60                    }
61                    // Reference without destination in the document, but resolved by the `broken_link_callback`
62                    LinkType::Reference |
63                    // Collapsed link like `[foo][]`
64                    LinkType::ReferenceUnknown |
65                    // Collapsed link like `[foo][]`
66                    LinkType::Collapsed|
67                    // Collapsed link without destination in the document, but resolved by the `broken_link_callback`
68                    LinkType::CollapsedUnknown |
69                    // Shortcut link like `[foo]`
70                    LinkType::Shortcut |
71                    // Shortcut without destination in the document, but resolved by the `broken_link_callback`
72                    LinkType::ShortcutUnknown => {
73                        inside_link_label = true;
74                        // For reference links, create RawUri directly to handle relative file paths
75                        // that linkify doesn't recognize as URLs
76                        Some(raw_uri(&dest_url, span_provider.span(span.start)))
77                    },
78                    // Autolink like `<http://foo.bar/baz>`
79                    LinkType::Autolink |
80                    // Email address in autolink like `<john@example.org>`
81                    LinkType::Email => {
82                        inside_extracted_link  = true;
83                        let span_provider = get_email_span_provider(&span_provider, &span, link_type);
84                        Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider))
85                    }
86                    // Wiki URL (`[[http://example.com]]`)
87                    LinkType::WikiLink { has_pothole } => {
88                        // Exclude WikiLinks if not explicitly enabled
89                        if !include_wikilinks {
90                            return None;
91                        }
92                        inside_extracted_link = true;
93                        // Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents
94                        if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
95                            return None;
96                        }
97
98                        if let Ok(wikilink) = wikilink(&dest_url, has_pothole) {
99                            Some(vec![RawUri {
100                                text: wikilink.to_string(),
101                                element: Some("a".to_string()),
102                                attribute: Some("wikilink".to_string()),
103                                // wiki links start with `[[`, so offset the span by `2`
104                                span: span_provider.span(span.start + 2)
105                            }])
106                        } else {
107                            warn!("The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments");
108                            None
109                        }
110                    }
111                }
112            }
113
114            Event::Start(Tag::Image { dest_url, .. }) => Some(extract_image(&dest_url, span_provider.span(span.start))),
115
116            // A code block (inline or fenced).
117            Event::Start(Tag::CodeBlock(_)) => {
118                inside_code_block = true;
119                None
120            }
121            Event::End(TagEnd::CodeBlock) => {
122                inside_code_block = false;
123                None
124            }
125
126            // A text node.
127            Event::Text(txt) => {
128                if inside_extracted_link
129                    || (inside_link_label && !include_verbatim)
130                    || (inside_code_block && !include_verbatim) {
131                    None
132                } else {
133                    Some(extract_raw_uri_from_plaintext(
134                        &txt,
135                        &OffsetSpanProvider { offset: span.start, inner: &span_provider }
136                    ))
137                }
138            }
139
140            // Start of an HTML block
141            Event::Start(Tag::HtmlBlock) => {
142                inside_html_block = true;
143                html_block_buffer.clear();
144                html_block_start_offset = span.start;
145                None
146            }
147
148            // End of an HTML block - process accumulated HTML
149            Event::End(TagEnd::HtmlBlock) => {
150                inside_html_block = false;
151                if html_block_buffer.is_empty() {
152                    None
153                } else {
154                    Some(extract_html_with_span(
155                        &html_block_buffer,
156                        include_verbatim,
157                        OffsetSpanProvider {
158                            offset: html_block_start_offset,
159                            inner: &span_provider
160                        }
161                    ))
162                }
163            }
164
165            // An HTML node
166            Event::Html(html) => {
167                if inside_html_block {
168                    // Accumulate HTML chunks within a block
169                    html_block_buffer.push_str(&html);
170                    None
171                } else {
172                    // Standalone HTML (not part of a block) - process immediately
173                    Some(extract_html_with_span(
174                        &html,
175                        include_verbatim,
176                        OffsetSpanProvider { offset: span.start, inner: &span_provider }
177                    ))
178                }
179            }
180
181            // Inline HTML (not part of a block)
182            Event::InlineHtml(html) => {
183                Some(extract_html_with_span(
184                    &html,
185                    include_verbatim,
186                    OffsetSpanProvider { offset: span.start, inner: &span_provider }
187                ))
188            }
189
190            // An inline code node.
191            Event::Code(code) => {
192                if include_verbatim {
193                    // inline code starts with '`', so offset the span by `1`.
194                    Some(extract_raw_uri_from_plaintext(
195                        &code,
196                        &OffsetSpanProvider { offset: span.start + 1, inner: &span_provider }
197                    ))
198                } else {
199                    None
200                }
201            }
202
203            Event::End(TagEnd::Link) => {
204                inside_link_label = false;
205                inside_extracted_link = false;
206                None
207            }
208
209            #[expect(clippy::match_same_arms, reason = "Skip footnote references and definitions explicitly - they're not links to check")]
210            Event::FootnoteReference(_) | Event::Start(Tag::FootnoteDefinition(_)) | Event::End(TagEnd::FootnoteDefinition) => None,
211
212            // Silently skip over other events
213            _ => None,
214        })
215        .flatten()
216        .collect()
217}
218
219fn get_email_span_provider<'a>(
220    span_provider: &'a SourceSpanProvider<'_>,
221    span: &std::ops::Range<usize>,
222    link_type: LinkType,
223) -> OffsetSpanProvider<'a> {
224    let offset = match link_type {
225        // We don't know how the link starts, so don't offset the span.
226        LinkType::Reference | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown => 0,
227        // These start all with `[` or `<`, so offset the span by `1`.
228        LinkType::ReferenceUnknown
229        | LinkType::Collapsed
230        | LinkType::Shortcut
231        | LinkType::Autolink
232        | LinkType::Email => 1,
233        _ => {
234            debug_assert!(false, "Unexpected email link type: {link_type:?}");
235            0
236        }
237    };
238
239    OffsetSpanProvider {
240        offset: span.start + offset,
241        inner: span_provider,
242    }
243}
244
245/// Emulate `<img src="...">` tag to be compatible with HTML links.
246/// We might consider using the actual Markdown `LinkType` for better granularity in the future.
247fn extract_image(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
248    vec![RawUri {
249        text: dest_url.to_string(),
250        element: Some("img".to_string()),
251        attribute: Some("src".to_string()),
252        span,
253    }]
254}
255
256/// Emulate `<a href="...">` tag to be compatible with HTML links.
257/// We might consider using the actual Markdown `LinkType` for better granularity in the future.
258fn raw_uri(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
259    vec![RawUri {
260        text: dest_url.to_string(),
261        element: Some("a".to_string()),
262        attribute: Some("href".to_string()),
263        // Sadly, we don't know how long the `foo` part in `[foo](bar)` is,
264        // so the span points to the `[` and not to the `b`.
265        span,
266    }]
267}
268
269/// Extract fragments/anchors from a Markdown string.
270///
271/// Fragments are generated from headings using the same unique kebab case method as GitHub.
272/// If a [heading attribute](https://github.com/raphlinus/pulldown-cmark/blob/master/specs/heading_attrs.txt)
273/// is present,
274/// this will be added to the fragment set **alongside** the other generated fragment.
275/// It means a single heading such as `## Frag 1 {#frag-2}` would generate two fragments.
276pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
277    let mut in_heading = false;
278    let mut heading_text = String::new();
279    let mut heading_id: Option<CowStr<'_>> = None;
280    let mut id_generator = HeadingIdGenerator::default();
281
282    let mut out = HashSet::new();
283
284    for event in Parser::new_ext(input, md_extensions()) {
285        match event {
286            Event::Start(Tag::Heading { id, .. }) => {
287                heading_id = id;
288                in_heading = true;
289            }
290            Event::End(TagEnd::Heading(_)) => {
291                if let Some(frag) = heading_id.take() {
292                    out.insert(frag.to_string());
293                }
294
295                if !heading_text.is_empty() {
296                    let id = id_generator.generate(&heading_text);
297                    out.insert(id);
298                    heading_text.clear();
299                }
300
301                in_heading = false;
302            }
303            Event::Text(text) | Event::Code(text) if in_heading => {
304                heading_text.push_str(&text);
305            }
306
307            // An HTML node
308            Event::Html(html) | Event::InlineHtml(html) => {
309                out.extend(extract_html_fragments(&html));
310            }
311
312            // Silently skip over other events
313            _ => (),
314        }
315    }
316    out
317}
318
319#[derive(Default)]
320struct HeadingIdGenerator {
321    counter: HashMap<String, usize>,
322}
323
324impl HeadingIdGenerator {
325    fn generate(&mut self, heading: &str) -> String {
326        let mut id = Self::into_kebab_case(heading);
327        let count = self.counter.entry(id.clone()).or_insert(0);
328        if *count != 0 {
329            id = format!("{}-{}", id, *count);
330        }
331        *count += 1;
332
333        id
334    }
335
336    /// Converts text into kebab case
337    #[must_use]
338    fn into_kebab_case(text: &str) -> String {
339        text.to_lowercase()
340            .chars()
341            .filter_map(|ch| {
342                if ch.is_alphanumeric() || ch == '_' || ch == '-' {
343                    Some(ch)
344                } else if ch.is_whitespace() {
345                    Some('-')
346                } else {
347                    None
348                }
349            })
350            .collect::<String>()
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use crate::types::uri::raw::span;
357
358    use super::*;
359
360    const MD_INPUT: &str = r#"
361# A Test
362
363Some link in text [here](https://foo.com)
364
365## A test {#well-still-the-same-test}
366
367Code:
368
369```bash
370https://bar.com/123
371```
372
373or inline like `https://bar.org` for instance.
374
375### Some `code` in a heading.
376
377[example](http://example.com)
378
379<span id="the-end">The End</span>
380        "#;
381
382    #[test]
383    fn test_extract_fragments() {
384        let expected = HashSet::from([
385            "a-test".to_string(),
386            "a-test-1".to_string(),
387            "well-still-the-same-test".to_string(),
388            "some-code-in-a-heading".to_string(),
389            "the-end".to_string(),
390        ]);
391        let actual = extract_markdown_fragments(MD_INPUT);
392        assert_eq!(actual, expected);
393    }
394
395    #[test]
396    fn test_skip_verbatim() {
397        let expected = vec![
398            RawUri {
399                text: "https://foo.com".to_string(),
400                element: Some("a".to_string()),
401                attribute: Some("href".to_string()),
402                span: span(4, 19),
403            },
404            RawUri {
405                text: "http://example.com".to_string(),
406                element: Some("a".to_string()),
407                attribute: Some("href".to_string()),
408                span: span(18, 1),
409            },
410        ];
411
412        let uris = extract_markdown(MD_INPUT, false, false);
413        assert_eq!(uris, expected);
414    }
415
416    #[test]
417    fn test_include_verbatim() {
418        let expected = vec![
419            RawUri {
420                text: "https://foo.com".to_string(),
421                element: Some("a".to_string()),
422                attribute: Some("href".to_string()),
423                span: span(4, 19),
424            },
425            RawUri {
426                text: "https://bar.com/123".to_string(),
427                element: None,
428                attribute: None,
429                span: span(11, 1),
430            },
431            RawUri {
432                text: "https://bar.org".to_string(),
433                element: None,
434                attribute: None,
435                span: span(14, 17),
436            },
437            RawUri {
438                text: "http://example.com".to_string(),
439                element: Some("a".to_string()),
440                attribute: Some("href".to_string()),
441                span: span(18, 1),
442            },
443        ];
444
445        let uris = extract_markdown(MD_INPUT, true, false);
446        assert_eq!(uris, expected);
447    }
448
449    #[test]
450    fn test_skip_verbatim_html() {
451        let input = "
452<code>
453http://link.com
454</code>
455<pre>
456Some pre-formatted http://pre.com
457</pre>";
458
459        let expected = vec![];
460
461        let uris = extract_markdown(input, false, false);
462        assert_eq!(uris, expected);
463    }
464
465    #[test]
466    fn test_kebab_case() {
467        let check = |input, expected| {
468            let actual = HeadingIdGenerator::into_kebab_case(input);
469            assert_eq!(actual, expected);
470        };
471        check("A Heading", "a-heading");
472        check(
473            "This header has a :thumbsup: in it",
474            "this-header-has-a-thumbsup-in-it",
475        );
476        check(
477            "Header with 한글 characters (using unicode)",
478            "header-with-한글-characters-using-unicode",
479        );
480        check(
481            "Underscores foo_bar_, dots . and numbers 1.7e-3",
482            "underscores-foo_bar_-dots--and-numbers-17e-3",
483        );
484        check("Many          spaces", "many----------spaces");
485    }
486
487    #[test]
488    fn test_markdown_math() {
489        let input = r"
490$$
491[\psi](\mathbf{L})
492$$
493";
494        let uris = extract_markdown(input, true, false);
495        assert!(uris.is_empty());
496    }
497
498    #[test]
499    fn test_single_word_footnote_is_not_detected_as_link() {
500        let markdown = "This footnote is[^actually] a link.\n\n[^actually]: not";
501        let expected = vec![];
502        let uris = extract_markdown(markdown, true, false);
503        assert_eq!(uris, expected);
504    }
505
506    #[test]
507    fn test_underscore_in_urls_middle() {
508        let markdown = r"https://example.com/_/foo";
509        let expected = vec![RawUri {
510            text: "https://example.com/_/foo".to_string(),
511            element: None,
512            attribute: None,
513            span: span(1, 1),
514        }];
515        let uris = extract_markdown(markdown, true, false);
516        assert_eq!(uris, expected);
517    }
518
519    #[test]
520    fn test_underscore_in_urls_end() {
521        let markdown = r"https://example.com/_";
522        let expected = vec![RawUri {
523            text: "https://example.com/_".to_string(),
524            element: None,
525            attribute: None,
526            span: span(1, 1),
527        }];
528        let uris = extract_markdown(markdown, true, false);
529        assert_eq!(uris, expected);
530    }
531
532    #[test]
533    fn test_wiki_link() {
534        let markdown = r"[[https://example.com/destination]]";
535        let expected = vec![RawUri {
536            text: "https://example.com/destination".to_string(),
537            element: Some("a".to_string()),
538            attribute: Some("wikilink".to_string()),
539            span: span(1, 3),
540        }];
541        let uris = extract_markdown(markdown, true, true);
542        assert_eq!(uris, expected);
543    }
544
545    #[test]
546    fn test_multiple_wiki_links() {
547        let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
548        let expected = vec![
549            RawUri {
550                text: "https://example.com/destination".to_string(),
551                element: Some("a".to_string()),
552                attribute: Some("wikilink".to_string()),
553                span: span(1, 3),
554            },
555            RawUri {
556                text: "https://example.com/source".to_string(),
557                element: Some("a".to_string()),
558                attribute: Some("wikilink".to_string()),
559                span: span(1, 38),
560            },
561        ];
562        let uris = extract_markdown(markdown, true, true);
563        assert_eq!(uris, expected);
564    }
565
566    #[test]
567    fn test_ignore_gitlab_toc() {
568        let markdown = r"[[_TOC_]][TOC]";
569        let uris = extract_markdown(markdown, true, true);
570        assert!(uris.is_empty());
571    }
572
573    /// Don't extract the text of autolinks, as this is the link itself already.
574    /// Prevents a regression of <https://github.com/lycheeverse/lychee/issues/2150>
575    #[test]
576    fn test_autolink() {
577        let markdown = "<http://example>";
578        assert_eq!(extract_markdown(markdown, false, false).len(), 1);
579        assert_eq!(extract_markdown(markdown, true, false).len(), 1);
580    }
581
582    #[test]
583    fn test_link_text_not_checked() {
584        // Test that link text is not extracted as a separate link by default
585        let markdown =
586            r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
587        let uris = extract_markdown(markdown, false, false);
588
589        // Should only extract the destination URL, not the link text
590        let expected = vec![RawUri {
591            text: "https://example.com".to_string(),
592            element: Some("a".to_string()),
593            attribute: Some("href".to_string()),
594            span: span(1, 1),
595        }];
596
597        assert_eq!(uris, expected);
598        assert_eq!(
599            uris.len(),
600            1,
601            "Should only find destination URL, not link text"
602        );
603    }
604
605    #[test]
606    fn test_link_text_checked_with_include_verbatim() {
607        // Test that link text IS extracted when include_verbatim is true
608        let markdown =
609            r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
610        let uris = extract_markdown(markdown, true, false);
611
612        // Should extract both the link text AND the destination URL
613        let expected = vec![
614            RawUri {
615                text: "https://example.com".to_string(),
616                element: Some("a".to_string()),
617                attribute: Some("href".to_string()),
618                span: span(1, 1),
619            },
620            RawUri {
621                text: "https://lycheerepublic.gov/notexist".to_string(),
622                element: None,
623                attribute: None,
624                span: span(1, 2),
625            },
626        ];
627
628        assert_eq!(
629            uris.len(),
630            2,
631            "Should find both destination URL and link text"
632        );
633        // Check that both expected URLs are present (order might vary)
634        for expected_uri in expected {
635            assert!(
636                uris.contains(&expected_uri),
637                "Missing expected URI: {expected_uri:?}"
638            );
639        }
640    }
641
642    #[test]
643    fn test_reference_links_extraction() {
644        // Test that all types of reference links are extracted correctly
645        let markdown = r"
646Inline link: [link1](target1.md)
647
648Reference link: [link2][ref2]
649Collapsed link: [link3][]
650Shortcut link: [link4]
651
652[ref2]: target2.md
653[link3]: target3.md
654[link4]: target4.md
655";
656        let uris = extract_markdown(markdown, false, false);
657
658        let expected = vec![
659            RawUri {
660                text: "target1.md".to_string(),
661                element: Some("a".to_string()),
662                attribute: Some("href".to_string()),
663                span: span(2, 14),
664            },
665            RawUri {
666                text: "target2.md".to_string(),
667                element: Some("a".to_string()),
668                attribute: Some("href".to_string()),
669                span: span(4, 17),
670            },
671            RawUri {
672                text: "target3.md".to_string(),
673                element: Some("a".to_string()),
674                attribute: Some("href".to_string()),
675                span: span(5, 17),
676            },
677            RawUri {
678                text: "target4.md".to_string(),
679                element: Some("a".to_string()),
680                span: span(6, 16),
681                attribute: Some("href".to_string()),
682            },
683        ];
684
685        assert_eq!(uris.len(), 4, "Should extract all four link types");
686
687        // Check that all expected URIs are present (order might vary)
688        for expected_uri in expected {
689            assert!(
690                uris.contains(&expected_uri),
691                "Missing expected URI: {expected_uri:?}. Found: {uris:?}"
692            );
693        }
694    }
695
696    #[test]
697    fn test_clean_wikilink() {
698        let markdown = r"
699[[foo|bar]]
700[[foo#bar]]
701[[foo#bar|baz]]
702";
703        let uris = extract_markdown(markdown, true, true);
704        let expected = vec![
705            RawUri {
706                text: "foo".to_string(),
707                element: Some("a".to_string()),
708                attribute: Some("wikilink".to_string()),
709                span: span(2, 3),
710            },
711            RawUri {
712                text: "foo".to_string(),
713                element: Some("a".to_string()),
714                attribute: Some("wikilink".to_string()),
715                span: span(3, 3),
716            },
717            RawUri {
718                text: "foo".to_string(),
719                element: Some("a".to_string()),
720                attribute: Some("wikilink".to_string()),
721                span: span(4, 3),
722            },
723        ];
724        assert_eq!(uris, expected);
725    }
726
727    #[test]
728    fn test_nested_html() {
729        let input = r#"<Foo>
730          <Bar href="https://example.com" >
731          Some text
732          </Bar>
733        </Foo>"#;
734
735        let expected = vec![RawUri {
736            text: "https://example.com".to_string(),
737            element: Some("bar".to_string()),
738            attribute: Some("href".to_string()),
739            span: span(2, 22),
740        }];
741
742        let uris = extract_markdown(input, false, false);
743
744        assert_eq!(uris, expected);
745    }
746
747    #[test]
748    fn test_wikilink_extraction_returns_none_on_empty_links() {
749        let markdown = r"
750[[|bar]]
751[[#bar]]
752[[#bar|baz]]
753";
754
755        let uris = extract_markdown(markdown, true, true);
756        assert!(uris.is_empty());
757    }
758
759    #[test]
760    fn test_mdx_multiline_jsx() {
761        let input = r#"<CardGroup cols={1}>
762  <Card
763    title="Example"
764    href="https://example.com"
765  >
766    Some text
767  </Card>
768</CardGroup>"#;
769
770        let expected = vec![RawUri {
771            text: "https://example.com".to_string(),
772            element: Some("card".to_string()),
773            attribute: Some("href".to_string()),
774            span: span(4, 11),
775        }];
776
777        let uris = extract_markdown(input, false, false);
778
779        assert_eq!(uris, expected);
780    }
781
782    // Test that Markdown links inside HTML blocks are still parsed correctly.
783    // pulldown_cmark parses block-level HTML tags as separate HTML blocks, so
784    // Markdown content between them is processed normally.
785    #[test]
786    fn test_markdown_inside_html_block() {
787        let input = r"<div>
788
789[markdown link](https://example.com/markdown)
790
791</div>
792
793<span>[another link](https://example.com/another)</span>";
794
795        let uris = extract_markdown(input, false, false);
796
797        // Verify both Markdown links are extracted
798        let expected_urls = vec![
799            "https://example.com/markdown",
800            "https://example.com/another",
801        ];
802
803        assert_eq!(uris.len(), 2, "Should extract both Markdown links");
804
805        for expected_url in expected_urls {
806            assert!(
807                uris.iter().any(|u| u.text == expected_url),
808                "Should find URL: {expected_url}"
809            );
810        }
811
812        // Verify they're recognized as Markdown links (i.e. element: "a", attribute: "href")
813        for uri in &uris {
814            assert_eq!(uri.element, Some("a".to_string()));
815            assert_eq!(uri.attribute, Some("href".to_string()));
816        }
817    }
818
819    #[test]
820    fn test_remove_wikilink_potholes_and_fragments() {
821        let markdown = r"[[foo#bar|baz]]";
822        let uris = extract_markdown(markdown, true, true);
823        let expected = vec![RawUri {
824            text: "foo".to_string(),
825            element: Some("a".to_string()),
826            attribute: Some("wikilink".to_string()),
827            span: span(1, 3),
828        }];
829        assert_eq!(uris, expected);
830    }
831}