1use std::collections::{HashMap, HashSet};
3
4use log::warn;
5use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset};
6
7use crate::{
8 checker::wikilink::wikilink,
9 extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext},
10 types::uri::raw::{
11 OffsetSpanProvider, RawUri, RawUriSpan, SourceSpanProvider, SpanProvider as _,
12 },
13};
14
15use super::html::html5gum::extract_html_fragments;
16
17fn md_extensions() -> Options {
20 Options::ENABLE_HEADING_ATTRIBUTES
21 | Options::ENABLE_MATH
22 | Options::ENABLE_WIKILINKS
23 | Options::ENABLE_FOOTNOTES
24}
25
26#[expect(clippy::too_many_lines)]
29pub(crate) fn extract_markdown(
30 input: &str,
31 include_verbatim: bool,
32 include_wikilinks: bool,
33) -> Vec<RawUri> {
34 let mut inside_code_block = false;
35 let mut inside_link_label = false; let mut inside_extracted_link = false; let mut inside_html_block = false;
40 let mut html_block_buffer = String::new();
41 let mut html_block_start_offset = 0;
42
43 let span_provider = SourceSpanProvider::from_input(input);
44 let parser =
45 TextMergeWithOffset::new(Parser::new_ext(input, md_extensions()).into_offset_iter());
46 parser
47 .filter_map(|(event, span)| match event {
48 Event::Start(Tag::Link {
50 link_type,
51 dest_url,
52 ..
53 }) => {
54 match link_type {
55 LinkType::Inline => {
58 inside_link_label = true;
59 Some(raw_uri(&dest_url, span_provider.span(span.start)))
60 }
61 LinkType::Reference |
63 LinkType::ReferenceUnknown |
65 LinkType::Collapsed|
67 LinkType::CollapsedUnknown |
69 LinkType::Shortcut |
71 LinkType::ShortcutUnknown => {
73 inside_link_label = true;
74 Some(raw_uri(&dest_url, span_provider.span(span.start)))
77 },
78 LinkType::Autolink |
80 LinkType::Email => {
82 inside_extracted_link = true;
83 let span_provider = get_email_span_provider(&span_provider, &span, link_type);
84 Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider))
85 }
86 LinkType::WikiLink { has_pothole } => {
88 if !include_wikilinks {
90 return None;
91 }
92 inside_extracted_link = true;
93 if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
95 return None;
96 }
97
98 if let Ok(wikilink) = wikilink(&dest_url, has_pothole) {
99 Some(vec![RawUri {
100 text: wikilink.to_string(),
101 element: Some("a".to_string()),
102 attribute: Some("wikilink".to_string()),
103 span: span_provider.span(span.start + 2)
105 }])
106 } else {
107 warn!("The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments");
108 None
109 }
110 }
111 }
112 }
113
114 Event::Start(Tag::Image { dest_url, .. }) => Some(extract_image(&dest_url, span_provider.span(span.start))),
115
116 Event::Start(Tag::CodeBlock(_)) => {
118 inside_code_block = true;
119 None
120 }
121 Event::End(TagEnd::CodeBlock) => {
122 inside_code_block = false;
123 None
124 }
125
126 Event::Text(txt) => {
128 if inside_extracted_link
129 || (inside_link_label && !include_verbatim)
130 || (inside_code_block && !include_verbatim) {
131 None
132 } else {
133 Some(extract_raw_uri_from_plaintext(
134 &txt,
135 &OffsetSpanProvider { offset: span.start, inner: &span_provider }
136 ))
137 }
138 }
139
140 Event::Start(Tag::HtmlBlock) => {
142 inside_html_block = true;
143 html_block_buffer.clear();
144 html_block_start_offset = span.start;
145 None
146 }
147
148 Event::End(TagEnd::HtmlBlock) => {
150 inside_html_block = false;
151 if html_block_buffer.is_empty() {
152 None
153 } else {
154 Some(extract_html_with_span(
155 &html_block_buffer,
156 include_verbatim,
157 OffsetSpanProvider {
158 offset: html_block_start_offset,
159 inner: &span_provider
160 }
161 ))
162 }
163 }
164
165 Event::Html(html) => {
167 if inside_html_block {
168 html_block_buffer.push_str(&html);
170 None
171 } else {
172 Some(extract_html_with_span(
174 &html,
175 include_verbatim,
176 OffsetSpanProvider { offset: span.start, inner: &span_provider }
177 ))
178 }
179 }
180
181 Event::InlineHtml(html) => {
183 Some(extract_html_with_span(
184 &html,
185 include_verbatim,
186 OffsetSpanProvider { offset: span.start, inner: &span_provider }
187 ))
188 }
189
190 Event::Code(code) => {
192 if include_verbatim {
193 Some(extract_raw_uri_from_plaintext(
195 &code,
196 &OffsetSpanProvider { offset: span.start + 1, inner: &span_provider }
197 ))
198 } else {
199 None
200 }
201 }
202
203 Event::End(TagEnd::Link) => {
204 inside_link_label = false;
205 inside_extracted_link = false;
206 None
207 }
208
209 #[expect(clippy::match_same_arms, reason = "Skip footnote references and definitions explicitly - they're not links to check")]
210 Event::FootnoteReference(_) | Event::Start(Tag::FootnoteDefinition(_)) | Event::End(TagEnd::FootnoteDefinition) => None,
211
212 _ => None,
214 })
215 .flatten()
216 .collect()
217}
218
219fn get_email_span_provider<'a>(
220 span_provider: &'a SourceSpanProvider<'_>,
221 span: &std::ops::Range<usize>,
222 link_type: LinkType,
223) -> OffsetSpanProvider<'a> {
224 let offset = match link_type {
225 LinkType::Reference | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown => 0,
227 LinkType::ReferenceUnknown
229 | LinkType::Collapsed
230 | LinkType::Shortcut
231 | LinkType::Autolink
232 | LinkType::Email => 1,
233 _ => {
234 debug_assert!(false, "Unexpected email link type: {link_type:?}");
235 0
236 }
237 };
238
239 OffsetSpanProvider {
240 offset: span.start + offset,
241 inner: span_provider,
242 }
243}
244
245fn extract_image(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
248 vec![RawUri {
249 text: dest_url.to_string(),
250 element: Some("img".to_string()),
251 attribute: Some("src".to_string()),
252 span,
253 }]
254}
255
256fn raw_uri(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
259 vec![RawUri {
260 text: dest_url.to_string(),
261 element: Some("a".to_string()),
262 attribute: Some("href".to_string()),
263 span,
266 }]
267}
268
269pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
277 let mut in_heading = false;
278 let mut heading_text = String::new();
279 let mut heading_id: Option<CowStr<'_>> = None;
280 let mut id_generator = HeadingIdGenerator::default();
281
282 let mut out = HashSet::new();
283
284 for event in Parser::new_ext(input, md_extensions()) {
285 match event {
286 Event::Start(Tag::Heading { id, .. }) => {
287 heading_id = id;
288 in_heading = true;
289 }
290 Event::End(TagEnd::Heading(_)) => {
291 if let Some(frag) = heading_id.take() {
292 out.insert(frag.to_string());
293 }
294
295 if !heading_text.is_empty() {
296 let id = id_generator.generate(&heading_text);
297 out.insert(id);
298 heading_text.clear();
299 }
300
301 in_heading = false;
302 }
303 Event::Text(text) | Event::Code(text) if in_heading => {
304 heading_text.push_str(&text);
305 }
306
307 Event::Html(html) | Event::InlineHtml(html) => {
309 out.extend(extract_html_fragments(&html));
310 }
311
312 _ => (),
314 }
315 }
316 out
317}
318
319#[derive(Default)]
320struct HeadingIdGenerator {
321 counter: HashMap<String, usize>,
322}
323
324impl HeadingIdGenerator {
325 fn generate(&mut self, heading: &str) -> String {
326 let mut id = Self::into_kebab_case(heading);
327 let count = self.counter.entry(id.clone()).or_insert(0);
328 if *count != 0 {
329 id = format!("{}-{}", id, *count);
330 }
331 *count += 1;
332
333 id
334 }
335
336 #[must_use]
338 fn into_kebab_case(text: &str) -> String {
339 text.to_lowercase()
340 .chars()
341 .filter_map(|ch| {
342 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
343 Some(ch)
344 } else if ch.is_whitespace() {
345 Some('-')
346 } else {
347 None
348 }
349 })
350 .collect::<String>()
351 }
352}
353
354#[cfg(test)]
355mod tests {
356 use crate::types::uri::raw::span;
357
358 use super::*;
359
360 const MD_INPUT: &str = r#"
361# A Test
362
363Some link in text [here](https://foo.com)
364
365## A test {#well-still-the-same-test}
366
367Code:
368
369```bash
370https://bar.com/123
371```
372
373or inline like `https://bar.org` for instance.
374
375### Some `code` in a heading.
376
377[example](http://example.com)
378
379<span id="the-end">The End</span>
380 "#;
381
382 #[test]
383 fn test_extract_fragments() {
384 let expected = HashSet::from([
385 "a-test".to_string(),
386 "a-test-1".to_string(),
387 "well-still-the-same-test".to_string(),
388 "some-code-in-a-heading".to_string(),
389 "the-end".to_string(),
390 ]);
391 let actual = extract_markdown_fragments(MD_INPUT);
392 assert_eq!(actual, expected);
393 }
394
395 #[test]
396 fn test_skip_verbatim() {
397 let expected = vec![
398 RawUri {
399 text: "https://foo.com".to_string(),
400 element: Some("a".to_string()),
401 attribute: Some("href".to_string()),
402 span: span(4, 19),
403 },
404 RawUri {
405 text: "http://example.com".to_string(),
406 element: Some("a".to_string()),
407 attribute: Some("href".to_string()),
408 span: span(18, 1),
409 },
410 ];
411
412 let uris = extract_markdown(MD_INPUT, false, false);
413 assert_eq!(uris, expected);
414 }
415
416 #[test]
417 fn test_include_verbatim() {
418 let expected = vec![
419 RawUri {
420 text: "https://foo.com".to_string(),
421 element: Some("a".to_string()),
422 attribute: Some("href".to_string()),
423 span: span(4, 19),
424 },
425 RawUri {
426 text: "https://bar.com/123".to_string(),
427 element: None,
428 attribute: None,
429 span: span(11, 1),
430 },
431 RawUri {
432 text: "https://bar.org".to_string(),
433 element: None,
434 attribute: None,
435 span: span(14, 17),
436 },
437 RawUri {
438 text: "http://example.com".to_string(),
439 element: Some("a".to_string()),
440 attribute: Some("href".to_string()),
441 span: span(18, 1),
442 },
443 ];
444
445 let uris = extract_markdown(MD_INPUT, true, false);
446 assert_eq!(uris, expected);
447 }
448
449 #[test]
450 fn test_skip_verbatim_html() {
451 let input = "
452<code>
453http://link.com
454</code>
455<pre>
456Some pre-formatted http://pre.com
457</pre>";
458
459 let expected = vec![];
460
461 let uris = extract_markdown(input, false, false);
462 assert_eq!(uris, expected);
463 }
464
465 #[test]
466 fn test_kebab_case() {
467 let check = |input, expected| {
468 let actual = HeadingIdGenerator::into_kebab_case(input);
469 assert_eq!(actual, expected);
470 };
471 check("A Heading", "a-heading");
472 check(
473 "This header has a :thumbsup: in it",
474 "this-header-has-a-thumbsup-in-it",
475 );
476 check(
477 "Header with 한글 characters (using unicode)",
478 "header-with-한글-characters-using-unicode",
479 );
480 check(
481 "Underscores foo_bar_, dots . and numbers 1.7e-3",
482 "underscores-foo_bar_-dots--and-numbers-17e-3",
483 );
484 check("Many spaces", "many----------spaces");
485 }
486
487 #[test]
488 fn test_markdown_math() {
489 let input = r"
490$$
491[\psi](\mathbf{L})
492$$
493";
494 let uris = extract_markdown(input, true, false);
495 assert!(uris.is_empty());
496 }
497
498 #[test]
499 fn test_single_word_footnote_is_not_detected_as_link() {
500 let markdown = "This footnote is[^actually] a link.\n\n[^actually]: not";
501 let expected = vec![];
502 let uris = extract_markdown(markdown, true, false);
503 assert_eq!(uris, expected);
504 }
505
506 #[test]
507 fn test_underscore_in_urls_middle() {
508 let markdown = r"https://example.com/_/foo";
509 let expected = vec![RawUri {
510 text: "https://example.com/_/foo".to_string(),
511 element: None,
512 attribute: None,
513 span: span(1, 1),
514 }];
515 let uris = extract_markdown(markdown, true, false);
516 assert_eq!(uris, expected);
517 }
518
519 #[test]
520 fn test_underscore_in_urls_end() {
521 let markdown = r"https://example.com/_";
522 let expected = vec![RawUri {
523 text: "https://example.com/_".to_string(),
524 element: None,
525 attribute: None,
526 span: span(1, 1),
527 }];
528 let uris = extract_markdown(markdown, true, false);
529 assert_eq!(uris, expected);
530 }
531
532 #[test]
533 fn test_wiki_link() {
534 let markdown = r"[[https://example.com/destination]]";
535 let expected = vec![RawUri {
536 text: "https://example.com/destination".to_string(),
537 element: Some("a".to_string()),
538 attribute: Some("wikilink".to_string()),
539 span: span(1, 3),
540 }];
541 let uris = extract_markdown(markdown, true, true);
542 assert_eq!(uris, expected);
543 }
544
545 #[test]
546 fn test_multiple_wiki_links() {
547 let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
548 let expected = vec![
549 RawUri {
550 text: "https://example.com/destination".to_string(),
551 element: Some("a".to_string()),
552 attribute: Some("wikilink".to_string()),
553 span: span(1, 3),
554 },
555 RawUri {
556 text: "https://example.com/source".to_string(),
557 element: Some("a".to_string()),
558 attribute: Some("wikilink".to_string()),
559 span: span(1, 38),
560 },
561 ];
562 let uris = extract_markdown(markdown, true, true);
563 assert_eq!(uris, expected);
564 }
565
566 #[test]
567 fn test_ignore_gitlab_toc() {
568 let markdown = r"[[_TOC_]][TOC]";
569 let uris = extract_markdown(markdown, true, true);
570 assert!(uris.is_empty());
571 }
572
573 #[test]
576 fn test_autolink() {
577 let markdown = "<http://example>";
578 assert_eq!(extract_markdown(markdown, false, false).len(), 1);
579 assert_eq!(extract_markdown(markdown, true, false).len(), 1);
580 }
581
582 #[test]
583 fn test_link_text_not_checked() {
584 let markdown =
586 r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
587 let uris = extract_markdown(markdown, false, false);
588
589 let expected = vec![RawUri {
591 text: "https://example.com".to_string(),
592 element: Some("a".to_string()),
593 attribute: Some("href".to_string()),
594 span: span(1, 1),
595 }];
596
597 assert_eq!(uris, expected);
598 assert_eq!(
599 uris.len(),
600 1,
601 "Should only find destination URL, not link text"
602 );
603 }
604
605 #[test]
606 fn test_link_text_checked_with_include_verbatim() {
607 let markdown =
609 r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
610 let uris = extract_markdown(markdown, true, false);
611
612 let expected = vec![
614 RawUri {
615 text: "https://example.com".to_string(),
616 element: Some("a".to_string()),
617 attribute: Some("href".to_string()),
618 span: span(1, 1),
619 },
620 RawUri {
621 text: "https://lycheerepublic.gov/notexist".to_string(),
622 element: None,
623 attribute: None,
624 span: span(1, 2),
625 },
626 ];
627
628 assert_eq!(
629 uris.len(),
630 2,
631 "Should find both destination URL and link text"
632 );
633 for expected_uri in expected {
635 assert!(
636 uris.contains(&expected_uri),
637 "Missing expected URI: {expected_uri:?}"
638 );
639 }
640 }
641
642 #[test]
643 fn test_reference_links_extraction() {
644 let markdown = r"
646Inline link: [link1](target1.md)
647
648Reference link: [link2][ref2]
649Collapsed link: [link3][]
650Shortcut link: [link4]
651
652[ref2]: target2.md
653[link3]: target3.md
654[link4]: target4.md
655";
656 let uris = extract_markdown(markdown, false, false);
657
658 let expected = vec![
659 RawUri {
660 text: "target1.md".to_string(),
661 element: Some("a".to_string()),
662 attribute: Some("href".to_string()),
663 span: span(2, 14),
664 },
665 RawUri {
666 text: "target2.md".to_string(),
667 element: Some("a".to_string()),
668 attribute: Some("href".to_string()),
669 span: span(4, 17),
670 },
671 RawUri {
672 text: "target3.md".to_string(),
673 element: Some("a".to_string()),
674 attribute: Some("href".to_string()),
675 span: span(5, 17),
676 },
677 RawUri {
678 text: "target4.md".to_string(),
679 element: Some("a".to_string()),
680 span: span(6, 16),
681 attribute: Some("href".to_string()),
682 },
683 ];
684
685 assert_eq!(uris.len(), 4, "Should extract all four link types");
686
687 for expected_uri in expected {
689 assert!(
690 uris.contains(&expected_uri),
691 "Missing expected URI: {expected_uri:?}. Found: {uris:?}"
692 );
693 }
694 }
695
696 #[test]
697 fn test_clean_wikilink() {
698 let markdown = r"
699[[foo|bar]]
700[[foo#bar]]
701[[foo#bar|baz]]
702";
703 let uris = extract_markdown(markdown, true, true);
704 let expected = vec![
705 RawUri {
706 text: "foo".to_string(),
707 element: Some("a".to_string()),
708 attribute: Some("wikilink".to_string()),
709 span: span(2, 3),
710 },
711 RawUri {
712 text: "foo".to_string(),
713 element: Some("a".to_string()),
714 attribute: Some("wikilink".to_string()),
715 span: span(3, 3),
716 },
717 RawUri {
718 text: "foo".to_string(),
719 element: Some("a".to_string()),
720 attribute: Some("wikilink".to_string()),
721 span: span(4, 3),
722 },
723 ];
724 assert_eq!(uris, expected);
725 }
726
727 #[test]
728 fn test_nested_html() {
729 let input = r#"<Foo>
730 <Bar href="https://example.com" >
731 Some text
732 </Bar>
733 </Foo>"#;
734
735 let expected = vec![RawUri {
736 text: "https://example.com".to_string(),
737 element: Some("bar".to_string()),
738 attribute: Some("href".to_string()),
739 span: span(2, 22),
740 }];
741
742 let uris = extract_markdown(input, false, false);
743
744 assert_eq!(uris, expected);
745 }
746
747 #[test]
748 fn test_wikilink_extraction_returns_none_on_empty_links() {
749 let markdown = r"
750[[|bar]]
751[[#bar]]
752[[#bar|baz]]
753";
754
755 let uris = extract_markdown(markdown, true, true);
756 assert!(uris.is_empty());
757 }
758
759 #[test]
760 fn test_mdx_multiline_jsx() {
761 let input = r#"<CardGroup cols={1}>
762 <Card
763 title="Example"
764 href="https://example.com"
765 >
766 Some text
767 </Card>
768</CardGroup>"#;
769
770 let expected = vec![RawUri {
771 text: "https://example.com".to_string(),
772 element: Some("card".to_string()),
773 attribute: Some("href".to_string()),
774 span: span(4, 11),
775 }];
776
777 let uris = extract_markdown(input, false, false);
778
779 assert_eq!(uris, expected);
780 }
781
782 #[test]
786 fn test_markdown_inside_html_block() {
787 let input = r"<div>
788
789[markdown link](https://example.com/markdown)
790
791</div>
792
793<span>[another link](https://example.com/another)</span>";
794
795 let uris = extract_markdown(input, false, false);
796
797 let expected_urls = vec![
799 "https://example.com/markdown",
800 "https://example.com/another",
801 ];
802
803 assert_eq!(uris.len(), 2, "Should extract both Markdown links");
804
805 for expected_url in expected_urls {
806 assert!(
807 uris.iter().any(|u| u.text == expected_url),
808 "Should find URL: {expected_url}"
809 );
810 }
811
812 for uri in &uris {
814 assert_eq!(uri.element, Some("a".to_string()));
815 assert_eq!(uri.attribute, Some("href".to_string()));
816 }
817 }
818
819 #[test]
820 fn test_remove_wikilink_potholes_and_fragments() {
821 let markdown = r"[[foo#bar|baz]]";
822 let uris = extract_markdown(markdown, true, true);
823 let expected = vec![RawUri {
824 text: "foo".to_string(),
825 element: Some("a".to_string()),
826 attribute: Some("wikilink".to_string()),
827 span: span(1, 3),
828 }];
829 assert_eq!(uris, expected);
830 }
831}