1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12 ($name:expr, $profile:expr, $code:expr) => {{
13 let start = std::time::Instant::now();
14 let result = $code;
15 if $profile {
16 eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17 }
18 result
19 }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24 ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30 Regex::new(
31 r#"(?sx)
32 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33 (?:
34 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
35 |
36 \[([^\]]*)\] # Reference ID in group 6
37 )"#
38 ).unwrap()
39});
40
41static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44 Regex::new(
45 r#"(?sx)
46 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47 (?:
48 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
49 |
50 \[([^\]]*)\] # Reference ID in group 6
51 )"#
52 ).unwrap()
53});
54
55static REF_DEF_PATTERN: LazyLock<Regex> =
57 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61 Regex::new(
62 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63 ).unwrap()
64});
65
66static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73#[derive(Debug, Clone)]
75pub struct LineInfo {
76 pub byte_offset: usize,
78 pub byte_len: usize,
80 pub indent: usize,
82 pub is_blank: bool,
84 pub in_code_block: bool,
86 pub in_front_matter: bool,
88 pub in_html_block: bool,
90 pub in_html_comment: bool,
92 pub list_item: Option<ListItemInfo>,
94 pub heading: Option<HeadingInfo>,
96 pub blockquote: Option<BlockquoteInfo>,
98 pub in_mkdocstrings: bool,
100 pub in_esm_block: bool,
102}
103
104impl LineInfo {
105 pub fn content<'a>(&self, source: &'a str) -> &'a str {
107 &source[self.byte_offset..self.byte_offset + self.byte_len]
108 }
109}
110
111#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114 pub marker: String,
116 pub is_ordered: bool,
118 pub number: Option<usize>,
120 pub marker_column: usize,
122 pub content_column: usize,
124}
125
126#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129 ATX,
131 Setext1,
133 Setext2,
135}
136
137#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140 pub line: usize,
142 pub start_col: usize,
144 pub end_col: usize,
146 pub byte_offset: usize,
148 pub byte_end: usize,
150 pub text: Cow<'a, str>,
152 pub url: Cow<'a, str>,
154 pub is_reference: bool,
156 pub reference_id: Option<Cow<'a, str>>,
158 pub link_type: LinkType,
160}
161
162#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165 pub reference: String,
167 pub span: std::ops::Range<usize>,
169}
170
171#[derive(Debug, Clone)]
173pub struct FootnoteRef {
174 pub id: String,
176 pub line: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182}
183
184#[derive(Debug, Clone)]
186pub struct ParsedImage<'a> {
187 pub line: usize,
189 pub start_col: usize,
191 pub end_col: usize,
193 pub byte_offset: usize,
195 pub byte_end: usize,
197 pub alt_text: Cow<'a, str>,
199 pub url: Cow<'a, str>,
201 pub is_reference: bool,
203 pub reference_id: Option<Cow<'a, str>>,
205 pub link_type: LinkType,
207}
208
209#[derive(Debug, Clone)]
211pub struct ReferenceDef {
212 pub line: usize,
214 pub id: String,
216 pub url: String,
218 pub title: Option<String>,
220 pub byte_offset: usize,
222 pub byte_end: usize,
224}
225
226#[derive(Debug, Clone)]
228pub struct CodeSpan {
229 pub line: usize,
231 pub start_col: usize,
233 pub end_col: usize,
235 pub byte_offset: usize,
237 pub byte_end: usize,
239 pub backtick_count: usize,
241 pub content: String,
243}
244
245#[derive(Debug, Clone)]
247pub struct HeadingInfo {
248 pub level: u8,
250 pub style: HeadingStyle,
252 pub marker: String,
254 pub marker_column: usize,
256 pub content_column: usize,
258 pub text: String,
260 pub custom_id: Option<String>,
262 pub raw_text: String,
264 pub has_closing_sequence: bool,
266 pub closing_sequence: String,
268}
269
270#[derive(Debug, Clone)]
272pub struct BlockquoteInfo {
273 pub nesting_level: usize,
275 pub indent: String,
277 pub marker_column: usize,
279 pub prefix: String,
281 pub content: String,
283 pub has_no_space_after_marker: bool,
285 pub has_multiple_spaces_after_marker: bool,
287 pub needs_md028_fix: bool,
289}
290
291#[derive(Debug, Clone)]
293pub struct ListBlock {
294 pub start_line: usize,
296 pub end_line: usize,
298 pub is_ordered: bool,
300 pub marker: Option<String>,
302 pub blockquote_prefix: String,
304 pub item_lines: Vec<usize>,
306 pub nesting_level: usize,
308 pub max_marker_width: usize,
310}
311
312use std::sync::{Arc, Mutex};
313
314#[derive(Debug, Clone, Default)]
316pub struct CharFrequency {
317 pub hash_count: usize,
319 pub asterisk_count: usize,
321 pub underscore_count: usize,
323 pub hyphen_count: usize,
325 pub plus_count: usize,
327 pub gt_count: usize,
329 pub pipe_count: usize,
331 pub bracket_count: usize,
333 pub backtick_count: usize,
335 pub lt_count: usize,
337 pub exclamation_count: usize,
339 pub newline_count: usize,
341}
342
343#[derive(Debug, Clone)]
345pub struct HtmlTag {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub tag_name: String,
358 pub is_closing: bool,
360 pub is_self_closing: bool,
362 pub raw_content: String,
364}
365
366#[derive(Debug, Clone)]
368pub struct EmphasisSpan {
369 pub line: usize,
371 pub start_col: usize,
373 pub end_col: usize,
375 pub byte_offset: usize,
377 pub byte_end: usize,
379 pub marker: char,
381 pub marker_count: usize,
383 pub content: String,
385}
386
387#[derive(Debug, Clone)]
389pub struct TableRow {
390 pub line: usize,
392 pub is_separator: bool,
394 pub column_count: usize,
396 pub column_alignments: Vec<String>, }
399
400#[derive(Debug, Clone)]
402pub struct BareUrl {
403 pub line: usize,
405 pub start_col: usize,
407 pub end_col: usize,
409 pub byte_offset: usize,
411 pub byte_end: usize,
413 pub url: String,
415 pub url_type: String,
417}
418
419pub struct LintContext<'a> {
420 pub content: &'a str,
421 pub line_offsets: Vec<usize>,
422 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub footnote_refs: Vec<FootnoteRef>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
442
443struct BlockquoteComponents<'a> {
445 indent: &'a str,
446 markers: &'a str,
447 spaces_after: &'a str,
448 content: &'a str,
449}
450
451#[inline]
453fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
454 let bytes = line.as_bytes();
455 let mut pos = 0;
456
457 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
459 pos += 1;
460 }
461 let indent_end = pos;
462
463 if pos >= bytes.len() || bytes[pos] != b'>' {
465 return None;
466 }
467
468 while pos < bytes.len() && bytes[pos] == b'>' {
470 pos += 1;
471 }
472 let markers_end = pos;
473
474 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
476 pos += 1;
477 }
478 let spaces_end = pos;
479
480 Some(BlockquoteComponents {
481 indent: &line[0..indent_end],
482 markers: &line[indent_end..markers_end],
483 spaces_after: &line[markers_end..spaces_end],
484 content: &line[spaces_end..],
485 })
486}
487
488impl<'a> LintContext<'a> {
489 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
490 #[cfg(not(target_arch = "wasm32"))]
491 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
492 #[cfg(target_arch = "wasm32")]
493 let profile = false;
494
495 let line_offsets = profile_section!("Line offsets", profile, {
496 let mut offsets = vec![0];
497 for (i, c) in content.char_indices() {
498 if c == '\n' {
499 offsets.push(i + 1);
500 }
501 }
502 offsets
503 });
504
505 let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
507
508 let html_comment_ranges = profile_section!(
510 "HTML comment ranges",
511 profile,
512 crate::utils::skip_context::compute_html_comment_ranges(content)
513 );
514
515 let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
517 if flavor == MarkdownFlavor::MkDocs {
518 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
519 } else {
520 Vec::new()
521 }
522 });
523
524 let mut lines = profile_section!(
526 "Basic line info",
527 profile,
528 Self::compute_basic_line_info(
529 content,
530 &line_offsets,
531 &code_blocks,
532 flavor,
533 &html_comment_ranges,
534 &autodoc_ranges,
535 )
536 );
537
538 profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
540
541 profile_section!(
543 "ESM blocks",
544 profile,
545 Self::detect_esm_blocks(content, &mut lines, flavor)
546 );
547
548 profile_section!(
550 "Headings & blockquotes",
551 profile,
552 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
553 );
554
555 let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
557
558 let (links, broken_links, footnote_refs) = profile_section!(
560 "Links",
561 profile,
562 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
563 );
564
565 let images = profile_section!(
566 "Images",
567 profile,
568 Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
569 );
570
571 let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
572
573 let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
574
575 let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
577
578 let table_blocks = profile_section!(
580 "Table blocks",
581 profile,
582 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
583 content,
584 &code_blocks,
585 &code_spans,
586 &html_comment_ranges,
587 )
588 );
589
590 let line_index = profile_section!(
592 "Line index",
593 profile,
594 crate::utils::range_utils::LineIndex::new(content)
595 );
596
597 let jinja_ranges = profile_section!(
599 "Jinja ranges",
600 profile,
601 crate::utils::jinja_utils::find_jinja_ranges(content)
602 );
603
604 Self {
605 content,
606 line_offsets,
607 code_blocks,
608 lines,
609 links,
610 images,
611 broken_links,
612 footnote_refs,
613 reference_defs,
614 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
615 list_blocks,
616 char_frequency,
617 html_tags_cache: Mutex::new(None),
618 emphasis_spans_cache: Mutex::new(None),
619 table_rows_cache: Mutex::new(None),
620 bare_urls_cache: Mutex::new(None),
621 html_comment_ranges,
622 table_blocks,
623 line_index,
624 jinja_ranges,
625 flavor,
626 }
627 }
628
629 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
631 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
632
633 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
634 }
635
636 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
638 &self.html_comment_ranges
639 }
640
641 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
643 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
644
645 Arc::clone(cache.get_or_insert_with(|| {
646 Arc::new(Self::parse_html_tags(
647 self.content,
648 &self.lines,
649 &self.code_blocks,
650 self.flavor,
651 ))
652 }))
653 }
654
655 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
657 let mut cache = self
658 .emphasis_spans_cache
659 .lock()
660 .expect("Emphasis spans cache mutex poisoned");
661
662 Arc::clone(
663 cache.get_or_insert_with(|| {
664 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
665 }),
666 )
667 }
668
669 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
671 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
672
673 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
674 }
675
676 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
678 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
679
680 Arc::clone(
681 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
682 )
683 }
684
685 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
687 match self.line_offsets.binary_search(&offset) {
688 Ok(line) => (line + 1, 1),
689 Err(line) => {
690 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
691 (line, offset - line_start + 1)
692 }
693 }
694 }
695
696 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
698 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
700 return true;
701 }
702
703 self.code_spans()
705 .iter()
706 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
707 }
708
709 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
711 if line_num > 0 {
712 self.lines.get(line_num - 1)
713 } else {
714 None
715 }
716 }
717
718 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
720 self.line_info(line_num).map(|info| info.byte_offset)
721 }
722
723 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
725 let normalized_id = ref_id.to_lowercase();
726 self.reference_defs
727 .iter()
728 .find(|def| def.id == normalized_id)
729 .map(|def| def.url.as_str())
730 }
731
732 pub fn is_in_list_block(&self, line_num: usize) -> bool {
734 self.list_blocks
735 .iter()
736 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
737 }
738
739 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
741 self.list_blocks
742 .iter()
743 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
744 }
745
746 pub fn is_in_code_block(&self, line_num: usize) -> bool {
750 if line_num == 0 || line_num > self.lines.len() {
751 return false;
752 }
753 self.lines[line_num - 1].in_code_block
754 }
755
756 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
758 if line_num == 0 || line_num > self.lines.len() {
759 return false;
760 }
761 self.lines[line_num - 1].in_front_matter
762 }
763
764 pub fn is_in_html_block(&self, line_num: usize) -> bool {
766 if line_num == 0 || line_num > self.lines.len() {
767 return false;
768 }
769 self.lines[line_num - 1].in_html_block
770 }
771
772 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
774 if line_num == 0 || line_num > self.lines.len() {
775 return false;
776 }
777
778 let col_0indexed = if col > 0 { col - 1 } else { 0 };
782 let code_spans = self.code_spans();
783 code_spans
784 .iter()
785 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
786 }
787
788 #[inline]
791 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
792 self.reference_defs
793 .iter()
794 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
795 }
796
797 #[inline]
801 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
802 self.html_comment_ranges
803 .iter()
804 .any(|range| byte_pos >= range.start && byte_pos < range.end)
805 }
806
807 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
809 self.jinja_ranges
810 .iter()
811 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
812 }
813
814 pub fn has_char(&self, ch: char) -> bool {
816 match ch {
817 '#' => self.char_frequency.hash_count > 0,
818 '*' => self.char_frequency.asterisk_count > 0,
819 '_' => self.char_frequency.underscore_count > 0,
820 '-' => self.char_frequency.hyphen_count > 0,
821 '+' => self.char_frequency.plus_count > 0,
822 '>' => self.char_frequency.gt_count > 0,
823 '|' => self.char_frequency.pipe_count > 0,
824 '[' => self.char_frequency.bracket_count > 0,
825 '`' => self.char_frequency.backtick_count > 0,
826 '<' => self.char_frequency.lt_count > 0,
827 '!' => self.char_frequency.exclamation_count > 0,
828 '\n' => self.char_frequency.newline_count > 0,
829 _ => self.content.contains(ch), }
831 }
832
833 pub fn char_count(&self, ch: char) -> usize {
835 match ch {
836 '#' => self.char_frequency.hash_count,
837 '*' => self.char_frequency.asterisk_count,
838 '_' => self.char_frequency.underscore_count,
839 '-' => self.char_frequency.hyphen_count,
840 '+' => self.char_frequency.plus_count,
841 '>' => self.char_frequency.gt_count,
842 '|' => self.char_frequency.pipe_count,
843 '[' => self.char_frequency.bracket_count,
844 '`' => self.char_frequency.backtick_count,
845 '<' => self.char_frequency.lt_count,
846 '!' => self.char_frequency.exclamation_count,
847 '\n' => self.char_frequency.newline_count,
848 _ => self.content.matches(ch).count(), }
850 }
851
852 pub fn likely_has_headings(&self) -> bool {
854 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
856
857 pub fn likely_has_lists(&self) -> bool {
859 self.char_frequency.asterisk_count > 0
860 || self.char_frequency.hyphen_count > 0
861 || self.char_frequency.plus_count > 0
862 }
863
864 pub fn likely_has_emphasis(&self) -> bool {
866 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
867 }
868
869 pub fn likely_has_tables(&self) -> bool {
871 self.char_frequency.pipe_count > 2
872 }
873
874 pub fn likely_has_blockquotes(&self) -> bool {
876 self.char_frequency.gt_count > 0
877 }
878
879 pub fn likely_has_code(&self) -> bool {
881 self.char_frequency.backtick_count > 0
882 }
883
884 pub fn likely_has_links_or_images(&self) -> bool {
886 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
887 }
888
889 pub fn likely_has_html(&self) -> bool {
891 self.char_frequency.lt_count > 0
892 }
893
894 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
896 self.html_tags()
897 .iter()
898 .filter(|tag| tag.line == line_num)
899 .cloned()
900 .collect()
901 }
902
903 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
905 self.emphasis_spans()
906 .iter()
907 .filter(|span| span.line == line_num)
908 .cloned()
909 .collect()
910 }
911
912 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
914 self.table_rows()
915 .iter()
916 .filter(|row| row.line == line_num)
917 .cloned()
918 .collect()
919 }
920
921 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
923 self.bare_urls()
924 .iter()
925 .filter(|url| url.line == line_num)
926 .cloned()
927 .collect()
928 }
929
930 #[inline]
936 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
937 let idx = match lines.binary_search_by(|line| {
939 if byte_offset < line.byte_offset {
940 std::cmp::Ordering::Greater
941 } else if byte_offset > line.byte_offset + line.byte_len {
942 std::cmp::Ordering::Less
943 } else {
944 std::cmp::Ordering::Equal
945 }
946 }) {
947 Ok(idx) => idx,
948 Err(idx) => idx.saturating_sub(1),
949 };
950
951 let line = &lines[idx];
952 let line_num = idx + 1;
953 let col = byte_offset.saturating_sub(line.byte_offset);
954
955 (idx, line_num, col)
956 }
957
958 #[inline]
960 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
961 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
963
964 if idx > 0 {
966 let span = &code_spans[idx - 1];
967 if offset >= span.byte_offset && offset < span.byte_end {
968 return true;
969 }
970 }
971
972 false
973 }
974
975 fn parse_links(
977 content: &'a str,
978 lines: &[LineInfo],
979 code_blocks: &[(usize, usize)],
980 code_spans: &[CodeSpan],
981 flavor: MarkdownFlavor,
982 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
983 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
984 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
985 use std::collections::HashSet;
986
987 let mut links = Vec::with_capacity(content.len() / 500);
988 let mut broken_links = Vec::new();
989 let mut footnote_refs = Vec::new();
990
991 let mut found_positions = HashSet::new();
993
994 let mut options = Options::empty();
1004 options.insert(Options::ENABLE_WIKILINKS);
1005 options.insert(Options::ENABLE_FOOTNOTES);
1006
1007 let parser = Parser::new_with_broken_link_callback(
1008 content,
1009 options,
1010 Some(|link: BrokenLink<'_>| {
1011 broken_links.push(BrokenLinkInfo {
1012 reference: link.reference.to_string(),
1013 span: link.span.clone(),
1014 });
1015 None
1016 }),
1017 )
1018 .into_offset_iter();
1019
1020 let mut link_stack: Vec<(
1021 usize,
1022 usize,
1023 pulldown_cmark::CowStr<'a>,
1024 LinkType,
1025 pulldown_cmark::CowStr<'a>,
1026 )> = Vec::new();
1027 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1030 match event {
1031 Event::Start(Tag::Link {
1032 link_type,
1033 dest_url,
1034 id,
1035 ..
1036 }) => {
1037 link_stack.push((range.start, range.end, dest_url, link_type, id));
1039 text_chunks.clear();
1040 }
1041 Event::Text(text) if !link_stack.is_empty() => {
1042 text_chunks.push((text.to_string(), range.start, range.end));
1044 }
1045 Event::Code(code) if !link_stack.is_empty() => {
1046 let code_text = format!("`{code}`");
1048 text_chunks.push((code_text, range.start, range.end));
1049 }
1050 Event::End(TagEnd::Link) => {
1051 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1052 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1054 text_chunks.clear();
1055 continue;
1056 }
1057
1058 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1060
1061 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1063 text_chunks.clear();
1064 continue;
1065 }
1066
1067 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1068
1069 let is_reference = matches!(
1070 link_type,
1071 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1072 );
1073
1074 let link_text = if start_pos < content.len() {
1077 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1078
1079 let mut close_pos = None;
1083 let mut depth = 0;
1084 let mut in_code_span = false;
1085
1086 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1087 let mut backslash_count = 0;
1089 let mut j = i;
1090 while j > 0 && link_bytes[j - 1] == b'\\' {
1091 backslash_count += 1;
1092 j -= 1;
1093 }
1094 let is_escaped = backslash_count % 2 != 0;
1095
1096 if byte == b'`' && !is_escaped {
1098 in_code_span = !in_code_span;
1099 }
1100
1101 if !is_escaped && !in_code_span {
1103 if byte == b'[' {
1104 depth += 1;
1105 } else if byte == b']' {
1106 if depth == 0 {
1107 close_pos = Some(i);
1109 break;
1110 } else {
1111 depth -= 1;
1112 }
1113 }
1114 }
1115 }
1116
1117 if let Some(pos) = close_pos {
1118 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1119 } else {
1120 Cow::Borrowed("")
1121 }
1122 } else {
1123 Cow::Borrowed("")
1124 };
1125
1126 let reference_id = if is_reference && !ref_id.is_empty() {
1128 Some(Cow::Owned(ref_id.to_lowercase()))
1129 } else if is_reference {
1130 Some(Cow::Owned(link_text.to_lowercase()))
1132 } else {
1133 None
1134 };
1135
1136 let has_escaped_bang = start_pos >= 2
1140 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1141 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1142
1143 let has_escaped_bracket =
1146 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1147
1148 if has_escaped_bang || has_escaped_bracket {
1149 text_chunks.clear();
1150 continue; }
1152
1153 found_positions.insert(start_pos);
1155
1156 links.push(ParsedLink {
1157 line: line_num,
1158 start_col: col_start,
1159 end_col: col_end,
1160 byte_offset: start_pos,
1161 byte_end: range.end,
1162 text: link_text,
1163 url: Cow::Owned(url.to_string()),
1164 is_reference,
1165 reference_id,
1166 link_type,
1167 });
1168
1169 text_chunks.clear();
1170 }
1171 }
1172 Event::FootnoteReference(footnote_id) => {
1173 if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1176 continue;
1177 }
1178
1179 let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1180 footnote_refs.push(FootnoteRef {
1181 id: footnote_id.to_string(),
1182 line: line_num,
1183 byte_offset: range.start,
1184 byte_end: range.end,
1185 });
1186 }
1187 _ => {}
1188 }
1189 }
1190
1191 for cap in LINK_PATTERN.captures_iter(content) {
1195 let full_match = cap.get(0).unwrap();
1196 let match_start = full_match.start();
1197 let match_end = full_match.end();
1198
1199 if found_positions.contains(&match_start) {
1201 continue;
1202 }
1203
1204 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1206 continue;
1207 }
1208
1209 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1211 continue;
1212 }
1213
1214 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1216 continue;
1217 }
1218
1219 if Self::is_offset_in_code_span(code_spans, match_start) {
1221 continue;
1222 }
1223
1224 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1226 continue;
1227 }
1228
1229 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1231
1232 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1234 continue;
1235 }
1236
1237 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1238
1239 let text = cap.get(1).map_or("", |m| m.as_str());
1240
1241 if let Some(ref_id) = cap.get(6) {
1243 let ref_id_str = ref_id.as_str();
1244 let normalized_ref = if ref_id_str.is_empty() {
1245 Cow::Owned(text.to_lowercase()) } else {
1247 Cow::Owned(ref_id_str.to_lowercase())
1248 };
1249
1250 links.push(ParsedLink {
1252 line: line_num,
1253 start_col: col_start,
1254 end_col: col_end,
1255 byte_offset: match_start,
1256 byte_end: match_end,
1257 text: Cow::Borrowed(text),
1258 url: Cow::Borrowed(""), is_reference: true,
1260 reference_id: Some(normalized_ref),
1261 link_type: LinkType::Reference, });
1263 }
1264 }
1265
1266 (links, broken_links, footnote_refs)
1267 }
1268
1269 fn parse_images(
1271 content: &'a str,
1272 lines: &[LineInfo],
1273 code_blocks: &[(usize, usize)],
1274 code_spans: &[CodeSpan],
1275 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1276 ) -> Vec<ParsedImage<'a>> {
1277 use crate::utils::skip_context::is_in_html_comment_ranges;
1278 use std::collections::HashSet;
1279
1280 let mut images = Vec::with_capacity(content.len() / 1000);
1282 let mut found_positions = HashSet::new();
1283
1284 let parser = Parser::new(content).into_offset_iter();
1286 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1287 Vec::new();
1288 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1291 match event {
1292 Event::Start(Tag::Image {
1293 link_type,
1294 dest_url,
1295 id,
1296 ..
1297 }) => {
1298 image_stack.push((range.start, dest_url, link_type, id));
1299 text_chunks.clear();
1300 }
1301 Event::Text(text) if !image_stack.is_empty() => {
1302 text_chunks.push((text.to_string(), range.start, range.end));
1303 }
1304 Event::Code(code) if !image_stack.is_empty() => {
1305 let code_text = format!("`{code}`");
1306 text_chunks.push((code_text, range.start, range.end));
1307 }
1308 Event::End(TagEnd::Image) => {
1309 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1310 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1312 continue;
1313 }
1314
1315 if Self::is_offset_in_code_span(code_spans, start_pos) {
1317 continue;
1318 }
1319
1320 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1322 continue;
1323 }
1324
1325 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1327 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1328
1329 let is_reference = matches!(
1330 link_type,
1331 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1332 );
1333
1334 let alt_text = if start_pos < content.len() {
1337 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1338
1339 let mut close_pos = None;
1342 let mut depth = 0;
1343
1344 if image_bytes.len() > 2 {
1345 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1346 let mut backslash_count = 0;
1348 let mut j = i;
1349 while j > 0 && image_bytes[j - 1] == b'\\' {
1350 backslash_count += 1;
1351 j -= 1;
1352 }
1353 let is_escaped = backslash_count % 2 != 0;
1354
1355 if !is_escaped {
1356 if byte == b'[' {
1357 depth += 1;
1358 } else if byte == b']' {
1359 if depth == 0 {
1360 close_pos = Some(i);
1362 break;
1363 } else {
1364 depth -= 1;
1365 }
1366 }
1367 }
1368 }
1369 }
1370
1371 if let Some(pos) = close_pos {
1372 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1373 } else {
1374 Cow::Borrowed("")
1375 }
1376 } else {
1377 Cow::Borrowed("")
1378 };
1379
1380 let reference_id = if is_reference && !ref_id.is_empty() {
1381 Some(Cow::Owned(ref_id.to_lowercase()))
1382 } else if is_reference {
1383 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1385 None
1386 };
1387
1388 found_positions.insert(start_pos);
1389 images.push(ParsedImage {
1390 line: line_num,
1391 start_col: col_start,
1392 end_col: col_end,
1393 byte_offset: start_pos,
1394 byte_end: range.end,
1395 alt_text,
1396 url: Cow::Owned(url.to_string()),
1397 is_reference,
1398 reference_id,
1399 link_type,
1400 });
1401 }
1402 }
1403 _ => {}
1404 }
1405 }
1406
1407 for cap in IMAGE_PATTERN.captures_iter(content) {
1409 let full_match = cap.get(0).unwrap();
1410 let match_start = full_match.start();
1411 let match_end = full_match.end();
1412
1413 if found_positions.contains(&match_start) {
1415 continue;
1416 }
1417
1418 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1420 continue;
1421 }
1422
1423 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1425 || Self::is_offset_in_code_span(code_spans, match_start)
1426 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1427 {
1428 continue;
1429 }
1430
1431 if let Some(ref_id) = cap.get(6) {
1433 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1434 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1435 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1436 let ref_id_str = ref_id.as_str();
1437 let normalized_ref = if ref_id_str.is_empty() {
1438 Cow::Owned(alt_text.to_lowercase())
1439 } else {
1440 Cow::Owned(ref_id_str.to_lowercase())
1441 };
1442
1443 images.push(ParsedImage {
1444 line: line_num,
1445 start_col: col_start,
1446 end_col: col_end,
1447 byte_offset: match_start,
1448 byte_end: match_end,
1449 alt_text: Cow::Borrowed(alt_text),
1450 url: Cow::Borrowed(""),
1451 is_reference: true,
1452 reference_id: Some(normalized_ref),
1453 link_type: LinkType::Reference, });
1455 }
1456 }
1457
1458 images
1459 }
1460
1461 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1463 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1467 if line_info.in_code_block {
1469 continue;
1470 }
1471
1472 let line = line_info.content(content);
1473 let line_num = line_idx + 1;
1474
1475 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1476 let id = cap.get(1).unwrap().as_str().to_lowercase();
1477 let url = cap.get(2).unwrap().as_str().to_string();
1478 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1479
1480 let match_obj = cap.get(0).unwrap();
1483 let byte_offset = line_info.byte_offset + match_obj.start();
1484 let byte_end = line_info.byte_offset + match_obj.end();
1485
1486 refs.push(ReferenceDef {
1487 line: line_num,
1488 id,
1489 url,
1490 title,
1491 byte_offset,
1492 byte_end,
1493 });
1494 }
1495 }
1496
1497 refs
1498 }
1499
1500 #[inline]
1504 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1505 let trimmed_start = line.trim_start();
1506 if !trimmed_start.starts_with('>') {
1507 return None;
1508 }
1509
1510 let leading_ws_len = line.len() - trimmed_start.len();
1511 let after_gt = &trimmed_start[1..];
1512 let content = after_gt.trim_start();
1513 let ws_after_gt_len = after_gt.len() - content.len();
1514 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1515
1516 Some((&line[..prefix_len], content))
1517 }
1518
1519 #[inline]
1523 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1524 let bytes = line.as_bytes();
1525 let mut i = 0;
1526
1527 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1529 i += 1;
1530 }
1531
1532 if i >= bytes.len() {
1534 return None;
1535 }
1536 let marker = bytes[i] as char;
1537 if marker != '-' && marker != '*' && marker != '+' {
1538 return None;
1539 }
1540 let marker_pos = i;
1541 i += 1;
1542
1543 let spacing_start = i;
1545 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1546 i += 1;
1547 }
1548
1549 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1550 }
1551
1552 #[inline]
1556 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1557 let bytes = line.as_bytes();
1558 let mut i = 0;
1559
1560 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1562 i += 1;
1563 }
1564
1565 let number_start = i;
1567 while i < bytes.len() && bytes[i].is_ascii_digit() {
1568 i += 1;
1569 }
1570 if i == number_start {
1571 return None; }
1573
1574 if i >= bytes.len() {
1576 return None;
1577 }
1578 let delimiter = bytes[i] as char;
1579 if delimiter != '.' && delimiter != ')' {
1580 return None;
1581 }
1582 let delimiter_pos = i;
1583 i += 1;
1584
1585 let spacing_start = i;
1587 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1588 i += 1;
1589 }
1590
1591 Some((
1592 &line[..number_start],
1593 &line[number_start..delimiter_pos],
1594 delimiter,
1595 &line[spacing_start..i],
1596 &line[i..],
1597 ))
1598 }
1599
1600 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1603 let num_lines = line_offsets.len();
1604 let mut in_code_block = vec![false; num_lines];
1605
1606 for &(start, end) in code_blocks {
1608 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1610 let mut boundary = start;
1611 while boundary > 0 && !content.is_char_boundary(boundary) {
1612 boundary -= 1;
1613 }
1614 boundary
1615 } else {
1616 start
1617 };
1618
1619 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1620 let mut boundary = end;
1621 while boundary < content.len() && !content.is_char_boundary(boundary) {
1622 boundary += 1;
1623 }
1624 boundary
1625 } else {
1626 end.min(content.len())
1627 };
1628
1629 let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1648 let first_line = first_line_after.saturating_sub(1);
1649 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1650
1651 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1653 *flag = true;
1654 }
1655 }
1656
1657 in_code_block
1658 }
1659
1660 fn compute_basic_line_info(
1662 content: &str,
1663 line_offsets: &[usize],
1664 code_blocks: &[(usize, usize)],
1665 flavor: MarkdownFlavor,
1666 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1667 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1668 ) -> Vec<LineInfo> {
1669 let content_lines: Vec<&str> = content.lines().collect();
1670 let mut lines = Vec::with_capacity(content_lines.len());
1671
1672 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1674
1675 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1678
1679 for (i, line) in content_lines.iter().enumerate() {
1680 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1681 let indent = line.len() - line.trim_start().len();
1682
1683 let blockquote_parse = Self::parse_blockquote_prefix(line);
1685
1686 let is_blank = if let Some((_, content)) = blockquote_parse {
1688 content.trim().is_empty()
1690 } else {
1691 line.trim().is_empty()
1692 };
1693
1694 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1696
1697 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1699 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1700 let in_html_comment =
1702 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1703 let list_item = if !(in_code_block
1704 || is_blank
1705 || in_mkdocstrings
1706 || in_html_comment
1707 || (front_matter_end > 0 && i < front_matter_end))
1708 {
1709 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1711 (content, prefix.len())
1712 } else {
1713 (&**line, 0)
1714 };
1715
1716 if let Some((leading_spaces, marker, spacing, _content)) =
1717 Self::parse_unordered_list(line_for_list_check)
1718 {
1719 let marker_column = blockquote_prefix_len + leading_spaces.len();
1720 let content_column = marker_column + 1 + spacing.len();
1721
1722 if spacing.is_empty() {
1729 None
1730 } else {
1731 Some(ListItemInfo {
1732 marker: marker.to_string(),
1733 is_ordered: false,
1734 number: None,
1735 marker_column,
1736 content_column,
1737 })
1738 }
1739 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1740 Self::parse_ordered_list(line_for_list_check)
1741 {
1742 let marker = format!("{number_str}{delimiter}");
1743 let marker_column = blockquote_prefix_len + leading_spaces.len();
1744 let content_column = marker_column + marker.len() + spacing.len();
1745
1746 if spacing.is_empty() {
1749 None
1750 } else {
1751 Some(ListItemInfo {
1752 marker,
1753 is_ordered: true,
1754 number: number_str.parse().ok(),
1755 marker_column,
1756 content_column,
1757 })
1758 }
1759 } else {
1760 None
1761 }
1762 } else {
1763 None
1764 };
1765
1766 lines.push(LineInfo {
1767 byte_offset,
1768 byte_len: line.len(),
1769 indent,
1770 is_blank,
1771 in_code_block,
1772 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1773 in_html_block: false, in_html_comment,
1775 list_item,
1776 heading: None, blockquote: None, in_mkdocstrings,
1779 in_esm_block: false, });
1781 }
1782
1783 lines
1784 }
1785
1786 fn detect_headings_and_blockquotes(
1788 content: &str,
1789 lines: &mut [LineInfo],
1790 flavor: MarkdownFlavor,
1791 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1792 ) {
1793 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1795 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1796 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1797 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1798
1799 let content_lines: Vec<&str> = content.lines().collect();
1800
1801 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1803
1804 for i in 0..lines.len() {
1806 if lines[i].in_code_block {
1807 continue;
1808 }
1809
1810 if front_matter_end > 0 && i < front_matter_end {
1812 continue;
1813 }
1814
1815 if lines[i].in_html_block {
1817 continue;
1818 }
1819
1820 let line = content_lines[i];
1821
1822 if let Some(bq) = parse_blockquote_detailed(line) {
1824 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1826
1827 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1829
1830 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1832 let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1835
1836 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1840
1841 lines[i].blockquote = Some(BlockquoteInfo {
1842 nesting_level,
1843 indent: bq.indent.to_string(),
1844 marker_column,
1845 prefix,
1846 content: bq.content.to_string(),
1847 has_no_space_after_marker: has_no_space,
1848 has_multiple_spaces_after_marker: has_multiple_spaces,
1849 needs_md028_fix,
1850 });
1851 }
1852
1853 if lines[i].is_blank {
1855 continue;
1856 }
1857
1858 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1861 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1862 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1863 } else {
1864 false
1865 };
1866
1867 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1868 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1870 continue;
1871 }
1872 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1873 let hashes = caps.get(2).map_or("", |m| m.as_str());
1874 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1875 let rest = caps.get(4).map_or("", |m| m.as_str());
1876
1877 let level = hashes.len() as u8;
1878 let marker_column = leading_spaces.len();
1879
1880 let (text, has_closing, closing_seq) = {
1882 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1884 if rest[id_start..].trim_end().ends_with('}') {
1886 (&rest[..id_start], &rest[id_start..])
1888 } else {
1889 (rest, "")
1890 }
1891 } else {
1892 (rest, "")
1893 };
1894
1895 let trimmed_rest = rest_without_id.trim_end();
1897 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1898 let mut start_of_hashes = last_hash_pos;
1900 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1901 start_of_hashes -= 1;
1902 }
1903
1904 let has_space_before = start_of_hashes == 0
1906 || trimmed_rest
1907 .chars()
1908 .nth(start_of_hashes - 1)
1909 .is_some_and(|c| c.is_whitespace());
1910
1911 let potential_closing = &trimmed_rest[start_of_hashes..];
1913 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1914
1915 if is_all_hashes && has_space_before {
1916 let closing_hashes = potential_closing.to_string();
1918 let text_part = if !custom_id_part.is_empty() {
1921 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1924 } else {
1925 rest_without_id[..start_of_hashes].trim_end().to_string()
1926 };
1927 (text_part, true, closing_hashes)
1928 } else {
1929 (rest.to_string(), false, String::new())
1931 }
1932 } else {
1933 (rest.to_string(), false, String::new())
1935 }
1936 };
1937
1938 let content_column = marker_column + hashes.len() + spaces_after.len();
1939
1940 let raw_text = text.trim().to_string();
1942 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1943
1944 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1946 let next_line = content_lines[i + 1];
1947 if !lines[i + 1].in_code_block
1948 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1949 && let Some(next_line_id) =
1950 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1951 {
1952 custom_id = Some(next_line_id);
1953 }
1954 }
1955
1956 lines[i].heading = Some(HeadingInfo {
1957 level,
1958 style: HeadingStyle::ATX,
1959 marker: hashes.to_string(),
1960 marker_column,
1961 content_column,
1962 text: clean_text,
1963 custom_id,
1964 raw_text,
1965 has_closing_sequence: has_closing,
1966 closing_sequence: closing_seq,
1967 });
1968 }
1969 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1971 let next_line = content_lines[i + 1];
1972 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1973 if front_matter_end > 0 && i < front_matter_end {
1975 continue;
1976 }
1977
1978 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1980 {
1981 continue;
1982 }
1983
1984 let underline = next_line.trim();
1985
1986 if underline == "---" {
1989 continue;
1990 }
1991
1992 let current_line_trimmed = line.trim();
1994 if current_line_trimmed.contains(':')
1995 && !current_line_trimmed.starts_with('#')
1996 && !current_line_trimmed.contains('[')
1997 && !current_line_trimmed.contains("](")
1998 {
1999 continue;
2001 }
2002
2003 let level = if underline.starts_with('=') { 1 } else { 2 };
2004 let style = if level == 1 {
2005 HeadingStyle::Setext1
2006 } else {
2007 HeadingStyle::Setext2
2008 };
2009
2010 let raw_text = line.trim().to_string();
2012 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2013
2014 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2016 let attr_line = content_lines[i + 2];
2017 if !lines[i + 2].in_code_block
2018 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2019 && let Some(attr_line_id) =
2020 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2021 {
2022 custom_id = Some(attr_line_id);
2023 }
2024 }
2025
2026 lines[i].heading = Some(HeadingInfo {
2027 level,
2028 style,
2029 marker: underline.to_string(),
2030 marker_column: next_line.len() - next_line.trim_start().len(),
2031 content_column: lines[i].indent,
2032 text: clean_text,
2033 custom_id,
2034 raw_text,
2035 has_closing_sequence: false,
2036 closing_sequence: String::new(),
2037 });
2038 }
2039 }
2040 }
2041 }
2042
2043 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2045 const BLOCK_ELEMENTS: &[&str] = &[
2047 "address",
2048 "article",
2049 "aside",
2050 "blockquote",
2051 "details",
2052 "dialog",
2053 "dd",
2054 "div",
2055 "dl",
2056 "dt",
2057 "fieldset",
2058 "figcaption",
2059 "figure",
2060 "footer",
2061 "form",
2062 "h1",
2063 "h2",
2064 "h3",
2065 "h4",
2066 "h5",
2067 "h6",
2068 "header",
2069 "hr",
2070 "li",
2071 "main",
2072 "nav",
2073 "ol",
2074 "p",
2075 "picture",
2076 "pre",
2077 "script",
2078 "section",
2079 "style",
2080 "table",
2081 "tbody",
2082 "td",
2083 "textarea",
2084 "tfoot",
2085 "th",
2086 "thead",
2087 "tr",
2088 "ul",
2089 ];
2090
2091 let mut i = 0;
2092 while i < lines.len() {
2093 if lines[i].in_code_block || lines[i].in_front_matter {
2095 i += 1;
2096 continue;
2097 }
2098
2099 let trimmed = lines[i].content(content).trim_start();
2100
2101 if trimmed.starts_with('<') && trimmed.len() > 1 {
2103 let after_bracket = &trimmed[1..];
2105 let is_closing = after_bracket.starts_with('/');
2106 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2107
2108 let tag_name = tag_start
2110 .chars()
2111 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2112 .collect::<String>()
2113 .to_lowercase();
2114
2115 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2117 lines[i].in_html_block = true;
2119
2120 if !is_closing {
2123 let closing_tag = format!("</{tag_name}>");
2124 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2126 let mut j = i + 1;
2127 while j < lines.len() && j < i + 100 {
2128 if !allow_blank_lines && lines[j].is_blank {
2131 break;
2132 }
2133
2134 lines[j].in_html_block = true;
2135
2136 if lines[j].content(content).contains(&closing_tag) {
2138 break;
2139 }
2140 j += 1;
2141 }
2142 }
2143 }
2144 }
2145
2146 i += 1;
2147 }
2148 }
2149
2150 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2153 if !flavor.supports_esm_blocks() {
2155 return;
2156 }
2157
2158 for line in lines.iter_mut() {
2159 if line.is_blank || line.in_html_comment {
2161 continue;
2162 }
2163
2164 let trimmed = line.content(content).trim_start();
2166 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2167 line.in_esm_block = true;
2168 } else {
2169 break;
2171 }
2172 }
2173 }
2174
2175 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2177 let mut code_spans = Vec::new();
2178
2179 if !content.contains('`') {
2181 return code_spans;
2182 }
2183
2184 let parser = Parser::new(content).into_offset_iter();
2186
2187 for (event, range) in parser {
2188 if let Event::Code(_) = event {
2189 let start_pos = range.start;
2190 let end_pos = range.end;
2191
2192 let full_span = &content[start_pos..end_pos];
2194 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2195
2196 let content_start = start_pos + backtick_count;
2198 let content_end = end_pos - backtick_count;
2199 let span_content = if content_start < content_end {
2200 content[content_start..content_end].to_string()
2201 } else {
2202 String::new()
2203 };
2204
2205 let line_idx = lines
2208 .partition_point(|line| line.byte_offset <= start_pos)
2209 .saturating_sub(1);
2210 let line_num = line_idx + 1;
2211 let byte_col_start = start_pos - lines[line_idx].byte_offset;
2212
2213 let end_line_idx = lines
2215 .partition_point(|line| line.byte_offset <= end_pos)
2216 .saturating_sub(1);
2217 let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2218
2219 let line_content = lines[line_idx].content(content);
2222 let col_start = if byte_col_start <= line_content.len() {
2223 line_content[..byte_col_start].chars().count()
2224 } else {
2225 line_content.chars().count()
2226 };
2227
2228 let end_line_content = lines[end_line_idx].content(content);
2229 let col_end = if byte_col_end <= end_line_content.len() {
2230 end_line_content[..byte_col_end].chars().count()
2231 } else {
2232 end_line_content.chars().count()
2233 };
2234
2235 code_spans.push(CodeSpan {
2236 line: line_num,
2237 start_col: col_start,
2238 end_col: col_end,
2239 byte_offset: start_pos,
2240 byte_end: end_pos,
2241 backtick_count,
2242 content: span_content,
2243 });
2244 }
2245 }
2246
2247 code_spans.sort_by_key(|span| span.byte_offset);
2249
2250 code_spans
2251 }
2252
2253 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2264 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2266
2267 #[inline]
2270 fn reset_tracking_state(
2271 list_item: &ListItemInfo,
2272 has_list_breaking_content: &mut bool,
2273 min_continuation: &mut usize,
2274 ) {
2275 *has_list_breaking_content = false;
2276 let marker_width = if list_item.is_ordered {
2277 list_item.marker.len() + 1 } else {
2279 list_item.marker.len()
2280 };
2281 *min_continuation = if list_item.is_ordered {
2282 marker_width
2283 } else {
2284 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2285 };
2286 }
2287
2288 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2291 let mut last_list_item_line = 0;
2292 let mut current_indent_level = 0;
2293 let mut last_marker_width = 0;
2294
2295 let mut has_list_breaking_content_since_last_item = false;
2297 let mut min_continuation_for_tracking = 0;
2298
2299 for (line_idx, line_info) in lines.iter().enumerate() {
2300 let line_num = line_idx + 1;
2301
2302 if line_info.in_code_block {
2304 if let Some(ref mut block) = current_block {
2305 let min_continuation_indent =
2307 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2308
2309 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2311
2312 match context {
2313 CodeBlockContext::Indented => {
2314 block.end_line = line_num;
2316 continue;
2317 }
2318 CodeBlockContext::Standalone => {
2319 let completed_block = current_block.take().unwrap();
2321 list_blocks.push(completed_block);
2322 continue;
2323 }
2324 CodeBlockContext::Adjacent => {
2325 block.end_line = line_num;
2327 continue;
2328 }
2329 }
2330 } else {
2331 continue;
2333 }
2334 }
2335
2336 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2338 caps.get(0).unwrap().as_str().to_string()
2339 } else {
2340 String::new()
2341 };
2342
2343 if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2345 let line_content = line_info.content(content).trim();
2346
2347 let breaks_list = line_info.heading.is_some()
2349 || line_content.starts_with("---")
2350 || line_content.starts_with("***")
2351 || line_content.starts_with("___")
2352 || (line_content.contains('|')
2353 && !line_content.contains("](")
2354 && !line_content.contains("http")
2355 && (line_content.matches('|').count() > 1
2356 || line_content.starts_with('|')
2357 || line_content.ends_with('|')))
2358 || line_content.starts_with(">")
2359 || (line_info.indent < min_continuation_for_tracking);
2360
2361 if breaks_list {
2362 has_list_breaking_content_since_last_item = true;
2363 }
2364 }
2365
2366 if let Some(list_item) = &line_info.list_item {
2368 let item_indent = list_item.marker_column;
2370 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2373 let is_nested = nesting > block.nesting_level;
2377 let same_type =
2378 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2379 let same_context = block.blockquote_prefix == blockquote_prefix;
2380 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2384 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2385
2386 let has_non_list_content = has_list_breaking_content_since_last_item;
2389
2390 let mut continues_list = if is_nested {
2394 same_context && reasonable_distance && !has_non_list_content
2396 } else {
2397 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2399 };
2400
2401 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2404 if block.item_lines.contains(&(line_num - 1)) {
2406 continues_list = true;
2408 }
2409 }
2410
2411 if continues_list {
2412 block.end_line = line_num;
2414 block.item_lines.push(line_num);
2415
2416 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2418 list_item.marker.len() + 1
2419 } else {
2420 list_item.marker.len()
2421 });
2422
2423 if !block.is_ordered
2425 && block.marker.is_some()
2426 && block.marker.as_ref() != Some(&list_item.marker)
2427 {
2428 block.marker = None;
2430 }
2431
2432 reset_tracking_state(
2434 list_item,
2435 &mut has_list_breaking_content_since_last_item,
2436 &mut min_continuation_for_tracking,
2437 );
2438 } else {
2439 list_blocks.push(block.clone());
2442
2443 *block = ListBlock {
2444 start_line: line_num,
2445 end_line: line_num,
2446 is_ordered: list_item.is_ordered,
2447 marker: if list_item.is_ordered {
2448 None
2449 } else {
2450 Some(list_item.marker.clone())
2451 },
2452 blockquote_prefix: blockquote_prefix.clone(),
2453 item_lines: vec![line_num],
2454 nesting_level: nesting,
2455 max_marker_width: if list_item.is_ordered {
2456 list_item.marker.len() + 1
2457 } else {
2458 list_item.marker.len()
2459 },
2460 };
2461
2462 reset_tracking_state(
2464 list_item,
2465 &mut has_list_breaking_content_since_last_item,
2466 &mut min_continuation_for_tracking,
2467 );
2468 }
2469 } else {
2470 current_block = Some(ListBlock {
2472 start_line: line_num,
2473 end_line: line_num,
2474 is_ordered: list_item.is_ordered,
2475 marker: if list_item.is_ordered {
2476 None
2477 } else {
2478 Some(list_item.marker.clone())
2479 },
2480 blockquote_prefix,
2481 item_lines: vec![line_num],
2482 nesting_level: nesting,
2483 max_marker_width: list_item.marker.len(),
2484 });
2485
2486 reset_tracking_state(
2488 list_item,
2489 &mut has_list_breaking_content_since_last_item,
2490 &mut min_continuation_for_tracking,
2491 );
2492 }
2493
2494 last_list_item_line = line_num;
2495 current_indent_level = item_indent;
2496 last_marker_width = if list_item.is_ordered {
2497 list_item.marker.len() + 1 } else {
2499 list_item.marker.len()
2500 };
2501 } else if let Some(ref mut block) = current_block {
2502 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2512 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2513 } else {
2514 false
2515 };
2516
2517 let min_continuation_indent = if block.is_ordered {
2521 current_indent_level + last_marker_width
2522 } else {
2523 current_indent_level + 2 };
2525
2526 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2527 block.end_line = line_num;
2529 } else if line_info.is_blank {
2530 let mut check_idx = line_idx + 1;
2533 let mut found_continuation = false;
2534
2535 while check_idx < lines.len() && lines[check_idx].is_blank {
2537 check_idx += 1;
2538 }
2539
2540 if check_idx < lines.len() {
2541 let next_line = &lines[check_idx];
2542 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2544 found_continuation = true;
2545 }
2546 else if !next_line.in_code_block
2548 && next_line.list_item.is_some()
2549 && let Some(item) = &next_line.list_item
2550 {
2551 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2552 .find(next_line.content(content))
2553 .map_or(String::new(), |m| m.as_str().to_string());
2554 if item.marker_column == current_indent_level
2555 && item.is_ordered == block.is_ordered
2556 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2557 {
2558 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2561 if let Some(between_line) = lines.get(idx) {
2562 let between_content = between_line.content(content);
2563 let trimmed = between_content.trim();
2564 if trimmed.is_empty() {
2566 return false;
2567 }
2568 let line_indent = between_content.len() - between_content.trim_start().len();
2570
2571 if trimmed.starts_with("```")
2573 || trimmed.starts_with("~~~")
2574 || trimmed.starts_with("---")
2575 || trimmed.starts_with("***")
2576 || trimmed.starts_with("___")
2577 || trimmed.starts_with(">")
2578 || trimmed.contains('|') || between_line.heading.is_some()
2580 {
2581 return true; }
2583
2584 line_indent >= min_continuation_indent
2586 } else {
2587 false
2588 }
2589 });
2590
2591 if block.is_ordered {
2592 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2595 if let Some(between_line) = lines.get(idx) {
2596 let trimmed = between_line.content(content).trim();
2597 if trimmed.is_empty() {
2598 return false;
2599 }
2600 trimmed.starts_with("```")
2602 || trimmed.starts_with("~~~")
2603 || trimmed.starts_with("---")
2604 || trimmed.starts_with("***")
2605 || trimmed.starts_with("___")
2606 || trimmed.starts_with(">")
2607 || trimmed.contains('|') || between_line.heading.is_some()
2609 } else {
2610 false
2611 }
2612 });
2613 found_continuation = !has_structural_separators;
2614 } else {
2615 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2617 if let Some(between_line) = lines.get(idx) {
2618 let trimmed = between_line.content(content).trim();
2619 if trimmed.is_empty() {
2620 return false;
2621 }
2622 trimmed.starts_with("```")
2624 || trimmed.starts_with("~~~")
2625 || trimmed.starts_with("---")
2626 || trimmed.starts_with("***")
2627 || trimmed.starts_with("___")
2628 || trimmed.starts_with(">")
2629 || trimmed.contains('|') || between_line.heading.is_some()
2631 } else {
2632 false
2633 }
2634 });
2635 found_continuation = !has_structural_separators;
2636 }
2637 }
2638 }
2639 }
2640
2641 if found_continuation {
2642 block.end_line = line_num;
2644 } else {
2645 list_blocks.push(block.clone());
2647 current_block = None;
2648 }
2649 } else {
2650 let min_required_indent = if block.is_ordered {
2653 current_indent_level + last_marker_width
2654 } else {
2655 current_indent_level + 2
2656 };
2657
2658 let line_content = line_info.content(content).trim();
2663 let is_structural_separator = line_info.heading.is_some()
2664 || line_content.starts_with("```")
2665 || line_content.starts_with("~~~")
2666 || line_content.starts_with("---")
2667 || line_content.starts_with("***")
2668 || line_content.starts_with("___")
2669 || line_content.starts_with(">")
2670 || (line_content.contains('|')
2671 && !line_content.contains("](")
2672 && !line_content.contains("http")
2673 && (line_content.matches('|').count() > 1
2674 || line_content.starts_with('|')
2675 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2680 && !line_info.is_blank
2681 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2682
2683 if is_lazy_continuation {
2684 let content_to_check = if !blockquote_prefix.is_empty() {
2687 line_info
2689 .content(content)
2690 .strip_prefix(&blockquote_prefix)
2691 .unwrap_or(line_info.content(content))
2692 .trim()
2693 } else {
2694 line_info.content(content).trim()
2695 };
2696
2697 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2698
2699 if starts_with_uppercase && last_list_item_line > 0 {
2702 list_blocks.push(block.clone());
2704 current_block = None;
2705 } else {
2706 block.end_line = line_num;
2708 }
2709 } else {
2710 list_blocks.push(block.clone());
2712 current_block = None;
2713 }
2714 }
2715 }
2716 }
2717
2718 if let Some(block) = current_block {
2720 list_blocks.push(block);
2721 }
2722
2723 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2725
2726 list_blocks
2727 }
2728
2729 fn compute_char_frequency(content: &str) -> CharFrequency {
2731 let mut frequency = CharFrequency::default();
2732
2733 for ch in content.chars() {
2734 match ch {
2735 '#' => frequency.hash_count += 1,
2736 '*' => frequency.asterisk_count += 1,
2737 '_' => frequency.underscore_count += 1,
2738 '-' => frequency.hyphen_count += 1,
2739 '+' => frequency.plus_count += 1,
2740 '>' => frequency.gt_count += 1,
2741 '|' => frequency.pipe_count += 1,
2742 '[' => frequency.bracket_count += 1,
2743 '`' => frequency.backtick_count += 1,
2744 '<' => frequency.lt_count += 1,
2745 '!' => frequency.exclamation_count += 1,
2746 '\n' => frequency.newline_count += 1,
2747 _ => {}
2748 }
2749 }
2750
2751 frequency
2752 }
2753
2754 fn parse_html_tags(
2756 content: &str,
2757 lines: &[LineInfo],
2758 code_blocks: &[(usize, usize)],
2759 flavor: MarkdownFlavor,
2760 ) -> Vec<HtmlTag> {
2761 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2762 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2763
2764 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2765
2766 for cap in HTML_TAG_REGEX.captures_iter(content) {
2767 let full_match = cap.get(0).unwrap();
2768 let match_start = full_match.start();
2769 let match_end = full_match.end();
2770
2771 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2773 continue;
2774 }
2775
2776 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2777 let tag_name_original = cap.get(2).unwrap().as_str();
2778 let tag_name = tag_name_original.to_lowercase();
2779 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2780
2781 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2784 continue;
2785 }
2786
2787 let mut line_num = 1;
2789 let mut col_start = match_start;
2790 let mut col_end = match_end;
2791 for (idx, line_info) in lines.iter().enumerate() {
2792 if match_start >= line_info.byte_offset {
2793 line_num = idx + 1;
2794 col_start = match_start - line_info.byte_offset;
2795 col_end = match_end - line_info.byte_offset;
2796 } else {
2797 break;
2798 }
2799 }
2800
2801 html_tags.push(HtmlTag {
2802 line: line_num,
2803 start_col: col_start,
2804 end_col: col_end,
2805 byte_offset: match_start,
2806 byte_end: match_end,
2807 tag_name,
2808 is_closing,
2809 is_self_closing,
2810 raw_content: full_match.as_str().to_string(),
2811 });
2812 }
2813
2814 html_tags
2815 }
2816
2817 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2819 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2820 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2821
2822 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2823
2824 for cap in EMPHASIS_REGEX.captures_iter(content) {
2825 let full_match = cap.get(0).unwrap();
2826 let match_start = full_match.start();
2827 let match_end = full_match.end();
2828
2829 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2831 continue;
2832 }
2833
2834 let opening_markers = cap.get(1).unwrap().as_str();
2835 let content_part = cap.get(2).unwrap().as_str();
2836 let closing_markers = cap.get(3).unwrap().as_str();
2837
2838 if opening_markers.chars().next() != closing_markers.chars().next()
2840 || opening_markers.len() != closing_markers.len()
2841 {
2842 continue;
2843 }
2844
2845 let marker = opening_markers.chars().next().unwrap();
2846 let marker_count = opening_markers.len();
2847
2848 let mut line_num = 1;
2850 let mut col_start = match_start;
2851 let mut col_end = match_end;
2852 for (idx, line_info) in lines.iter().enumerate() {
2853 if match_start >= line_info.byte_offset {
2854 line_num = idx + 1;
2855 col_start = match_start - line_info.byte_offset;
2856 col_end = match_end - line_info.byte_offset;
2857 } else {
2858 break;
2859 }
2860 }
2861
2862 emphasis_spans.push(EmphasisSpan {
2863 line: line_num,
2864 start_col: col_start,
2865 end_col: col_end,
2866 byte_offset: match_start,
2867 byte_end: match_end,
2868 marker,
2869 marker_count,
2870 content: content_part.to_string(),
2871 });
2872 }
2873
2874 emphasis_spans
2875 }
2876
2877 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2879 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2880
2881 for (line_idx, line_info) in lines.iter().enumerate() {
2882 if line_info.in_code_block || line_info.is_blank {
2884 continue;
2885 }
2886
2887 let line = line_info.content(content);
2888 let line_num = line_idx + 1;
2889
2890 if !line.contains('|') {
2892 continue;
2893 }
2894
2895 let parts: Vec<&str> = line.split('|').collect();
2897 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2898
2899 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2901 let mut column_alignments = Vec::new();
2902
2903 if is_separator {
2904 for part in &parts[1..parts.len() - 1] {
2905 let trimmed = part.trim();
2907 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2908 "center".to_string()
2909 } else if trimmed.ends_with(':') {
2910 "right".to_string()
2911 } else if trimmed.starts_with(':') {
2912 "left".to_string()
2913 } else {
2914 "none".to_string()
2915 };
2916 column_alignments.push(alignment);
2917 }
2918 }
2919
2920 table_rows.push(TableRow {
2921 line: line_num,
2922 is_separator,
2923 column_count,
2924 column_alignments,
2925 });
2926 }
2927
2928 table_rows
2929 }
2930
2931 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2933 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2934
2935 for cap in BARE_URL_PATTERN.captures_iter(content) {
2937 let full_match = cap.get(0).unwrap();
2938 let match_start = full_match.start();
2939 let match_end = full_match.end();
2940
2941 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2943 continue;
2944 }
2945
2946 let preceding_char = if match_start > 0 {
2948 content.chars().nth(match_start - 1)
2949 } else {
2950 None
2951 };
2952 let following_char = content.chars().nth(match_end);
2953
2954 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2955 continue;
2956 }
2957 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2958 continue;
2959 }
2960
2961 let url = full_match.as_str();
2962 let url_type = if url.starts_with("https://") {
2963 "https"
2964 } else if url.starts_with("http://") {
2965 "http"
2966 } else if url.starts_with("ftp://") {
2967 "ftp"
2968 } else {
2969 "other"
2970 };
2971
2972 let mut line_num = 1;
2974 let mut col_start = match_start;
2975 let mut col_end = match_end;
2976 for (idx, line_info) in lines.iter().enumerate() {
2977 if match_start >= line_info.byte_offset {
2978 line_num = idx + 1;
2979 col_start = match_start - line_info.byte_offset;
2980 col_end = match_end - line_info.byte_offset;
2981 } else {
2982 break;
2983 }
2984 }
2985
2986 bare_urls.push(BareUrl {
2987 line: line_num,
2988 start_col: col_start,
2989 end_col: col_end,
2990 byte_offset: match_start,
2991 byte_end: match_end,
2992 url: url.to_string(),
2993 url_type: url_type.to_string(),
2994 });
2995 }
2996
2997 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2999 let full_match = cap.get(0).unwrap();
3000 let match_start = full_match.start();
3001 let match_end = full_match.end();
3002
3003 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3005 continue;
3006 }
3007
3008 let preceding_char = if match_start > 0 {
3010 content.chars().nth(match_start - 1)
3011 } else {
3012 None
3013 };
3014 let following_char = content.chars().nth(match_end);
3015
3016 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3017 continue;
3018 }
3019 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3020 continue;
3021 }
3022
3023 let email = full_match.as_str();
3024
3025 let mut line_num = 1;
3027 let mut col_start = match_start;
3028 let mut col_end = match_end;
3029 for (idx, line_info) in lines.iter().enumerate() {
3030 if match_start >= line_info.byte_offset {
3031 line_num = idx + 1;
3032 col_start = match_start - line_info.byte_offset;
3033 col_end = match_end - line_info.byte_offset;
3034 } else {
3035 break;
3036 }
3037 }
3038
3039 bare_urls.push(BareUrl {
3040 line: line_num,
3041 start_col: col_start,
3042 end_col: col_end,
3043 byte_offset: match_start,
3044 byte_end: match_end,
3045 url: email.to_string(),
3046 url_type: "email".to_string(),
3047 });
3048 }
3049
3050 bare_urls
3051 }
3052}
3053
3054fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3056 if list_blocks.len() < 2 {
3057 return;
3058 }
3059
3060 let mut merger = ListBlockMerger::new(content, lines);
3061 *list_blocks = merger.merge(list_blocks);
3062}
3063
3064struct ListBlockMerger<'a> {
3066 content: &'a str,
3067 lines: &'a [LineInfo],
3068}
3069
3070impl<'a> ListBlockMerger<'a> {
3071 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3072 Self { content, lines }
3073 }
3074
3075 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3076 let mut merged = Vec::with_capacity(list_blocks.len());
3077 let mut current = list_blocks[0].clone();
3078
3079 for next in list_blocks.iter().skip(1) {
3080 if self.should_merge_blocks(¤t, next) {
3081 current = self.merge_two_blocks(current, next);
3082 } else {
3083 merged.push(current);
3084 current = next.clone();
3085 }
3086 }
3087
3088 merged.push(current);
3089 merged
3090 }
3091
3092 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3094 if !self.blocks_are_compatible(current, next) {
3096 return false;
3097 }
3098
3099 let spacing = self.analyze_spacing_between(current, next);
3101 match spacing {
3102 BlockSpacing::Consecutive => true,
3103 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3104 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3105 self.can_merge_with_content_between(current, next)
3106 }
3107 }
3108 }
3109
3110 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3112 current.is_ordered == next.is_ordered
3113 && current.blockquote_prefix == next.blockquote_prefix
3114 && current.nesting_level == next.nesting_level
3115 }
3116
3117 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3119 let gap = next.start_line - current.end_line;
3120
3121 match gap {
3122 1 => BlockSpacing::Consecutive,
3123 2 => BlockSpacing::SingleBlank,
3124 _ if gap > 2 => {
3125 if self.has_only_blank_lines_between(current, next) {
3126 BlockSpacing::MultipleBlanks
3127 } else {
3128 BlockSpacing::ContentBetween
3129 }
3130 }
3131 _ => BlockSpacing::Consecutive, }
3133 }
3134
3135 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3137 if has_meaningful_content_between(self.content, current, next, self.lines) {
3140 return false; }
3142
3143 !current.is_ordered && current.marker == next.marker
3145 }
3146
3147 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3149 if has_meaningful_content_between(self.content, current, next, self.lines) {
3151 return false; }
3153
3154 current.is_ordered && next.is_ordered
3156 }
3157
3158 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3160 for line_num in (current.end_line + 1)..next.start_line {
3161 if let Some(line_info) = self.lines.get(line_num - 1)
3162 && !line_info.content(self.content).trim().is_empty()
3163 {
3164 return false;
3165 }
3166 }
3167 true
3168 }
3169
3170 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3172 current.end_line = next.end_line;
3173 current.item_lines.extend_from_slice(&next.item_lines);
3174
3175 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3177
3178 if !current.is_ordered && self.markers_differ(¤t, next) {
3180 current.marker = None; }
3182
3183 current
3184 }
3185
3186 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3188 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3189 }
3190}
3191
3192#[derive(Debug, PartialEq)]
3194enum BlockSpacing {
3195 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3200
3201fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3203 for line_num in (current.end_line + 1)..next.start_line {
3205 if let Some(line_info) = lines.get(line_num - 1) {
3206 let trimmed = line_info.content(content).trim();
3208
3209 if trimmed.is_empty() {
3211 continue;
3212 }
3213
3214 if line_info.heading.is_some() {
3218 return true; }
3220
3221 if is_horizontal_rule(trimmed) {
3223 return true; }
3225
3226 if trimmed.contains('|') && trimmed.len() > 1 {
3229 if !trimmed.contains("](") && !trimmed.contains("http") {
3231 let pipe_count = trimmed.matches('|').count();
3233 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3234 return true; }
3236 }
3237 }
3238
3239 if trimmed.starts_with('>') {
3241 return true; }
3243
3244 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3246 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3247
3248 let min_continuation_indent = if current.is_ordered {
3250 current.nesting_level + current.max_marker_width + 1 } else {
3252 current.nesting_level + 2
3253 };
3254
3255 if line_indent < min_continuation_indent {
3256 return true; }
3259 }
3260
3261 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3263
3264 let min_indent = if current.is_ordered {
3266 current.nesting_level + current.max_marker_width
3267 } else {
3268 current.nesting_level + 2
3269 };
3270
3271 if line_indent < min_indent {
3273 return true; }
3275
3276 }
3279 }
3280
3281 false
3283}
3284
3285fn is_horizontal_rule(trimmed: &str) -> bool {
3287 if trimmed.len() < 3 {
3288 return false;
3289 }
3290
3291 let chars: Vec<char> = trimmed.chars().collect();
3293 if let Some(&first_char) = chars.first()
3294 && (first_char == '-' || first_char == '*' || first_char == '_')
3295 {
3296 let mut count = 0;
3297 for &ch in &chars {
3298 if ch == first_char {
3299 count += 1;
3300 } else if ch != ' ' && ch != '\t' {
3301 return false; }
3303 }
3304 return count >= 3;
3305 }
3306 false
3307}
3308
3309#[cfg(test)]
3311mod tests {
3312 use super::*;
3313
3314 #[test]
3315 fn test_empty_content() {
3316 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3317 assert_eq!(ctx.content, "");
3318 assert_eq!(ctx.line_offsets, vec![0]);
3319 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3320 assert_eq!(ctx.lines.len(), 0);
3321 }
3322
3323 #[test]
3324 fn test_single_line() {
3325 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3326 assert_eq!(ctx.content, "# Hello");
3327 assert_eq!(ctx.line_offsets, vec![0]);
3328 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3329 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3330 }
3331
3332 #[test]
3333 fn test_multi_line() {
3334 let content = "# Title\n\nSecond line\nThird line";
3335 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3336 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3337 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3344
3345 #[test]
3346 fn test_line_info() {
3347 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3348 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3349
3350 assert_eq!(ctx.lines.len(), 7);
3352
3353 let line1 = &ctx.lines[0];
3355 assert_eq!(line1.content(ctx.content), "# Title");
3356 assert_eq!(line1.byte_offset, 0);
3357 assert_eq!(line1.indent, 0);
3358 assert!(!line1.is_blank);
3359 assert!(!line1.in_code_block);
3360 assert!(line1.list_item.is_none());
3361
3362 let line2 = &ctx.lines[1];
3364 assert_eq!(line2.content(ctx.content), " indented");
3365 assert_eq!(line2.byte_offset, 8);
3366 assert_eq!(line2.indent, 4);
3367 assert!(!line2.is_blank);
3368
3369 let line3 = &ctx.lines[2];
3371 assert_eq!(line3.content(ctx.content), "");
3372 assert!(line3.is_blank);
3373
3374 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3376 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3377 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3378 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3379 }
3380
3381 #[test]
3382 fn test_list_item_detection() {
3383 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3384 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3385
3386 let line1 = &ctx.lines[0];
3388 assert!(line1.list_item.is_some());
3389 let list1 = line1.list_item.as_ref().unwrap();
3390 assert_eq!(list1.marker, "-");
3391 assert!(!list1.is_ordered);
3392 assert_eq!(list1.marker_column, 0);
3393 assert_eq!(list1.content_column, 2);
3394
3395 let line2 = &ctx.lines[1];
3397 assert!(line2.list_item.is_some());
3398 let list2 = line2.list_item.as_ref().unwrap();
3399 assert_eq!(list2.marker, "*");
3400 assert_eq!(list2.marker_column, 2);
3401
3402 let line3 = &ctx.lines[2];
3404 assert!(line3.list_item.is_some());
3405 let list3 = line3.list_item.as_ref().unwrap();
3406 assert_eq!(list3.marker, "1.");
3407 assert!(list3.is_ordered);
3408 assert_eq!(list3.number, Some(1));
3409
3410 let line6 = &ctx.lines[5];
3412 assert!(line6.list_item.is_none());
3413 }
3414
3415 #[test]
3416 fn test_offset_to_line_col_edge_cases() {
3417 let content = "a\nb\nc";
3418 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3419 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3427
3428 #[test]
3429 fn test_mdx_esm_blocks() {
3430 let content = r##"import {Chart} from './snowfall.js'
3431export const year = 2023
3432
3433# Last year's snowfall
3434
3435In {year}, the snowfall was above average.
3436It was followed by a warm spring which caused
3437flood conditions in many of the nearby rivers.
3438
3439<Chart color="#fcb32c" year={year} />
3440"##;
3441
3442 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3443
3444 assert_eq!(ctx.lines.len(), 10);
3446 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3447 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3448 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3449 assert!(
3450 !ctx.lines[3].in_esm_block,
3451 "Line 4 (heading) should NOT be in_esm_block"
3452 );
3453 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3454 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3455 }
3456
3457 #[test]
3458 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3459 let content = r#"import {Chart} from './snowfall.js'
3460export const year = 2023
3461
3462# Last year's snowfall
3463"#;
3464
3465 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3466
3467 assert!(
3469 !ctx.lines[0].in_esm_block,
3470 "Line 1 should NOT be in_esm_block in Standard flavor"
3471 );
3472 assert!(
3473 !ctx.lines[1].in_esm_block,
3474 "Line 2 should NOT be in_esm_block in Standard flavor"
3475 );
3476 }
3477}