1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
12 Regex::new(
13 r#"(?sx)
14 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
15 (?:
16 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
17 |
18 \[([^\]]*)\] # Reference ID in group 6
19 )"#
20 ).unwrap()
21});
22
23static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
26 Regex::new(
27 r#"(?sx)
28 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
29 (?:
30 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
31 |
32 \[([^\]]*)\] # Reference ID in group 6
33 )"#
34 ).unwrap()
35});
36
37static REF_DEF_PATTERN: LazyLock<Regex> =
39 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
40
41static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
43 Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap()
46});
47
48static BARE_EMAIL_PATTERN: LazyLock<Regex> =
50 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
51
52static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
54
55#[derive(Debug, Clone)]
57pub struct LineInfo {
58 pub byte_offset: usize,
60 pub byte_len: usize,
62 pub indent: usize,
64 pub is_blank: bool,
66 pub in_code_block: bool,
68 pub in_front_matter: bool,
70 pub in_html_block: bool,
72 pub in_html_comment: bool,
74 pub list_item: Option<ListItemInfo>,
76 pub heading: Option<HeadingInfo>,
78 pub blockquote: Option<BlockquoteInfo>,
80 pub in_mkdocstrings: bool,
82 pub in_esm_block: bool,
84}
85
86impl LineInfo {
87 pub fn content<'a>(&self, source: &'a str) -> &'a str {
89 &source[self.byte_offset..self.byte_offset + self.byte_len]
90 }
91}
92
93#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96 pub marker: String,
98 pub is_ordered: bool,
100 pub number: Option<usize>,
102 pub marker_column: usize,
104 pub content_column: usize,
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111 ATX,
113 Setext1,
115 Setext2,
117}
118
119#[derive(Debug, Clone)]
121pub struct ParsedLink<'a> {
122 pub line: usize,
124 pub start_col: usize,
126 pub end_col: usize,
128 pub byte_offset: usize,
130 pub byte_end: usize,
132 pub text: Cow<'a, str>,
134 pub url: Cow<'a, str>,
136 pub is_reference: bool,
138 pub reference_id: Option<Cow<'a, str>>,
140 pub link_type: LinkType,
142}
143
144#[derive(Debug, Clone)]
146pub struct BrokenLinkInfo {
147 pub reference: String,
149 pub span: std::ops::Range<usize>,
151}
152
153#[derive(Debug, Clone)]
155pub struct ParsedImage<'a> {
156 pub line: usize,
158 pub start_col: usize,
160 pub end_col: usize,
162 pub byte_offset: usize,
164 pub byte_end: usize,
166 pub alt_text: Cow<'a, str>,
168 pub url: Cow<'a, str>,
170 pub is_reference: bool,
172 pub reference_id: Option<Cow<'a, str>>,
174 pub link_type: LinkType,
176}
177
178#[derive(Debug, Clone)]
180pub struct ReferenceDef {
181 pub line: usize,
183 pub id: String,
185 pub url: String,
187 pub title: Option<String>,
189 pub byte_offset: usize,
191 pub byte_end: usize,
193}
194
195#[derive(Debug, Clone)]
197pub struct CodeSpan {
198 pub line: usize,
200 pub start_col: usize,
202 pub end_col: usize,
204 pub byte_offset: usize,
206 pub byte_end: usize,
208 pub backtick_count: usize,
210 pub content: String,
212}
213
214#[derive(Debug, Clone)]
216pub struct HeadingInfo {
217 pub level: u8,
219 pub style: HeadingStyle,
221 pub marker: String,
223 pub marker_column: usize,
225 pub content_column: usize,
227 pub text: String,
229 pub custom_id: Option<String>,
231 pub raw_text: String,
233 pub has_closing_sequence: bool,
235 pub closing_sequence: String,
237}
238
239#[derive(Debug, Clone)]
241pub struct BlockquoteInfo {
242 pub nesting_level: usize,
244 pub indent: String,
246 pub marker_column: usize,
248 pub prefix: String,
250 pub content: String,
252 pub has_no_space_after_marker: bool,
254 pub has_multiple_spaces_after_marker: bool,
256 pub needs_md028_fix: bool,
258}
259
260#[derive(Debug, Clone)]
262pub struct ListBlock {
263 pub start_line: usize,
265 pub end_line: usize,
267 pub is_ordered: bool,
269 pub marker: Option<String>,
271 pub blockquote_prefix: String,
273 pub item_lines: Vec<usize>,
275 pub nesting_level: usize,
277 pub max_marker_width: usize,
279}
280
281use std::sync::{Arc, Mutex};
282
283#[derive(Debug, Clone, Default)]
285pub struct CharFrequency {
286 pub hash_count: usize,
288 pub asterisk_count: usize,
290 pub underscore_count: usize,
292 pub hyphen_count: usize,
294 pub plus_count: usize,
296 pub gt_count: usize,
298 pub pipe_count: usize,
300 pub bracket_count: usize,
302 pub backtick_count: usize,
304 pub lt_count: usize,
306 pub exclamation_count: usize,
308 pub newline_count: usize,
310}
311
312#[derive(Debug, Clone)]
314pub struct HtmlTag {
315 pub line: usize,
317 pub start_col: usize,
319 pub end_col: usize,
321 pub byte_offset: usize,
323 pub byte_end: usize,
325 pub tag_name: String,
327 pub is_closing: bool,
329 pub is_self_closing: bool,
331 pub raw_content: String,
333}
334
335#[derive(Debug, Clone)]
337pub struct EmphasisSpan {
338 pub line: usize,
340 pub start_col: usize,
342 pub end_col: usize,
344 pub byte_offset: usize,
346 pub byte_end: usize,
348 pub marker: char,
350 pub marker_count: usize,
352 pub content: String,
354}
355
356#[derive(Debug, Clone)]
358pub struct TableRow {
359 pub line: usize,
361 pub is_separator: bool,
363 pub column_count: usize,
365 pub column_alignments: Vec<String>, }
368
369#[derive(Debug, Clone)]
371pub struct BareUrl {
372 pub line: usize,
374 pub start_col: usize,
376 pub end_col: usize,
378 pub byte_offset: usize,
380 pub byte_end: usize,
382 pub url: String,
384 pub url_type: String,
386}
387
388pub struct LintContext<'a> {
389 pub content: &'a str,
390 pub line_offsets: Vec<usize>,
391 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
410
411struct BlockquoteComponents<'a> {
413 indent: &'a str,
414 markers: &'a str,
415 spaces_after: &'a str,
416 content: &'a str,
417}
418
419#[inline]
421fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
422 let bytes = line.as_bytes();
423 let mut pos = 0;
424
425 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
427 pos += 1;
428 }
429 let indent_end = pos;
430
431 if pos >= bytes.len() || bytes[pos] != b'>' {
433 return None;
434 }
435
436 while pos < bytes.len() && bytes[pos] == b'>' {
438 pos += 1;
439 }
440 let markers_end = pos;
441
442 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
444 pos += 1;
445 }
446 let spaces_end = pos;
447
448 Some(BlockquoteComponents {
449 indent: &line[0..indent_end],
450 markers: &line[indent_end..markers_end],
451 spaces_after: &line[markers_end..spaces_end],
452 content: &line[spaces_end..],
453 })
454}
455
456impl<'a> LintContext<'a> {
457 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
458 use std::time::Instant;
459 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
460
461 let start = Instant::now();
462 let mut line_offsets = vec![0];
463 for (i, c) in content.char_indices() {
464 if c == '\n' {
465 line_offsets.push(i + 1);
466 }
467 }
468 if profile {
469 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
470 }
471
472 let start = Instant::now();
474 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
475 if profile {
476 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
477 }
478
479 let start = Instant::now();
481 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
482 if profile {
483 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
484 }
485
486 let start = Instant::now();
488 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
489 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
490 } else {
491 Vec::new()
492 };
493 if profile {
494 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
495 }
496
497 let start = Instant::now();
499 let mut lines = Self::compute_basic_line_info(
500 content,
501 &line_offsets,
502 &code_blocks,
503 flavor,
504 &html_comment_ranges,
505 &autodoc_ranges,
506 );
507 if profile {
508 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
509 }
510
511 let start = Instant::now();
513 Self::detect_html_blocks(content, &mut lines);
514 if profile {
515 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
516 }
517
518 let start = Instant::now();
520 Self::detect_esm_blocks(content, &mut lines, flavor);
521 if profile {
522 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
523 }
524
525 let start = Instant::now();
527 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
528 if profile {
529 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
530 }
531
532 let start = Instant::now();
534 let code_spans = Self::parse_code_spans(content, &lines);
535 if profile {
536 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
537 }
538
539 let start = Instant::now();
541 let (links, broken_links) =
542 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
543 if profile {
544 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
545 }
546
547 let start = Instant::now();
548 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
549 if profile {
550 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
551 }
552
553 let start = Instant::now();
554 let reference_defs = Self::parse_reference_defs(content, &lines);
555 if profile {
556 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
557 }
558
559 let start = Instant::now();
560 let list_blocks = Self::parse_list_blocks(content, &lines);
561 if profile {
562 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
563 }
564
565 let start = Instant::now();
567 let char_frequency = Self::compute_char_frequency(content);
568 if profile {
569 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
570 }
571
572 let start = Instant::now();
574 let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
575 content,
576 &code_blocks,
577 &code_spans,
578 &html_comment_ranges,
579 );
580 if profile {
581 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
582 }
583
584 let start = Instant::now();
586 let line_index = crate::utils::range_utils::LineIndex::new(content);
587 if profile {
588 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
589 }
590
591 let start = Instant::now();
593 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
594 if profile {
595 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
596 }
597
598 Self {
599 content,
600 line_offsets,
601 code_blocks,
602 lines,
603 links,
604 images,
605 broken_links,
606 reference_defs,
607 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
608 list_blocks,
609 char_frequency,
610 html_tags_cache: Mutex::new(None),
611 emphasis_spans_cache: Mutex::new(None),
612 table_rows_cache: Mutex::new(None),
613 bare_urls_cache: Mutex::new(None),
614 html_comment_ranges,
615 table_blocks,
616 line_index,
617 jinja_ranges,
618 flavor,
619 }
620 }
621
622 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
624 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
625
626 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
627 }
628
629 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
631 &self.html_comment_ranges
632 }
633
634 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
636 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
637
638 Arc::clone(cache.get_or_insert_with(|| {
639 Arc::new(Self::parse_html_tags(
640 self.content,
641 &self.lines,
642 &self.code_blocks,
643 self.flavor,
644 ))
645 }))
646 }
647
648 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
650 let mut cache = self
651 .emphasis_spans_cache
652 .lock()
653 .expect("Emphasis spans cache mutex poisoned");
654
655 Arc::clone(
656 cache.get_or_insert_with(|| {
657 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
658 }),
659 )
660 }
661
662 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
664 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
665
666 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
667 }
668
669 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
671 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
672
673 Arc::clone(
674 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
675 )
676 }
677
678 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
680 match self.line_offsets.binary_search(&offset) {
681 Ok(line) => (line + 1, 1),
682 Err(line) => {
683 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
684 (line, offset - line_start + 1)
685 }
686 }
687 }
688
689 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
691 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
693 return true;
694 }
695
696 self.code_spans()
698 .iter()
699 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
700 }
701
702 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
704 if line_num > 0 {
705 self.lines.get(line_num - 1)
706 } else {
707 None
708 }
709 }
710
711 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
713 self.line_info(line_num).map(|info| info.byte_offset)
714 }
715
716 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
718 let normalized_id = ref_id.to_lowercase();
719 self.reference_defs
720 .iter()
721 .find(|def| def.id == normalized_id)
722 .map(|def| def.url.as_str())
723 }
724
725 pub fn is_in_list_block(&self, line_num: usize) -> bool {
727 self.list_blocks
728 .iter()
729 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
730 }
731
732 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
734 self.list_blocks
735 .iter()
736 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
737 }
738
739 pub fn is_in_code_block(&self, line_num: usize) -> bool {
743 if line_num == 0 || line_num > self.lines.len() {
744 return false;
745 }
746 self.lines[line_num - 1].in_code_block
747 }
748
749 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
751 if line_num == 0 || line_num > self.lines.len() {
752 return false;
753 }
754 self.lines[line_num - 1].in_front_matter
755 }
756
757 pub fn is_in_html_block(&self, line_num: usize) -> bool {
759 if line_num == 0 || line_num > self.lines.len() {
760 return false;
761 }
762 self.lines[line_num - 1].in_html_block
763 }
764
765 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
767 if line_num == 0 || line_num > self.lines.len() {
768 return false;
769 }
770
771 let col_0indexed = if col > 0 { col - 1 } else { 0 };
775 let code_spans = self.code_spans();
776 code_spans
777 .iter()
778 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
779 }
780
781 #[inline]
784 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
785 self.reference_defs
786 .iter()
787 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
788 }
789
790 #[inline]
794 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
795 self.html_comment_ranges
796 .iter()
797 .any(|range| byte_pos >= range.start && byte_pos < range.end)
798 }
799
800 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
802 self.jinja_ranges
803 .iter()
804 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
805 }
806
807 pub fn has_char(&self, ch: char) -> bool {
809 match ch {
810 '#' => self.char_frequency.hash_count > 0,
811 '*' => self.char_frequency.asterisk_count > 0,
812 '_' => self.char_frequency.underscore_count > 0,
813 '-' => self.char_frequency.hyphen_count > 0,
814 '+' => self.char_frequency.plus_count > 0,
815 '>' => self.char_frequency.gt_count > 0,
816 '|' => self.char_frequency.pipe_count > 0,
817 '[' => self.char_frequency.bracket_count > 0,
818 '`' => self.char_frequency.backtick_count > 0,
819 '<' => self.char_frequency.lt_count > 0,
820 '!' => self.char_frequency.exclamation_count > 0,
821 '\n' => self.char_frequency.newline_count > 0,
822 _ => self.content.contains(ch), }
824 }
825
826 pub fn char_count(&self, ch: char) -> usize {
828 match ch {
829 '#' => self.char_frequency.hash_count,
830 '*' => self.char_frequency.asterisk_count,
831 '_' => self.char_frequency.underscore_count,
832 '-' => self.char_frequency.hyphen_count,
833 '+' => self.char_frequency.plus_count,
834 '>' => self.char_frequency.gt_count,
835 '|' => self.char_frequency.pipe_count,
836 '[' => self.char_frequency.bracket_count,
837 '`' => self.char_frequency.backtick_count,
838 '<' => self.char_frequency.lt_count,
839 '!' => self.char_frequency.exclamation_count,
840 '\n' => self.char_frequency.newline_count,
841 _ => self.content.matches(ch).count(), }
843 }
844
845 pub fn likely_has_headings(&self) -> bool {
847 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
849
850 pub fn likely_has_lists(&self) -> bool {
852 self.char_frequency.asterisk_count > 0
853 || self.char_frequency.hyphen_count > 0
854 || self.char_frequency.plus_count > 0
855 }
856
857 pub fn likely_has_emphasis(&self) -> bool {
859 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
860 }
861
862 pub fn likely_has_tables(&self) -> bool {
864 self.char_frequency.pipe_count > 2
865 }
866
867 pub fn likely_has_blockquotes(&self) -> bool {
869 self.char_frequency.gt_count > 0
870 }
871
872 pub fn likely_has_code(&self) -> bool {
874 self.char_frequency.backtick_count > 0
875 }
876
877 pub fn likely_has_links_or_images(&self) -> bool {
879 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
880 }
881
882 pub fn likely_has_html(&self) -> bool {
884 self.char_frequency.lt_count > 0
885 }
886
887 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
889 self.html_tags()
890 .iter()
891 .filter(|tag| tag.line == line_num)
892 .cloned()
893 .collect()
894 }
895
896 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
898 self.emphasis_spans()
899 .iter()
900 .filter(|span| span.line == line_num)
901 .cloned()
902 .collect()
903 }
904
905 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
907 self.table_rows()
908 .iter()
909 .filter(|row| row.line == line_num)
910 .cloned()
911 .collect()
912 }
913
914 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
916 self.bare_urls()
917 .iter()
918 .filter(|url| url.line == line_num)
919 .cloned()
920 .collect()
921 }
922
923 #[inline]
929 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
930 let idx = match lines.binary_search_by(|line| {
932 if byte_offset < line.byte_offset {
933 std::cmp::Ordering::Greater
934 } else if byte_offset > line.byte_offset + line.byte_len {
935 std::cmp::Ordering::Less
936 } else {
937 std::cmp::Ordering::Equal
938 }
939 }) {
940 Ok(idx) => idx,
941 Err(idx) => idx.saturating_sub(1),
942 };
943
944 let line = &lines[idx];
945 let line_num = idx + 1;
946 let col = byte_offset.saturating_sub(line.byte_offset);
947
948 (idx, line_num, col)
949 }
950
951 #[inline]
953 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
954 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
956
957 if idx > 0 {
959 let span = &code_spans[idx - 1];
960 if offset >= span.byte_offset && offset < span.byte_end {
961 return true;
962 }
963 }
964
965 false
966 }
967
968 fn parse_links(
970 content: &'a str,
971 lines: &[LineInfo],
972 code_blocks: &[(usize, usize)],
973 code_spans: &[CodeSpan],
974 flavor: MarkdownFlavor,
975 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
976 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
977 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
978 use std::collections::HashSet;
979
980 let mut links = Vec::with_capacity(content.len() / 500);
981 let mut broken_links = Vec::new();
982
983 let mut found_positions = HashSet::new();
985
986 let mut options = Options::empty();
996 options.insert(Options::ENABLE_WIKILINKS);
997
998 let parser = Parser::new_with_broken_link_callback(
999 content,
1000 options,
1001 Some(|link: BrokenLink<'_>| {
1002 broken_links.push(BrokenLinkInfo {
1003 reference: link.reference.to_string(),
1004 span: link.span.clone(),
1005 });
1006 None
1007 }),
1008 )
1009 .into_offset_iter();
1010
1011 let mut link_stack: Vec<(
1012 usize,
1013 usize,
1014 pulldown_cmark::CowStr<'a>,
1015 LinkType,
1016 pulldown_cmark::CowStr<'a>,
1017 )> = Vec::new();
1018 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1021 match event {
1022 Event::Start(Tag::Link {
1023 link_type,
1024 dest_url,
1025 id,
1026 ..
1027 }) => {
1028 link_stack.push((range.start, range.end, dest_url, link_type, id));
1030 text_chunks.clear();
1031 }
1032 Event::Text(text) if !link_stack.is_empty() => {
1033 text_chunks.push((text.to_string(), range.start, range.end));
1035 }
1036 Event::Code(code) if !link_stack.is_empty() => {
1037 let code_text = format!("`{code}`");
1039 text_chunks.push((code_text, range.start, range.end));
1040 }
1041 Event::End(TagEnd::Link) => {
1042 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1043 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1045 text_chunks.clear();
1046 continue;
1047 }
1048
1049 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1051
1052 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1054 text_chunks.clear();
1055 continue;
1056 }
1057
1058 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1059
1060 let is_reference = matches!(
1061 link_type,
1062 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1063 );
1064
1065 let link_text = if start_pos < content.len() {
1068 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1069
1070 let mut close_pos = None;
1074 let mut depth = 0;
1075 let mut in_code_span = false;
1076
1077 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1078 let mut backslash_count = 0;
1080 let mut j = i;
1081 while j > 0 && link_bytes[j - 1] == b'\\' {
1082 backslash_count += 1;
1083 j -= 1;
1084 }
1085 let is_escaped = backslash_count % 2 != 0;
1086
1087 if byte == b'`' && !is_escaped {
1089 in_code_span = !in_code_span;
1090 }
1091
1092 if !is_escaped && !in_code_span {
1094 if byte == b'[' {
1095 depth += 1;
1096 } else if byte == b']' {
1097 if depth == 0 {
1098 close_pos = Some(i);
1100 break;
1101 } else {
1102 depth -= 1;
1103 }
1104 }
1105 }
1106 }
1107
1108 if let Some(pos) = close_pos {
1109 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1110 } else {
1111 Cow::Borrowed("")
1112 }
1113 } else {
1114 Cow::Borrowed("")
1115 };
1116
1117 let reference_id = if is_reference && !ref_id.is_empty() {
1119 Some(Cow::Owned(ref_id.to_lowercase()))
1120 } else if is_reference {
1121 Some(Cow::Owned(link_text.to_lowercase()))
1123 } else {
1124 None
1125 };
1126
1127 let has_escaped_bang = start_pos >= 2
1131 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1132 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1133
1134 let has_escaped_bracket =
1137 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1138
1139 if has_escaped_bang || has_escaped_bracket {
1140 text_chunks.clear();
1141 continue; }
1143
1144 found_positions.insert(start_pos);
1146
1147 links.push(ParsedLink {
1148 line: line_num,
1149 start_col: col_start,
1150 end_col: col_end,
1151 byte_offset: start_pos,
1152 byte_end: range.end,
1153 text: link_text,
1154 url: Cow::Owned(url.to_string()),
1155 is_reference,
1156 reference_id,
1157 link_type,
1158 });
1159
1160 text_chunks.clear();
1161 }
1162 }
1163 _ => {}
1164 }
1165 }
1166
1167 for cap in LINK_PATTERN.captures_iter(content) {
1171 let full_match = cap.get(0).unwrap();
1172 let match_start = full_match.start();
1173 let match_end = full_match.end();
1174
1175 if found_positions.contains(&match_start) {
1177 continue;
1178 }
1179
1180 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1182 continue;
1183 }
1184
1185 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1187 continue;
1188 }
1189
1190 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1192 continue;
1193 }
1194
1195 if Self::is_offset_in_code_span(code_spans, match_start) {
1197 continue;
1198 }
1199
1200 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1202 continue;
1203 }
1204
1205 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1207
1208 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1210 continue;
1211 }
1212
1213 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1214
1215 let text = cap.get(1).map_or("", |m| m.as_str());
1216
1217 if let Some(ref_id) = cap.get(6) {
1219 let ref_id_str = ref_id.as_str();
1220 let normalized_ref = if ref_id_str.is_empty() {
1221 Cow::Owned(text.to_lowercase()) } else {
1223 Cow::Owned(ref_id_str.to_lowercase())
1224 };
1225
1226 links.push(ParsedLink {
1228 line: line_num,
1229 start_col: col_start,
1230 end_col: col_end,
1231 byte_offset: match_start,
1232 byte_end: match_end,
1233 text: Cow::Borrowed(text),
1234 url: Cow::Borrowed(""), is_reference: true,
1236 reference_id: Some(normalized_ref),
1237 link_type: LinkType::Reference, });
1239 }
1240 }
1241
1242 (links, broken_links)
1243 }
1244
1245 fn parse_images(
1247 content: &'a str,
1248 lines: &[LineInfo],
1249 code_blocks: &[(usize, usize)],
1250 code_spans: &[CodeSpan],
1251 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1252 ) -> Vec<ParsedImage<'a>> {
1253 use crate::utils::skip_context::is_in_html_comment_ranges;
1254 use std::collections::HashSet;
1255
1256 let mut images = Vec::with_capacity(content.len() / 1000);
1258 let mut found_positions = HashSet::new();
1259
1260 let parser = Parser::new(content).into_offset_iter();
1262 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1263 Vec::new();
1264 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1267 match event {
1268 Event::Start(Tag::Image {
1269 link_type,
1270 dest_url,
1271 id,
1272 ..
1273 }) => {
1274 image_stack.push((range.start, dest_url, link_type, id));
1275 text_chunks.clear();
1276 }
1277 Event::Text(text) if !image_stack.is_empty() => {
1278 text_chunks.push((text.to_string(), range.start, range.end));
1279 }
1280 Event::Code(code) if !image_stack.is_empty() => {
1281 let code_text = format!("`{code}`");
1282 text_chunks.push((code_text, range.start, range.end));
1283 }
1284 Event::End(TagEnd::Image) => {
1285 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1286 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1288 continue;
1289 }
1290
1291 if Self::is_offset_in_code_span(code_spans, start_pos) {
1293 continue;
1294 }
1295
1296 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1298 continue;
1299 }
1300
1301 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1304
1305 let is_reference = matches!(
1306 link_type,
1307 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1308 );
1309
1310 let alt_text = if start_pos < content.len() {
1313 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1314
1315 let mut close_pos = None;
1318 let mut depth = 0;
1319
1320 if image_bytes.len() > 2 {
1321 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1322 let mut backslash_count = 0;
1324 let mut j = i;
1325 while j > 0 && image_bytes[j - 1] == b'\\' {
1326 backslash_count += 1;
1327 j -= 1;
1328 }
1329 let is_escaped = backslash_count % 2 != 0;
1330
1331 if !is_escaped {
1332 if byte == b'[' {
1333 depth += 1;
1334 } else if byte == b']' {
1335 if depth == 0 {
1336 close_pos = Some(i);
1338 break;
1339 } else {
1340 depth -= 1;
1341 }
1342 }
1343 }
1344 }
1345 }
1346
1347 if let Some(pos) = close_pos {
1348 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1349 } else {
1350 Cow::Borrowed("")
1351 }
1352 } else {
1353 Cow::Borrowed("")
1354 };
1355
1356 let reference_id = if is_reference && !ref_id.is_empty() {
1357 Some(Cow::Owned(ref_id.to_lowercase()))
1358 } else if is_reference {
1359 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1361 None
1362 };
1363
1364 found_positions.insert(start_pos);
1365 images.push(ParsedImage {
1366 line: line_num,
1367 start_col: col_start,
1368 end_col: col_end,
1369 byte_offset: start_pos,
1370 byte_end: range.end,
1371 alt_text,
1372 url: Cow::Owned(url.to_string()),
1373 is_reference,
1374 reference_id,
1375 link_type,
1376 });
1377 }
1378 }
1379 _ => {}
1380 }
1381 }
1382
1383 for cap in IMAGE_PATTERN.captures_iter(content) {
1385 let full_match = cap.get(0).unwrap();
1386 let match_start = full_match.start();
1387 let match_end = full_match.end();
1388
1389 if found_positions.contains(&match_start) {
1391 continue;
1392 }
1393
1394 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1396 continue;
1397 }
1398
1399 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1401 || Self::is_offset_in_code_span(code_spans, match_start)
1402 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1403 {
1404 continue;
1405 }
1406
1407 if let Some(ref_id) = cap.get(6) {
1409 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1410 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1411 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1412 let ref_id_str = ref_id.as_str();
1413 let normalized_ref = if ref_id_str.is_empty() {
1414 Cow::Owned(alt_text.to_lowercase())
1415 } else {
1416 Cow::Owned(ref_id_str.to_lowercase())
1417 };
1418
1419 images.push(ParsedImage {
1420 line: line_num,
1421 start_col: col_start,
1422 end_col: col_end,
1423 byte_offset: match_start,
1424 byte_end: match_end,
1425 alt_text: Cow::Borrowed(alt_text),
1426 url: Cow::Borrowed(""),
1427 is_reference: true,
1428 reference_id: Some(normalized_ref),
1429 link_type: LinkType::Reference, });
1431 }
1432 }
1433
1434 images
1435 }
1436
1437 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1439 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1443 if line_info.in_code_block {
1445 continue;
1446 }
1447
1448 let line = line_info.content(content);
1449 let line_num = line_idx + 1;
1450
1451 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1452 let id = cap.get(1).unwrap().as_str().to_lowercase();
1453 let url = cap.get(2).unwrap().as_str().to_string();
1454 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1455
1456 let match_obj = cap.get(0).unwrap();
1459 let byte_offset = line_info.byte_offset + match_obj.start();
1460 let byte_end = line_info.byte_offset + match_obj.end();
1461
1462 refs.push(ReferenceDef {
1463 line: line_num,
1464 id,
1465 url,
1466 title,
1467 byte_offset,
1468 byte_end,
1469 });
1470 }
1471 }
1472
1473 refs
1474 }
1475
1476 #[inline]
1480 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1481 let trimmed_start = line.trim_start();
1482 if !trimmed_start.starts_with('>') {
1483 return None;
1484 }
1485
1486 let leading_ws_len = line.len() - trimmed_start.len();
1487 let after_gt = &trimmed_start[1..];
1488 let content = after_gt.trim_start();
1489 let ws_after_gt_len = after_gt.len() - content.len();
1490 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1491
1492 Some((&line[..prefix_len], content))
1493 }
1494
1495 #[inline]
1499 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1500 let bytes = line.as_bytes();
1501 let mut i = 0;
1502
1503 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1505 i += 1;
1506 }
1507
1508 if i >= bytes.len() {
1510 return None;
1511 }
1512 let marker = bytes[i] as char;
1513 if marker != '-' && marker != '*' && marker != '+' {
1514 return None;
1515 }
1516 let marker_pos = i;
1517 i += 1;
1518
1519 let spacing_start = i;
1521 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1522 i += 1;
1523 }
1524
1525 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1526 }
1527
1528 #[inline]
1532 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1533 let bytes = line.as_bytes();
1534 let mut i = 0;
1535
1536 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1538 i += 1;
1539 }
1540
1541 let number_start = i;
1543 while i < bytes.len() && bytes[i].is_ascii_digit() {
1544 i += 1;
1545 }
1546 if i == number_start {
1547 return None; }
1549
1550 if i >= bytes.len() {
1552 return None;
1553 }
1554 let delimiter = bytes[i] as char;
1555 if delimiter != '.' && delimiter != ')' {
1556 return None;
1557 }
1558 let delimiter_pos = i;
1559 i += 1;
1560
1561 let spacing_start = i;
1563 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1564 i += 1;
1565 }
1566
1567 Some((
1568 &line[..number_start],
1569 &line[number_start..delimiter_pos],
1570 delimiter,
1571 &line[spacing_start..i],
1572 &line[i..],
1573 ))
1574 }
1575
1576 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1579 let num_lines = line_offsets.len();
1580 let mut in_code_block = vec![false; num_lines];
1581
1582 for &(start, end) in code_blocks {
1584 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1586 let mut boundary = start;
1587 while boundary > 0 && !content.is_char_boundary(boundary) {
1588 boundary -= 1;
1589 }
1590 boundary
1591 } else {
1592 start
1593 };
1594
1595 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1596 let mut boundary = end;
1597 while boundary < content.len() && !content.is_char_boundary(boundary) {
1598 boundary += 1;
1599 }
1600 boundary
1601 } else {
1602 end.min(content.len())
1603 };
1604
1605 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1620 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1621
1622 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1624 *flag = true;
1625 }
1626 }
1627
1628 in_code_block
1629 }
1630
1631 fn compute_basic_line_info(
1633 content: &str,
1634 line_offsets: &[usize],
1635 code_blocks: &[(usize, usize)],
1636 flavor: MarkdownFlavor,
1637 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1638 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1639 ) -> Vec<LineInfo> {
1640 let content_lines: Vec<&str> = content.lines().collect();
1641 let mut lines = Vec::with_capacity(content_lines.len());
1642
1643 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1645
1646 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1649
1650 for (i, line) in content_lines.iter().enumerate() {
1651 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1652 let indent = line.len() - line.trim_start().len();
1653
1654 let blockquote_parse = Self::parse_blockquote_prefix(line);
1656
1657 let is_blank = if let Some((_, content)) = blockquote_parse {
1659 content.trim().is_empty()
1661 } else {
1662 line.trim().is_empty()
1663 };
1664
1665 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1667
1668 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1670 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1671 let in_html_comment =
1673 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1674 let list_item = if !(in_code_block
1675 || is_blank
1676 || in_mkdocstrings
1677 || in_html_comment
1678 || (front_matter_end > 0 && i < front_matter_end))
1679 {
1680 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1682 (content, prefix.len())
1683 } else {
1684 (&**line, 0)
1685 };
1686
1687 if let Some((leading_spaces, marker, spacing, _content)) =
1688 Self::parse_unordered_list(line_for_list_check)
1689 {
1690 let marker_column = blockquote_prefix_len + leading_spaces.len();
1691 let content_column = marker_column + 1 + spacing.len();
1692
1693 if spacing.is_empty() {
1700 None
1701 } else {
1702 Some(ListItemInfo {
1703 marker: marker.to_string(),
1704 is_ordered: false,
1705 number: None,
1706 marker_column,
1707 content_column,
1708 })
1709 }
1710 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1711 Self::parse_ordered_list(line_for_list_check)
1712 {
1713 let marker = format!("{number_str}{delimiter}");
1714 let marker_column = blockquote_prefix_len + leading_spaces.len();
1715 let content_column = marker_column + marker.len() + spacing.len();
1716
1717 if spacing.is_empty() {
1720 None
1721 } else {
1722 Some(ListItemInfo {
1723 marker,
1724 is_ordered: true,
1725 number: number_str.parse().ok(),
1726 marker_column,
1727 content_column,
1728 })
1729 }
1730 } else {
1731 None
1732 }
1733 } else {
1734 None
1735 };
1736
1737 lines.push(LineInfo {
1738 byte_offset,
1739 byte_len: line.len(),
1740 indent,
1741 is_blank,
1742 in_code_block,
1743 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1744 in_html_block: false, in_html_comment,
1746 list_item,
1747 heading: None, blockquote: None, in_mkdocstrings,
1750 in_esm_block: false, });
1752 }
1753
1754 lines
1755 }
1756
1757 fn detect_headings_and_blockquotes(
1759 content: &str,
1760 lines: &mut [LineInfo],
1761 flavor: MarkdownFlavor,
1762 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1763 ) {
1764 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1766 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1767 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1768 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1769
1770 let content_lines: Vec<&str> = content.lines().collect();
1771
1772 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1774
1775 for i in 0..lines.len() {
1777 if lines[i].in_code_block {
1778 continue;
1779 }
1780
1781 if front_matter_end > 0 && i < front_matter_end {
1783 continue;
1784 }
1785
1786 if lines[i].in_html_block {
1788 continue;
1789 }
1790
1791 let line = content_lines[i];
1792
1793 if let Some(bq) = parse_blockquote_detailed(line) {
1795 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1797
1798 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1800
1801 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1803 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1805
1806 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1810
1811 lines[i].blockquote = Some(BlockquoteInfo {
1812 nesting_level,
1813 indent: bq.indent.to_string(),
1814 marker_column,
1815 prefix,
1816 content: bq.content.to_string(),
1817 has_no_space_after_marker: has_no_space,
1818 has_multiple_spaces_after_marker: has_multiple_spaces,
1819 needs_md028_fix,
1820 });
1821 }
1822
1823 if lines[i].is_blank {
1825 continue;
1826 }
1827
1828 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1831 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1832 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1833 } else {
1834 false
1835 };
1836
1837 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1838 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1840 continue;
1841 }
1842 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1843 let hashes = caps.get(2).map_or("", |m| m.as_str());
1844 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1845 let rest = caps.get(4).map_or("", |m| m.as_str());
1846
1847 let level = hashes.len() as u8;
1848 let marker_column = leading_spaces.len();
1849
1850 let (text, has_closing, closing_seq) = {
1852 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1854 if rest[id_start..].trim_end().ends_with('}') {
1856 (&rest[..id_start], &rest[id_start..])
1858 } else {
1859 (rest, "")
1860 }
1861 } else {
1862 (rest, "")
1863 };
1864
1865 let trimmed_rest = rest_without_id.trim_end();
1867 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1868 let mut start_of_hashes = last_hash_pos;
1870 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1871 start_of_hashes -= 1;
1872 }
1873
1874 let has_space_before = start_of_hashes == 0
1876 || trimmed_rest
1877 .chars()
1878 .nth(start_of_hashes - 1)
1879 .is_some_and(|c| c.is_whitespace());
1880
1881 let potential_closing = &trimmed_rest[start_of_hashes..];
1883 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1884
1885 if is_all_hashes && has_space_before {
1886 let closing_hashes = potential_closing.to_string();
1888 let text_part = if !custom_id_part.is_empty() {
1891 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1894 } else {
1895 rest_without_id[..start_of_hashes].trim_end().to_string()
1896 };
1897 (text_part, true, closing_hashes)
1898 } else {
1899 (rest.to_string(), false, String::new())
1901 }
1902 } else {
1903 (rest.to_string(), false, String::new())
1905 }
1906 };
1907
1908 let content_column = marker_column + hashes.len() + spaces_after.len();
1909
1910 let raw_text = text.trim().to_string();
1912 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1913
1914 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1916 let next_line = content_lines[i + 1];
1917 if !lines[i + 1].in_code_block
1918 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1919 && let Some(next_line_id) =
1920 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1921 {
1922 custom_id = Some(next_line_id);
1923 }
1924 }
1925
1926 lines[i].heading = Some(HeadingInfo {
1927 level,
1928 style: HeadingStyle::ATX,
1929 marker: hashes.to_string(),
1930 marker_column,
1931 content_column,
1932 text: clean_text,
1933 custom_id,
1934 raw_text,
1935 has_closing_sequence: has_closing,
1936 closing_sequence: closing_seq,
1937 });
1938 }
1939 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1941 let next_line = content_lines[i + 1];
1942 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1943 if front_matter_end > 0 && i < front_matter_end {
1945 continue;
1946 }
1947
1948 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1950 {
1951 continue;
1952 }
1953
1954 let underline = next_line.trim();
1955
1956 if underline == "---" {
1959 continue;
1960 }
1961
1962 let current_line_trimmed = line.trim();
1964 if current_line_trimmed.contains(':')
1965 && !current_line_trimmed.starts_with('#')
1966 && !current_line_trimmed.contains('[')
1967 && !current_line_trimmed.contains("](")
1968 {
1969 continue;
1971 }
1972
1973 let level = if underline.starts_with('=') { 1 } else { 2 };
1974 let style = if level == 1 {
1975 HeadingStyle::Setext1
1976 } else {
1977 HeadingStyle::Setext2
1978 };
1979
1980 let raw_text = line.trim().to_string();
1982 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1983
1984 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1986 let attr_line = content_lines[i + 2];
1987 if !lines[i + 2].in_code_block
1988 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1989 && let Some(attr_line_id) =
1990 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1991 {
1992 custom_id = Some(attr_line_id);
1993 }
1994 }
1995
1996 lines[i].heading = Some(HeadingInfo {
1997 level,
1998 style,
1999 marker: underline.to_string(),
2000 marker_column: next_line.len() - next_line.trim_start().len(),
2001 content_column: lines[i].indent,
2002 text: clean_text,
2003 custom_id,
2004 raw_text,
2005 has_closing_sequence: false,
2006 closing_sequence: String::new(),
2007 });
2008 }
2009 }
2010 }
2011 }
2012
2013 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2015 const BLOCK_ELEMENTS: &[&str] = &[
2017 "address",
2018 "article",
2019 "aside",
2020 "blockquote",
2021 "details",
2022 "dialog",
2023 "dd",
2024 "div",
2025 "dl",
2026 "dt",
2027 "fieldset",
2028 "figcaption",
2029 "figure",
2030 "footer",
2031 "form",
2032 "h1",
2033 "h2",
2034 "h3",
2035 "h4",
2036 "h5",
2037 "h6",
2038 "header",
2039 "hr",
2040 "li",
2041 "main",
2042 "nav",
2043 "ol",
2044 "p",
2045 "pre",
2046 "script",
2047 "section",
2048 "style",
2049 "table",
2050 "tbody",
2051 "td",
2052 "tfoot",
2053 "th",
2054 "thead",
2055 "tr",
2056 "ul",
2057 ];
2058
2059 let mut i = 0;
2060 while i < lines.len() {
2061 if lines[i].in_code_block || lines[i].in_front_matter {
2063 i += 1;
2064 continue;
2065 }
2066
2067 let trimmed = lines[i].content(content).trim_start();
2068
2069 if trimmed.starts_with('<') && trimmed.len() > 1 {
2071 let after_bracket = &trimmed[1..];
2073 let is_closing = after_bracket.starts_with('/');
2074 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2075
2076 let tag_name = tag_start
2078 .chars()
2079 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2080 .collect::<String>()
2081 .to_lowercase();
2082
2083 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2085 lines[i].in_html_block = true;
2087
2088 if !is_closing {
2091 let closing_tag = format!("</{tag_name}>");
2092 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2094 let mut j = i + 1;
2095 while j < lines.len() && j < i + 100 {
2096 if !allow_blank_lines && lines[j].is_blank {
2099 break;
2100 }
2101
2102 lines[j].in_html_block = true;
2103
2104 if lines[j].content(content).contains(&closing_tag) {
2106 break;
2107 }
2108 j += 1;
2109 }
2110 }
2111 }
2112 }
2113
2114 i += 1;
2115 }
2116 }
2117
2118 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2121 if !flavor.supports_esm_blocks() {
2123 return;
2124 }
2125
2126 for line in lines.iter_mut() {
2127 if line.is_blank || line.in_html_comment {
2129 continue;
2130 }
2131
2132 let trimmed = line.content(content).trim_start();
2134 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2135 line.in_esm_block = true;
2136 } else {
2137 break;
2139 }
2140 }
2141 }
2142
2143 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2145 let mut code_spans = Vec::new();
2146
2147 if !content.contains('`') {
2149 return code_spans;
2150 }
2151
2152 let parser = Parser::new(content).into_offset_iter();
2154
2155 for (event, range) in parser {
2156 if let Event::Code(_) = event {
2157 let start_pos = range.start;
2158 let end_pos = range.end;
2159
2160 let full_span = &content[start_pos..end_pos];
2162 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2163
2164 let content_start = start_pos + backtick_count;
2166 let content_end = end_pos - backtick_count;
2167 let span_content = if content_start < content_end {
2168 content[content_start..content_end].to_string()
2169 } else {
2170 String::new()
2171 };
2172
2173 let line_idx = lines
2176 .partition_point(|line| line.byte_offset <= start_pos)
2177 .saturating_sub(1);
2178 let line_num = line_idx + 1;
2179 let col_start = start_pos - lines[line_idx].byte_offset;
2180
2181 let end_line_idx = lines
2183 .partition_point(|line| line.byte_offset <= end_pos)
2184 .saturating_sub(1);
2185 let col_end = end_pos - lines[end_line_idx].byte_offset;
2186
2187 code_spans.push(CodeSpan {
2188 line: line_num,
2189 start_col: col_start,
2190 end_col: col_end,
2191 byte_offset: start_pos,
2192 byte_end: end_pos,
2193 backtick_count,
2194 content: span_content,
2195 });
2196 }
2197 }
2198
2199 code_spans.sort_by_key(|span| span.byte_offset);
2201
2202 code_spans
2203 }
2204
2205 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2216 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2218
2219 #[inline]
2222 fn reset_tracking_state(
2223 list_item: &ListItemInfo,
2224 has_list_breaking_content: &mut bool,
2225 min_continuation: &mut usize,
2226 ) {
2227 *has_list_breaking_content = false;
2228 let marker_width = if list_item.is_ordered {
2229 list_item.marker.len() + 1 } else {
2231 list_item.marker.len()
2232 };
2233 *min_continuation = if list_item.is_ordered {
2234 marker_width
2235 } else {
2236 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2237 };
2238 }
2239
2240 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2243 let mut last_list_item_line = 0;
2244 let mut current_indent_level = 0;
2245 let mut last_marker_width = 0;
2246
2247 let mut has_list_breaking_content_since_last_item = false;
2249 let mut min_continuation_for_tracking = 0;
2250
2251 for (line_idx, line_info) in lines.iter().enumerate() {
2252 let line_num = line_idx + 1;
2253
2254 if line_info.in_code_block {
2256 if let Some(ref mut block) = current_block {
2257 let min_continuation_indent =
2259 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2260
2261 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2263
2264 match context {
2265 CodeBlockContext::Indented => {
2266 block.end_line = line_num;
2268 continue;
2269 }
2270 CodeBlockContext::Standalone => {
2271 let completed_block = current_block.take().unwrap();
2273 list_blocks.push(completed_block);
2274 continue;
2275 }
2276 CodeBlockContext::Adjacent => {
2277 block.end_line = line_num;
2279 continue;
2280 }
2281 }
2282 } else {
2283 continue;
2285 }
2286 }
2287
2288 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2290 caps.get(0).unwrap().as_str().to_string()
2291 } else {
2292 String::new()
2293 };
2294
2295 if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2297 let line_content = line_info.content(content).trim();
2298
2299 let breaks_list = line_info.heading.is_some()
2301 || line_content.starts_with("---")
2302 || line_content.starts_with("***")
2303 || line_content.starts_with("___")
2304 || (line_content.contains('|')
2305 && !line_content.contains("](")
2306 && !line_content.contains("http")
2307 && (line_content.matches('|').count() > 1
2308 || line_content.starts_with('|')
2309 || line_content.ends_with('|')))
2310 || line_content.starts_with(">")
2311 || (line_info.indent < min_continuation_for_tracking);
2312
2313 if breaks_list {
2314 has_list_breaking_content_since_last_item = true;
2315 }
2316 }
2317
2318 if let Some(list_item) = &line_info.list_item {
2320 let item_indent = list_item.marker_column;
2322 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2325 let is_nested = nesting > block.nesting_level;
2329 let same_type =
2330 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2331 let same_context = block.blockquote_prefix == blockquote_prefix;
2332 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2336 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2337
2338 let has_non_list_content = has_list_breaking_content_since_last_item;
2341
2342 let mut continues_list = if is_nested {
2346 same_context && reasonable_distance && !has_non_list_content
2348 } else {
2349 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2351 };
2352
2353 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2356 if block.item_lines.contains(&(line_num - 1)) {
2358 continues_list = true;
2360 }
2361 }
2362
2363 if continues_list {
2364 block.end_line = line_num;
2366 block.item_lines.push(line_num);
2367
2368 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2370 list_item.marker.len() + 1
2371 } else {
2372 list_item.marker.len()
2373 });
2374
2375 if !block.is_ordered
2377 && block.marker.is_some()
2378 && block.marker.as_ref() != Some(&list_item.marker)
2379 {
2380 block.marker = None;
2382 }
2383
2384 reset_tracking_state(
2386 list_item,
2387 &mut has_list_breaking_content_since_last_item,
2388 &mut min_continuation_for_tracking,
2389 );
2390 } else {
2391 list_blocks.push(block.clone());
2394
2395 *block = ListBlock {
2396 start_line: line_num,
2397 end_line: line_num,
2398 is_ordered: list_item.is_ordered,
2399 marker: if list_item.is_ordered {
2400 None
2401 } else {
2402 Some(list_item.marker.clone())
2403 },
2404 blockquote_prefix: blockquote_prefix.clone(),
2405 item_lines: vec![line_num],
2406 nesting_level: nesting,
2407 max_marker_width: if list_item.is_ordered {
2408 list_item.marker.len() + 1
2409 } else {
2410 list_item.marker.len()
2411 },
2412 };
2413
2414 reset_tracking_state(
2416 list_item,
2417 &mut has_list_breaking_content_since_last_item,
2418 &mut min_continuation_for_tracking,
2419 );
2420 }
2421 } else {
2422 current_block = Some(ListBlock {
2424 start_line: line_num,
2425 end_line: line_num,
2426 is_ordered: list_item.is_ordered,
2427 marker: if list_item.is_ordered {
2428 None
2429 } else {
2430 Some(list_item.marker.clone())
2431 },
2432 blockquote_prefix,
2433 item_lines: vec![line_num],
2434 nesting_level: nesting,
2435 max_marker_width: list_item.marker.len(),
2436 });
2437
2438 reset_tracking_state(
2440 list_item,
2441 &mut has_list_breaking_content_since_last_item,
2442 &mut min_continuation_for_tracking,
2443 );
2444 }
2445
2446 last_list_item_line = line_num;
2447 current_indent_level = item_indent;
2448 last_marker_width = if list_item.is_ordered {
2449 list_item.marker.len() + 1 } else {
2451 list_item.marker.len()
2452 };
2453 } else if let Some(ref mut block) = current_block {
2454 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2464 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2465 } else {
2466 false
2467 };
2468
2469 let min_continuation_indent = if block.is_ordered {
2473 current_indent_level + last_marker_width
2474 } else {
2475 current_indent_level + 2 };
2477
2478 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2479 block.end_line = line_num;
2481 } else if line_info.is_blank {
2482 let mut check_idx = line_idx + 1;
2485 let mut found_continuation = false;
2486
2487 while check_idx < lines.len() && lines[check_idx].is_blank {
2489 check_idx += 1;
2490 }
2491
2492 if check_idx < lines.len() {
2493 let next_line = &lines[check_idx];
2494 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2496 found_continuation = true;
2497 }
2498 else if !next_line.in_code_block
2500 && next_line.list_item.is_some()
2501 && let Some(item) = &next_line.list_item
2502 {
2503 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2504 .find(next_line.content(content))
2505 .map_or(String::new(), |m| m.as_str().to_string());
2506 if item.marker_column == current_indent_level
2507 && item.is_ordered == block.is_ordered
2508 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2509 {
2510 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2513 if let Some(between_line) = lines.get(idx) {
2514 let between_content = between_line.content(content);
2515 let trimmed = between_content.trim();
2516 if trimmed.is_empty() {
2518 return false;
2519 }
2520 let line_indent = between_content.len() - between_content.trim_start().len();
2522
2523 if trimmed.starts_with("```")
2525 || trimmed.starts_with("~~~")
2526 || trimmed.starts_with("---")
2527 || trimmed.starts_with("***")
2528 || trimmed.starts_with("___")
2529 || trimmed.starts_with(">")
2530 || trimmed.contains('|') || between_line.heading.is_some()
2532 {
2533 return true; }
2535
2536 line_indent >= min_continuation_indent
2538 } else {
2539 false
2540 }
2541 });
2542
2543 if block.is_ordered {
2544 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2547 if let Some(between_line) = lines.get(idx) {
2548 let trimmed = between_line.content(content).trim();
2549 if trimmed.is_empty() {
2550 return false;
2551 }
2552 trimmed.starts_with("```")
2554 || trimmed.starts_with("~~~")
2555 || trimmed.starts_with("---")
2556 || trimmed.starts_with("***")
2557 || trimmed.starts_with("___")
2558 || trimmed.starts_with(">")
2559 || trimmed.contains('|') || between_line.heading.is_some()
2561 } else {
2562 false
2563 }
2564 });
2565 found_continuation = !has_structural_separators;
2566 } else {
2567 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2569 if let Some(between_line) = lines.get(idx) {
2570 let trimmed = between_line.content(content).trim();
2571 if trimmed.is_empty() {
2572 return false;
2573 }
2574 trimmed.starts_with("```")
2576 || trimmed.starts_with("~~~")
2577 || trimmed.starts_with("---")
2578 || trimmed.starts_with("***")
2579 || trimmed.starts_with("___")
2580 || trimmed.starts_with(">")
2581 || trimmed.contains('|') || between_line.heading.is_some()
2583 } else {
2584 false
2585 }
2586 });
2587 found_continuation = !has_structural_separators;
2588 }
2589 }
2590 }
2591 }
2592
2593 if found_continuation {
2594 block.end_line = line_num;
2596 } else {
2597 list_blocks.push(block.clone());
2599 current_block = None;
2600 }
2601 } else {
2602 let min_required_indent = if block.is_ordered {
2605 current_indent_level + last_marker_width
2606 } else {
2607 current_indent_level + 2
2608 };
2609
2610 let line_content = line_info.content(content).trim();
2615 let is_structural_separator = line_info.heading.is_some()
2616 || line_content.starts_with("```")
2617 || line_content.starts_with("~~~")
2618 || line_content.starts_with("---")
2619 || line_content.starts_with("***")
2620 || line_content.starts_with("___")
2621 || line_content.starts_with(">")
2622 || (line_content.contains('|')
2623 && !line_content.contains("](")
2624 && !line_content.contains("http")
2625 && (line_content.matches('|').count() > 1
2626 || line_content.starts_with('|')
2627 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2632 && !line_info.is_blank
2633 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2634
2635 if is_lazy_continuation {
2636 let content_to_check = if !blockquote_prefix.is_empty() {
2639 line_info
2641 .content(content)
2642 .strip_prefix(&blockquote_prefix)
2643 .unwrap_or(line_info.content(content))
2644 .trim()
2645 } else {
2646 line_info.content(content).trim()
2647 };
2648
2649 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2650
2651 if starts_with_uppercase && last_list_item_line > 0 {
2654 list_blocks.push(block.clone());
2656 current_block = None;
2657 } else {
2658 block.end_line = line_num;
2660 }
2661 } else {
2662 list_blocks.push(block.clone());
2664 current_block = None;
2665 }
2666 }
2667 }
2668 }
2669
2670 if let Some(block) = current_block {
2672 list_blocks.push(block);
2673 }
2674
2675 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2677
2678 list_blocks
2679 }
2680
2681 fn compute_char_frequency(content: &str) -> CharFrequency {
2683 let mut frequency = CharFrequency::default();
2684
2685 for ch in content.chars() {
2686 match ch {
2687 '#' => frequency.hash_count += 1,
2688 '*' => frequency.asterisk_count += 1,
2689 '_' => frequency.underscore_count += 1,
2690 '-' => frequency.hyphen_count += 1,
2691 '+' => frequency.plus_count += 1,
2692 '>' => frequency.gt_count += 1,
2693 '|' => frequency.pipe_count += 1,
2694 '[' => frequency.bracket_count += 1,
2695 '`' => frequency.backtick_count += 1,
2696 '<' => frequency.lt_count += 1,
2697 '!' => frequency.exclamation_count += 1,
2698 '\n' => frequency.newline_count += 1,
2699 _ => {}
2700 }
2701 }
2702
2703 frequency
2704 }
2705
2706 fn parse_html_tags(
2708 content: &str,
2709 lines: &[LineInfo],
2710 code_blocks: &[(usize, usize)],
2711 flavor: MarkdownFlavor,
2712 ) -> Vec<HtmlTag> {
2713 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2714 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2715
2716 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2717
2718 for cap in HTML_TAG_REGEX.captures_iter(content) {
2719 let full_match = cap.get(0).unwrap();
2720 let match_start = full_match.start();
2721 let match_end = full_match.end();
2722
2723 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2725 continue;
2726 }
2727
2728 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2729 let tag_name_original = cap.get(2).unwrap().as_str();
2730 let tag_name = tag_name_original.to_lowercase();
2731 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2732
2733 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2736 continue;
2737 }
2738
2739 let mut line_num = 1;
2741 let mut col_start = match_start;
2742 let mut col_end = match_end;
2743 for (idx, line_info) in lines.iter().enumerate() {
2744 if match_start >= line_info.byte_offset {
2745 line_num = idx + 1;
2746 col_start = match_start - line_info.byte_offset;
2747 col_end = match_end - line_info.byte_offset;
2748 } else {
2749 break;
2750 }
2751 }
2752
2753 html_tags.push(HtmlTag {
2754 line: line_num,
2755 start_col: col_start,
2756 end_col: col_end,
2757 byte_offset: match_start,
2758 byte_end: match_end,
2759 tag_name,
2760 is_closing,
2761 is_self_closing,
2762 raw_content: full_match.as_str().to_string(),
2763 });
2764 }
2765
2766 html_tags
2767 }
2768
2769 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2771 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2772 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2773
2774 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2775
2776 for cap in EMPHASIS_REGEX.captures_iter(content) {
2777 let full_match = cap.get(0).unwrap();
2778 let match_start = full_match.start();
2779 let match_end = full_match.end();
2780
2781 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2783 continue;
2784 }
2785
2786 let opening_markers = cap.get(1).unwrap().as_str();
2787 let content_part = cap.get(2).unwrap().as_str();
2788 let closing_markers = cap.get(3).unwrap().as_str();
2789
2790 if opening_markers.chars().next() != closing_markers.chars().next()
2792 || opening_markers.len() != closing_markers.len()
2793 {
2794 continue;
2795 }
2796
2797 let marker = opening_markers.chars().next().unwrap();
2798 let marker_count = opening_markers.len();
2799
2800 let mut line_num = 1;
2802 let mut col_start = match_start;
2803 let mut col_end = match_end;
2804 for (idx, line_info) in lines.iter().enumerate() {
2805 if match_start >= line_info.byte_offset {
2806 line_num = idx + 1;
2807 col_start = match_start - line_info.byte_offset;
2808 col_end = match_end - line_info.byte_offset;
2809 } else {
2810 break;
2811 }
2812 }
2813
2814 emphasis_spans.push(EmphasisSpan {
2815 line: line_num,
2816 start_col: col_start,
2817 end_col: col_end,
2818 byte_offset: match_start,
2819 byte_end: match_end,
2820 marker,
2821 marker_count,
2822 content: content_part.to_string(),
2823 });
2824 }
2825
2826 emphasis_spans
2827 }
2828
2829 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2831 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2832
2833 for (line_idx, line_info) in lines.iter().enumerate() {
2834 if line_info.in_code_block || line_info.is_blank {
2836 continue;
2837 }
2838
2839 let line = line_info.content(content);
2840 let line_num = line_idx + 1;
2841
2842 if !line.contains('|') {
2844 continue;
2845 }
2846
2847 let parts: Vec<&str> = line.split('|').collect();
2849 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2850
2851 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2853 let mut column_alignments = Vec::new();
2854
2855 if is_separator {
2856 for part in &parts[1..parts.len() - 1] {
2857 let trimmed = part.trim();
2859 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2860 "center".to_string()
2861 } else if trimmed.ends_with(':') {
2862 "right".to_string()
2863 } else if trimmed.starts_with(':') {
2864 "left".to_string()
2865 } else {
2866 "none".to_string()
2867 };
2868 column_alignments.push(alignment);
2869 }
2870 }
2871
2872 table_rows.push(TableRow {
2873 line: line_num,
2874 is_separator,
2875 column_count,
2876 column_alignments,
2877 });
2878 }
2879
2880 table_rows
2881 }
2882
2883 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2885 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2886
2887 for cap in BARE_URL_PATTERN.captures_iter(content) {
2889 let full_match = cap.get(0).unwrap();
2890 let match_start = full_match.start();
2891 let match_end = full_match.end();
2892
2893 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2895 continue;
2896 }
2897
2898 let preceding_char = if match_start > 0 {
2900 content.chars().nth(match_start - 1)
2901 } else {
2902 None
2903 };
2904 let following_char = content.chars().nth(match_end);
2905
2906 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2907 continue;
2908 }
2909 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2910 continue;
2911 }
2912
2913 let url = full_match.as_str();
2914 let url_type = if url.starts_with("https://") {
2915 "https"
2916 } else if url.starts_with("http://") {
2917 "http"
2918 } else if url.starts_with("ftp://") {
2919 "ftp"
2920 } else {
2921 "other"
2922 };
2923
2924 let mut line_num = 1;
2926 let mut col_start = match_start;
2927 let mut col_end = match_end;
2928 for (idx, line_info) in lines.iter().enumerate() {
2929 if match_start >= line_info.byte_offset {
2930 line_num = idx + 1;
2931 col_start = match_start - line_info.byte_offset;
2932 col_end = match_end - line_info.byte_offset;
2933 } else {
2934 break;
2935 }
2936 }
2937
2938 bare_urls.push(BareUrl {
2939 line: line_num,
2940 start_col: col_start,
2941 end_col: col_end,
2942 byte_offset: match_start,
2943 byte_end: match_end,
2944 url: url.to_string(),
2945 url_type: url_type.to_string(),
2946 });
2947 }
2948
2949 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2951 let full_match = cap.get(0).unwrap();
2952 let match_start = full_match.start();
2953 let match_end = full_match.end();
2954
2955 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2957 continue;
2958 }
2959
2960 let preceding_char = if match_start > 0 {
2962 content.chars().nth(match_start - 1)
2963 } else {
2964 None
2965 };
2966 let following_char = content.chars().nth(match_end);
2967
2968 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2969 continue;
2970 }
2971 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2972 continue;
2973 }
2974
2975 let email = full_match.as_str();
2976
2977 let mut line_num = 1;
2979 let mut col_start = match_start;
2980 let mut col_end = match_end;
2981 for (idx, line_info) in lines.iter().enumerate() {
2982 if match_start >= line_info.byte_offset {
2983 line_num = idx + 1;
2984 col_start = match_start - line_info.byte_offset;
2985 col_end = match_end - line_info.byte_offset;
2986 } else {
2987 break;
2988 }
2989 }
2990
2991 bare_urls.push(BareUrl {
2992 line: line_num,
2993 start_col: col_start,
2994 end_col: col_end,
2995 byte_offset: match_start,
2996 byte_end: match_end,
2997 url: email.to_string(),
2998 url_type: "email".to_string(),
2999 });
3000 }
3001
3002 bare_urls
3003 }
3004}
3005
3006fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3008 if list_blocks.len() < 2 {
3009 return;
3010 }
3011
3012 let mut merger = ListBlockMerger::new(content, lines);
3013 *list_blocks = merger.merge(list_blocks);
3014}
3015
3016struct ListBlockMerger<'a> {
3018 content: &'a str,
3019 lines: &'a [LineInfo],
3020}
3021
3022impl<'a> ListBlockMerger<'a> {
3023 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3024 Self { content, lines }
3025 }
3026
3027 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3028 let mut merged = Vec::with_capacity(list_blocks.len());
3029 let mut current = list_blocks[0].clone();
3030
3031 for next in list_blocks.iter().skip(1) {
3032 if self.should_merge_blocks(¤t, next) {
3033 current = self.merge_two_blocks(current, next);
3034 } else {
3035 merged.push(current);
3036 current = next.clone();
3037 }
3038 }
3039
3040 merged.push(current);
3041 merged
3042 }
3043
3044 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3046 if !self.blocks_are_compatible(current, next) {
3048 return false;
3049 }
3050
3051 let spacing = self.analyze_spacing_between(current, next);
3053 match spacing {
3054 BlockSpacing::Consecutive => true,
3055 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3056 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3057 self.can_merge_with_content_between(current, next)
3058 }
3059 }
3060 }
3061
3062 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3064 current.is_ordered == next.is_ordered
3065 && current.blockquote_prefix == next.blockquote_prefix
3066 && current.nesting_level == next.nesting_level
3067 }
3068
3069 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3071 let gap = next.start_line - current.end_line;
3072
3073 match gap {
3074 1 => BlockSpacing::Consecutive,
3075 2 => BlockSpacing::SingleBlank,
3076 _ if gap > 2 => {
3077 if self.has_only_blank_lines_between(current, next) {
3078 BlockSpacing::MultipleBlanks
3079 } else {
3080 BlockSpacing::ContentBetween
3081 }
3082 }
3083 _ => BlockSpacing::Consecutive, }
3085 }
3086
3087 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3089 if has_meaningful_content_between(self.content, current, next, self.lines) {
3092 return false; }
3094
3095 !current.is_ordered && current.marker == next.marker
3097 }
3098
3099 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3101 if has_meaningful_content_between(self.content, current, next, self.lines) {
3103 return false; }
3105
3106 current.is_ordered && next.is_ordered
3108 }
3109
3110 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3112 for line_num in (current.end_line + 1)..next.start_line {
3113 if let Some(line_info) = self.lines.get(line_num - 1)
3114 && !line_info.content(self.content).trim().is_empty()
3115 {
3116 return false;
3117 }
3118 }
3119 true
3120 }
3121
3122 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3124 current.end_line = next.end_line;
3125 current.item_lines.extend_from_slice(&next.item_lines);
3126
3127 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3129
3130 if !current.is_ordered && self.markers_differ(¤t, next) {
3132 current.marker = None; }
3134
3135 current
3136 }
3137
3138 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3140 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3141 }
3142}
3143
3144#[derive(Debug, PartialEq)]
3146enum BlockSpacing {
3147 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3152
3153fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3155 for line_num in (current.end_line + 1)..next.start_line {
3157 if let Some(line_info) = lines.get(line_num - 1) {
3158 let trimmed = line_info.content(content).trim();
3160
3161 if trimmed.is_empty() {
3163 continue;
3164 }
3165
3166 if line_info.heading.is_some() {
3170 return true; }
3172
3173 if is_horizontal_rule(trimmed) {
3175 return true; }
3177
3178 if trimmed.contains('|') && trimmed.len() > 1 {
3181 if !trimmed.contains("](") && !trimmed.contains("http") {
3183 let pipe_count = trimmed.matches('|').count();
3185 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3186 return true; }
3188 }
3189 }
3190
3191 if trimmed.starts_with('>') {
3193 return true; }
3195
3196 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3198 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3199
3200 let min_continuation_indent = if current.is_ordered {
3202 current.nesting_level + current.max_marker_width + 1 } else {
3204 current.nesting_level + 2
3205 };
3206
3207 if line_indent < min_continuation_indent {
3208 return true; }
3211 }
3212
3213 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3215
3216 let min_indent = if current.is_ordered {
3218 current.nesting_level + current.max_marker_width
3219 } else {
3220 current.nesting_level + 2
3221 };
3222
3223 if line_indent < min_indent {
3225 return true; }
3227
3228 }
3231 }
3232
3233 false
3235}
3236
3237fn is_horizontal_rule(trimmed: &str) -> bool {
3239 if trimmed.len() < 3 {
3240 return false;
3241 }
3242
3243 let chars: Vec<char> = trimmed.chars().collect();
3245 if let Some(&first_char) = chars.first()
3246 && (first_char == '-' || first_char == '*' || first_char == '_')
3247 {
3248 let mut count = 0;
3249 for &ch in &chars {
3250 if ch == first_char {
3251 count += 1;
3252 } else if ch != ' ' && ch != '\t' {
3253 return false; }
3255 }
3256 return count >= 3;
3257 }
3258 false
3259}
3260
3261#[cfg(test)]
3263mod tests {
3264 use super::*;
3265
3266 #[test]
3267 fn test_empty_content() {
3268 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3269 assert_eq!(ctx.content, "");
3270 assert_eq!(ctx.line_offsets, vec![0]);
3271 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3272 assert_eq!(ctx.lines.len(), 0);
3273 }
3274
3275 #[test]
3276 fn test_single_line() {
3277 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3278 assert_eq!(ctx.content, "# Hello");
3279 assert_eq!(ctx.line_offsets, vec![0]);
3280 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3281 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3282 }
3283
3284 #[test]
3285 fn test_multi_line() {
3286 let content = "# Title\n\nSecond line\nThird line";
3287 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3288 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3289 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3296
3297 #[test]
3298 fn test_line_info() {
3299 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3300 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3301
3302 assert_eq!(ctx.lines.len(), 7);
3304
3305 let line1 = &ctx.lines[0];
3307 assert_eq!(line1.content(ctx.content), "# Title");
3308 assert_eq!(line1.byte_offset, 0);
3309 assert_eq!(line1.indent, 0);
3310 assert!(!line1.is_blank);
3311 assert!(!line1.in_code_block);
3312 assert!(line1.list_item.is_none());
3313
3314 let line2 = &ctx.lines[1];
3316 assert_eq!(line2.content(ctx.content), " indented");
3317 assert_eq!(line2.byte_offset, 8);
3318 assert_eq!(line2.indent, 4);
3319 assert!(!line2.is_blank);
3320
3321 let line3 = &ctx.lines[2];
3323 assert_eq!(line3.content(ctx.content), "");
3324 assert!(line3.is_blank);
3325
3326 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3328 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3329 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3330 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3331 }
3332
3333 #[test]
3334 fn test_list_item_detection() {
3335 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3336 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3337
3338 let line1 = &ctx.lines[0];
3340 assert!(line1.list_item.is_some());
3341 let list1 = line1.list_item.as_ref().unwrap();
3342 assert_eq!(list1.marker, "-");
3343 assert!(!list1.is_ordered);
3344 assert_eq!(list1.marker_column, 0);
3345 assert_eq!(list1.content_column, 2);
3346
3347 let line2 = &ctx.lines[1];
3349 assert!(line2.list_item.is_some());
3350 let list2 = line2.list_item.as_ref().unwrap();
3351 assert_eq!(list2.marker, "*");
3352 assert_eq!(list2.marker_column, 2);
3353
3354 let line3 = &ctx.lines[2];
3356 assert!(line3.list_item.is_some());
3357 let list3 = line3.list_item.as_ref().unwrap();
3358 assert_eq!(list3.marker, "1.");
3359 assert!(list3.is_ordered);
3360 assert_eq!(list3.number, Some(1));
3361
3362 let line6 = &ctx.lines[5];
3364 assert!(line6.list_item.is_none());
3365 }
3366
3367 #[test]
3368 fn test_offset_to_line_col_edge_cases() {
3369 let content = "a\nb\nc";
3370 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3371 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3379
3380 #[test]
3381 fn test_mdx_esm_blocks() {
3382 let content = r##"import {Chart} from './snowfall.js'
3383export const year = 2023
3384
3385# Last year's snowfall
3386
3387In {year}, the snowfall was above average.
3388It was followed by a warm spring which caused
3389flood conditions in many of the nearby rivers.
3390
3391<Chart color="#fcb32c" year={year} />
3392"##;
3393
3394 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3395
3396 assert_eq!(ctx.lines.len(), 10);
3398 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3399 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3400 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3401 assert!(
3402 !ctx.lines[3].in_esm_block,
3403 "Line 4 (heading) should NOT be in_esm_block"
3404 );
3405 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3406 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3407 }
3408
3409 #[test]
3410 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3411 let content = r#"import {Chart} from './snowfall.js'
3412export const year = 2023
3413
3414# Last year's snowfall
3415"#;
3416
3417 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3418
3419 assert!(
3421 !ctx.lines[0].in_esm_block,
3422 "Line 1 should NOT be in_esm_block in Standard flavor"
3423 );
3424 assert!(
3425 !ctx.lines[1].in_esm_block,
3426 "Line 2 should NOT be in_esm_block in Standard flavor"
3427 );
3428 }
3429}