1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
12 Regex::new(
13 r#"(?sx)
14 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
15 (?:
16 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
17 |
18 \[([^\]]*)\] # Reference ID in group 6
19 )"#
20 ).unwrap()
21});
22
23static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
26 Regex::new(
27 r#"(?sx)
28 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
29 (?:
30 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
31 |
32 \[([^\]]*)\] # Reference ID in group 6
33 )"#
34 ).unwrap()
35});
36
37static REF_DEF_PATTERN: LazyLock<Regex> =
39 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
40
41static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
43 Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap()
46});
47
48static BARE_EMAIL_PATTERN: LazyLock<Regex> =
50 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
51
52static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
54
55#[derive(Debug, Clone)]
57pub struct LineInfo {
58 pub byte_offset: usize,
60 pub byte_len: usize,
62 pub indent: usize,
64 pub is_blank: bool,
66 pub in_code_block: bool,
68 pub in_front_matter: bool,
70 pub in_html_block: bool,
72 pub in_html_comment: bool,
74 pub list_item: Option<ListItemInfo>,
76 pub heading: Option<HeadingInfo>,
78 pub blockquote: Option<BlockquoteInfo>,
80 pub in_mkdocstrings: bool,
82 pub in_esm_block: bool,
84}
85
86impl LineInfo {
87 pub fn content<'a>(&self, source: &'a str) -> &'a str {
89 &source[self.byte_offset..self.byte_offset + self.byte_len]
90 }
91}
92
93#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96 pub marker: String,
98 pub is_ordered: bool,
100 pub number: Option<usize>,
102 pub marker_column: usize,
104 pub content_column: usize,
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111 ATX,
113 Setext1,
115 Setext2,
117}
118
119#[derive(Debug, Clone)]
121pub struct ParsedLink<'a> {
122 pub line: usize,
124 pub start_col: usize,
126 pub end_col: usize,
128 pub byte_offset: usize,
130 pub byte_end: usize,
132 pub text: Cow<'a, str>,
134 pub url: Cow<'a, str>,
136 pub is_reference: bool,
138 pub reference_id: Option<Cow<'a, str>>,
140 pub link_type: LinkType,
142}
143
144#[derive(Debug, Clone)]
146pub struct BrokenLinkInfo {
147 pub reference: String,
149 pub span: std::ops::Range<usize>,
151}
152
153#[derive(Debug, Clone)]
155pub struct ParsedImage<'a> {
156 pub line: usize,
158 pub start_col: usize,
160 pub end_col: usize,
162 pub byte_offset: usize,
164 pub byte_end: usize,
166 pub alt_text: Cow<'a, str>,
168 pub url: Cow<'a, str>,
170 pub is_reference: bool,
172 pub reference_id: Option<Cow<'a, str>>,
174 pub link_type: LinkType,
176}
177
178#[derive(Debug, Clone)]
180pub struct ReferenceDef {
181 pub line: usize,
183 pub id: String,
185 pub url: String,
187 pub title: Option<String>,
189 pub byte_offset: usize,
191 pub byte_end: usize,
193}
194
195#[derive(Debug, Clone)]
197pub struct CodeSpan {
198 pub line: usize,
200 pub start_col: usize,
202 pub end_col: usize,
204 pub byte_offset: usize,
206 pub byte_end: usize,
208 pub backtick_count: usize,
210 pub content: String,
212}
213
214#[derive(Debug, Clone)]
216pub struct HeadingInfo {
217 pub level: u8,
219 pub style: HeadingStyle,
221 pub marker: String,
223 pub marker_column: usize,
225 pub content_column: usize,
227 pub text: String,
229 pub custom_id: Option<String>,
231 pub raw_text: String,
233 pub has_closing_sequence: bool,
235 pub closing_sequence: String,
237}
238
239#[derive(Debug, Clone)]
241pub struct BlockquoteInfo {
242 pub nesting_level: usize,
244 pub indent: String,
246 pub marker_column: usize,
248 pub prefix: String,
250 pub content: String,
252 pub has_no_space_after_marker: bool,
254 pub has_multiple_spaces_after_marker: bool,
256 pub needs_md028_fix: bool,
258}
259
260#[derive(Debug, Clone)]
262pub struct ListBlock {
263 pub start_line: usize,
265 pub end_line: usize,
267 pub is_ordered: bool,
269 pub marker: Option<String>,
271 pub blockquote_prefix: String,
273 pub item_lines: Vec<usize>,
275 pub nesting_level: usize,
277 pub max_marker_width: usize,
279}
280
281use std::sync::{Arc, Mutex};
282
283#[derive(Debug, Clone, Default)]
285pub struct CharFrequency {
286 pub hash_count: usize,
288 pub asterisk_count: usize,
290 pub underscore_count: usize,
292 pub hyphen_count: usize,
294 pub plus_count: usize,
296 pub gt_count: usize,
298 pub pipe_count: usize,
300 pub bracket_count: usize,
302 pub backtick_count: usize,
304 pub lt_count: usize,
306 pub exclamation_count: usize,
308 pub newline_count: usize,
310}
311
312#[derive(Debug, Clone)]
314pub struct HtmlTag {
315 pub line: usize,
317 pub start_col: usize,
319 pub end_col: usize,
321 pub byte_offset: usize,
323 pub byte_end: usize,
325 pub tag_name: String,
327 pub is_closing: bool,
329 pub is_self_closing: bool,
331 pub raw_content: String,
333}
334
335#[derive(Debug, Clone)]
337pub struct EmphasisSpan {
338 pub line: usize,
340 pub start_col: usize,
342 pub end_col: usize,
344 pub byte_offset: usize,
346 pub byte_end: usize,
348 pub marker: char,
350 pub marker_count: usize,
352 pub content: String,
354}
355
356#[derive(Debug, Clone)]
358pub struct TableRow {
359 pub line: usize,
361 pub is_separator: bool,
363 pub column_count: usize,
365 pub column_alignments: Vec<String>, }
368
369#[derive(Debug, Clone)]
371pub struct BareUrl {
372 pub line: usize,
374 pub start_col: usize,
376 pub end_col: usize,
378 pub byte_offset: usize,
380 pub byte_end: usize,
382 pub url: String,
384 pub url_type: String,
386}
387
388pub struct LintContext<'a> {
389 pub content: &'a str,
390 pub line_offsets: Vec<usize>,
391 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
410
411struct BlockquoteComponents<'a> {
413 indent: &'a str,
414 markers: &'a str,
415 spaces_after: &'a str,
416 content: &'a str,
417}
418
419#[inline]
421fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
422 let bytes = line.as_bytes();
423 let mut pos = 0;
424
425 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
427 pos += 1;
428 }
429 let indent_end = pos;
430
431 if pos >= bytes.len() || bytes[pos] != b'>' {
433 return None;
434 }
435
436 while pos < bytes.len() && bytes[pos] == b'>' {
438 pos += 1;
439 }
440 let markers_end = pos;
441
442 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
444 pos += 1;
445 }
446 let spaces_end = pos;
447
448 Some(BlockquoteComponents {
449 indent: &line[0..indent_end],
450 markers: &line[indent_end..markers_end],
451 spaces_after: &line[markers_end..spaces_end],
452 content: &line[spaces_end..],
453 })
454}
455
456impl<'a> LintContext<'a> {
457 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
458 use std::time::Instant;
459 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
460
461 let start = Instant::now();
462 let mut line_offsets = vec![0];
463 for (i, c) in content.char_indices() {
464 if c == '\n' {
465 line_offsets.push(i + 1);
466 }
467 }
468 if profile {
469 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
470 }
471
472 let start = Instant::now();
474 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
475 if profile {
476 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
477 }
478
479 let start = Instant::now();
481 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
482 if profile {
483 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
484 }
485
486 let start = Instant::now();
488 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
489 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
490 } else {
491 Vec::new()
492 };
493 if profile {
494 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
495 }
496
497 let start = Instant::now();
499 let mut lines = Self::compute_basic_line_info(
500 content,
501 &line_offsets,
502 &code_blocks,
503 flavor,
504 &html_comment_ranges,
505 &autodoc_ranges,
506 );
507 if profile {
508 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
509 }
510
511 let start = Instant::now();
513 Self::detect_html_blocks(content, &mut lines);
514 if profile {
515 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
516 }
517
518 let start = Instant::now();
520 Self::detect_esm_blocks(content, &mut lines, flavor);
521 if profile {
522 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
523 }
524
525 let start = Instant::now();
527 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
528 if profile {
529 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
530 }
531
532 let start = Instant::now();
534 let code_spans = Self::parse_code_spans(content, &lines);
535 if profile {
536 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
537 }
538
539 let start = Instant::now();
541 let (links, broken_links) =
542 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
543 if profile {
544 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
545 }
546
547 let start = Instant::now();
548 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
549 if profile {
550 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
551 }
552
553 let start = Instant::now();
554 let reference_defs = Self::parse_reference_defs(content, &lines);
555 if profile {
556 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
557 }
558
559 let start = Instant::now();
560 let list_blocks = Self::parse_list_blocks(content, &lines);
561 if profile {
562 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
563 }
564
565 let start = Instant::now();
567 let char_frequency = Self::compute_char_frequency(content);
568 if profile {
569 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
570 }
571
572 let start = Instant::now();
574 let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
575 content,
576 &code_blocks,
577 &code_spans,
578 &html_comment_ranges,
579 );
580 if profile {
581 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
582 }
583
584 let start = Instant::now();
586 let line_index = crate::utils::range_utils::LineIndex::new(content);
587 if profile {
588 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
589 }
590
591 let start = Instant::now();
593 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
594 if profile {
595 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
596 }
597
598 Self {
599 content,
600 line_offsets,
601 code_blocks,
602 lines,
603 links,
604 images,
605 broken_links,
606 reference_defs,
607 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
608 list_blocks,
609 char_frequency,
610 html_tags_cache: Mutex::new(None),
611 emphasis_spans_cache: Mutex::new(None),
612 table_rows_cache: Mutex::new(None),
613 bare_urls_cache: Mutex::new(None),
614 html_comment_ranges,
615 table_blocks,
616 line_index,
617 jinja_ranges,
618 flavor,
619 }
620 }
621
622 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
624 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
625
626 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
627 }
628
629 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
631 &self.html_comment_ranges
632 }
633
634 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
636 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
637
638 Arc::clone(cache.get_or_insert_with(|| {
639 Arc::new(Self::parse_html_tags(
640 self.content,
641 &self.lines,
642 &self.code_blocks,
643 self.flavor,
644 ))
645 }))
646 }
647
648 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
650 let mut cache = self
651 .emphasis_spans_cache
652 .lock()
653 .expect("Emphasis spans cache mutex poisoned");
654
655 Arc::clone(
656 cache.get_or_insert_with(|| {
657 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
658 }),
659 )
660 }
661
662 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
664 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
665
666 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
667 }
668
669 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
671 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
672
673 Arc::clone(
674 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
675 )
676 }
677
678 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
680 match self.line_offsets.binary_search(&offset) {
681 Ok(line) => (line + 1, 1),
682 Err(line) => {
683 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
684 (line, offset - line_start + 1)
685 }
686 }
687 }
688
689 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
691 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
693 return true;
694 }
695
696 self.code_spans()
698 .iter()
699 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
700 }
701
702 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
704 if line_num > 0 {
705 self.lines.get(line_num - 1)
706 } else {
707 None
708 }
709 }
710
711 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
713 self.line_info(line_num).map(|info| info.byte_offset)
714 }
715
716 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
718 let normalized_id = ref_id.to_lowercase();
719 self.reference_defs
720 .iter()
721 .find(|def| def.id == normalized_id)
722 .map(|def| def.url.as_str())
723 }
724
725 pub fn is_in_list_block(&self, line_num: usize) -> bool {
727 self.list_blocks
728 .iter()
729 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
730 }
731
732 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
734 self.list_blocks
735 .iter()
736 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
737 }
738
739 pub fn is_in_code_block(&self, line_num: usize) -> bool {
743 if line_num == 0 || line_num > self.lines.len() {
744 return false;
745 }
746 self.lines[line_num - 1].in_code_block
747 }
748
749 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
751 if line_num == 0 || line_num > self.lines.len() {
752 return false;
753 }
754 self.lines[line_num - 1].in_front_matter
755 }
756
757 pub fn is_in_html_block(&self, line_num: usize) -> bool {
759 if line_num == 0 || line_num > self.lines.len() {
760 return false;
761 }
762 self.lines[line_num - 1].in_html_block
763 }
764
765 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
767 if line_num == 0 || line_num > self.lines.len() {
768 return false;
769 }
770
771 let col_0indexed = if col > 0 { col - 1 } else { 0 };
775 let code_spans = self.code_spans();
776 code_spans
777 .iter()
778 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
779 }
780
781 #[inline]
784 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
785 self.reference_defs
786 .iter()
787 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
788 }
789
790 #[inline]
794 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
795 self.html_comment_ranges
796 .iter()
797 .any(|range| byte_pos >= range.start && byte_pos < range.end)
798 }
799
800 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
802 self.jinja_ranges
803 .iter()
804 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
805 }
806
807 pub fn has_char(&self, ch: char) -> bool {
809 match ch {
810 '#' => self.char_frequency.hash_count > 0,
811 '*' => self.char_frequency.asterisk_count > 0,
812 '_' => self.char_frequency.underscore_count > 0,
813 '-' => self.char_frequency.hyphen_count > 0,
814 '+' => self.char_frequency.plus_count > 0,
815 '>' => self.char_frequency.gt_count > 0,
816 '|' => self.char_frequency.pipe_count > 0,
817 '[' => self.char_frequency.bracket_count > 0,
818 '`' => self.char_frequency.backtick_count > 0,
819 '<' => self.char_frequency.lt_count > 0,
820 '!' => self.char_frequency.exclamation_count > 0,
821 '\n' => self.char_frequency.newline_count > 0,
822 _ => self.content.contains(ch), }
824 }
825
826 pub fn char_count(&self, ch: char) -> usize {
828 match ch {
829 '#' => self.char_frequency.hash_count,
830 '*' => self.char_frequency.asterisk_count,
831 '_' => self.char_frequency.underscore_count,
832 '-' => self.char_frequency.hyphen_count,
833 '+' => self.char_frequency.plus_count,
834 '>' => self.char_frequency.gt_count,
835 '|' => self.char_frequency.pipe_count,
836 '[' => self.char_frequency.bracket_count,
837 '`' => self.char_frequency.backtick_count,
838 '<' => self.char_frequency.lt_count,
839 '!' => self.char_frequency.exclamation_count,
840 '\n' => self.char_frequency.newline_count,
841 _ => self.content.matches(ch).count(), }
843 }
844
845 pub fn likely_has_headings(&self) -> bool {
847 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
849
850 pub fn likely_has_lists(&self) -> bool {
852 self.char_frequency.asterisk_count > 0
853 || self.char_frequency.hyphen_count > 0
854 || self.char_frequency.plus_count > 0
855 }
856
857 pub fn likely_has_emphasis(&self) -> bool {
859 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
860 }
861
862 pub fn likely_has_tables(&self) -> bool {
864 self.char_frequency.pipe_count > 2
865 }
866
867 pub fn likely_has_blockquotes(&self) -> bool {
869 self.char_frequency.gt_count > 0
870 }
871
872 pub fn likely_has_code(&self) -> bool {
874 self.char_frequency.backtick_count > 0
875 }
876
877 pub fn likely_has_links_or_images(&self) -> bool {
879 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
880 }
881
882 pub fn likely_has_html(&self) -> bool {
884 self.char_frequency.lt_count > 0
885 }
886
887 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
889 self.html_tags()
890 .iter()
891 .filter(|tag| tag.line == line_num)
892 .cloned()
893 .collect()
894 }
895
896 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
898 self.emphasis_spans()
899 .iter()
900 .filter(|span| span.line == line_num)
901 .cloned()
902 .collect()
903 }
904
905 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
907 self.table_rows()
908 .iter()
909 .filter(|row| row.line == line_num)
910 .cloned()
911 .collect()
912 }
913
914 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
916 self.bare_urls()
917 .iter()
918 .filter(|url| url.line == line_num)
919 .cloned()
920 .collect()
921 }
922
923 #[inline]
929 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
930 let idx = match lines.binary_search_by(|line| {
932 if byte_offset < line.byte_offset {
933 std::cmp::Ordering::Greater
934 } else if byte_offset > line.byte_offset + line.byte_len {
935 std::cmp::Ordering::Less
936 } else {
937 std::cmp::Ordering::Equal
938 }
939 }) {
940 Ok(idx) => idx,
941 Err(idx) => idx.saturating_sub(1),
942 };
943
944 let line = &lines[idx];
945 let line_num = idx + 1;
946 let col = byte_offset.saturating_sub(line.byte_offset);
947
948 (idx, line_num, col)
949 }
950
951 #[inline]
953 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
954 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
956
957 if idx > 0 {
959 let span = &code_spans[idx - 1];
960 if offset >= span.byte_offset && offset < span.byte_end {
961 return true;
962 }
963 }
964
965 false
966 }
967
968 fn parse_links(
970 content: &'a str,
971 lines: &[LineInfo],
972 code_blocks: &[(usize, usize)],
973 code_spans: &[CodeSpan],
974 flavor: MarkdownFlavor,
975 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
976 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
977 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
978 use std::collections::HashSet;
979
980 let mut links = Vec::with_capacity(content.len() / 500);
981 let mut broken_links = Vec::new();
982
983 let mut found_positions = HashSet::new();
985
986 let mut options = Options::empty();
996 options.insert(Options::ENABLE_WIKILINKS);
997
998 let parser = Parser::new_with_broken_link_callback(
999 content,
1000 options,
1001 Some(|link: BrokenLink<'_>| {
1002 broken_links.push(BrokenLinkInfo {
1003 reference: link.reference.to_string(),
1004 span: link.span.clone(),
1005 });
1006 None
1007 }),
1008 )
1009 .into_offset_iter();
1010
1011 let mut link_stack: Vec<(
1012 usize,
1013 usize,
1014 pulldown_cmark::CowStr<'a>,
1015 LinkType,
1016 pulldown_cmark::CowStr<'a>,
1017 )> = Vec::new();
1018 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1021 match event {
1022 Event::Start(Tag::Link {
1023 link_type,
1024 dest_url,
1025 id,
1026 ..
1027 }) => {
1028 link_stack.push((range.start, range.end, dest_url, link_type, id));
1030 text_chunks.clear();
1031 }
1032 Event::Text(text) if !link_stack.is_empty() => {
1033 text_chunks.push((text.to_string(), range.start, range.end));
1035 }
1036 Event::Code(code) if !link_stack.is_empty() => {
1037 let code_text = format!("`{code}`");
1039 text_chunks.push((code_text, range.start, range.end));
1040 }
1041 Event::End(TagEnd::Link) => {
1042 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1043 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1045 text_chunks.clear();
1046 continue;
1047 }
1048
1049 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1051
1052 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1054 text_chunks.clear();
1055 continue;
1056 }
1057
1058 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1059
1060 let is_reference = matches!(
1061 link_type,
1062 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1063 );
1064
1065 let link_text = if start_pos < content.len() {
1068 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1069
1070 let mut close_pos = None;
1074 let mut depth = 0;
1075 let mut in_code_span = false;
1076
1077 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1078 let mut backslash_count = 0;
1080 let mut j = i;
1081 while j > 0 && link_bytes[j - 1] == b'\\' {
1082 backslash_count += 1;
1083 j -= 1;
1084 }
1085 let is_escaped = backslash_count % 2 != 0;
1086
1087 if byte == b'`' && !is_escaped {
1089 in_code_span = !in_code_span;
1090 }
1091
1092 if !is_escaped && !in_code_span {
1094 if byte == b'[' {
1095 depth += 1;
1096 } else if byte == b']' {
1097 if depth == 0 {
1098 close_pos = Some(i);
1100 break;
1101 } else {
1102 depth -= 1;
1103 }
1104 }
1105 }
1106 }
1107
1108 if let Some(pos) = close_pos {
1109 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1110 } else {
1111 Cow::Borrowed("")
1112 }
1113 } else {
1114 Cow::Borrowed("")
1115 };
1116
1117 let reference_id = if is_reference && !ref_id.is_empty() {
1119 Some(Cow::Owned(ref_id.to_lowercase()))
1120 } else if is_reference {
1121 Some(Cow::Owned(link_text.to_lowercase()))
1123 } else {
1124 None
1125 };
1126
1127 let has_escaped_bang = start_pos >= 2
1131 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1132 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1133
1134 let has_escaped_bracket =
1137 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1138
1139 if has_escaped_bang || has_escaped_bracket {
1140 text_chunks.clear();
1141 continue; }
1143
1144 found_positions.insert(start_pos);
1146
1147 links.push(ParsedLink {
1148 line: line_num,
1149 start_col: col_start,
1150 end_col: col_end,
1151 byte_offset: start_pos,
1152 byte_end: range.end,
1153 text: link_text,
1154 url: Cow::Owned(url.to_string()),
1155 is_reference,
1156 reference_id,
1157 link_type,
1158 });
1159
1160 text_chunks.clear();
1161 }
1162 }
1163 _ => {}
1164 }
1165 }
1166
1167 for cap in LINK_PATTERN.captures_iter(content) {
1171 let full_match = cap.get(0).unwrap();
1172 let match_start = full_match.start();
1173 let match_end = full_match.end();
1174
1175 if found_positions.contains(&match_start) {
1177 continue;
1178 }
1179
1180 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1182 continue;
1183 }
1184
1185 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1187 continue;
1188 }
1189
1190 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1192 continue;
1193 }
1194
1195 if Self::is_offset_in_code_span(code_spans, match_start) {
1197 continue;
1198 }
1199
1200 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1202 continue;
1203 }
1204
1205 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1207
1208 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1210 continue;
1211 }
1212
1213 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1214
1215 let text = cap.get(1).map_or("", |m| m.as_str());
1216
1217 if let Some(ref_id) = cap.get(6) {
1219 let ref_id_str = ref_id.as_str();
1220 let normalized_ref = if ref_id_str.is_empty() {
1221 Cow::Owned(text.to_lowercase()) } else {
1223 Cow::Owned(ref_id_str.to_lowercase())
1224 };
1225
1226 links.push(ParsedLink {
1228 line: line_num,
1229 start_col: col_start,
1230 end_col: col_end,
1231 byte_offset: match_start,
1232 byte_end: match_end,
1233 text: Cow::Borrowed(text),
1234 url: Cow::Borrowed(""), is_reference: true,
1236 reference_id: Some(normalized_ref),
1237 link_type: LinkType::Reference, });
1239 }
1240 }
1241
1242 (links, broken_links)
1243 }
1244
1245 fn parse_images(
1247 content: &'a str,
1248 lines: &[LineInfo],
1249 code_blocks: &[(usize, usize)],
1250 code_spans: &[CodeSpan],
1251 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1252 ) -> Vec<ParsedImage<'a>> {
1253 use crate::utils::skip_context::is_in_html_comment_ranges;
1254 use std::collections::HashSet;
1255
1256 let mut images = Vec::with_capacity(content.len() / 1000);
1258 let mut found_positions = HashSet::new();
1259
1260 let parser = Parser::new(content).into_offset_iter();
1262 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1263 Vec::new();
1264 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1267 match event {
1268 Event::Start(Tag::Image {
1269 link_type,
1270 dest_url,
1271 id,
1272 ..
1273 }) => {
1274 image_stack.push((range.start, dest_url, link_type, id));
1275 text_chunks.clear();
1276 }
1277 Event::Text(text) if !image_stack.is_empty() => {
1278 text_chunks.push((text.to_string(), range.start, range.end));
1279 }
1280 Event::Code(code) if !image_stack.is_empty() => {
1281 let code_text = format!("`{code}`");
1282 text_chunks.push((code_text, range.start, range.end));
1283 }
1284 Event::End(TagEnd::Image) => {
1285 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1286 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1288 continue;
1289 }
1290
1291 if Self::is_offset_in_code_span(code_spans, start_pos) {
1293 continue;
1294 }
1295
1296 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1298 continue;
1299 }
1300
1301 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1304
1305 let is_reference = matches!(
1306 link_type,
1307 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1308 );
1309
1310 let alt_text = if start_pos < content.len() {
1313 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1314
1315 let mut close_pos = None;
1318 let mut depth = 0;
1319
1320 if image_bytes.len() > 2 {
1321 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1322 let mut backslash_count = 0;
1324 let mut j = i;
1325 while j > 0 && image_bytes[j - 1] == b'\\' {
1326 backslash_count += 1;
1327 j -= 1;
1328 }
1329 let is_escaped = backslash_count % 2 != 0;
1330
1331 if !is_escaped {
1332 if byte == b'[' {
1333 depth += 1;
1334 } else if byte == b']' {
1335 if depth == 0 {
1336 close_pos = Some(i);
1338 break;
1339 } else {
1340 depth -= 1;
1341 }
1342 }
1343 }
1344 }
1345 }
1346
1347 if let Some(pos) = close_pos {
1348 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1349 } else {
1350 Cow::Borrowed("")
1351 }
1352 } else {
1353 Cow::Borrowed("")
1354 };
1355
1356 let reference_id = if is_reference && !ref_id.is_empty() {
1357 Some(Cow::Owned(ref_id.to_lowercase()))
1358 } else if is_reference {
1359 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1361 None
1362 };
1363
1364 found_positions.insert(start_pos);
1365 images.push(ParsedImage {
1366 line: line_num,
1367 start_col: col_start,
1368 end_col: col_end,
1369 byte_offset: start_pos,
1370 byte_end: range.end,
1371 alt_text,
1372 url: Cow::Owned(url.to_string()),
1373 is_reference,
1374 reference_id,
1375 link_type,
1376 });
1377 }
1378 }
1379 _ => {}
1380 }
1381 }
1382
1383 for cap in IMAGE_PATTERN.captures_iter(content) {
1385 let full_match = cap.get(0).unwrap();
1386 let match_start = full_match.start();
1387 let match_end = full_match.end();
1388
1389 if found_positions.contains(&match_start) {
1391 continue;
1392 }
1393
1394 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1396 continue;
1397 }
1398
1399 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1401 || Self::is_offset_in_code_span(code_spans, match_start)
1402 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1403 {
1404 continue;
1405 }
1406
1407 if let Some(ref_id) = cap.get(6) {
1409 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1410 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1411 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1412 let ref_id_str = ref_id.as_str();
1413 let normalized_ref = if ref_id_str.is_empty() {
1414 Cow::Owned(alt_text.to_lowercase())
1415 } else {
1416 Cow::Owned(ref_id_str.to_lowercase())
1417 };
1418
1419 images.push(ParsedImage {
1420 line: line_num,
1421 start_col: col_start,
1422 end_col: col_end,
1423 byte_offset: match_start,
1424 byte_end: match_end,
1425 alt_text: Cow::Borrowed(alt_text),
1426 url: Cow::Borrowed(""),
1427 is_reference: true,
1428 reference_id: Some(normalized_ref),
1429 link_type: LinkType::Reference, });
1431 }
1432 }
1433
1434 images
1435 }
1436
1437 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1439 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1443 if line_info.in_code_block {
1445 continue;
1446 }
1447
1448 let line = line_info.content(content);
1449 let line_num = line_idx + 1;
1450
1451 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1452 let id = cap.get(1).unwrap().as_str().to_lowercase();
1453 let url = cap.get(2).unwrap().as_str().to_string();
1454 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1455
1456 let match_obj = cap.get(0).unwrap();
1459 let byte_offset = line_info.byte_offset + match_obj.start();
1460 let byte_end = line_info.byte_offset + match_obj.end();
1461
1462 refs.push(ReferenceDef {
1463 line: line_num,
1464 id,
1465 url,
1466 title,
1467 byte_offset,
1468 byte_end,
1469 });
1470 }
1471 }
1472
1473 refs
1474 }
1475
1476 #[inline]
1480 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1481 let trimmed_start = line.trim_start();
1482 if !trimmed_start.starts_with('>') {
1483 return None;
1484 }
1485
1486 let leading_ws_len = line.len() - trimmed_start.len();
1487 let after_gt = &trimmed_start[1..];
1488 let content = after_gt.trim_start();
1489 let ws_after_gt_len = after_gt.len() - content.len();
1490 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1491
1492 Some((&line[..prefix_len], content))
1493 }
1494
1495 #[inline]
1499 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1500 let bytes = line.as_bytes();
1501 let mut i = 0;
1502
1503 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1505 i += 1;
1506 }
1507
1508 if i >= bytes.len() {
1510 return None;
1511 }
1512 let marker = bytes[i] as char;
1513 if marker != '-' && marker != '*' && marker != '+' {
1514 return None;
1515 }
1516 let marker_pos = i;
1517 i += 1;
1518
1519 let spacing_start = i;
1521 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1522 i += 1;
1523 }
1524
1525 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1526 }
1527
1528 #[inline]
1532 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1533 let bytes = line.as_bytes();
1534 let mut i = 0;
1535
1536 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1538 i += 1;
1539 }
1540
1541 let number_start = i;
1543 while i < bytes.len() && bytes[i].is_ascii_digit() {
1544 i += 1;
1545 }
1546 if i == number_start {
1547 return None; }
1549
1550 if i >= bytes.len() {
1552 return None;
1553 }
1554 let delimiter = bytes[i] as char;
1555 if delimiter != '.' && delimiter != ')' {
1556 return None;
1557 }
1558 let delimiter_pos = i;
1559 i += 1;
1560
1561 let spacing_start = i;
1563 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1564 i += 1;
1565 }
1566
1567 Some((
1568 &line[..number_start],
1569 &line[number_start..delimiter_pos],
1570 delimiter,
1571 &line[spacing_start..i],
1572 &line[i..],
1573 ))
1574 }
1575
1576 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1579 let num_lines = line_offsets.len();
1580 let mut in_code_block = vec![false; num_lines];
1581
1582 for &(start, end) in code_blocks {
1584 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1586 let mut boundary = start;
1587 while boundary > 0 && !content.is_char_boundary(boundary) {
1588 boundary -= 1;
1589 }
1590 boundary
1591 } else {
1592 start
1593 };
1594
1595 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1596 let mut boundary = end;
1597 while boundary < content.len() && !content.is_char_boundary(boundary) {
1598 boundary += 1;
1599 }
1600 boundary
1601 } else {
1602 end.min(content.len())
1603 };
1604
1605 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1620 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1621
1622 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1624 *flag = true;
1625 }
1626 }
1627
1628 in_code_block
1629 }
1630
1631 fn compute_basic_line_info(
1633 content: &str,
1634 line_offsets: &[usize],
1635 code_blocks: &[(usize, usize)],
1636 flavor: MarkdownFlavor,
1637 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1638 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1639 ) -> Vec<LineInfo> {
1640 let content_lines: Vec<&str> = content.lines().collect();
1641 let mut lines = Vec::with_capacity(content_lines.len());
1642
1643 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1645
1646 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1649
1650 for (i, line) in content_lines.iter().enumerate() {
1651 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1652 let indent = line.len() - line.trim_start().len();
1653
1654 let blockquote_parse = Self::parse_blockquote_prefix(line);
1656
1657 let is_blank = if let Some((_, content)) = blockquote_parse {
1659 content.trim().is_empty()
1661 } else {
1662 line.trim().is_empty()
1663 };
1664
1665 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1667
1668 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1670 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1671 let in_html_comment =
1673 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1674 let list_item = if !(in_code_block
1675 || is_blank
1676 || in_mkdocstrings
1677 || in_html_comment
1678 || (front_matter_end > 0 && i < front_matter_end))
1679 {
1680 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1682 (content, prefix.len())
1683 } else {
1684 (&**line, 0)
1685 };
1686
1687 if let Some((leading_spaces, marker, spacing, _content)) =
1688 Self::parse_unordered_list(line_for_list_check)
1689 {
1690 let marker_column = blockquote_prefix_len + leading_spaces.len();
1691 let content_column = marker_column + 1 + spacing.len();
1692
1693 if spacing.is_empty() {
1700 None
1701 } else {
1702 Some(ListItemInfo {
1703 marker: marker.to_string(),
1704 is_ordered: false,
1705 number: None,
1706 marker_column,
1707 content_column,
1708 })
1709 }
1710 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1711 Self::parse_ordered_list(line_for_list_check)
1712 {
1713 let marker = format!("{number_str}{delimiter}");
1714 let marker_column = blockquote_prefix_len + leading_spaces.len();
1715 let content_column = marker_column + marker.len() + spacing.len();
1716
1717 if spacing.is_empty() {
1720 None
1721 } else {
1722 Some(ListItemInfo {
1723 marker,
1724 is_ordered: true,
1725 number: number_str.parse().ok(),
1726 marker_column,
1727 content_column,
1728 })
1729 }
1730 } else {
1731 None
1732 }
1733 } else {
1734 None
1735 };
1736
1737 lines.push(LineInfo {
1738 byte_offset,
1739 byte_len: line.len(),
1740 indent,
1741 is_blank,
1742 in_code_block,
1743 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1744 in_html_block: false, in_html_comment,
1746 list_item,
1747 heading: None, blockquote: None, in_mkdocstrings,
1750 in_esm_block: false, });
1752 }
1753
1754 lines
1755 }
1756
1757 fn detect_headings_and_blockquotes(
1759 content: &str,
1760 lines: &mut [LineInfo],
1761 flavor: MarkdownFlavor,
1762 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1763 ) {
1764 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1766 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1767 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1768 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1769
1770 let content_lines: Vec<&str> = content.lines().collect();
1771
1772 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1774
1775 for i in 0..lines.len() {
1777 if lines[i].in_code_block {
1778 continue;
1779 }
1780
1781 if front_matter_end > 0 && i < front_matter_end {
1783 continue;
1784 }
1785
1786 if lines[i].in_html_block {
1788 continue;
1789 }
1790
1791 let line = content_lines[i];
1792
1793 if let Some(bq) = parse_blockquote_detailed(line) {
1795 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1797
1798 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1800
1801 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1803 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1805
1806 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1810
1811 lines[i].blockquote = Some(BlockquoteInfo {
1812 nesting_level,
1813 indent: bq.indent.to_string(),
1814 marker_column,
1815 prefix,
1816 content: bq.content.to_string(),
1817 has_no_space_after_marker: has_no_space,
1818 has_multiple_spaces_after_marker: has_multiple_spaces,
1819 needs_md028_fix,
1820 });
1821 }
1822
1823 if lines[i].is_blank {
1825 continue;
1826 }
1827
1828 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1831 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1832 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1833 } else {
1834 false
1835 };
1836
1837 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1838 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1840 continue;
1841 }
1842 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1843 let hashes = caps.get(2).map_or("", |m| m.as_str());
1844 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1845 let rest = caps.get(4).map_or("", |m| m.as_str());
1846
1847 let level = hashes.len() as u8;
1848 let marker_column = leading_spaces.len();
1849
1850 let (text, has_closing, closing_seq) = {
1852 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1854 if rest[id_start..].trim_end().ends_with('}') {
1856 (&rest[..id_start], &rest[id_start..])
1858 } else {
1859 (rest, "")
1860 }
1861 } else {
1862 (rest, "")
1863 };
1864
1865 let trimmed_rest = rest_without_id.trim_end();
1867 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1868 let mut start_of_hashes = last_hash_pos;
1870 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1871 start_of_hashes -= 1;
1872 }
1873
1874 let has_space_before = start_of_hashes == 0
1876 || trimmed_rest
1877 .chars()
1878 .nth(start_of_hashes - 1)
1879 .is_some_and(|c| c.is_whitespace());
1880
1881 let potential_closing = &trimmed_rest[start_of_hashes..];
1883 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1884
1885 if is_all_hashes && has_space_before {
1886 let closing_hashes = potential_closing.to_string();
1888 let text_part = if !custom_id_part.is_empty() {
1891 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1894 } else {
1895 rest_without_id[..start_of_hashes].trim_end().to_string()
1896 };
1897 (text_part, true, closing_hashes)
1898 } else {
1899 (rest.to_string(), false, String::new())
1901 }
1902 } else {
1903 (rest.to_string(), false, String::new())
1905 }
1906 };
1907
1908 let content_column = marker_column + hashes.len() + spaces_after.len();
1909
1910 let raw_text = text.trim().to_string();
1912 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1913
1914 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1916 let next_line = content_lines[i + 1];
1917 if !lines[i + 1].in_code_block
1918 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1919 && let Some(next_line_id) =
1920 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1921 {
1922 custom_id = Some(next_line_id);
1923 }
1924 }
1925
1926 lines[i].heading = Some(HeadingInfo {
1927 level,
1928 style: HeadingStyle::ATX,
1929 marker: hashes.to_string(),
1930 marker_column,
1931 content_column,
1932 text: clean_text,
1933 custom_id,
1934 raw_text,
1935 has_closing_sequence: has_closing,
1936 closing_sequence: closing_seq,
1937 });
1938 }
1939 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1941 let next_line = content_lines[i + 1];
1942 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1943 if front_matter_end > 0 && i < front_matter_end {
1945 continue;
1946 }
1947
1948 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1950 {
1951 continue;
1952 }
1953
1954 let underline = next_line.trim();
1955
1956 if underline == "---" {
1959 continue;
1960 }
1961
1962 let current_line_trimmed = line.trim();
1964 if current_line_trimmed.contains(':')
1965 && !current_line_trimmed.starts_with('#')
1966 && !current_line_trimmed.contains('[')
1967 && !current_line_trimmed.contains("](")
1968 {
1969 continue;
1971 }
1972
1973 let level = if underline.starts_with('=') { 1 } else { 2 };
1974 let style = if level == 1 {
1975 HeadingStyle::Setext1
1976 } else {
1977 HeadingStyle::Setext2
1978 };
1979
1980 let raw_text = line.trim().to_string();
1982 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1983
1984 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1986 let attr_line = content_lines[i + 2];
1987 if !lines[i + 2].in_code_block
1988 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1989 && let Some(attr_line_id) =
1990 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1991 {
1992 custom_id = Some(attr_line_id);
1993 }
1994 }
1995
1996 lines[i].heading = Some(HeadingInfo {
1997 level,
1998 style,
1999 marker: underline.to_string(),
2000 marker_column: next_line.len() - next_line.trim_start().len(),
2001 content_column: lines[i].indent,
2002 text: clean_text,
2003 custom_id,
2004 raw_text,
2005 has_closing_sequence: false,
2006 closing_sequence: String::new(),
2007 });
2008 }
2009 }
2010 }
2011 }
2012
2013 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2015 const BLOCK_ELEMENTS: &[&str] = &[
2017 "address",
2018 "article",
2019 "aside",
2020 "blockquote",
2021 "details",
2022 "dialog",
2023 "dd",
2024 "div",
2025 "dl",
2026 "dt",
2027 "fieldset",
2028 "figcaption",
2029 "figure",
2030 "footer",
2031 "form",
2032 "h1",
2033 "h2",
2034 "h3",
2035 "h4",
2036 "h5",
2037 "h6",
2038 "header",
2039 "hr",
2040 "li",
2041 "main",
2042 "nav",
2043 "ol",
2044 "p",
2045 "picture",
2046 "pre",
2047 "script",
2048 "section",
2049 "style",
2050 "table",
2051 "tbody",
2052 "td",
2053 "textarea",
2054 "tfoot",
2055 "th",
2056 "thead",
2057 "tr",
2058 "ul",
2059 ];
2060
2061 let mut i = 0;
2062 while i < lines.len() {
2063 if lines[i].in_code_block || lines[i].in_front_matter {
2065 i += 1;
2066 continue;
2067 }
2068
2069 let trimmed = lines[i].content(content).trim_start();
2070
2071 if trimmed.starts_with('<') && trimmed.len() > 1 {
2073 let after_bracket = &trimmed[1..];
2075 let is_closing = after_bracket.starts_with('/');
2076 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2077
2078 let tag_name = tag_start
2080 .chars()
2081 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2082 .collect::<String>()
2083 .to_lowercase();
2084
2085 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2087 lines[i].in_html_block = true;
2089
2090 if !is_closing {
2093 let closing_tag = format!("</{tag_name}>");
2094 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2096 let mut j = i + 1;
2097 while j < lines.len() && j < i + 100 {
2098 if !allow_blank_lines && lines[j].is_blank {
2101 break;
2102 }
2103
2104 lines[j].in_html_block = true;
2105
2106 if lines[j].content(content).contains(&closing_tag) {
2108 break;
2109 }
2110 j += 1;
2111 }
2112 }
2113 }
2114 }
2115
2116 i += 1;
2117 }
2118 }
2119
2120 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2123 if !flavor.supports_esm_blocks() {
2125 return;
2126 }
2127
2128 for line in lines.iter_mut() {
2129 if line.is_blank || line.in_html_comment {
2131 continue;
2132 }
2133
2134 let trimmed = line.content(content).trim_start();
2136 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2137 line.in_esm_block = true;
2138 } else {
2139 break;
2141 }
2142 }
2143 }
2144
2145 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2147 let mut code_spans = Vec::new();
2148
2149 if !content.contains('`') {
2151 return code_spans;
2152 }
2153
2154 let parser = Parser::new(content).into_offset_iter();
2156
2157 for (event, range) in parser {
2158 if let Event::Code(_) = event {
2159 let start_pos = range.start;
2160 let end_pos = range.end;
2161
2162 let full_span = &content[start_pos..end_pos];
2164 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2165
2166 let content_start = start_pos + backtick_count;
2168 let content_end = end_pos - backtick_count;
2169 let span_content = if content_start < content_end {
2170 content[content_start..content_end].to_string()
2171 } else {
2172 String::new()
2173 };
2174
2175 let line_idx = lines
2178 .partition_point(|line| line.byte_offset <= start_pos)
2179 .saturating_sub(1);
2180 let line_num = line_idx + 1;
2181 let col_start = start_pos - lines[line_idx].byte_offset;
2182
2183 let end_line_idx = lines
2185 .partition_point(|line| line.byte_offset <= end_pos)
2186 .saturating_sub(1);
2187 let col_end = end_pos - lines[end_line_idx].byte_offset;
2188
2189 code_spans.push(CodeSpan {
2190 line: line_num,
2191 start_col: col_start,
2192 end_col: col_end,
2193 byte_offset: start_pos,
2194 byte_end: end_pos,
2195 backtick_count,
2196 content: span_content,
2197 });
2198 }
2199 }
2200
2201 code_spans.sort_by_key(|span| span.byte_offset);
2203
2204 code_spans
2205 }
2206
2207 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2218 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2220
2221 #[inline]
2224 fn reset_tracking_state(
2225 list_item: &ListItemInfo,
2226 has_list_breaking_content: &mut bool,
2227 min_continuation: &mut usize,
2228 ) {
2229 *has_list_breaking_content = false;
2230 let marker_width = if list_item.is_ordered {
2231 list_item.marker.len() + 1 } else {
2233 list_item.marker.len()
2234 };
2235 *min_continuation = if list_item.is_ordered {
2236 marker_width
2237 } else {
2238 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2239 };
2240 }
2241
2242 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2245 let mut last_list_item_line = 0;
2246 let mut current_indent_level = 0;
2247 let mut last_marker_width = 0;
2248
2249 let mut has_list_breaking_content_since_last_item = false;
2251 let mut min_continuation_for_tracking = 0;
2252
2253 for (line_idx, line_info) in lines.iter().enumerate() {
2254 let line_num = line_idx + 1;
2255
2256 if line_info.in_code_block {
2258 if let Some(ref mut block) = current_block {
2259 let min_continuation_indent =
2261 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2262
2263 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2265
2266 match context {
2267 CodeBlockContext::Indented => {
2268 block.end_line = line_num;
2270 continue;
2271 }
2272 CodeBlockContext::Standalone => {
2273 let completed_block = current_block.take().unwrap();
2275 list_blocks.push(completed_block);
2276 continue;
2277 }
2278 CodeBlockContext::Adjacent => {
2279 block.end_line = line_num;
2281 continue;
2282 }
2283 }
2284 } else {
2285 continue;
2287 }
2288 }
2289
2290 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2292 caps.get(0).unwrap().as_str().to_string()
2293 } else {
2294 String::new()
2295 };
2296
2297 if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2299 let line_content = line_info.content(content).trim();
2300
2301 let breaks_list = line_info.heading.is_some()
2303 || line_content.starts_with("---")
2304 || line_content.starts_with("***")
2305 || line_content.starts_with("___")
2306 || (line_content.contains('|')
2307 && !line_content.contains("](")
2308 && !line_content.contains("http")
2309 && (line_content.matches('|').count() > 1
2310 || line_content.starts_with('|')
2311 || line_content.ends_with('|')))
2312 || line_content.starts_with(">")
2313 || (line_info.indent < min_continuation_for_tracking);
2314
2315 if breaks_list {
2316 has_list_breaking_content_since_last_item = true;
2317 }
2318 }
2319
2320 if let Some(list_item) = &line_info.list_item {
2322 let item_indent = list_item.marker_column;
2324 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2327 let is_nested = nesting > block.nesting_level;
2331 let same_type =
2332 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2333 let same_context = block.blockquote_prefix == blockquote_prefix;
2334 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2338 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2339
2340 let has_non_list_content = has_list_breaking_content_since_last_item;
2343
2344 let mut continues_list = if is_nested {
2348 same_context && reasonable_distance && !has_non_list_content
2350 } else {
2351 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2353 };
2354
2355 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2358 if block.item_lines.contains(&(line_num - 1)) {
2360 continues_list = true;
2362 }
2363 }
2364
2365 if continues_list {
2366 block.end_line = line_num;
2368 block.item_lines.push(line_num);
2369
2370 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2372 list_item.marker.len() + 1
2373 } else {
2374 list_item.marker.len()
2375 });
2376
2377 if !block.is_ordered
2379 && block.marker.is_some()
2380 && block.marker.as_ref() != Some(&list_item.marker)
2381 {
2382 block.marker = None;
2384 }
2385
2386 reset_tracking_state(
2388 list_item,
2389 &mut has_list_breaking_content_since_last_item,
2390 &mut min_continuation_for_tracking,
2391 );
2392 } else {
2393 list_blocks.push(block.clone());
2396
2397 *block = ListBlock {
2398 start_line: line_num,
2399 end_line: line_num,
2400 is_ordered: list_item.is_ordered,
2401 marker: if list_item.is_ordered {
2402 None
2403 } else {
2404 Some(list_item.marker.clone())
2405 },
2406 blockquote_prefix: blockquote_prefix.clone(),
2407 item_lines: vec![line_num],
2408 nesting_level: nesting,
2409 max_marker_width: if list_item.is_ordered {
2410 list_item.marker.len() + 1
2411 } else {
2412 list_item.marker.len()
2413 },
2414 };
2415
2416 reset_tracking_state(
2418 list_item,
2419 &mut has_list_breaking_content_since_last_item,
2420 &mut min_continuation_for_tracking,
2421 );
2422 }
2423 } else {
2424 current_block = Some(ListBlock {
2426 start_line: line_num,
2427 end_line: line_num,
2428 is_ordered: list_item.is_ordered,
2429 marker: if list_item.is_ordered {
2430 None
2431 } else {
2432 Some(list_item.marker.clone())
2433 },
2434 blockquote_prefix,
2435 item_lines: vec![line_num],
2436 nesting_level: nesting,
2437 max_marker_width: list_item.marker.len(),
2438 });
2439
2440 reset_tracking_state(
2442 list_item,
2443 &mut has_list_breaking_content_since_last_item,
2444 &mut min_continuation_for_tracking,
2445 );
2446 }
2447
2448 last_list_item_line = line_num;
2449 current_indent_level = item_indent;
2450 last_marker_width = if list_item.is_ordered {
2451 list_item.marker.len() + 1 } else {
2453 list_item.marker.len()
2454 };
2455 } else if let Some(ref mut block) = current_block {
2456 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2466 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2467 } else {
2468 false
2469 };
2470
2471 let min_continuation_indent = if block.is_ordered {
2475 current_indent_level + last_marker_width
2476 } else {
2477 current_indent_level + 2 };
2479
2480 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2481 block.end_line = line_num;
2483 } else if line_info.is_blank {
2484 let mut check_idx = line_idx + 1;
2487 let mut found_continuation = false;
2488
2489 while check_idx < lines.len() && lines[check_idx].is_blank {
2491 check_idx += 1;
2492 }
2493
2494 if check_idx < lines.len() {
2495 let next_line = &lines[check_idx];
2496 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2498 found_continuation = true;
2499 }
2500 else if !next_line.in_code_block
2502 && next_line.list_item.is_some()
2503 && let Some(item) = &next_line.list_item
2504 {
2505 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2506 .find(next_line.content(content))
2507 .map_or(String::new(), |m| m.as_str().to_string());
2508 if item.marker_column == current_indent_level
2509 && item.is_ordered == block.is_ordered
2510 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2511 {
2512 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2515 if let Some(between_line) = lines.get(idx) {
2516 let between_content = between_line.content(content);
2517 let trimmed = between_content.trim();
2518 if trimmed.is_empty() {
2520 return false;
2521 }
2522 let line_indent = between_content.len() - between_content.trim_start().len();
2524
2525 if trimmed.starts_with("```")
2527 || trimmed.starts_with("~~~")
2528 || trimmed.starts_with("---")
2529 || trimmed.starts_with("***")
2530 || trimmed.starts_with("___")
2531 || trimmed.starts_with(">")
2532 || trimmed.contains('|') || between_line.heading.is_some()
2534 {
2535 return true; }
2537
2538 line_indent >= min_continuation_indent
2540 } else {
2541 false
2542 }
2543 });
2544
2545 if block.is_ordered {
2546 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2549 if let Some(between_line) = lines.get(idx) {
2550 let trimmed = between_line.content(content).trim();
2551 if trimmed.is_empty() {
2552 return false;
2553 }
2554 trimmed.starts_with("```")
2556 || trimmed.starts_with("~~~")
2557 || trimmed.starts_with("---")
2558 || trimmed.starts_with("***")
2559 || trimmed.starts_with("___")
2560 || trimmed.starts_with(">")
2561 || trimmed.contains('|') || between_line.heading.is_some()
2563 } else {
2564 false
2565 }
2566 });
2567 found_continuation = !has_structural_separators;
2568 } else {
2569 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2571 if let Some(between_line) = lines.get(idx) {
2572 let trimmed = between_line.content(content).trim();
2573 if trimmed.is_empty() {
2574 return false;
2575 }
2576 trimmed.starts_with("```")
2578 || trimmed.starts_with("~~~")
2579 || trimmed.starts_with("---")
2580 || trimmed.starts_with("***")
2581 || trimmed.starts_with("___")
2582 || trimmed.starts_with(">")
2583 || trimmed.contains('|') || between_line.heading.is_some()
2585 } else {
2586 false
2587 }
2588 });
2589 found_continuation = !has_structural_separators;
2590 }
2591 }
2592 }
2593 }
2594
2595 if found_continuation {
2596 block.end_line = line_num;
2598 } else {
2599 list_blocks.push(block.clone());
2601 current_block = None;
2602 }
2603 } else {
2604 let min_required_indent = if block.is_ordered {
2607 current_indent_level + last_marker_width
2608 } else {
2609 current_indent_level + 2
2610 };
2611
2612 let line_content = line_info.content(content).trim();
2617 let is_structural_separator = line_info.heading.is_some()
2618 || line_content.starts_with("```")
2619 || line_content.starts_with("~~~")
2620 || line_content.starts_with("---")
2621 || line_content.starts_with("***")
2622 || line_content.starts_with("___")
2623 || line_content.starts_with(">")
2624 || (line_content.contains('|')
2625 && !line_content.contains("](")
2626 && !line_content.contains("http")
2627 && (line_content.matches('|').count() > 1
2628 || line_content.starts_with('|')
2629 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2634 && !line_info.is_blank
2635 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2636
2637 if is_lazy_continuation {
2638 let content_to_check = if !blockquote_prefix.is_empty() {
2641 line_info
2643 .content(content)
2644 .strip_prefix(&blockquote_prefix)
2645 .unwrap_or(line_info.content(content))
2646 .trim()
2647 } else {
2648 line_info.content(content).trim()
2649 };
2650
2651 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2652
2653 if starts_with_uppercase && last_list_item_line > 0 {
2656 list_blocks.push(block.clone());
2658 current_block = None;
2659 } else {
2660 block.end_line = line_num;
2662 }
2663 } else {
2664 list_blocks.push(block.clone());
2666 current_block = None;
2667 }
2668 }
2669 }
2670 }
2671
2672 if let Some(block) = current_block {
2674 list_blocks.push(block);
2675 }
2676
2677 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2679
2680 list_blocks
2681 }
2682
2683 fn compute_char_frequency(content: &str) -> CharFrequency {
2685 let mut frequency = CharFrequency::default();
2686
2687 for ch in content.chars() {
2688 match ch {
2689 '#' => frequency.hash_count += 1,
2690 '*' => frequency.asterisk_count += 1,
2691 '_' => frequency.underscore_count += 1,
2692 '-' => frequency.hyphen_count += 1,
2693 '+' => frequency.plus_count += 1,
2694 '>' => frequency.gt_count += 1,
2695 '|' => frequency.pipe_count += 1,
2696 '[' => frequency.bracket_count += 1,
2697 '`' => frequency.backtick_count += 1,
2698 '<' => frequency.lt_count += 1,
2699 '!' => frequency.exclamation_count += 1,
2700 '\n' => frequency.newline_count += 1,
2701 _ => {}
2702 }
2703 }
2704
2705 frequency
2706 }
2707
2708 fn parse_html_tags(
2710 content: &str,
2711 lines: &[LineInfo],
2712 code_blocks: &[(usize, usize)],
2713 flavor: MarkdownFlavor,
2714 ) -> Vec<HtmlTag> {
2715 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2716 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2717
2718 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2719
2720 for cap in HTML_TAG_REGEX.captures_iter(content) {
2721 let full_match = cap.get(0).unwrap();
2722 let match_start = full_match.start();
2723 let match_end = full_match.end();
2724
2725 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2727 continue;
2728 }
2729
2730 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2731 let tag_name_original = cap.get(2).unwrap().as_str();
2732 let tag_name = tag_name_original.to_lowercase();
2733 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2734
2735 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2738 continue;
2739 }
2740
2741 let mut line_num = 1;
2743 let mut col_start = match_start;
2744 let mut col_end = match_end;
2745 for (idx, line_info) in lines.iter().enumerate() {
2746 if match_start >= line_info.byte_offset {
2747 line_num = idx + 1;
2748 col_start = match_start - line_info.byte_offset;
2749 col_end = match_end - line_info.byte_offset;
2750 } else {
2751 break;
2752 }
2753 }
2754
2755 html_tags.push(HtmlTag {
2756 line: line_num,
2757 start_col: col_start,
2758 end_col: col_end,
2759 byte_offset: match_start,
2760 byte_end: match_end,
2761 tag_name,
2762 is_closing,
2763 is_self_closing,
2764 raw_content: full_match.as_str().to_string(),
2765 });
2766 }
2767
2768 html_tags
2769 }
2770
2771 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2773 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2774 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2775
2776 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2777
2778 for cap in EMPHASIS_REGEX.captures_iter(content) {
2779 let full_match = cap.get(0).unwrap();
2780 let match_start = full_match.start();
2781 let match_end = full_match.end();
2782
2783 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2785 continue;
2786 }
2787
2788 let opening_markers = cap.get(1).unwrap().as_str();
2789 let content_part = cap.get(2).unwrap().as_str();
2790 let closing_markers = cap.get(3).unwrap().as_str();
2791
2792 if opening_markers.chars().next() != closing_markers.chars().next()
2794 || opening_markers.len() != closing_markers.len()
2795 {
2796 continue;
2797 }
2798
2799 let marker = opening_markers.chars().next().unwrap();
2800 let marker_count = opening_markers.len();
2801
2802 let mut line_num = 1;
2804 let mut col_start = match_start;
2805 let mut col_end = match_end;
2806 for (idx, line_info) in lines.iter().enumerate() {
2807 if match_start >= line_info.byte_offset {
2808 line_num = idx + 1;
2809 col_start = match_start - line_info.byte_offset;
2810 col_end = match_end - line_info.byte_offset;
2811 } else {
2812 break;
2813 }
2814 }
2815
2816 emphasis_spans.push(EmphasisSpan {
2817 line: line_num,
2818 start_col: col_start,
2819 end_col: col_end,
2820 byte_offset: match_start,
2821 byte_end: match_end,
2822 marker,
2823 marker_count,
2824 content: content_part.to_string(),
2825 });
2826 }
2827
2828 emphasis_spans
2829 }
2830
2831 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2833 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2834
2835 for (line_idx, line_info) in lines.iter().enumerate() {
2836 if line_info.in_code_block || line_info.is_blank {
2838 continue;
2839 }
2840
2841 let line = line_info.content(content);
2842 let line_num = line_idx + 1;
2843
2844 if !line.contains('|') {
2846 continue;
2847 }
2848
2849 let parts: Vec<&str> = line.split('|').collect();
2851 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2852
2853 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2855 let mut column_alignments = Vec::new();
2856
2857 if is_separator {
2858 for part in &parts[1..parts.len() - 1] {
2859 let trimmed = part.trim();
2861 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2862 "center".to_string()
2863 } else if trimmed.ends_with(':') {
2864 "right".to_string()
2865 } else if trimmed.starts_with(':') {
2866 "left".to_string()
2867 } else {
2868 "none".to_string()
2869 };
2870 column_alignments.push(alignment);
2871 }
2872 }
2873
2874 table_rows.push(TableRow {
2875 line: line_num,
2876 is_separator,
2877 column_count,
2878 column_alignments,
2879 });
2880 }
2881
2882 table_rows
2883 }
2884
2885 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2887 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2888
2889 for cap in BARE_URL_PATTERN.captures_iter(content) {
2891 let full_match = cap.get(0).unwrap();
2892 let match_start = full_match.start();
2893 let match_end = full_match.end();
2894
2895 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2897 continue;
2898 }
2899
2900 let preceding_char = if match_start > 0 {
2902 content.chars().nth(match_start - 1)
2903 } else {
2904 None
2905 };
2906 let following_char = content.chars().nth(match_end);
2907
2908 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2909 continue;
2910 }
2911 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2912 continue;
2913 }
2914
2915 let url = full_match.as_str();
2916 let url_type = if url.starts_with("https://") {
2917 "https"
2918 } else if url.starts_with("http://") {
2919 "http"
2920 } else if url.starts_with("ftp://") {
2921 "ftp"
2922 } else {
2923 "other"
2924 };
2925
2926 let mut line_num = 1;
2928 let mut col_start = match_start;
2929 let mut col_end = match_end;
2930 for (idx, line_info) in lines.iter().enumerate() {
2931 if match_start >= line_info.byte_offset {
2932 line_num = idx + 1;
2933 col_start = match_start - line_info.byte_offset;
2934 col_end = match_end - line_info.byte_offset;
2935 } else {
2936 break;
2937 }
2938 }
2939
2940 bare_urls.push(BareUrl {
2941 line: line_num,
2942 start_col: col_start,
2943 end_col: col_end,
2944 byte_offset: match_start,
2945 byte_end: match_end,
2946 url: url.to_string(),
2947 url_type: url_type.to_string(),
2948 });
2949 }
2950
2951 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2953 let full_match = cap.get(0).unwrap();
2954 let match_start = full_match.start();
2955 let match_end = full_match.end();
2956
2957 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2959 continue;
2960 }
2961
2962 let preceding_char = if match_start > 0 {
2964 content.chars().nth(match_start - 1)
2965 } else {
2966 None
2967 };
2968 let following_char = content.chars().nth(match_end);
2969
2970 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2971 continue;
2972 }
2973 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2974 continue;
2975 }
2976
2977 let email = full_match.as_str();
2978
2979 let mut line_num = 1;
2981 let mut col_start = match_start;
2982 let mut col_end = match_end;
2983 for (idx, line_info) in lines.iter().enumerate() {
2984 if match_start >= line_info.byte_offset {
2985 line_num = idx + 1;
2986 col_start = match_start - line_info.byte_offset;
2987 col_end = match_end - line_info.byte_offset;
2988 } else {
2989 break;
2990 }
2991 }
2992
2993 bare_urls.push(BareUrl {
2994 line: line_num,
2995 start_col: col_start,
2996 end_col: col_end,
2997 byte_offset: match_start,
2998 byte_end: match_end,
2999 url: email.to_string(),
3000 url_type: "email".to_string(),
3001 });
3002 }
3003
3004 bare_urls
3005 }
3006}
3007
3008fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3010 if list_blocks.len() < 2 {
3011 return;
3012 }
3013
3014 let mut merger = ListBlockMerger::new(content, lines);
3015 *list_blocks = merger.merge(list_blocks);
3016}
3017
3018struct ListBlockMerger<'a> {
3020 content: &'a str,
3021 lines: &'a [LineInfo],
3022}
3023
3024impl<'a> ListBlockMerger<'a> {
3025 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3026 Self { content, lines }
3027 }
3028
3029 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3030 let mut merged = Vec::with_capacity(list_blocks.len());
3031 let mut current = list_blocks[0].clone();
3032
3033 for next in list_blocks.iter().skip(1) {
3034 if self.should_merge_blocks(¤t, next) {
3035 current = self.merge_two_blocks(current, next);
3036 } else {
3037 merged.push(current);
3038 current = next.clone();
3039 }
3040 }
3041
3042 merged.push(current);
3043 merged
3044 }
3045
3046 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3048 if !self.blocks_are_compatible(current, next) {
3050 return false;
3051 }
3052
3053 let spacing = self.analyze_spacing_between(current, next);
3055 match spacing {
3056 BlockSpacing::Consecutive => true,
3057 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3058 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3059 self.can_merge_with_content_between(current, next)
3060 }
3061 }
3062 }
3063
3064 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3066 current.is_ordered == next.is_ordered
3067 && current.blockquote_prefix == next.blockquote_prefix
3068 && current.nesting_level == next.nesting_level
3069 }
3070
3071 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3073 let gap = next.start_line - current.end_line;
3074
3075 match gap {
3076 1 => BlockSpacing::Consecutive,
3077 2 => BlockSpacing::SingleBlank,
3078 _ if gap > 2 => {
3079 if self.has_only_blank_lines_between(current, next) {
3080 BlockSpacing::MultipleBlanks
3081 } else {
3082 BlockSpacing::ContentBetween
3083 }
3084 }
3085 _ => BlockSpacing::Consecutive, }
3087 }
3088
3089 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3091 if has_meaningful_content_between(self.content, current, next, self.lines) {
3094 return false; }
3096
3097 !current.is_ordered && current.marker == next.marker
3099 }
3100
3101 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3103 if has_meaningful_content_between(self.content, current, next, self.lines) {
3105 return false; }
3107
3108 current.is_ordered && next.is_ordered
3110 }
3111
3112 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3114 for line_num in (current.end_line + 1)..next.start_line {
3115 if let Some(line_info) = self.lines.get(line_num - 1)
3116 && !line_info.content(self.content).trim().is_empty()
3117 {
3118 return false;
3119 }
3120 }
3121 true
3122 }
3123
3124 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3126 current.end_line = next.end_line;
3127 current.item_lines.extend_from_slice(&next.item_lines);
3128
3129 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3131
3132 if !current.is_ordered && self.markers_differ(¤t, next) {
3134 current.marker = None; }
3136
3137 current
3138 }
3139
3140 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3142 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3143 }
3144}
3145
3146#[derive(Debug, PartialEq)]
3148enum BlockSpacing {
3149 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3154
3155fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3157 for line_num in (current.end_line + 1)..next.start_line {
3159 if let Some(line_info) = lines.get(line_num - 1) {
3160 let trimmed = line_info.content(content).trim();
3162
3163 if trimmed.is_empty() {
3165 continue;
3166 }
3167
3168 if line_info.heading.is_some() {
3172 return true; }
3174
3175 if is_horizontal_rule(trimmed) {
3177 return true; }
3179
3180 if trimmed.contains('|') && trimmed.len() > 1 {
3183 if !trimmed.contains("](") && !trimmed.contains("http") {
3185 let pipe_count = trimmed.matches('|').count();
3187 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3188 return true; }
3190 }
3191 }
3192
3193 if trimmed.starts_with('>') {
3195 return true; }
3197
3198 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3200 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3201
3202 let min_continuation_indent = if current.is_ordered {
3204 current.nesting_level + current.max_marker_width + 1 } else {
3206 current.nesting_level + 2
3207 };
3208
3209 if line_indent < min_continuation_indent {
3210 return true; }
3213 }
3214
3215 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3217
3218 let min_indent = if current.is_ordered {
3220 current.nesting_level + current.max_marker_width
3221 } else {
3222 current.nesting_level + 2
3223 };
3224
3225 if line_indent < min_indent {
3227 return true; }
3229
3230 }
3233 }
3234
3235 false
3237}
3238
3239fn is_horizontal_rule(trimmed: &str) -> bool {
3241 if trimmed.len() < 3 {
3242 return false;
3243 }
3244
3245 let chars: Vec<char> = trimmed.chars().collect();
3247 if let Some(&first_char) = chars.first()
3248 && (first_char == '-' || first_char == '*' || first_char == '_')
3249 {
3250 let mut count = 0;
3251 for &ch in &chars {
3252 if ch == first_char {
3253 count += 1;
3254 } else if ch != ' ' && ch != '\t' {
3255 return false; }
3257 }
3258 return count >= 3;
3259 }
3260 false
3261}
3262
3263#[cfg(test)]
3265mod tests {
3266 use super::*;
3267
3268 #[test]
3269 fn test_empty_content() {
3270 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3271 assert_eq!(ctx.content, "");
3272 assert_eq!(ctx.line_offsets, vec![0]);
3273 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3274 assert_eq!(ctx.lines.len(), 0);
3275 }
3276
3277 #[test]
3278 fn test_single_line() {
3279 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3280 assert_eq!(ctx.content, "# Hello");
3281 assert_eq!(ctx.line_offsets, vec![0]);
3282 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3283 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3284 }
3285
3286 #[test]
3287 fn test_multi_line() {
3288 let content = "# Title\n\nSecond line\nThird line";
3289 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3290 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3291 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3298
3299 #[test]
3300 fn test_line_info() {
3301 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3302 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3303
3304 assert_eq!(ctx.lines.len(), 7);
3306
3307 let line1 = &ctx.lines[0];
3309 assert_eq!(line1.content(ctx.content), "# Title");
3310 assert_eq!(line1.byte_offset, 0);
3311 assert_eq!(line1.indent, 0);
3312 assert!(!line1.is_blank);
3313 assert!(!line1.in_code_block);
3314 assert!(line1.list_item.is_none());
3315
3316 let line2 = &ctx.lines[1];
3318 assert_eq!(line2.content(ctx.content), " indented");
3319 assert_eq!(line2.byte_offset, 8);
3320 assert_eq!(line2.indent, 4);
3321 assert!(!line2.is_blank);
3322
3323 let line3 = &ctx.lines[2];
3325 assert_eq!(line3.content(ctx.content), "");
3326 assert!(line3.is_blank);
3327
3328 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3330 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3331 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3332 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3333 }
3334
3335 #[test]
3336 fn test_list_item_detection() {
3337 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3338 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3339
3340 let line1 = &ctx.lines[0];
3342 assert!(line1.list_item.is_some());
3343 let list1 = line1.list_item.as_ref().unwrap();
3344 assert_eq!(list1.marker, "-");
3345 assert!(!list1.is_ordered);
3346 assert_eq!(list1.marker_column, 0);
3347 assert_eq!(list1.content_column, 2);
3348
3349 let line2 = &ctx.lines[1];
3351 assert!(line2.list_item.is_some());
3352 let list2 = line2.list_item.as_ref().unwrap();
3353 assert_eq!(list2.marker, "*");
3354 assert_eq!(list2.marker_column, 2);
3355
3356 let line3 = &ctx.lines[2];
3358 assert!(line3.list_item.is_some());
3359 let list3 = line3.list_item.as_ref().unwrap();
3360 assert_eq!(list3.marker, "1.");
3361 assert!(list3.is_ordered);
3362 assert_eq!(list3.number, Some(1));
3363
3364 let line6 = &ctx.lines[5];
3366 assert!(line6.list_item.is_none());
3367 }
3368
3369 #[test]
3370 fn test_offset_to_line_col_edge_cases() {
3371 let content = "a\nb\nc";
3372 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3373 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3381
3382 #[test]
3383 fn test_mdx_esm_blocks() {
3384 let content = r##"import {Chart} from './snowfall.js'
3385export const year = 2023
3386
3387# Last year's snowfall
3388
3389In {year}, the snowfall was above average.
3390It was followed by a warm spring which caused
3391flood conditions in many of the nearby rivers.
3392
3393<Chart color="#fcb32c" year={year} />
3394"##;
3395
3396 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3397
3398 assert_eq!(ctx.lines.len(), 10);
3400 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3401 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3402 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3403 assert!(
3404 !ctx.lines[3].in_esm_block,
3405 "Line 4 (heading) should NOT be in_esm_block"
3406 );
3407 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3408 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3409 }
3410
3411 #[test]
3412 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3413 let content = r#"import {Chart} from './snowfall.js'
3414export const year = 2023
3415
3416# Last year's snowfall
3417"#;
3418
3419 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3420
3421 assert!(
3423 !ctx.lines[0].in_esm_block,
3424 "Line 1 should NOT be in_esm_block in Standard flavor"
3425 );
3426 assert!(
3427 !ctx.lines[1].in_esm_block,
3428 "Line 2 should NOT be in_esm_block in Standard flavor"
3429 );
3430 }
3431}