1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12 ($name:expr, $profile:expr, $code:expr) => {{
13 let start = std::time::Instant::now();
14 let result = $code;
15 if $profile {
16 eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17 }
18 result
19 }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24 ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30 Regex::new(
31 r#"(?sx)
32 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33 (?:
34 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
35 |
36 \[([^\]]*)\] # Reference ID in group 6
37 )"#
38 ).unwrap()
39});
40
41static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44 Regex::new(
45 r#"(?sx)
46 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47 (?:
48 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
49 |
50 \[([^\]]*)\] # Reference ID in group 6
51 )"#
52 ).unwrap()
53});
54
55static REF_DEF_PATTERN: LazyLock<Regex> =
57 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61 Regex::new(
62 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63 ).unwrap()
64});
65
66static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73#[derive(Debug, Clone)]
75pub struct LineInfo {
76 pub byte_offset: usize,
78 pub byte_len: usize,
80 pub indent: usize,
82 pub is_blank: bool,
84 pub in_code_block: bool,
86 pub in_front_matter: bool,
88 pub in_html_block: bool,
90 pub in_html_comment: bool,
92 pub list_item: Option<ListItemInfo>,
94 pub heading: Option<HeadingInfo>,
96 pub blockquote: Option<BlockquoteInfo>,
98 pub in_mkdocstrings: bool,
100 pub in_esm_block: bool,
102}
103
104impl LineInfo {
105 pub fn content<'a>(&self, source: &'a str) -> &'a str {
107 &source[self.byte_offset..self.byte_offset + self.byte_len]
108 }
109}
110
111#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114 pub marker: String,
116 pub is_ordered: bool,
118 pub number: Option<usize>,
120 pub marker_column: usize,
122 pub content_column: usize,
124}
125
126#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129 ATX,
131 Setext1,
133 Setext2,
135}
136
137#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140 pub line: usize,
142 pub start_col: usize,
144 pub end_col: usize,
146 pub byte_offset: usize,
148 pub byte_end: usize,
150 pub text: Cow<'a, str>,
152 pub url: Cow<'a, str>,
154 pub is_reference: bool,
156 pub reference_id: Option<Cow<'a, str>>,
158 pub link_type: LinkType,
160}
161
162#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165 pub reference: String,
167 pub span: std::ops::Range<usize>,
169}
170
171#[derive(Debug, Clone)]
173pub struct FootnoteRef {
174 pub id: String,
176 pub line: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182}
183
184#[derive(Debug, Clone)]
186pub struct ParsedImage<'a> {
187 pub line: usize,
189 pub start_col: usize,
191 pub end_col: usize,
193 pub byte_offset: usize,
195 pub byte_end: usize,
197 pub alt_text: Cow<'a, str>,
199 pub url: Cow<'a, str>,
201 pub is_reference: bool,
203 pub reference_id: Option<Cow<'a, str>>,
205 pub link_type: LinkType,
207}
208
209#[derive(Debug, Clone)]
211pub struct ReferenceDef {
212 pub line: usize,
214 pub id: String,
216 pub url: String,
218 pub title: Option<String>,
220 pub byte_offset: usize,
222 pub byte_end: usize,
224}
225
226#[derive(Debug, Clone)]
228pub struct CodeSpan {
229 pub line: usize,
231 pub start_col: usize,
233 pub end_col: usize,
235 pub byte_offset: usize,
237 pub byte_end: usize,
239 pub backtick_count: usize,
241 pub content: String,
243}
244
245#[derive(Debug, Clone)]
247pub struct HeadingInfo {
248 pub level: u8,
250 pub style: HeadingStyle,
252 pub marker: String,
254 pub marker_column: usize,
256 pub content_column: usize,
258 pub text: String,
260 pub custom_id: Option<String>,
262 pub raw_text: String,
264 pub has_closing_sequence: bool,
266 pub closing_sequence: String,
268}
269
270#[derive(Debug, Clone)]
272pub struct BlockquoteInfo {
273 pub nesting_level: usize,
275 pub indent: String,
277 pub marker_column: usize,
279 pub prefix: String,
281 pub content: String,
283 pub has_no_space_after_marker: bool,
285 pub has_multiple_spaces_after_marker: bool,
287 pub needs_md028_fix: bool,
289}
290
291#[derive(Debug, Clone)]
293pub struct ListBlock {
294 pub start_line: usize,
296 pub end_line: usize,
298 pub is_ordered: bool,
300 pub marker: Option<String>,
302 pub blockquote_prefix: String,
304 pub item_lines: Vec<usize>,
306 pub nesting_level: usize,
308 pub max_marker_width: usize,
310}
311
312use std::sync::{Arc, Mutex};
313
314#[derive(Debug, Clone, Default)]
316pub struct CharFrequency {
317 pub hash_count: usize,
319 pub asterisk_count: usize,
321 pub underscore_count: usize,
323 pub hyphen_count: usize,
325 pub plus_count: usize,
327 pub gt_count: usize,
329 pub pipe_count: usize,
331 pub bracket_count: usize,
333 pub backtick_count: usize,
335 pub lt_count: usize,
337 pub exclamation_count: usize,
339 pub newline_count: usize,
341}
342
343#[derive(Debug, Clone)]
345pub struct HtmlTag {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub tag_name: String,
358 pub is_closing: bool,
360 pub is_self_closing: bool,
362 pub raw_content: String,
364}
365
366#[derive(Debug, Clone)]
368pub struct EmphasisSpan {
369 pub line: usize,
371 pub start_col: usize,
373 pub end_col: usize,
375 pub byte_offset: usize,
377 pub byte_end: usize,
379 pub marker: char,
381 pub marker_count: usize,
383 pub content: String,
385}
386
387#[derive(Debug, Clone)]
389pub struct TableRow {
390 pub line: usize,
392 pub is_separator: bool,
394 pub column_count: usize,
396 pub column_alignments: Vec<String>, }
399
400#[derive(Debug, Clone)]
402pub struct BareUrl {
403 pub line: usize,
405 pub start_col: usize,
407 pub end_col: usize,
409 pub byte_offset: usize,
411 pub byte_end: usize,
413 pub url: String,
415 pub url_type: String,
417}
418
419pub struct LintContext<'a> {
420 pub content: &'a str,
421 pub line_offsets: Vec<usize>,
422 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub footnote_refs: Vec<FootnoteRef>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
442
443struct BlockquoteComponents<'a> {
445 indent: &'a str,
446 markers: &'a str,
447 spaces_after: &'a str,
448 content: &'a str,
449}
450
451#[inline]
453fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
454 let bytes = line.as_bytes();
455 let mut pos = 0;
456
457 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
459 pos += 1;
460 }
461 let indent_end = pos;
462
463 if pos >= bytes.len() || bytes[pos] != b'>' {
465 return None;
466 }
467
468 while pos < bytes.len() && bytes[pos] == b'>' {
470 pos += 1;
471 }
472 let markers_end = pos;
473
474 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
476 pos += 1;
477 }
478 let spaces_end = pos;
479
480 Some(BlockquoteComponents {
481 indent: &line[0..indent_end],
482 markers: &line[indent_end..markers_end],
483 spaces_after: &line[markers_end..spaces_end],
484 content: &line[spaces_end..],
485 })
486}
487
488impl<'a> LintContext<'a> {
489 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
490 #[cfg(not(target_arch = "wasm32"))]
491 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
492 #[cfg(target_arch = "wasm32")]
493 let profile = false;
494
495 let line_offsets = profile_section!("Line offsets", profile, {
496 let mut offsets = vec![0];
497 for (i, c) in content.char_indices() {
498 if c == '\n' {
499 offsets.push(i + 1);
500 }
501 }
502 offsets
503 });
504
505 let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
507
508 let html_comment_ranges = profile_section!(
510 "HTML comment ranges",
511 profile,
512 crate::utils::skip_context::compute_html_comment_ranges(content)
513 );
514
515 let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
517 if flavor == MarkdownFlavor::MkDocs {
518 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
519 } else {
520 Vec::new()
521 }
522 });
523
524 let mut lines = profile_section!(
526 "Basic line info",
527 profile,
528 Self::compute_basic_line_info(
529 content,
530 &line_offsets,
531 &code_blocks,
532 flavor,
533 &html_comment_ranges,
534 &autodoc_ranges,
535 )
536 );
537
538 profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
540
541 profile_section!(
543 "ESM blocks",
544 profile,
545 Self::detect_esm_blocks(content, &mut lines, flavor)
546 );
547
548 profile_section!(
550 "Headings & blockquotes",
551 profile,
552 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
553 );
554
555 let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
557
558 let (links, broken_links, footnote_refs) = profile_section!(
560 "Links",
561 profile,
562 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
563 );
564
565 let images = profile_section!(
566 "Images",
567 profile,
568 Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
569 );
570
571 let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
572
573 let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
574
575 let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
577
578 let table_blocks = profile_section!(
580 "Table blocks",
581 profile,
582 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
583 content,
584 &code_blocks,
585 &code_spans,
586 &html_comment_ranges,
587 )
588 );
589
590 let line_index = profile_section!(
592 "Line index",
593 profile,
594 crate::utils::range_utils::LineIndex::new(content)
595 );
596
597 let jinja_ranges = profile_section!(
599 "Jinja ranges",
600 profile,
601 crate::utils::jinja_utils::find_jinja_ranges(content)
602 );
603
604 Self {
605 content,
606 line_offsets,
607 code_blocks,
608 lines,
609 links,
610 images,
611 broken_links,
612 footnote_refs,
613 reference_defs,
614 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
615 list_blocks,
616 char_frequency,
617 html_tags_cache: Mutex::new(None),
618 emphasis_spans_cache: Mutex::new(None),
619 table_rows_cache: Mutex::new(None),
620 bare_urls_cache: Mutex::new(None),
621 html_comment_ranges,
622 table_blocks,
623 line_index,
624 jinja_ranges,
625 flavor,
626 }
627 }
628
629 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
631 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
632
633 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
634 }
635
636 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
638 &self.html_comment_ranges
639 }
640
641 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
643 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
644
645 Arc::clone(cache.get_or_insert_with(|| {
646 Arc::new(Self::parse_html_tags(
647 self.content,
648 &self.lines,
649 &self.code_blocks,
650 self.flavor,
651 ))
652 }))
653 }
654
655 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
657 let mut cache = self
658 .emphasis_spans_cache
659 .lock()
660 .expect("Emphasis spans cache mutex poisoned");
661
662 Arc::clone(
663 cache.get_or_insert_with(|| {
664 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
665 }),
666 )
667 }
668
669 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
671 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
672
673 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
674 }
675
676 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
678 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
679
680 Arc::clone(
681 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
682 )
683 }
684
685 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
687 match self.line_offsets.binary_search(&offset) {
688 Ok(line) => (line + 1, 1),
689 Err(line) => {
690 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
691 (line, offset - line_start + 1)
692 }
693 }
694 }
695
696 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
698 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
700 return true;
701 }
702
703 self.code_spans()
705 .iter()
706 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
707 }
708
709 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
711 if line_num > 0 {
712 self.lines.get(line_num - 1)
713 } else {
714 None
715 }
716 }
717
718 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
720 self.line_info(line_num).map(|info| info.byte_offset)
721 }
722
723 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
725 let normalized_id = ref_id.to_lowercase();
726 self.reference_defs
727 .iter()
728 .find(|def| def.id == normalized_id)
729 .map(|def| def.url.as_str())
730 }
731
732 pub fn is_in_list_block(&self, line_num: usize) -> bool {
734 self.list_blocks
735 .iter()
736 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
737 }
738
739 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
741 self.list_blocks
742 .iter()
743 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
744 }
745
746 pub fn is_in_code_block(&self, line_num: usize) -> bool {
750 if line_num == 0 || line_num > self.lines.len() {
751 return false;
752 }
753 self.lines[line_num - 1].in_code_block
754 }
755
756 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
758 if line_num == 0 || line_num > self.lines.len() {
759 return false;
760 }
761 self.lines[line_num - 1].in_front_matter
762 }
763
764 pub fn is_in_html_block(&self, line_num: usize) -> bool {
766 if line_num == 0 || line_num > self.lines.len() {
767 return false;
768 }
769 self.lines[line_num - 1].in_html_block
770 }
771
772 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
774 if line_num == 0 || line_num > self.lines.len() {
775 return false;
776 }
777
778 let col_0indexed = if col > 0 { col - 1 } else { 0 };
782 let code_spans = self.code_spans();
783 code_spans
784 .iter()
785 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
786 }
787
788 #[inline]
791 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
792 self.reference_defs
793 .iter()
794 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
795 }
796
797 #[inline]
801 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
802 self.html_comment_ranges
803 .iter()
804 .any(|range| byte_pos >= range.start && byte_pos < range.end)
805 }
806
807 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
809 self.jinja_ranges
810 .iter()
811 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
812 }
813
814 pub fn has_char(&self, ch: char) -> bool {
816 match ch {
817 '#' => self.char_frequency.hash_count > 0,
818 '*' => self.char_frequency.asterisk_count > 0,
819 '_' => self.char_frequency.underscore_count > 0,
820 '-' => self.char_frequency.hyphen_count > 0,
821 '+' => self.char_frequency.plus_count > 0,
822 '>' => self.char_frequency.gt_count > 0,
823 '|' => self.char_frequency.pipe_count > 0,
824 '[' => self.char_frequency.bracket_count > 0,
825 '`' => self.char_frequency.backtick_count > 0,
826 '<' => self.char_frequency.lt_count > 0,
827 '!' => self.char_frequency.exclamation_count > 0,
828 '\n' => self.char_frequency.newline_count > 0,
829 _ => self.content.contains(ch), }
831 }
832
833 pub fn char_count(&self, ch: char) -> usize {
835 match ch {
836 '#' => self.char_frequency.hash_count,
837 '*' => self.char_frequency.asterisk_count,
838 '_' => self.char_frequency.underscore_count,
839 '-' => self.char_frequency.hyphen_count,
840 '+' => self.char_frequency.plus_count,
841 '>' => self.char_frequency.gt_count,
842 '|' => self.char_frequency.pipe_count,
843 '[' => self.char_frequency.bracket_count,
844 '`' => self.char_frequency.backtick_count,
845 '<' => self.char_frequency.lt_count,
846 '!' => self.char_frequency.exclamation_count,
847 '\n' => self.char_frequency.newline_count,
848 _ => self.content.matches(ch).count(), }
850 }
851
852 pub fn likely_has_headings(&self) -> bool {
854 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
856
857 pub fn likely_has_lists(&self) -> bool {
859 self.char_frequency.asterisk_count > 0
860 || self.char_frequency.hyphen_count > 0
861 || self.char_frequency.plus_count > 0
862 }
863
864 pub fn likely_has_emphasis(&self) -> bool {
866 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
867 }
868
869 pub fn likely_has_tables(&self) -> bool {
871 self.char_frequency.pipe_count > 2
872 }
873
874 pub fn likely_has_blockquotes(&self) -> bool {
876 self.char_frequency.gt_count > 0
877 }
878
879 pub fn likely_has_code(&self) -> bool {
881 self.char_frequency.backtick_count > 0
882 }
883
884 pub fn likely_has_links_or_images(&self) -> bool {
886 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
887 }
888
889 pub fn likely_has_html(&self) -> bool {
891 self.char_frequency.lt_count > 0
892 }
893
894 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
896 self.html_tags()
897 .iter()
898 .filter(|tag| tag.line == line_num)
899 .cloned()
900 .collect()
901 }
902
903 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
905 self.emphasis_spans()
906 .iter()
907 .filter(|span| span.line == line_num)
908 .cloned()
909 .collect()
910 }
911
912 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
914 self.table_rows()
915 .iter()
916 .filter(|row| row.line == line_num)
917 .cloned()
918 .collect()
919 }
920
921 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
923 self.bare_urls()
924 .iter()
925 .filter(|url| url.line == line_num)
926 .cloned()
927 .collect()
928 }
929
930 #[inline]
936 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
937 let idx = match lines.binary_search_by(|line| {
939 if byte_offset < line.byte_offset {
940 std::cmp::Ordering::Greater
941 } else if byte_offset > line.byte_offset + line.byte_len {
942 std::cmp::Ordering::Less
943 } else {
944 std::cmp::Ordering::Equal
945 }
946 }) {
947 Ok(idx) => idx,
948 Err(idx) => idx.saturating_sub(1),
949 };
950
951 let line = &lines[idx];
952 let line_num = idx + 1;
953 let col = byte_offset.saturating_sub(line.byte_offset);
954
955 (idx, line_num, col)
956 }
957
958 #[inline]
960 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
961 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
963
964 if idx > 0 {
966 let span = &code_spans[idx - 1];
967 if offset >= span.byte_offset && offset < span.byte_end {
968 return true;
969 }
970 }
971
972 false
973 }
974
975 fn parse_links(
977 content: &'a str,
978 lines: &[LineInfo],
979 code_blocks: &[(usize, usize)],
980 code_spans: &[CodeSpan],
981 flavor: MarkdownFlavor,
982 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
983 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
984 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
985 use std::collections::HashSet;
986
987 let mut links = Vec::with_capacity(content.len() / 500);
988 let mut broken_links = Vec::new();
989 let mut footnote_refs = Vec::new();
990
991 let mut found_positions = HashSet::new();
993
994 let mut options = Options::empty();
1004 options.insert(Options::ENABLE_WIKILINKS);
1005 options.insert(Options::ENABLE_FOOTNOTES);
1006
1007 let parser = Parser::new_with_broken_link_callback(
1008 content,
1009 options,
1010 Some(|link: BrokenLink<'_>| {
1011 broken_links.push(BrokenLinkInfo {
1012 reference: link.reference.to_string(),
1013 span: link.span.clone(),
1014 });
1015 None
1016 }),
1017 )
1018 .into_offset_iter();
1019
1020 let mut link_stack: Vec<(
1021 usize,
1022 usize,
1023 pulldown_cmark::CowStr<'a>,
1024 LinkType,
1025 pulldown_cmark::CowStr<'a>,
1026 )> = Vec::new();
1027 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1030 match event {
1031 Event::Start(Tag::Link {
1032 link_type,
1033 dest_url,
1034 id,
1035 ..
1036 }) => {
1037 link_stack.push((range.start, range.end, dest_url, link_type, id));
1039 text_chunks.clear();
1040 }
1041 Event::Text(text) if !link_stack.is_empty() => {
1042 text_chunks.push((text.to_string(), range.start, range.end));
1044 }
1045 Event::Code(code) if !link_stack.is_empty() => {
1046 let code_text = format!("`{code}`");
1048 text_chunks.push((code_text, range.start, range.end));
1049 }
1050 Event::End(TagEnd::Link) => {
1051 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1052 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1054 text_chunks.clear();
1055 continue;
1056 }
1057
1058 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1060
1061 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1063 text_chunks.clear();
1064 continue;
1065 }
1066
1067 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1068
1069 let is_reference = matches!(
1070 link_type,
1071 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1072 );
1073
1074 let link_text = if start_pos < content.len() {
1077 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1078
1079 let mut close_pos = None;
1083 let mut depth = 0;
1084 let mut in_code_span = false;
1085
1086 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1087 let mut backslash_count = 0;
1089 let mut j = i;
1090 while j > 0 && link_bytes[j - 1] == b'\\' {
1091 backslash_count += 1;
1092 j -= 1;
1093 }
1094 let is_escaped = backslash_count % 2 != 0;
1095
1096 if byte == b'`' && !is_escaped {
1098 in_code_span = !in_code_span;
1099 }
1100
1101 if !is_escaped && !in_code_span {
1103 if byte == b'[' {
1104 depth += 1;
1105 } else if byte == b']' {
1106 if depth == 0 {
1107 close_pos = Some(i);
1109 break;
1110 } else {
1111 depth -= 1;
1112 }
1113 }
1114 }
1115 }
1116
1117 if let Some(pos) = close_pos {
1118 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1119 } else {
1120 Cow::Borrowed("")
1121 }
1122 } else {
1123 Cow::Borrowed("")
1124 };
1125
1126 let reference_id = if is_reference && !ref_id.is_empty() {
1128 Some(Cow::Owned(ref_id.to_lowercase()))
1129 } else if is_reference {
1130 Some(Cow::Owned(link_text.to_lowercase()))
1132 } else {
1133 None
1134 };
1135
1136 let has_escaped_bang = start_pos >= 2
1140 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1141 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1142
1143 let has_escaped_bracket =
1146 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1147
1148 if has_escaped_bang || has_escaped_bracket {
1149 text_chunks.clear();
1150 continue; }
1152
1153 found_positions.insert(start_pos);
1155
1156 links.push(ParsedLink {
1157 line: line_num,
1158 start_col: col_start,
1159 end_col: col_end,
1160 byte_offset: start_pos,
1161 byte_end: range.end,
1162 text: link_text,
1163 url: Cow::Owned(url.to_string()),
1164 is_reference,
1165 reference_id,
1166 link_type,
1167 });
1168
1169 text_chunks.clear();
1170 }
1171 }
1172 Event::FootnoteReference(footnote_id) => {
1173 if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1176 continue;
1177 }
1178
1179 let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1180 footnote_refs.push(FootnoteRef {
1181 id: footnote_id.to_string(),
1182 line: line_num,
1183 byte_offset: range.start,
1184 byte_end: range.end,
1185 });
1186 }
1187 _ => {}
1188 }
1189 }
1190
1191 for cap in LINK_PATTERN.captures_iter(content) {
1195 let full_match = cap.get(0).unwrap();
1196 let match_start = full_match.start();
1197 let match_end = full_match.end();
1198
1199 if found_positions.contains(&match_start) {
1201 continue;
1202 }
1203
1204 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1206 continue;
1207 }
1208
1209 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1211 continue;
1212 }
1213
1214 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1216 continue;
1217 }
1218
1219 if Self::is_offset_in_code_span(code_spans, match_start) {
1221 continue;
1222 }
1223
1224 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1226 continue;
1227 }
1228
1229 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1231
1232 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1234 continue;
1235 }
1236
1237 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1238
1239 let text = cap.get(1).map_or("", |m| m.as_str());
1240
1241 if let Some(ref_id) = cap.get(6) {
1243 let ref_id_str = ref_id.as_str();
1244 let normalized_ref = if ref_id_str.is_empty() {
1245 Cow::Owned(text.to_lowercase()) } else {
1247 Cow::Owned(ref_id_str.to_lowercase())
1248 };
1249
1250 links.push(ParsedLink {
1252 line: line_num,
1253 start_col: col_start,
1254 end_col: col_end,
1255 byte_offset: match_start,
1256 byte_end: match_end,
1257 text: Cow::Borrowed(text),
1258 url: Cow::Borrowed(""), is_reference: true,
1260 reference_id: Some(normalized_ref),
1261 link_type: LinkType::Reference, });
1263 }
1264 }
1265
1266 (links, broken_links, footnote_refs)
1267 }
1268
1269 fn parse_images(
1271 content: &'a str,
1272 lines: &[LineInfo],
1273 code_blocks: &[(usize, usize)],
1274 code_spans: &[CodeSpan],
1275 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1276 ) -> Vec<ParsedImage<'a>> {
1277 use crate::utils::skip_context::is_in_html_comment_ranges;
1278 use std::collections::HashSet;
1279
1280 let mut images = Vec::with_capacity(content.len() / 1000);
1282 let mut found_positions = HashSet::new();
1283
1284 let parser = Parser::new(content).into_offset_iter();
1286 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1287 Vec::new();
1288 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1291 match event {
1292 Event::Start(Tag::Image {
1293 link_type,
1294 dest_url,
1295 id,
1296 ..
1297 }) => {
1298 image_stack.push((range.start, dest_url, link_type, id));
1299 text_chunks.clear();
1300 }
1301 Event::Text(text) if !image_stack.is_empty() => {
1302 text_chunks.push((text.to_string(), range.start, range.end));
1303 }
1304 Event::Code(code) if !image_stack.is_empty() => {
1305 let code_text = format!("`{code}`");
1306 text_chunks.push((code_text, range.start, range.end));
1307 }
1308 Event::End(TagEnd::Image) => {
1309 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1310 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1312 continue;
1313 }
1314
1315 if Self::is_offset_in_code_span(code_spans, start_pos) {
1317 continue;
1318 }
1319
1320 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1322 continue;
1323 }
1324
1325 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1327 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1328
1329 let is_reference = matches!(
1330 link_type,
1331 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1332 );
1333
1334 let alt_text = if start_pos < content.len() {
1337 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1338
1339 let mut close_pos = None;
1342 let mut depth = 0;
1343
1344 if image_bytes.len() > 2 {
1345 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1346 let mut backslash_count = 0;
1348 let mut j = i;
1349 while j > 0 && image_bytes[j - 1] == b'\\' {
1350 backslash_count += 1;
1351 j -= 1;
1352 }
1353 let is_escaped = backslash_count % 2 != 0;
1354
1355 if !is_escaped {
1356 if byte == b'[' {
1357 depth += 1;
1358 } else if byte == b']' {
1359 if depth == 0 {
1360 close_pos = Some(i);
1362 break;
1363 } else {
1364 depth -= 1;
1365 }
1366 }
1367 }
1368 }
1369 }
1370
1371 if let Some(pos) = close_pos {
1372 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1373 } else {
1374 Cow::Borrowed("")
1375 }
1376 } else {
1377 Cow::Borrowed("")
1378 };
1379
1380 let reference_id = if is_reference && !ref_id.is_empty() {
1381 Some(Cow::Owned(ref_id.to_lowercase()))
1382 } else if is_reference {
1383 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1385 None
1386 };
1387
1388 found_positions.insert(start_pos);
1389 images.push(ParsedImage {
1390 line: line_num,
1391 start_col: col_start,
1392 end_col: col_end,
1393 byte_offset: start_pos,
1394 byte_end: range.end,
1395 alt_text,
1396 url: Cow::Owned(url.to_string()),
1397 is_reference,
1398 reference_id,
1399 link_type,
1400 });
1401 }
1402 }
1403 _ => {}
1404 }
1405 }
1406
1407 for cap in IMAGE_PATTERN.captures_iter(content) {
1409 let full_match = cap.get(0).unwrap();
1410 let match_start = full_match.start();
1411 let match_end = full_match.end();
1412
1413 if found_positions.contains(&match_start) {
1415 continue;
1416 }
1417
1418 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1420 continue;
1421 }
1422
1423 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1425 || Self::is_offset_in_code_span(code_spans, match_start)
1426 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1427 {
1428 continue;
1429 }
1430
1431 if let Some(ref_id) = cap.get(6) {
1433 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1434 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1435 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1436 let ref_id_str = ref_id.as_str();
1437 let normalized_ref = if ref_id_str.is_empty() {
1438 Cow::Owned(alt_text.to_lowercase())
1439 } else {
1440 Cow::Owned(ref_id_str.to_lowercase())
1441 };
1442
1443 images.push(ParsedImage {
1444 line: line_num,
1445 start_col: col_start,
1446 end_col: col_end,
1447 byte_offset: match_start,
1448 byte_end: match_end,
1449 alt_text: Cow::Borrowed(alt_text),
1450 url: Cow::Borrowed(""),
1451 is_reference: true,
1452 reference_id: Some(normalized_ref),
1453 link_type: LinkType::Reference, });
1455 }
1456 }
1457
1458 images
1459 }
1460
1461 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1463 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1467 if line_info.in_code_block {
1469 continue;
1470 }
1471
1472 let line = line_info.content(content);
1473 let line_num = line_idx + 1;
1474
1475 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1476 let id = cap.get(1).unwrap().as_str().to_lowercase();
1477 let url = cap.get(2).unwrap().as_str().to_string();
1478 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1479
1480 let match_obj = cap.get(0).unwrap();
1483 let byte_offset = line_info.byte_offset + match_obj.start();
1484 let byte_end = line_info.byte_offset + match_obj.end();
1485
1486 refs.push(ReferenceDef {
1487 line: line_num,
1488 id,
1489 url,
1490 title,
1491 byte_offset,
1492 byte_end,
1493 });
1494 }
1495 }
1496
1497 refs
1498 }
1499
1500 #[inline]
1504 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1505 let trimmed_start = line.trim_start();
1506 if !trimmed_start.starts_with('>') {
1507 return None;
1508 }
1509
1510 let leading_ws_len = line.len() - trimmed_start.len();
1511 let after_gt = &trimmed_start[1..];
1512 let content = after_gt.trim_start();
1513 let ws_after_gt_len = after_gt.len() - content.len();
1514 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1515
1516 Some((&line[..prefix_len], content))
1517 }
1518
1519 #[inline]
1523 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1524 let bytes = line.as_bytes();
1525 let mut i = 0;
1526
1527 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1529 i += 1;
1530 }
1531
1532 if i >= bytes.len() {
1534 return None;
1535 }
1536 let marker = bytes[i] as char;
1537 if marker != '-' && marker != '*' && marker != '+' {
1538 return None;
1539 }
1540 let marker_pos = i;
1541 i += 1;
1542
1543 let spacing_start = i;
1545 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1546 i += 1;
1547 }
1548
1549 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1550 }
1551
1552 #[inline]
1556 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1557 let bytes = line.as_bytes();
1558 let mut i = 0;
1559
1560 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1562 i += 1;
1563 }
1564
1565 let number_start = i;
1567 while i < bytes.len() && bytes[i].is_ascii_digit() {
1568 i += 1;
1569 }
1570 if i == number_start {
1571 return None; }
1573
1574 if i >= bytes.len() {
1576 return None;
1577 }
1578 let delimiter = bytes[i] as char;
1579 if delimiter != '.' && delimiter != ')' {
1580 return None;
1581 }
1582 let delimiter_pos = i;
1583 i += 1;
1584
1585 let spacing_start = i;
1587 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1588 i += 1;
1589 }
1590
1591 Some((
1592 &line[..number_start],
1593 &line[number_start..delimiter_pos],
1594 delimiter,
1595 &line[spacing_start..i],
1596 &line[i..],
1597 ))
1598 }
1599
1600 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1603 let num_lines = line_offsets.len();
1604 let mut in_code_block = vec![false; num_lines];
1605
1606 for &(start, end) in code_blocks {
1608 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1610 let mut boundary = start;
1611 while boundary > 0 && !content.is_char_boundary(boundary) {
1612 boundary -= 1;
1613 }
1614 boundary
1615 } else {
1616 start
1617 };
1618
1619 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1620 let mut boundary = end;
1621 while boundary < content.len() && !content.is_char_boundary(boundary) {
1622 boundary += 1;
1623 }
1624 boundary
1625 } else {
1626 end.min(content.len())
1627 };
1628
1629 let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1648 let first_line = first_line_after.saturating_sub(1);
1649 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1650
1651 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1653 *flag = true;
1654 }
1655 }
1656
1657 in_code_block
1658 }
1659
1660 fn compute_basic_line_info(
1662 content: &str,
1663 line_offsets: &[usize],
1664 code_blocks: &[(usize, usize)],
1665 flavor: MarkdownFlavor,
1666 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1667 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1668 ) -> Vec<LineInfo> {
1669 let content_lines: Vec<&str> = content.lines().collect();
1670 let mut lines = Vec::with_capacity(content_lines.len());
1671
1672 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1674
1675 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1678
1679 for (i, line) in content_lines.iter().enumerate() {
1680 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1681 let indent = line.len() - line.trim_start().len();
1682
1683 let blockquote_parse = Self::parse_blockquote_prefix(line);
1685
1686 let is_blank = if let Some((_, content)) = blockquote_parse {
1688 content.trim().is_empty()
1690 } else {
1691 line.trim().is_empty()
1692 };
1693
1694 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1696
1697 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1699 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1700 let in_html_comment =
1702 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1703 let list_item = if !(in_code_block
1704 || is_blank
1705 || in_mkdocstrings
1706 || in_html_comment
1707 || (front_matter_end > 0 && i < front_matter_end))
1708 {
1709 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1711 (content, prefix.len())
1712 } else {
1713 (&**line, 0)
1714 };
1715
1716 if let Some((leading_spaces, marker, spacing, _content)) =
1717 Self::parse_unordered_list(line_for_list_check)
1718 {
1719 let marker_column = blockquote_prefix_len + leading_spaces.len();
1720 let content_column = marker_column + 1 + spacing.len();
1721
1722 if spacing.is_empty() {
1729 None
1730 } else {
1731 Some(ListItemInfo {
1732 marker: marker.to_string(),
1733 is_ordered: false,
1734 number: None,
1735 marker_column,
1736 content_column,
1737 })
1738 }
1739 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1740 Self::parse_ordered_list(line_for_list_check)
1741 {
1742 let marker = format!("{number_str}{delimiter}");
1743 let marker_column = blockquote_prefix_len + leading_spaces.len();
1744 let content_column = marker_column + marker.len() + spacing.len();
1745
1746 if spacing.is_empty() {
1749 None
1750 } else {
1751 Some(ListItemInfo {
1752 marker,
1753 is_ordered: true,
1754 number: number_str.parse().ok(),
1755 marker_column,
1756 content_column,
1757 })
1758 }
1759 } else {
1760 None
1761 }
1762 } else {
1763 None
1764 };
1765
1766 lines.push(LineInfo {
1767 byte_offset,
1768 byte_len: line.len(),
1769 indent,
1770 is_blank,
1771 in_code_block,
1772 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1773 in_html_block: false, in_html_comment,
1775 list_item,
1776 heading: None, blockquote: None, in_mkdocstrings,
1779 in_esm_block: false, });
1781 }
1782
1783 lines
1784 }
1785
1786 fn detect_headings_and_blockquotes(
1788 content: &str,
1789 lines: &mut [LineInfo],
1790 flavor: MarkdownFlavor,
1791 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1792 ) {
1793 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1795 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1796 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1797 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1798
1799 let content_lines: Vec<&str> = content.lines().collect();
1800
1801 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1803
1804 for i in 0..lines.len() {
1806 if lines[i].in_code_block {
1807 continue;
1808 }
1809
1810 if front_matter_end > 0 && i < front_matter_end {
1812 continue;
1813 }
1814
1815 if lines[i].in_html_block {
1817 continue;
1818 }
1819
1820 let line = content_lines[i];
1821
1822 if let Some(bq) = parse_blockquote_detailed(line) {
1824 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1826
1827 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1829
1830 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1832 let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1835
1836 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1840
1841 lines[i].blockquote = Some(BlockquoteInfo {
1842 nesting_level,
1843 indent: bq.indent.to_string(),
1844 marker_column,
1845 prefix,
1846 content: bq.content.to_string(),
1847 has_no_space_after_marker: has_no_space,
1848 has_multiple_spaces_after_marker: has_multiple_spaces,
1849 needs_md028_fix,
1850 });
1851 }
1852
1853 if lines[i].is_blank {
1855 continue;
1856 }
1857
1858 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1861 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1862 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1863 } else {
1864 false
1865 };
1866
1867 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1868 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1870 continue;
1871 }
1872 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1873 let hashes = caps.get(2).map_or("", |m| m.as_str());
1874 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1875 let rest = caps.get(4).map_or("", |m| m.as_str());
1876
1877 let level = hashes.len() as u8;
1878 let marker_column = leading_spaces.len();
1879
1880 let (text, has_closing, closing_seq) = {
1882 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1884 if rest[id_start..].trim_end().ends_with('}') {
1886 (&rest[..id_start], &rest[id_start..])
1888 } else {
1889 (rest, "")
1890 }
1891 } else {
1892 (rest, "")
1893 };
1894
1895 let trimmed_rest = rest_without_id.trim_end();
1897 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1898 let mut start_of_hashes = last_hash_pos;
1900 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1901 start_of_hashes -= 1;
1902 }
1903
1904 let has_space_before = start_of_hashes == 0
1906 || trimmed_rest
1907 .chars()
1908 .nth(start_of_hashes - 1)
1909 .is_some_and(|c| c.is_whitespace());
1910
1911 let potential_closing = &trimmed_rest[start_of_hashes..];
1913 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1914
1915 if is_all_hashes && has_space_before {
1916 let closing_hashes = potential_closing.to_string();
1918 let text_part = if !custom_id_part.is_empty() {
1921 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1924 } else {
1925 rest_without_id[..start_of_hashes].trim_end().to_string()
1926 };
1927 (text_part, true, closing_hashes)
1928 } else {
1929 (rest.to_string(), false, String::new())
1931 }
1932 } else {
1933 (rest.to_string(), false, String::new())
1935 }
1936 };
1937
1938 let content_column = marker_column + hashes.len() + spaces_after.len();
1939
1940 let raw_text = text.trim().to_string();
1942 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1943
1944 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1946 let next_line = content_lines[i + 1];
1947 if !lines[i + 1].in_code_block
1948 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1949 && let Some(next_line_id) =
1950 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1951 {
1952 custom_id = Some(next_line_id);
1953 }
1954 }
1955
1956 lines[i].heading = Some(HeadingInfo {
1957 level,
1958 style: HeadingStyle::ATX,
1959 marker: hashes.to_string(),
1960 marker_column,
1961 content_column,
1962 text: clean_text,
1963 custom_id,
1964 raw_text,
1965 has_closing_sequence: has_closing,
1966 closing_sequence: closing_seq,
1967 });
1968 }
1969 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1971 let next_line = content_lines[i + 1];
1972 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1973 if front_matter_end > 0 && i < front_matter_end {
1975 continue;
1976 }
1977
1978 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1980 {
1981 continue;
1982 }
1983
1984 let underline = next_line.trim();
1985
1986 if underline == "---" {
1989 continue;
1990 }
1991
1992 let current_line_trimmed = line.trim();
1994 if current_line_trimmed.contains(':')
1995 && !current_line_trimmed.starts_with('#')
1996 && !current_line_trimmed.contains('[')
1997 && !current_line_trimmed.contains("](")
1998 {
1999 continue;
2001 }
2002
2003 let level = if underline.starts_with('=') { 1 } else { 2 };
2004 let style = if level == 1 {
2005 HeadingStyle::Setext1
2006 } else {
2007 HeadingStyle::Setext2
2008 };
2009
2010 let raw_text = line.trim().to_string();
2012 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2013
2014 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2016 let attr_line = content_lines[i + 2];
2017 if !lines[i + 2].in_code_block
2018 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2019 && let Some(attr_line_id) =
2020 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2021 {
2022 custom_id = Some(attr_line_id);
2023 }
2024 }
2025
2026 lines[i].heading = Some(HeadingInfo {
2027 level,
2028 style,
2029 marker: underline.to_string(),
2030 marker_column: next_line.len() - next_line.trim_start().len(),
2031 content_column: lines[i].indent,
2032 text: clean_text,
2033 custom_id,
2034 raw_text,
2035 has_closing_sequence: false,
2036 closing_sequence: String::new(),
2037 });
2038 }
2039 }
2040 }
2041 }
2042
2043 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2045 const BLOCK_ELEMENTS: &[&str] = &[
2047 "address",
2048 "article",
2049 "aside",
2050 "blockquote",
2051 "details",
2052 "dialog",
2053 "dd",
2054 "div",
2055 "dl",
2056 "dt",
2057 "fieldset",
2058 "figcaption",
2059 "figure",
2060 "footer",
2061 "form",
2062 "h1",
2063 "h2",
2064 "h3",
2065 "h4",
2066 "h5",
2067 "h6",
2068 "header",
2069 "hr",
2070 "li",
2071 "main",
2072 "nav",
2073 "ol",
2074 "p",
2075 "picture",
2076 "pre",
2077 "script",
2078 "section",
2079 "style",
2080 "table",
2081 "tbody",
2082 "td",
2083 "textarea",
2084 "tfoot",
2085 "th",
2086 "thead",
2087 "tr",
2088 "ul",
2089 ];
2090
2091 let mut i = 0;
2092 while i < lines.len() {
2093 if lines[i].in_code_block || lines[i].in_front_matter {
2095 i += 1;
2096 continue;
2097 }
2098
2099 let trimmed = lines[i].content(content).trim_start();
2100
2101 if trimmed.starts_with('<') && trimmed.len() > 1 {
2103 let after_bracket = &trimmed[1..];
2105 let is_closing = after_bracket.starts_with('/');
2106 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2107
2108 let tag_name = tag_start
2110 .chars()
2111 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2112 .collect::<String>()
2113 .to_lowercase();
2114
2115 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2117 lines[i].in_html_block = true;
2119
2120 if !is_closing {
2123 let closing_tag = format!("</{tag_name}>");
2124 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2126 let mut j = i + 1;
2127 while j < lines.len() && j < i + 100 {
2128 if !allow_blank_lines && lines[j].is_blank {
2131 break;
2132 }
2133
2134 lines[j].in_html_block = true;
2135
2136 if lines[j].content(content).contains(&closing_tag) {
2138 break;
2139 }
2140 j += 1;
2141 }
2142 }
2143 }
2144 }
2145
2146 i += 1;
2147 }
2148 }
2149
2150 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2153 if !flavor.supports_esm_blocks() {
2155 return;
2156 }
2157
2158 for line in lines.iter_mut() {
2159 if line.is_blank || line.in_html_comment {
2161 continue;
2162 }
2163
2164 let trimmed = line.content(content).trim_start();
2166 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2167 line.in_esm_block = true;
2168 } else {
2169 break;
2171 }
2172 }
2173 }
2174
2175 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2177 let mut code_spans = Vec::new();
2178
2179 if !content.contains('`') {
2181 return code_spans;
2182 }
2183
2184 let parser = Parser::new(content).into_offset_iter();
2186
2187 for (event, range) in parser {
2188 if let Event::Code(_) = event {
2189 let start_pos = range.start;
2190 let end_pos = range.end;
2191
2192 let full_span = &content[start_pos..end_pos];
2194 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2195
2196 let content_start = start_pos + backtick_count;
2198 let content_end = end_pos - backtick_count;
2199 let span_content = if content_start < content_end {
2200 content[content_start..content_end].to_string()
2201 } else {
2202 String::new()
2203 };
2204
2205 let line_idx = lines
2208 .partition_point(|line| line.byte_offset <= start_pos)
2209 .saturating_sub(1);
2210 let line_num = line_idx + 1;
2211 let col_start = start_pos - lines[line_idx].byte_offset;
2212
2213 let end_line_idx = lines
2215 .partition_point(|line| line.byte_offset <= end_pos)
2216 .saturating_sub(1);
2217 let col_end = end_pos - lines[end_line_idx].byte_offset;
2218
2219 code_spans.push(CodeSpan {
2220 line: line_num,
2221 start_col: col_start,
2222 end_col: col_end,
2223 byte_offset: start_pos,
2224 byte_end: end_pos,
2225 backtick_count,
2226 content: span_content,
2227 });
2228 }
2229 }
2230
2231 code_spans.sort_by_key(|span| span.byte_offset);
2233
2234 code_spans
2235 }
2236
2237 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2248 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2250
2251 #[inline]
2254 fn reset_tracking_state(
2255 list_item: &ListItemInfo,
2256 has_list_breaking_content: &mut bool,
2257 min_continuation: &mut usize,
2258 ) {
2259 *has_list_breaking_content = false;
2260 let marker_width = if list_item.is_ordered {
2261 list_item.marker.len() + 1 } else {
2263 list_item.marker.len()
2264 };
2265 *min_continuation = if list_item.is_ordered {
2266 marker_width
2267 } else {
2268 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2269 };
2270 }
2271
2272 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2275 let mut last_list_item_line = 0;
2276 let mut current_indent_level = 0;
2277 let mut last_marker_width = 0;
2278
2279 let mut has_list_breaking_content_since_last_item = false;
2281 let mut min_continuation_for_tracking = 0;
2282
2283 for (line_idx, line_info) in lines.iter().enumerate() {
2284 let line_num = line_idx + 1;
2285
2286 if line_info.in_code_block {
2288 if let Some(ref mut block) = current_block {
2289 let min_continuation_indent =
2291 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2292
2293 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2295
2296 match context {
2297 CodeBlockContext::Indented => {
2298 block.end_line = line_num;
2300 continue;
2301 }
2302 CodeBlockContext::Standalone => {
2303 let completed_block = current_block.take().unwrap();
2305 list_blocks.push(completed_block);
2306 continue;
2307 }
2308 CodeBlockContext::Adjacent => {
2309 block.end_line = line_num;
2311 continue;
2312 }
2313 }
2314 } else {
2315 continue;
2317 }
2318 }
2319
2320 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2322 caps.get(0).unwrap().as_str().to_string()
2323 } else {
2324 String::new()
2325 };
2326
2327 if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2329 let line_content = line_info.content(content).trim();
2330
2331 let breaks_list = line_info.heading.is_some()
2333 || line_content.starts_with("---")
2334 || line_content.starts_with("***")
2335 || line_content.starts_with("___")
2336 || (line_content.contains('|')
2337 && !line_content.contains("](")
2338 && !line_content.contains("http")
2339 && (line_content.matches('|').count() > 1
2340 || line_content.starts_with('|')
2341 || line_content.ends_with('|')))
2342 || line_content.starts_with(">")
2343 || (line_info.indent < min_continuation_for_tracking);
2344
2345 if breaks_list {
2346 has_list_breaking_content_since_last_item = true;
2347 }
2348 }
2349
2350 if let Some(list_item) = &line_info.list_item {
2352 let item_indent = list_item.marker_column;
2354 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2357 let is_nested = nesting > block.nesting_level;
2361 let same_type =
2362 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2363 let same_context = block.blockquote_prefix == blockquote_prefix;
2364 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2368 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2369
2370 let has_non_list_content = has_list_breaking_content_since_last_item;
2373
2374 let mut continues_list = if is_nested {
2378 same_context && reasonable_distance && !has_non_list_content
2380 } else {
2381 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2383 };
2384
2385 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2388 if block.item_lines.contains(&(line_num - 1)) {
2390 continues_list = true;
2392 }
2393 }
2394
2395 if continues_list {
2396 block.end_line = line_num;
2398 block.item_lines.push(line_num);
2399
2400 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2402 list_item.marker.len() + 1
2403 } else {
2404 list_item.marker.len()
2405 });
2406
2407 if !block.is_ordered
2409 && block.marker.is_some()
2410 && block.marker.as_ref() != Some(&list_item.marker)
2411 {
2412 block.marker = None;
2414 }
2415
2416 reset_tracking_state(
2418 list_item,
2419 &mut has_list_breaking_content_since_last_item,
2420 &mut min_continuation_for_tracking,
2421 );
2422 } else {
2423 list_blocks.push(block.clone());
2426
2427 *block = ListBlock {
2428 start_line: line_num,
2429 end_line: line_num,
2430 is_ordered: list_item.is_ordered,
2431 marker: if list_item.is_ordered {
2432 None
2433 } else {
2434 Some(list_item.marker.clone())
2435 },
2436 blockquote_prefix: blockquote_prefix.clone(),
2437 item_lines: vec![line_num],
2438 nesting_level: nesting,
2439 max_marker_width: if list_item.is_ordered {
2440 list_item.marker.len() + 1
2441 } else {
2442 list_item.marker.len()
2443 },
2444 };
2445
2446 reset_tracking_state(
2448 list_item,
2449 &mut has_list_breaking_content_since_last_item,
2450 &mut min_continuation_for_tracking,
2451 );
2452 }
2453 } else {
2454 current_block = Some(ListBlock {
2456 start_line: line_num,
2457 end_line: line_num,
2458 is_ordered: list_item.is_ordered,
2459 marker: if list_item.is_ordered {
2460 None
2461 } else {
2462 Some(list_item.marker.clone())
2463 },
2464 blockquote_prefix,
2465 item_lines: vec![line_num],
2466 nesting_level: nesting,
2467 max_marker_width: list_item.marker.len(),
2468 });
2469
2470 reset_tracking_state(
2472 list_item,
2473 &mut has_list_breaking_content_since_last_item,
2474 &mut min_continuation_for_tracking,
2475 );
2476 }
2477
2478 last_list_item_line = line_num;
2479 current_indent_level = item_indent;
2480 last_marker_width = if list_item.is_ordered {
2481 list_item.marker.len() + 1 } else {
2483 list_item.marker.len()
2484 };
2485 } else if let Some(ref mut block) = current_block {
2486 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2496 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2497 } else {
2498 false
2499 };
2500
2501 let min_continuation_indent = if block.is_ordered {
2505 current_indent_level + last_marker_width
2506 } else {
2507 current_indent_level + 2 };
2509
2510 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2511 block.end_line = line_num;
2513 } else if line_info.is_blank {
2514 let mut check_idx = line_idx + 1;
2517 let mut found_continuation = false;
2518
2519 while check_idx < lines.len() && lines[check_idx].is_blank {
2521 check_idx += 1;
2522 }
2523
2524 if check_idx < lines.len() {
2525 let next_line = &lines[check_idx];
2526 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2528 found_continuation = true;
2529 }
2530 else if !next_line.in_code_block
2532 && next_line.list_item.is_some()
2533 && let Some(item) = &next_line.list_item
2534 {
2535 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2536 .find(next_line.content(content))
2537 .map_or(String::new(), |m| m.as_str().to_string());
2538 if item.marker_column == current_indent_level
2539 && item.is_ordered == block.is_ordered
2540 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2541 {
2542 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2545 if let Some(between_line) = lines.get(idx) {
2546 let between_content = between_line.content(content);
2547 let trimmed = between_content.trim();
2548 if trimmed.is_empty() {
2550 return false;
2551 }
2552 let line_indent = between_content.len() - between_content.trim_start().len();
2554
2555 if trimmed.starts_with("```")
2557 || trimmed.starts_with("~~~")
2558 || trimmed.starts_with("---")
2559 || trimmed.starts_with("***")
2560 || trimmed.starts_with("___")
2561 || trimmed.starts_with(">")
2562 || trimmed.contains('|') || between_line.heading.is_some()
2564 {
2565 return true; }
2567
2568 line_indent >= min_continuation_indent
2570 } else {
2571 false
2572 }
2573 });
2574
2575 if block.is_ordered {
2576 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2579 if let Some(between_line) = lines.get(idx) {
2580 let trimmed = between_line.content(content).trim();
2581 if trimmed.is_empty() {
2582 return false;
2583 }
2584 trimmed.starts_with("```")
2586 || trimmed.starts_with("~~~")
2587 || trimmed.starts_with("---")
2588 || trimmed.starts_with("***")
2589 || trimmed.starts_with("___")
2590 || trimmed.starts_with(">")
2591 || trimmed.contains('|') || between_line.heading.is_some()
2593 } else {
2594 false
2595 }
2596 });
2597 found_continuation = !has_structural_separators;
2598 } else {
2599 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2601 if let Some(between_line) = lines.get(idx) {
2602 let trimmed = between_line.content(content).trim();
2603 if trimmed.is_empty() {
2604 return false;
2605 }
2606 trimmed.starts_with("```")
2608 || trimmed.starts_with("~~~")
2609 || trimmed.starts_with("---")
2610 || trimmed.starts_with("***")
2611 || trimmed.starts_with("___")
2612 || trimmed.starts_with(">")
2613 || trimmed.contains('|') || between_line.heading.is_some()
2615 } else {
2616 false
2617 }
2618 });
2619 found_continuation = !has_structural_separators;
2620 }
2621 }
2622 }
2623 }
2624
2625 if found_continuation {
2626 block.end_line = line_num;
2628 } else {
2629 list_blocks.push(block.clone());
2631 current_block = None;
2632 }
2633 } else {
2634 let min_required_indent = if block.is_ordered {
2637 current_indent_level + last_marker_width
2638 } else {
2639 current_indent_level + 2
2640 };
2641
2642 let line_content = line_info.content(content).trim();
2647 let is_structural_separator = line_info.heading.is_some()
2648 || line_content.starts_with("```")
2649 || line_content.starts_with("~~~")
2650 || line_content.starts_with("---")
2651 || line_content.starts_with("***")
2652 || line_content.starts_with("___")
2653 || line_content.starts_with(">")
2654 || (line_content.contains('|')
2655 && !line_content.contains("](")
2656 && !line_content.contains("http")
2657 && (line_content.matches('|').count() > 1
2658 || line_content.starts_with('|')
2659 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2664 && !line_info.is_blank
2665 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2666
2667 if is_lazy_continuation {
2668 let content_to_check = if !blockquote_prefix.is_empty() {
2671 line_info
2673 .content(content)
2674 .strip_prefix(&blockquote_prefix)
2675 .unwrap_or(line_info.content(content))
2676 .trim()
2677 } else {
2678 line_info.content(content).trim()
2679 };
2680
2681 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2682
2683 if starts_with_uppercase && last_list_item_line > 0 {
2686 list_blocks.push(block.clone());
2688 current_block = None;
2689 } else {
2690 block.end_line = line_num;
2692 }
2693 } else {
2694 list_blocks.push(block.clone());
2696 current_block = None;
2697 }
2698 }
2699 }
2700 }
2701
2702 if let Some(block) = current_block {
2704 list_blocks.push(block);
2705 }
2706
2707 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2709
2710 list_blocks
2711 }
2712
2713 fn compute_char_frequency(content: &str) -> CharFrequency {
2715 let mut frequency = CharFrequency::default();
2716
2717 for ch in content.chars() {
2718 match ch {
2719 '#' => frequency.hash_count += 1,
2720 '*' => frequency.asterisk_count += 1,
2721 '_' => frequency.underscore_count += 1,
2722 '-' => frequency.hyphen_count += 1,
2723 '+' => frequency.plus_count += 1,
2724 '>' => frequency.gt_count += 1,
2725 '|' => frequency.pipe_count += 1,
2726 '[' => frequency.bracket_count += 1,
2727 '`' => frequency.backtick_count += 1,
2728 '<' => frequency.lt_count += 1,
2729 '!' => frequency.exclamation_count += 1,
2730 '\n' => frequency.newline_count += 1,
2731 _ => {}
2732 }
2733 }
2734
2735 frequency
2736 }
2737
2738 fn parse_html_tags(
2740 content: &str,
2741 lines: &[LineInfo],
2742 code_blocks: &[(usize, usize)],
2743 flavor: MarkdownFlavor,
2744 ) -> Vec<HtmlTag> {
2745 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2746 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2747
2748 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2749
2750 for cap in HTML_TAG_REGEX.captures_iter(content) {
2751 let full_match = cap.get(0).unwrap();
2752 let match_start = full_match.start();
2753 let match_end = full_match.end();
2754
2755 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2757 continue;
2758 }
2759
2760 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2761 let tag_name_original = cap.get(2).unwrap().as_str();
2762 let tag_name = tag_name_original.to_lowercase();
2763 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2764
2765 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2768 continue;
2769 }
2770
2771 let mut line_num = 1;
2773 let mut col_start = match_start;
2774 let mut col_end = match_end;
2775 for (idx, line_info) in lines.iter().enumerate() {
2776 if match_start >= line_info.byte_offset {
2777 line_num = idx + 1;
2778 col_start = match_start - line_info.byte_offset;
2779 col_end = match_end - line_info.byte_offset;
2780 } else {
2781 break;
2782 }
2783 }
2784
2785 html_tags.push(HtmlTag {
2786 line: line_num,
2787 start_col: col_start,
2788 end_col: col_end,
2789 byte_offset: match_start,
2790 byte_end: match_end,
2791 tag_name,
2792 is_closing,
2793 is_self_closing,
2794 raw_content: full_match.as_str().to_string(),
2795 });
2796 }
2797
2798 html_tags
2799 }
2800
2801 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2803 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2804 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2805
2806 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2807
2808 for cap in EMPHASIS_REGEX.captures_iter(content) {
2809 let full_match = cap.get(0).unwrap();
2810 let match_start = full_match.start();
2811 let match_end = full_match.end();
2812
2813 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2815 continue;
2816 }
2817
2818 let opening_markers = cap.get(1).unwrap().as_str();
2819 let content_part = cap.get(2).unwrap().as_str();
2820 let closing_markers = cap.get(3).unwrap().as_str();
2821
2822 if opening_markers.chars().next() != closing_markers.chars().next()
2824 || opening_markers.len() != closing_markers.len()
2825 {
2826 continue;
2827 }
2828
2829 let marker = opening_markers.chars().next().unwrap();
2830 let marker_count = opening_markers.len();
2831
2832 let mut line_num = 1;
2834 let mut col_start = match_start;
2835 let mut col_end = match_end;
2836 for (idx, line_info) in lines.iter().enumerate() {
2837 if match_start >= line_info.byte_offset {
2838 line_num = idx + 1;
2839 col_start = match_start - line_info.byte_offset;
2840 col_end = match_end - line_info.byte_offset;
2841 } else {
2842 break;
2843 }
2844 }
2845
2846 emphasis_spans.push(EmphasisSpan {
2847 line: line_num,
2848 start_col: col_start,
2849 end_col: col_end,
2850 byte_offset: match_start,
2851 byte_end: match_end,
2852 marker,
2853 marker_count,
2854 content: content_part.to_string(),
2855 });
2856 }
2857
2858 emphasis_spans
2859 }
2860
2861 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2863 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2864
2865 for (line_idx, line_info) in lines.iter().enumerate() {
2866 if line_info.in_code_block || line_info.is_blank {
2868 continue;
2869 }
2870
2871 let line = line_info.content(content);
2872 let line_num = line_idx + 1;
2873
2874 if !line.contains('|') {
2876 continue;
2877 }
2878
2879 let parts: Vec<&str> = line.split('|').collect();
2881 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2882
2883 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2885 let mut column_alignments = Vec::new();
2886
2887 if is_separator {
2888 for part in &parts[1..parts.len() - 1] {
2889 let trimmed = part.trim();
2891 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2892 "center".to_string()
2893 } else if trimmed.ends_with(':') {
2894 "right".to_string()
2895 } else if trimmed.starts_with(':') {
2896 "left".to_string()
2897 } else {
2898 "none".to_string()
2899 };
2900 column_alignments.push(alignment);
2901 }
2902 }
2903
2904 table_rows.push(TableRow {
2905 line: line_num,
2906 is_separator,
2907 column_count,
2908 column_alignments,
2909 });
2910 }
2911
2912 table_rows
2913 }
2914
2915 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2917 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2918
2919 for cap in BARE_URL_PATTERN.captures_iter(content) {
2921 let full_match = cap.get(0).unwrap();
2922 let match_start = full_match.start();
2923 let match_end = full_match.end();
2924
2925 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2927 continue;
2928 }
2929
2930 let preceding_char = if match_start > 0 {
2932 content.chars().nth(match_start - 1)
2933 } else {
2934 None
2935 };
2936 let following_char = content.chars().nth(match_end);
2937
2938 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2939 continue;
2940 }
2941 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2942 continue;
2943 }
2944
2945 let url = full_match.as_str();
2946 let url_type = if url.starts_with("https://") {
2947 "https"
2948 } else if url.starts_with("http://") {
2949 "http"
2950 } else if url.starts_with("ftp://") {
2951 "ftp"
2952 } else {
2953 "other"
2954 };
2955
2956 let mut line_num = 1;
2958 let mut col_start = match_start;
2959 let mut col_end = match_end;
2960 for (idx, line_info) in lines.iter().enumerate() {
2961 if match_start >= line_info.byte_offset {
2962 line_num = idx + 1;
2963 col_start = match_start - line_info.byte_offset;
2964 col_end = match_end - line_info.byte_offset;
2965 } else {
2966 break;
2967 }
2968 }
2969
2970 bare_urls.push(BareUrl {
2971 line: line_num,
2972 start_col: col_start,
2973 end_col: col_end,
2974 byte_offset: match_start,
2975 byte_end: match_end,
2976 url: url.to_string(),
2977 url_type: url_type.to_string(),
2978 });
2979 }
2980
2981 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2983 let full_match = cap.get(0).unwrap();
2984 let match_start = full_match.start();
2985 let match_end = full_match.end();
2986
2987 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2989 continue;
2990 }
2991
2992 let preceding_char = if match_start > 0 {
2994 content.chars().nth(match_start - 1)
2995 } else {
2996 None
2997 };
2998 let following_char = content.chars().nth(match_end);
2999
3000 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3001 continue;
3002 }
3003 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3004 continue;
3005 }
3006
3007 let email = full_match.as_str();
3008
3009 let mut line_num = 1;
3011 let mut col_start = match_start;
3012 let mut col_end = match_end;
3013 for (idx, line_info) in lines.iter().enumerate() {
3014 if match_start >= line_info.byte_offset {
3015 line_num = idx + 1;
3016 col_start = match_start - line_info.byte_offset;
3017 col_end = match_end - line_info.byte_offset;
3018 } else {
3019 break;
3020 }
3021 }
3022
3023 bare_urls.push(BareUrl {
3024 line: line_num,
3025 start_col: col_start,
3026 end_col: col_end,
3027 byte_offset: match_start,
3028 byte_end: match_end,
3029 url: email.to_string(),
3030 url_type: "email".to_string(),
3031 });
3032 }
3033
3034 bare_urls
3035 }
3036}
3037
3038fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3040 if list_blocks.len() < 2 {
3041 return;
3042 }
3043
3044 let mut merger = ListBlockMerger::new(content, lines);
3045 *list_blocks = merger.merge(list_blocks);
3046}
3047
3048struct ListBlockMerger<'a> {
3050 content: &'a str,
3051 lines: &'a [LineInfo],
3052}
3053
3054impl<'a> ListBlockMerger<'a> {
3055 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3056 Self { content, lines }
3057 }
3058
3059 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3060 let mut merged = Vec::with_capacity(list_blocks.len());
3061 let mut current = list_blocks[0].clone();
3062
3063 for next in list_blocks.iter().skip(1) {
3064 if self.should_merge_blocks(¤t, next) {
3065 current = self.merge_two_blocks(current, next);
3066 } else {
3067 merged.push(current);
3068 current = next.clone();
3069 }
3070 }
3071
3072 merged.push(current);
3073 merged
3074 }
3075
3076 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3078 if !self.blocks_are_compatible(current, next) {
3080 return false;
3081 }
3082
3083 let spacing = self.analyze_spacing_between(current, next);
3085 match spacing {
3086 BlockSpacing::Consecutive => true,
3087 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3088 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3089 self.can_merge_with_content_between(current, next)
3090 }
3091 }
3092 }
3093
3094 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3096 current.is_ordered == next.is_ordered
3097 && current.blockquote_prefix == next.blockquote_prefix
3098 && current.nesting_level == next.nesting_level
3099 }
3100
3101 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3103 let gap = next.start_line - current.end_line;
3104
3105 match gap {
3106 1 => BlockSpacing::Consecutive,
3107 2 => BlockSpacing::SingleBlank,
3108 _ if gap > 2 => {
3109 if self.has_only_blank_lines_between(current, next) {
3110 BlockSpacing::MultipleBlanks
3111 } else {
3112 BlockSpacing::ContentBetween
3113 }
3114 }
3115 _ => BlockSpacing::Consecutive, }
3117 }
3118
3119 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3121 if has_meaningful_content_between(self.content, current, next, self.lines) {
3124 return false; }
3126
3127 !current.is_ordered && current.marker == next.marker
3129 }
3130
3131 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3133 if has_meaningful_content_between(self.content, current, next, self.lines) {
3135 return false; }
3137
3138 current.is_ordered && next.is_ordered
3140 }
3141
3142 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3144 for line_num in (current.end_line + 1)..next.start_line {
3145 if let Some(line_info) = self.lines.get(line_num - 1)
3146 && !line_info.content(self.content).trim().is_empty()
3147 {
3148 return false;
3149 }
3150 }
3151 true
3152 }
3153
3154 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3156 current.end_line = next.end_line;
3157 current.item_lines.extend_from_slice(&next.item_lines);
3158
3159 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3161
3162 if !current.is_ordered && self.markers_differ(¤t, next) {
3164 current.marker = None; }
3166
3167 current
3168 }
3169
3170 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3172 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3173 }
3174}
3175
3176#[derive(Debug, PartialEq)]
3178enum BlockSpacing {
3179 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3184
3185fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3187 for line_num in (current.end_line + 1)..next.start_line {
3189 if let Some(line_info) = lines.get(line_num - 1) {
3190 let trimmed = line_info.content(content).trim();
3192
3193 if trimmed.is_empty() {
3195 continue;
3196 }
3197
3198 if line_info.heading.is_some() {
3202 return true; }
3204
3205 if is_horizontal_rule(trimmed) {
3207 return true; }
3209
3210 if trimmed.contains('|') && trimmed.len() > 1 {
3213 if !trimmed.contains("](") && !trimmed.contains("http") {
3215 let pipe_count = trimmed.matches('|').count();
3217 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3218 return true; }
3220 }
3221 }
3222
3223 if trimmed.starts_with('>') {
3225 return true; }
3227
3228 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3230 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3231
3232 let min_continuation_indent = if current.is_ordered {
3234 current.nesting_level + current.max_marker_width + 1 } else {
3236 current.nesting_level + 2
3237 };
3238
3239 if line_indent < min_continuation_indent {
3240 return true; }
3243 }
3244
3245 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3247
3248 let min_indent = if current.is_ordered {
3250 current.nesting_level + current.max_marker_width
3251 } else {
3252 current.nesting_level + 2
3253 };
3254
3255 if line_indent < min_indent {
3257 return true; }
3259
3260 }
3263 }
3264
3265 false
3267}
3268
3269fn is_horizontal_rule(trimmed: &str) -> bool {
3271 if trimmed.len() < 3 {
3272 return false;
3273 }
3274
3275 let chars: Vec<char> = trimmed.chars().collect();
3277 if let Some(&first_char) = chars.first()
3278 && (first_char == '-' || first_char == '*' || first_char == '_')
3279 {
3280 let mut count = 0;
3281 for &ch in &chars {
3282 if ch == first_char {
3283 count += 1;
3284 } else if ch != ' ' && ch != '\t' {
3285 return false; }
3287 }
3288 return count >= 3;
3289 }
3290 false
3291}
3292
3293#[cfg(test)]
3295mod tests {
3296 use super::*;
3297
3298 #[test]
3299 fn test_empty_content() {
3300 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3301 assert_eq!(ctx.content, "");
3302 assert_eq!(ctx.line_offsets, vec![0]);
3303 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3304 assert_eq!(ctx.lines.len(), 0);
3305 }
3306
3307 #[test]
3308 fn test_single_line() {
3309 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3310 assert_eq!(ctx.content, "# Hello");
3311 assert_eq!(ctx.line_offsets, vec![0]);
3312 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3313 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3314 }
3315
3316 #[test]
3317 fn test_multi_line() {
3318 let content = "# Title\n\nSecond line\nThird line";
3319 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3320 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3321 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3328
3329 #[test]
3330 fn test_line_info() {
3331 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3332 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3333
3334 assert_eq!(ctx.lines.len(), 7);
3336
3337 let line1 = &ctx.lines[0];
3339 assert_eq!(line1.content(ctx.content), "# Title");
3340 assert_eq!(line1.byte_offset, 0);
3341 assert_eq!(line1.indent, 0);
3342 assert!(!line1.is_blank);
3343 assert!(!line1.in_code_block);
3344 assert!(line1.list_item.is_none());
3345
3346 let line2 = &ctx.lines[1];
3348 assert_eq!(line2.content(ctx.content), " indented");
3349 assert_eq!(line2.byte_offset, 8);
3350 assert_eq!(line2.indent, 4);
3351 assert!(!line2.is_blank);
3352
3353 let line3 = &ctx.lines[2];
3355 assert_eq!(line3.content(ctx.content), "");
3356 assert!(line3.is_blank);
3357
3358 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3360 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3361 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3362 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3363 }
3364
3365 #[test]
3366 fn test_list_item_detection() {
3367 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3368 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3369
3370 let line1 = &ctx.lines[0];
3372 assert!(line1.list_item.is_some());
3373 let list1 = line1.list_item.as_ref().unwrap();
3374 assert_eq!(list1.marker, "-");
3375 assert!(!list1.is_ordered);
3376 assert_eq!(list1.marker_column, 0);
3377 assert_eq!(list1.content_column, 2);
3378
3379 let line2 = &ctx.lines[1];
3381 assert!(line2.list_item.is_some());
3382 let list2 = line2.list_item.as_ref().unwrap();
3383 assert_eq!(list2.marker, "*");
3384 assert_eq!(list2.marker_column, 2);
3385
3386 let line3 = &ctx.lines[2];
3388 assert!(line3.list_item.is_some());
3389 let list3 = line3.list_item.as_ref().unwrap();
3390 assert_eq!(list3.marker, "1.");
3391 assert!(list3.is_ordered);
3392 assert_eq!(list3.number, Some(1));
3393
3394 let line6 = &ctx.lines[5];
3396 assert!(line6.list_item.is_none());
3397 }
3398
3399 #[test]
3400 fn test_offset_to_line_col_edge_cases() {
3401 let content = "a\nb\nc";
3402 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3403 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3411
3412 #[test]
3413 fn test_mdx_esm_blocks() {
3414 let content = r##"import {Chart} from './snowfall.js'
3415export const year = 2023
3416
3417# Last year's snowfall
3418
3419In {year}, the snowfall was above average.
3420It was followed by a warm spring which caused
3421flood conditions in many of the nearby rivers.
3422
3423<Chart color="#fcb32c" year={year} />
3424"##;
3425
3426 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3427
3428 assert_eq!(ctx.lines.len(), 10);
3430 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3431 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3432 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3433 assert!(
3434 !ctx.lines[3].in_esm_block,
3435 "Line 4 (heading) should NOT be in_esm_block"
3436 );
3437 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3438 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3439 }
3440
3441 #[test]
3442 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3443 let content = r#"import {Chart} from './snowfall.js'
3444export const year = 2023
3445
3446# Last year's snowfall
3447"#;
3448
3449 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3450
3451 assert!(
3453 !ctx.lines[0].in_esm_block,
3454 "Line 1 should NOT be in_esm_block in Standard flavor"
3455 );
3456 assert!(
3457 !ctx.lines[1].in_esm_block,
3458 "Line 2 should NOT be in_esm_block in Standard flavor"
3459 );
3460 }
3461}