1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12 ($name:expr, $profile:expr, $code:expr) => {{
13 let start = std::time::Instant::now();
14 let result = $code;
15 if $profile {
16 eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17 }
18 result
19 }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24 ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30 Regex::new(
31 r#"(?sx)
32 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33 (?:
34 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
35 |
36 \[([^\]]*)\] # Reference ID in group 6
37 )"#
38 ).unwrap()
39});
40
41static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44 Regex::new(
45 r#"(?sx)
46 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47 (?:
48 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
49 |
50 \[([^\]]*)\] # Reference ID in group 6
51 )"#
52 ).unwrap()
53});
54
55static REF_DEF_PATTERN: LazyLock<Regex> =
57 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61 Regex::new(
62 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63 ).unwrap()
64});
65
66static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73#[derive(Debug, Clone)]
75pub struct LineInfo {
76 pub byte_offset: usize,
78 pub byte_len: usize,
80 pub indent: usize,
82 pub is_blank: bool,
84 pub in_code_block: bool,
86 pub in_front_matter: bool,
88 pub in_html_block: bool,
90 pub in_html_comment: bool,
92 pub list_item: Option<ListItemInfo>,
94 pub heading: Option<HeadingInfo>,
96 pub blockquote: Option<BlockquoteInfo>,
98 pub in_mkdocstrings: bool,
100 pub in_esm_block: bool,
102}
103
104impl LineInfo {
105 pub fn content<'a>(&self, source: &'a str) -> &'a str {
107 &source[self.byte_offset..self.byte_offset + self.byte_len]
108 }
109}
110
111#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114 pub marker: String,
116 pub is_ordered: bool,
118 pub number: Option<usize>,
120 pub marker_column: usize,
122 pub content_column: usize,
124}
125
126#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129 ATX,
131 Setext1,
133 Setext2,
135}
136
137#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140 pub line: usize,
142 pub start_col: usize,
144 pub end_col: usize,
146 pub byte_offset: usize,
148 pub byte_end: usize,
150 pub text: Cow<'a, str>,
152 pub url: Cow<'a, str>,
154 pub is_reference: bool,
156 pub reference_id: Option<Cow<'a, str>>,
158 pub link_type: LinkType,
160}
161
162#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165 pub reference: String,
167 pub span: std::ops::Range<usize>,
169}
170
171#[derive(Debug, Clone)]
173pub struct ParsedImage<'a> {
174 pub line: usize,
176 pub start_col: usize,
178 pub end_col: usize,
180 pub byte_offset: usize,
182 pub byte_end: usize,
184 pub alt_text: Cow<'a, str>,
186 pub url: Cow<'a, str>,
188 pub is_reference: bool,
190 pub reference_id: Option<Cow<'a, str>>,
192 pub link_type: LinkType,
194}
195
196#[derive(Debug, Clone)]
198pub struct ReferenceDef {
199 pub line: usize,
201 pub id: String,
203 pub url: String,
205 pub title: Option<String>,
207 pub byte_offset: usize,
209 pub byte_end: usize,
211}
212
213#[derive(Debug, Clone)]
215pub struct CodeSpan {
216 pub line: usize,
218 pub start_col: usize,
220 pub end_col: usize,
222 pub byte_offset: usize,
224 pub byte_end: usize,
226 pub backtick_count: usize,
228 pub content: String,
230}
231
232#[derive(Debug, Clone)]
234pub struct HeadingInfo {
235 pub level: u8,
237 pub style: HeadingStyle,
239 pub marker: String,
241 pub marker_column: usize,
243 pub content_column: usize,
245 pub text: String,
247 pub custom_id: Option<String>,
249 pub raw_text: String,
251 pub has_closing_sequence: bool,
253 pub closing_sequence: String,
255}
256
257#[derive(Debug, Clone)]
259pub struct BlockquoteInfo {
260 pub nesting_level: usize,
262 pub indent: String,
264 pub marker_column: usize,
266 pub prefix: String,
268 pub content: String,
270 pub has_no_space_after_marker: bool,
272 pub has_multiple_spaces_after_marker: bool,
274 pub needs_md028_fix: bool,
276}
277
278#[derive(Debug, Clone)]
280pub struct ListBlock {
281 pub start_line: usize,
283 pub end_line: usize,
285 pub is_ordered: bool,
287 pub marker: Option<String>,
289 pub blockquote_prefix: String,
291 pub item_lines: Vec<usize>,
293 pub nesting_level: usize,
295 pub max_marker_width: usize,
297}
298
299use std::sync::{Arc, Mutex};
300
301#[derive(Debug, Clone, Default)]
303pub struct CharFrequency {
304 pub hash_count: usize,
306 pub asterisk_count: usize,
308 pub underscore_count: usize,
310 pub hyphen_count: usize,
312 pub plus_count: usize,
314 pub gt_count: usize,
316 pub pipe_count: usize,
318 pub bracket_count: usize,
320 pub backtick_count: usize,
322 pub lt_count: usize,
324 pub exclamation_count: usize,
326 pub newline_count: usize,
328}
329
330#[derive(Debug, Clone)]
332pub struct HtmlTag {
333 pub line: usize,
335 pub start_col: usize,
337 pub end_col: usize,
339 pub byte_offset: usize,
341 pub byte_end: usize,
343 pub tag_name: String,
345 pub is_closing: bool,
347 pub is_self_closing: bool,
349 pub raw_content: String,
351}
352
353#[derive(Debug, Clone)]
355pub struct EmphasisSpan {
356 pub line: usize,
358 pub start_col: usize,
360 pub end_col: usize,
362 pub byte_offset: usize,
364 pub byte_end: usize,
366 pub marker: char,
368 pub marker_count: usize,
370 pub content: String,
372}
373
374#[derive(Debug, Clone)]
376pub struct TableRow {
377 pub line: usize,
379 pub is_separator: bool,
381 pub column_count: usize,
383 pub column_alignments: Vec<String>, }
386
387#[derive(Debug, Clone)]
389pub struct BareUrl {
390 pub line: usize,
392 pub start_col: usize,
394 pub end_col: usize,
396 pub byte_offset: usize,
398 pub byte_end: usize,
400 pub url: String,
402 pub url_type: String,
404}
405
406pub struct LintContext<'a> {
407 pub content: &'a str,
408 pub line_offsets: Vec<usize>,
409 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
428
429struct BlockquoteComponents<'a> {
431 indent: &'a str,
432 markers: &'a str,
433 spaces_after: &'a str,
434 content: &'a str,
435}
436
437#[inline]
439fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
440 let bytes = line.as_bytes();
441 let mut pos = 0;
442
443 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
445 pos += 1;
446 }
447 let indent_end = pos;
448
449 if pos >= bytes.len() || bytes[pos] != b'>' {
451 return None;
452 }
453
454 while pos < bytes.len() && bytes[pos] == b'>' {
456 pos += 1;
457 }
458 let markers_end = pos;
459
460 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
462 pos += 1;
463 }
464 let spaces_end = pos;
465
466 Some(BlockquoteComponents {
467 indent: &line[0..indent_end],
468 markers: &line[indent_end..markers_end],
469 spaces_after: &line[markers_end..spaces_end],
470 content: &line[spaces_end..],
471 })
472}
473
474impl<'a> LintContext<'a> {
475 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
476 #[cfg(not(target_arch = "wasm32"))]
477 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
478 #[cfg(target_arch = "wasm32")]
479 let profile = false;
480
481 let line_offsets = profile_section!("Line offsets", profile, {
482 let mut offsets = vec![0];
483 for (i, c) in content.char_indices() {
484 if c == '\n' {
485 offsets.push(i + 1);
486 }
487 }
488 offsets
489 });
490
491 let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
493
494 let html_comment_ranges = profile_section!(
496 "HTML comment ranges",
497 profile,
498 crate::utils::skip_context::compute_html_comment_ranges(content)
499 );
500
501 let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
503 if flavor == MarkdownFlavor::MkDocs {
504 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
505 } else {
506 Vec::new()
507 }
508 });
509
510 let mut lines = profile_section!(
512 "Basic line info",
513 profile,
514 Self::compute_basic_line_info(
515 content,
516 &line_offsets,
517 &code_blocks,
518 flavor,
519 &html_comment_ranges,
520 &autodoc_ranges,
521 )
522 );
523
524 profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
526
527 profile_section!(
529 "ESM blocks",
530 profile,
531 Self::detect_esm_blocks(content, &mut lines, flavor)
532 );
533
534 profile_section!(
536 "Headings & blockquotes",
537 profile,
538 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
539 );
540
541 let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
543
544 let (links, broken_links) = profile_section!(
546 "Links",
547 profile,
548 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
549 );
550
551 let images = profile_section!(
552 "Images",
553 profile,
554 Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
555 );
556
557 let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
558
559 let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
560
561 let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
563
564 let table_blocks = profile_section!(
566 "Table blocks",
567 profile,
568 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
569 content,
570 &code_blocks,
571 &code_spans,
572 &html_comment_ranges,
573 )
574 );
575
576 let line_index = profile_section!(
578 "Line index",
579 profile,
580 crate::utils::range_utils::LineIndex::new(content)
581 );
582
583 let jinja_ranges = profile_section!(
585 "Jinja ranges",
586 profile,
587 crate::utils::jinja_utils::find_jinja_ranges(content)
588 );
589
590 Self {
591 content,
592 line_offsets,
593 code_blocks,
594 lines,
595 links,
596 images,
597 broken_links,
598 reference_defs,
599 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
600 list_blocks,
601 char_frequency,
602 html_tags_cache: Mutex::new(None),
603 emphasis_spans_cache: Mutex::new(None),
604 table_rows_cache: Mutex::new(None),
605 bare_urls_cache: Mutex::new(None),
606 html_comment_ranges,
607 table_blocks,
608 line_index,
609 jinja_ranges,
610 flavor,
611 }
612 }
613
614 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
616 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
617
618 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
619 }
620
621 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
623 &self.html_comment_ranges
624 }
625
626 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
628 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
629
630 Arc::clone(cache.get_or_insert_with(|| {
631 Arc::new(Self::parse_html_tags(
632 self.content,
633 &self.lines,
634 &self.code_blocks,
635 self.flavor,
636 ))
637 }))
638 }
639
640 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
642 let mut cache = self
643 .emphasis_spans_cache
644 .lock()
645 .expect("Emphasis spans cache mutex poisoned");
646
647 Arc::clone(
648 cache.get_or_insert_with(|| {
649 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
650 }),
651 )
652 }
653
654 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
656 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
657
658 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
659 }
660
661 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
663 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
664
665 Arc::clone(
666 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
667 )
668 }
669
670 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
672 match self.line_offsets.binary_search(&offset) {
673 Ok(line) => (line + 1, 1),
674 Err(line) => {
675 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
676 (line, offset - line_start + 1)
677 }
678 }
679 }
680
681 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
683 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
685 return true;
686 }
687
688 self.code_spans()
690 .iter()
691 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
692 }
693
694 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
696 if line_num > 0 {
697 self.lines.get(line_num - 1)
698 } else {
699 None
700 }
701 }
702
703 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
705 self.line_info(line_num).map(|info| info.byte_offset)
706 }
707
708 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
710 let normalized_id = ref_id.to_lowercase();
711 self.reference_defs
712 .iter()
713 .find(|def| def.id == normalized_id)
714 .map(|def| def.url.as_str())
715 }
716
717 pub fn is_in_list_block(&self, line_num: usize) -> bool {
719 self.list_blocks
720 .iter()
721 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
722 }
723
724 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
726 self.list_blocks
727 .iter()
728 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
729 }
730
731 pub fn is_in_code_block(&self, line_num: usize) -> bool {
735 if line_num == 0 || line_num > self.lines.len() {
736 return false;
737 }
738 self.lines[line_num - 1].in_code_block
739 }
740
741 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
743 if line_num == 0 || line_num > self.lines.len() {
744 return false;
745 }
746 self.lines[line_num - 1].in_front_matter
747 }
748
749 pub fn is_in_html_block(&self, line_num: usize) -> bool {
751 if line_num == 0 || line_num > self.lines.len() {
752 return false;
753 }
754 self.lines[line_num - 1].in_html_block
755 }
756
757 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
759 if line_num == 0 || line_num > self.lines.len() {
760 return false;
761 }
762
763 let col_0indexed = if col > 0 { col - 1 } else { 0 };
767 let code_spans = self.code_spans();
768 code_spans
769 .iter()
770 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
771 }
772
773 #[inline]
776 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
777 self.reference_defs
778 .iter()
779 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
780 }
781
782 #[inline]
786 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
787 self.html_comment_ranges
788 .iter()
789 .any(|range| byte_pos >= range.start && byte_pos < range.end)
790 }
791
792 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
794 self.jinja_ranges
795 .iter()
796 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
797 }
798
799 pub fn has_char(&self, ch: char) -> bool {
801 match ch {
802 '#' => self.char_frequency.hash_count > 0,
803 '*' => self.char_frequency.asterisk_count > 0,
804 '_' => self.char_frequency.underscore_count > 0,
805 '-' => self.char_frequency.hyphen_count > 0,
806 '+' => self.char_frequency.plus_count > 0,
807 '>' => self.char_frequency.gt_count > 0,
808 '|' => self.char_frequency.pipe_count > 0,
809 '[' => self.char_frequency.bracket_count > 0,
810 '`' => self.char_frequency.backtick_count > 0,
811 '<' => self.char_frequency.lt_count > 0,
812 '!' => self.char_frequency.exclamation_count > 0,
813 '\n' => self.char_frequency.newline_count > 0,
814 _ => self.content.contains(ch), }
816 }
817
818 pub fn char_count(&self, ch: char) -> usize {
820 match ch {
821 '#' => self.char_frequency.hash_count,
822 '*' => self.char_frequency.asterisk_count,
823 '_' => self.char_frequency.underscore_count,
824 '-' => self.char_frequency.hyphen_count,
825 '+' => self.char_frequency.plus_count,
826 '>' => self.char_frequency.gt_count,
827 '|' => self.char_frequency.pipe_count,
828 '[' => self.char_frequency.bracket_count,
829 '`' => self.char_frequency.backtick_count,
830 '<' => self.char_frequency.lt_count,
831 '!' => self.char_frequency.exclamation_count,
832 '\n' => self.char_frequency.newline_count,
833 _ => self.content.matches(ch).count(), }
835 }
836
837 pub fn likely_has_headings(&self) -> bool {
839 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
841
842 pub fn likely_has_lists(&self) -> bool {
844 self.char_frequency.asterisk_count > 0
845 || self.char_frequency.hyphen_count > 0
846 || self.char_frequency.plus_count > 0
847 }
848
849 pub fn likely_has_emphasis(&self) -> bool {
851 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
852 }
853
854 pub fn likely_has_tables(&self) -> bool {
856 self.char_frequency.pipe_count > 2
857 }
858
859 pub fn likely_has_blockquotes(&self) -> bool {
861 self.char_frequency.gt_count > 0
862 }
863
864 pub fn likely_has_code(&self) -> bool {
866 self.char_frequency.backtick_count > 0
867 }
868
869 pub fn likely_has_links_or_images(&self) -> bool {
871 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
872 }
873
874 pub fn likely_has_html(&self) -> bool {
876 self.char_frequency.lt_count > 0
877 }
878
879 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
881 self.html_tags()
882 .iter()
883 .filter(|tag| tag.line == line_num)
884 .cloned()
885 .collect()
886 }
887
888 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
890 self.emphasis_spans()
891 .iter()
892 .filter(|span| span.line == line_num)
893 .cloned()
894 .collect()
895 }
896
897 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
899 self.table_rows()
900 .iter()
901 .filter(|row| row.line == line_num)
902 .cloned()
903 .collect()
904 }
905
906 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
908 self.bare_urls()
909 .iter()
910 .filter(|url| url.line == line_num)
911 .cloned()
912 .collect()
913 }
914
915 #[inline]
921 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
922 let idx = match lines.binary_search_by(|line| {
924 if byte_offset < line.byte_offset {
925 std::cmp::Ordering::Greater
926 } else if byte_offset > line.byte_offset + line.byte_len {
927 std::cmp::Ordering::Less
928 } else {
929 std::cmp::Ordering::Equal
930 }
931 }) {
932 Ok(idx) => idx,
933 Err(idx) => idx.saturating_sub(1),
934 };
935
936 let line = &lines[idx];
937 let line_num = idx + 1;
938 let col = byte_offset.saturating_sub(line.byte_offset);
939
940 (idx, line_num, col)
941 }
942
943 #[inline]
945 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
946 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
948
949 if idx > 0 {
951 let span = &code_spans[idx - 1];
952 if offset >= span.byte_offset && offset < span.byte_end {
953 return true;
954 }
955 }
956
957 false
958 }
959
960 fn parse_links(
962 content: &'a str,
963 lines: &[LineInfo],
964 code_blocks: &[(usize, usize)],
965 code_spans: &[CodeSpan],
966 flavor: MarkdownFlavor,
967 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
968 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
969 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
970 use std::collections::HashSet;
971
972 let mut links = Vec::with_capacity(content.len() / 500);
973 let mut broken_links = Vec::new();
974
975 let mut found_positions = HashSet::new();
977
978 let mut options = Options::empty();
988 options.insert(Options::ENABLE_WIKILINKS);
989
990 let parser = Parser::new_with_broken_link_callback(
991 content,
992 options,
993 Some(|link: BrokenLink<'_>| {
994 broken_links.push(BrokenLinkInfo {
995 reference: link.reference.to_string(),
996 span: link.span.clone(),
997 });
998 None
999 }),
1000 )
1001 .into_offset_iter();
1002
1003 let mut link_stack: Vec<(
1004 usize,
1005 usize,
1006 pulldown_cmark::CowStr<'a>,
1007 LinkType,
1008 pulldown_cmark::CowStr<'a>,
1009 )> = Vec::new();
1010 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1013 match event {
1014 Event::Start(Tag::Link {
1015 link_type,
1016 dest_url,
1017 id,
1018 ..
1019 }) => {
1020 link_stack.push((range.start, range.end, dest_url, link_type, id));
1022 text_chunks.clear();
1023 }
1024 Event::Text(text) if !link_stack.is_empty() => {
1025 text_chunks.push((text.to_string(), range.start, range.end));
1027 }
1028 Event::Code(code) if !link_stack.is_empty() => {
1029 let code_text = format!("`{code}`");
1031 text_chunks.push((code_text, range.start, range.end));
1032 }
1033 Event::End(TagEnd::Link) => {
1034 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1035 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1037 text_chunks.clear();
1038 continue;
1039 }
1040
1041 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1043
1044 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1046 text_chunks.clear();
1047 continue;
1048 }
1049
1050 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1051
1052 let is_reference = matches!(
1053 link_type,
1054 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1055 );
1056
1057 let link_text = if start_pos < content.len() {
1060 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1061
1062 let mut close_pos = None;
1066 let mut depth = 0;
1067 let mut in_code_span = false;
1068
1069 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1070 let mut backslash_count = 0;
1072 let mut j = i;
1073 while j > 0 && link_bytes[j - 1] == b'\\' {
1074 backslash_count += 1;
1075 j -= 1;
1076 }
1077 let is_escaped = backslash_count % 2 != 0;
1078
1079 if byte == b'`' && !is_escaped {
1081 in_code_span = !in_code_span;
1082 }
1083
1084 if !is_escaped && !in_code_span {
1086 if byte == b'[' {
1087 depth += 1;
1088 } else if byte == b']' {
1089 if depth == 0 {
1090 close_pos = Some(i);
1092 break;
1093 } else {
1094 depth -= 1;
1095 }
1096 }
1097 }
1098 }
1099
1100 if let Some(pos) = close_pos {
1101 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1102 } else {
1103 Cow::Borrowed("")
1104 }
1105 } else {
1106 Cow::Borrowed("")
1107 };
1108
1109 let reference_id = if is_reference && !ref_id.is_empty() {
1111 Some(Cow::Owned(ref_id.to_lowercase()))
1112 } else if is_reference {
1113 Some(Cow::Owned(link_text.to_lowercase()))
1115 } else {
1116 None
1117 };
1118
1119 let has_escaped_bang = start_pos >= 2
1123 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1124 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1125
1126 let has_escaped_bracket =
1129 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1130
1131 if has_escaped_bang || has_escaped_bracket {
1132 text_chunks.clear();
1133 continue; }
1135
1136 found_positions.insert(start_pos);
1138
1139 links.push(ParsedLink {
1140 line: line_num,
1141 start_col: col_start,
1142 end_col: col_end,
1143 byte_offset: start_pos,
1144 byte_end: range.end,
1145 text: link_text,
1146 url: Cow::Owned(url.to_string()),
1147 is_reference,
1148 reference_id,
1149 link_type,
1150 });
1151
1152 text_chunks.clear();
1153 }
1154 }
1155 _ => {}
1156 }
1157 }
1158
1159 for cap in LINK_PATTERN.captures_iter(content) {
1163 let full_match = cap.get(0).unwrap();
1164 let match_start = full_match.start();
1165 let match_end = full_match.end();
1166
1167 if found_positions.contains(&match_start) {
1169 continue;
1170 }
1171
1172 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1174 continue;
1175 }
1176
1177 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1179 continue;
1180 }
1181
1182 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1184 continue;
1185 }
1186
1187 if Self::is_offset_in_code_span(code_spans, match_start) {
1189 continue;
1190 }
1191
1192 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1194 continue;
1195 }
1196
1197 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1199
1200 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1202 continue;
1203 }
1204
1205 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1206
1207 let text = cap.get(1).map_or("", |m| m.as_str());
1208
1209 if let Some(ref_id) = cap.get(6) {
1211 let ref_id_str = ref_id.as_str();
1212 let normalized_ref = if ref_id_str.is_empty() {
1213 Cow::Owned(text.to_lowercase()) } else {
1215 Cow::Owned(ref_id_str.to_lowercase())
1216 };
1217
1218 links.push(ParsedLink {
1220 line: line_num,
1221 start_col: col_start,
1222 end_col: col_end,
1223 byte_offset: match_start,
1224 byte_end: match_end,
1225 text: Cow::Borrowed(text),
1226 url: Cow::Borrowed(""), is_reference: true,
1228 reference_id: Some(normalized_ref),
1229 link_type: LinkType::Reference, });
1231 }
1232 }
1233
1234 (links, broken_links)
1235 }
1236
1237 fn parse_images(
1239 content: &'a str,
1240 lines: &[LineInfo],
1241 code_blocks: &[(usize, usize)],
1242 code_spans: &[CodeSpan],
1243 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1244 ) -> Vec<ParsedImage<'a>> {
1245 use crate::utils::skip_context::is_in_html_comment_ranges;
1246 use std::collections::HashSet;
1247
1248 let mut images = Vec::with_capacity(content.len() / 1000);
1250 let mut found_positions = HashSet::new();
1251
1252 let parser = Parser::new(content).into_offset_iter();
1254 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1255 Vec::new();
1256 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1259 match event {
1260 Event::Start(Tag::Image {
1261 link_type,
1262 dest_url,
1263 id,
1264 ..
1265 }) => {
1266 image_stack.push((range.start, dest_url, link_type, id));
1267 text_chunks.clear();
1268 }
1269 Event::Text(text) if !image_stack.is_empty() => {
1270 text_chunks.push((text.to_string(), range.start, range.end));
1271 }
1272 Event::Code(code) if !image_stack.is_empty() => {
1273 let code_text = format!("`{code}`");
1274 text_chunks.push((code_text, range.start, range.end));
1275 }
1276 Event::End(TagEnd::Image) => {
1277 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1278 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1280 continue;
1281 }
1282
1283 if Self::is_offset_in_code_span(code_spans, start_pos) {
1285 continue;
1286 }
1287
1288 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1290 continue;
1291 }
1292
1293 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1295 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1296
1297 let is_reference = matches!(
1298 link_type,
1299 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1300 );
1301
1302 let alt_text = if start_pos < content.len() {
1305 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1306
1307 let mut close_pos = None;
1310 let mut depth = 0;
1311
1312 if image_bytes.len() > 2 {
1313 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1314 let mut backslash_count = 0;
1316 let mut j = i;
1317 while j > 0 && image_bytes[j - 1] == b'\\' {
1318 backslash_count += 1;
1319 j -= 1;
1320 }
1321 let is_escaped = backslash_count % 2 != 0;
1322
1323 if !is_escaped {
1324 if byte == b'[' {
1325 depth += 1;
1326 } else if byte == b']' {
1327 if depth == 0 {
1328 close_pos = Some(i);
1330 break;
1331 } else {
1332 depth -= 1;
1333 }
1334 }
1335 }
1336 }
1337 }
1338
1339 if let Some(pos) = close_pos {
1340 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1341 } else {
1342 Cow::Borrowed("")
1343 }
1344 } else {
1345 Cow::Borrowed("")
1346 };
1347
1348 let reference_id = if is_reference && !ref_id.is_empty() {
1349 Some(Cow::Owned(ref_id.to_lowercase()))
1350 } else if is_reference {
1351 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1353 None
1354 };
1355
1356 found_positions.insert(start_pos);
1357 images.push(ParsedImage {
1358 line: line_num,
1359 start_col: col_start,
1360 end_col: col_end,
1361 byte_offset: start_pos,
1362 byte_end: range.end,
1363 alt_text,
1364 url: Cow::Owned(url.to_string()),
1365 is_reference,
1366 reference_id,
1367 link_type,
1368 });
1369 }
1370 }
1371 _ => {}
1372 }
1373 }
1374
1375 for cap in IMAGE_PATTERN.captures_iter(content) {
1377 let full_match = cap.get(0).unwrap();
1378 let match_start = full_match.start();
1379 let match_end = full_match.end();
1380
1381 if found_positions.contains(&match_start) {
1383 continue;
1384 }
1385
1386 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1388 continue;
1389 }
1390
1391 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1393 || Self::is_offset_in_code_span(code_spans, match_start)
1394 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1395 {
1396 continue;
1397 }
1398
1399 if let Some(ref_id) = cap.get(6) {
1401 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1402 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1403 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1404 let ref_id_str = ref_id.as_str();
1405 let normalized_ref = if ref_id_str.is_empty() {
1406 Cow::Owned(alt_text.to_lowercase())
1407 } else {
1408 Cow::Owned(ref_id_str.to_lowercase())
1409 };
1410
1411 images.push(ParsedImage {
1412 line: line_num,
1413 start_col: col_start,
1414 end_col: col_end,
1415 byte_offset: match_start,
1416 byte_end: match_end,
1417 alt_text: Cow::Borrowed(alt_text),
1418 url: Cow::Borrowed(""),
1419 is_reference: true,
1420 reference_id: Some(normalized_ref),
1421 link_type: LinkType::Reference, });
1423 }
1424 }
1425
1426 images
1427 }
1428
1429 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1431 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1435 if line_info.in_code_block {
1437 continue;
1438 }
1439
1440 let line = line_info.content(content);
1441 let line_num = line_idx + 1;
1442
1443 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1444 let id = cap.get(1).unwrap().as_str().to_lowercase();
1445 let url = cap.get(2).unwrap().as_str().to_string();
1446 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1447
1448 let match_obj = cap.get(0).unwrap();
1451 let byte_offset = line_info.byte_offset + match_obj.start();
1452 let byte_end = line_info.byte_offset + match_obj.end();
1453
1454 refs.push(ReferenceDef {
1455 line: line_num,
1456 id,
1457 url,
1458 title,
1459 byte_offset,
1460 byte_end,
1461 });
1462 }
1463 }
1464
1465 refs
1466 }
1467
1468 #[inline]
1472 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1473 let trimmed_start = line.trim_start();
1474 if !trimmed_start.starts_with('>') {
1475 return None;
1476 }
1477
1478 let leading_ws_len = line.len() - trimmed_start.len();
1479 let after_gt = &trimmed_start[1..];
1480 let content = after_gt.trim_start();
1481 let ws_after_gt_len = after_gt.len() - content.len();
1482 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1483
1484 Some((&line[..prefix_len], content))
1485 }
1486
1487 #[inline]
1491 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1492 let bytes = line.as_bytes();
1493 let mut i = 0;
1494
1495 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1497 i += 1;
1498 }
1499
1500 if i >= bytes.len() {
1502 return None;
1503 }
1504 let marker = bytes[i] as char;
1505 if marker != '-' && marker != '*' && marker != '+' {
1506 return None;
1507 }
1508 let marker_pos = i;
1509 i += 1;
1510
1511 let spacing_start = i;
1513 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1514 i += 1;
1515 }
1516
1517 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1518 }
1519
1520 #[inline]
1524 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1525 let bytes = line.as_bytes();
1526 let mut i = 0;
1527
1528 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1530 i += 1;
1531 }
1532
1533 let number_start = i;
1535 while i < bytes.len() && bytes[i].is_ascii_digit() {
1536 i += 1;
1537 }
1538 if i == number_start {
1539 return None; }
1541
1542 if i >= bytes.len() {
1544 return None;
1545 }
1546 let delimiter = bytes[i] as char;
1547 if delimiter != '.' && delimiter != ')' {
1548 return None;
1549 }
1550 let delimiter_pos = i;
1551 i += 1;
1552
1553 let spacing_start = i;
1555 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1556 i += 1;
1557 }
1558
1559 Some((
1560 &line[..number_start],
1561 &line[number_start..delimiter_pos],
1562 delimiter,
1563 &line[spacing_start..i],
1564 &line[i..],
1565 ))
1566 }
1567
1568 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1571 let num_lines = line_offsets.len();
1572 let mut in_code_block = vec![false; num_lines];
1573
1574 for &(start, end) in code_blocks {
1576 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1578 let mut boundary = start;
1579 while boundary > 0 && !content.is_char_boundary(boundary) {
1580 boundary -= 1;
1581 }
1582 boundary
1583 } else {
1584 start
1585 };
1586
1587 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1588 let mut boundary = end;
1589 while boundary < content.len() && !content.is_char_boundary(boundary) {
1590 boundary += 1;
1591 }
1592 boundary
1593 } else {
1594 end.min(content.len())
1595 };
1596
1597 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1612 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1613
1614 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1616 *flag = true;
1617 }
1618 }
1619
1620 in_code_block
1621 }
1622
1623 fn compute_basic_line_info(
1625 content: &str,
1626 line_offsets: &[usize],
1627 code_blocks: &[(usize, usize)],
1628 flavor: MarkdownFlavor,
1629 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1630 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1631 ) -> Vec<LineInfo> {
1632 let content_lines: Vec<&str> = content.lines().collect();
1633 let mut lines = Vec::with_capacity(content_lines.len());
1634
1635 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1637
1638 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1641
1642 for (i, line) in content_lines.iter().enumerate() {
1643 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1644 let indent = line.len() - line.trim_start().len();
1645
1646 let blockquote_parse = Self::parse_blockquote_prefix(line);
1648
1649 let is_blank = if let Some((_, content)) = blockquote_parse {
1651 content.trim().is_empty()
1653 } else {
1654 line.trim().is_empty()
1655 };
1656
1657 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1659
1660 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1662 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1663 let in_html_comment =
1665 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1666 let list_item = if !(in_code_block
1667 || is_blank
1668 || in_mkdocstrings
1669 || in_html_comment
1670 || (front_matter_end > 0 && i < front_matter_end))
1671 {
1672 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1674 (content, prefix.len())
1675 } else {
1676 (&**line, 0)
1677 };
1678
1679 if let Some((leading_spaces, marker, spacing, _content)) =
1680 Self::parse_unordered_list(line_for_list_check)
1681 {
1682 let marker_column = blockquote_prefix_len + leading_spaces.len();
1683 let content_column = marker_column + 1 + spacing.len();
1684
1685 if spacing.is_empty() {
1692 None
1693 } else {
1694 Some(ListItemInfo {
1695 marker: marker.to_string(),
1696 is_ordered: false,
1697 number: None,
1698 marker_column,
1699 content_column,
1700 })
1701 }
1702 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1703 Self::parse_ordered_list(line_for_list_check)
1704 {
1705 let marker = format!("{number_str}{delimiter}");
1706 let marker_column = blockquote_prefix_len + leading_spaces.len();
1707 let content_column = marker_column + marker.len() + spacing.len();
1708
1709 if spacing.is_empty() {
1712 None
1713 } else {
1714 Some(ListItemInfo {
1715 marker,
1716 is_ordered: true,
1717 number: number_str.parse().ok(),
1718 marker_column,
1719 content_column,
1720 })
1721 }
1722 } else {
1723 None
1724 }
1725 } else {
1726 None
1727 };
1728
1729 lines.push(LineInfo {
1730 byte_offset,
1731 byte_len: line.len(),
1732 indent,
1733 is_blank,
1734 in_code_block,
1735 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1736 in_html_block: false, in_html_comment,
1738 list_item,
1739 heading: None, blockquote: None, in_mkdocstrings,
1742 in_esm_block: false, });
1744 }
1745
1746 lines
1747 }
1748
1749 fn detect_headings_and_blockquotes(
1751 content: &str,
1752 lines: &mut [LineInfo],
1753 flavor: MarkdownFlavor,
1754 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1755 ) {
1756 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1758 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1759 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1760 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1761
1762 let content_lines: Vec<&str> = content.lines().collect();
1763
1764 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1766
1767 for i in 0..lines.len() {
1769 if lines[i].in_code_block {
1770 continue;
1771 }
1772
1773 if front_matter_end > 0 && i < front_matter_end {
1775 continue;
1776 }
1777
1778 if lines[i].in_html_block {
1780 continue;
1781 }
1782
1783 let line = content_lines[i];
1784
1785 if let Some(bq) = parse_blockquote_detailed(line) {
1787 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1789
1790 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1792
1793 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1795 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1797
1798 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1802
1803 lines[i].blockquote = Some(BlockquoteInfo {
1804 nesting_level,
1805 indent: bq.indent.to_string(),
1806 marker_column,
1807 prefix,
1808 content: bq.content.to_string(),
1809 has_no_space_after_marker: has_no_space,
1810 has_multiple_spaces_after_marker: has_multiple_spaces,
1811 needs_md028_fix,
1812 });
1813 }
1814
1815 if lines[i].is_blank {
1817 continue;
1818 }
1819
1820 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1823 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1824 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1825 } else {
1826 false
1827 };
1828
1829 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1830 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1832 continue;
1833 }
1834 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1835 let hashes = caps.get(2).map_or("", |m| m.as_str());
1836 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1837 let rest = caps.get(4).map_or("", |m| m.as_str());
1838
1839 let level = hashes.len() as u8;
1840 let marker_column = leading_spaces.len();
1841
1842 let (text, has_closing, closing_seq) = {
1844 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1846 if rest[id_start..].trim_end().ends_with('}') {
1848 (&rest[..id_start], &rest[id_start..])
1850 } else {
1851 (rest, "")
1852 }
1853 } else {
1854 (rest, "")
1855 };
1856
1857 let trimmed_rest = rest_without_id.trim_end();
1859 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1860 let mut start_of_hashes = last_hash_pos;
1862 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1863 start_of_hashes -= 1;
1864 }
1865
1866 let has_space_before = start_of_hashes == 0
1868 || trimmed_rest
1869 .chars()
1870 .nth(start_of_hashes - 1)
1871 .is_some_and(|c| c.is_whitespace());
1872
1873 let potential_closing = &trimmed_rest[start_of_hashes..];
1875 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1876
1877 if is_all_hashes && has_space_before {
1878 let closing_hashes = potential_closing.to_string();
1880 let text_part = if !custom_id_part.is_empty() {
1883 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1886 } else {
1887 rest_without_id[..start_of_hashes].trim_end().to_string()
1888 };
1889 (text_part, true, closing_hashes)
1890 } else {
1891 (rest.to_string(), false, String::new())
1893 }
1894 } else {
1895 (rest.to_string(), false, String::new())
1897 }
1898 };
1899
1900 let content_column = marker_column + hashes.len() + spaces_after.len();
1901
1902 let raw_text = text.trim().to_string();
1904 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1905
1906 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1908 let next_line = content_lines[i + 1];
1909 if !lines[i + 1].in_code_block
1910 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1911 && let Some(next_line_id) =
1912 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1913 {
1914 custom_id = Some(next_line_id);
1915 }
1916 }
1917
1918 lines[i].heading = Some(HeadingInfo {
1919 level,
1920 style: HeadingStyle::ATX,
1921 marker: hashes.to_string(),
1922 marker_column,
1923 content_column,
1924 text: clean_text,
1925 custom_id,
1926 raw_text,
1927 has_closing_sequence: has_closing,
1928 closing_sequence: closing_seq,
1929 });
1930 }
1931 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1933 let next_line = content_lines[i + 1];
1934 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1935 if front_matter_end > 0 && i < front_matter_end {
1937 continue;
1938 }
1939
1940 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1942 {
1943 continue;
1944 }
1945
1946 let underline = next_line.trim();
1947
1948 if underline == "---" {
1951 continue;
1952 }
1953
1954 let current_line_trimmed = line.trim();
1956 if current_line_trimmed.contains(':')
1957 && !current_line_trimmed.starts_with('#')
1958 && !current_line_trimmed.contains('[')
1959 && !current_line_trimmed.contains("](")
1960 {
1961 continue;
1963 }
1964
1965 let level = if underline.starts_with('=') { 1 } else { 2 };
1966 let style = if level == 1 {
1967 HeadingStyle::Setext1
1968 } else {
1969 HeadingStyle::Setext2
1970 };
1971
1972 let raw_text = line.trim().to_string();
1974 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1975
1976 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1978 let attr_line = content_lines[i + 2];
1979 if !lines[i + 2].in_code_block
1980 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1981 && let Some(attr_line_id) =
1982 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1983 {
1984 custom_id = Some(attr_line_id);
1985 }
1986 }
1987
1988 lines[i].heading = Some(HeadingInfo {
1989 level,
1990 style,
1991 marker: underline.to_string(),
1992 marker_column: next_line.len() - next_line.trim_start().len(),
1993 content_column: lines[i].indent,
1994 text: clean_text,
1995 custom_id,
1996 raw_text,
1997 has_closing_sequence: false,
1998 closing_sequence: String::new(),
1999 });
2000 }
2001 }
2002 }
2003 }
2004
2005 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2007 const BLOCK_ELEMENTS: &[&str] = &[
2009 "address",
2010 "article",
2011 "aside",
2012 "blockquote",
2013 "details",
2014 "dialog",
2015 "dd",
2016 "div",
2017 "dl",
2018 "dt",
2019 "fieldset",
2020 "figcaption",
2021 "figure",
2022 "footer",
2023 "form",
2024 "h1",
2025 "h2",
2026 "h3",
2027 "h4",
2028 "h5",
2029 "h6",
2030 "header",
2031 "hr",
2032 "li",
2033 "main",
2034 "nav",
2035 "ol",
2036 "p",
2037 "picture",
2038 "pre",
2039 "script",
2040 "section",
2041 "style",
2042 "table",
2043 "tbody",
2044 "td",
2045 "textarea",
2046 "tfoot",
2047 "th",
2048 "thead",
2049 "tr",
2050 "ul",
2051 ];
2052
2053 let mut i = 0;
2054 while i < lines.len() {
2055 if lines[i].in_code_block || lines[i].in_front_matter {
2057 i += 1;
2058 continue;
2059 }
2060
2061 let trimmed = lines[i].content(content).trim_start();
2062
2063 if trimmed.starts_with('<') && trimmed.len() > 1 {
2065 let after_bracket = &trimmed[1..];
2067 let is_closing = after_bracket.starts_with('/');
2068 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2069
2070 let tag_name = tag_start
2072 .chars()
2073 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2074 .collect::<String>()
2075 .to_lowercase();
2076
2077 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2079 lines[i].in_html_block = true;
2081
2082 if !is_closing {
2085 let closing_tag = format!("</{tag_name}>");
2086 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2088 let mut j = i + 1;
2089 while j < lines.len() && j < i + 100 {
2090 if !allow_blank_lines && lines[j].is_blank {
2093 break;
2094 }
2095
2096 lines[j].in_html_block = true;
2097
2098 if lines[j].content(content).contains(&closing_tag) {
2100 break;
2101 }
2102 j += 1;
2103 }
2104 }
2105 }
2106 }
2107
2108 i += 1;
2109 }
2110 }
2111
2112 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2115 if !flavor.supports_esm_blocks() {
2117 return;
2118 }
2119
2120 for line in lines.iter_mut() {
2121 if line.is_blank || line.in_html_comment {
2123 continue;
2124 }
2125
2126 let trimmed = line.content(content).trim_start();
2128 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2129 line.in_esm_block = true;
2130 } else {
2131 break;
2133 }
2134 }
2135 }
2136
2137 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2139 let mut code_spans = Vec::new();
2140
2141 if !content.contains('`') {
2143 return code_spans;
2144 }
2145
2146 let parser = Parser::new(content).into_offset_iter();
2148
2149 for (event, range) in parser {
2150 if let Event::Code(_) = event {
2151 let start_pos = range.start;
2152 let end_pos = range.end;
2153
2154 let full_span = &content[start_pos..end_pos];
2156 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2157
2158 let content_start = start_pos + backtick_count;
2160 let content_end = end_pos - backtick_count;
2161 let span_content = if content_start < content_end {
2162 content[content_start..content_end].to_string()
2163 } else {
2164 String::new()
2165 };
2166
2167 let line_idx = lines
2170 .partition_point(|line| line.byte_offset <= start_pos)
2171 .saturating_sub(1);
2172 let line_num = line_idx + 1;
2173 let col_start = start_pos - lines[line_idx].byte_offset;
2174
2175 let end_line_idx = lines
2177 .partition_point(|line| line.byte_offset <= end_pos)
2178 .saturating_sub(1);
2179 let col_end = end_pos - lines[end_line_idx].byte_offset;
2180
2181 code_spans.push(CodeSpan {
2182 line: line_num,
2183 start_col: col_start,
2184 end_col: col_end,
2185 byte_offset: start_pos,
2186 byte_end: end_pos,
2187 backtick_count,
2188 content: span_content,
2189 });
2190 }
2191 }
2192
2193 code_spans.sort_by_key(|span| span.byte_offset);
2195
2196 code_spans
2197 }
2198
2199 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2210 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2212
2213 #[inline]
2216 fn reset_tracking_state(
2217 list_item: &ListItemInfo,
2218 has_list_breaking_content: &mut bool,
2219 min_continuation: &mut usize,
2220 ) {
2221 *has_list_breaking_content = false;
2222 let marker_width = if list_item.is_ordered {
2223 list_item.marker.len() + 1 } else {
2225 list_item.marker.len()
2226 };
2227 *min_continuation = if list_item.is_ordered {
2228 marker_width
2229 } else {
2230 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2231 };
2232 }
2233
2234 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2237 let mut last_list_item_line = 0;
2238 let mut current_indent_level = 0;
2239 let mut last_marker_width = 0;
2240
2241 let mut has_list_breaking_content_since_last_item = false;
2243 let mut min_continuation_for_tracking = 0;
2244
2245 for (line_idx, line_info) in lines.iter().enumerate() {
2246 let line_num = line_idx + 1;
2247
2248 if line_info.in_code_block {
2250 if let Some(ref mut block) = current_block {
2251 let min_continuation_indent =
2253 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2254
2255 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2257
2258 match context {
2259 CodeBlockContext::Indented => {
2260 block.end_line = line_num;
2262 continue;
2263 }
2264 CodeBlockContext::Standalone => {
2265 let completed_block = current_block.take().unwrap();
2267 list_blocks.push(completed_block);
2268 continue;
2269 }
2270 CodeBlockContext::Adjacent => {
2271 block.end_line = line_num;
2273 continue;
2274 }
2275 }
2276 } else {
2277 continue;
2279 }
2280 }
2281
2282 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2284 caps.get(0).unwrap().as_str().to_string()
2285 } else {
2286 String::new()
2287 };
2288
2289 if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2291 let line_content = line_info.content(content).trim();
2292
2293 let breaks_list = line_info.heading.is_some()
2295 || line_content.starts_with("---")
2296 || line_content.starts_with("***")
2297 || line_content.starts_with("___")
2298 || (line_content.contains('|')
2299 && !line_content.contains("](")
2300 && !line_content.contains("http")
2301 && (line_content.matches('|').count() > 1
2302 || line_content.starts_with('|')
2303 || line_content.ends_with('|')))
2304 || line_content.starts_with(">")
2305 || (line_info.indent < min_continuation_for_tracking);
2306
2307 if breaks_list {
2308 has_list_breaking_content_since_last_item = true;
2309 }
2310 }
2311
2312 if let Some(list_item) = &line_info.list_item {
2314 let item_indent = list_item.marker_column;
2316 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2319 let is_nested = nesting > block.nesting_level;
2323 let same_type =
2324 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2325 let same_context = block.blockquote_prefix == blockquote_prefix;
2326 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2330 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2331
2332 let has_non_list_content = has_list_breaking_content_since_last_item;
2335
2336 let mut continues_list = if is_nested {
2340 same_context && reasonable_distance && !has_non_list_content
2342 } else {
2343 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2345 };
2346
2347 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2350 if block.item_lines.contains(&(line_num - 1)) {
2352 continues_list = true;
2354 }
2355 }
2356
2357 if continues_list {
2358 block.end_line = line_num;
2360 block.item_lines.push(line_num);
2361
2362 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2364 list_item.marker.len() + 1
2365 } else {
2366 list_item.marker.len()
2367 });
2368
2369 if !block.is_ordered
2371 && block.marker.is_some()
2372 && block.marker.as_ref() != Some(&list_item.marker)
2373 {
2374 block.marker = None;
2376 }
2377
2378 reset_tracking_state(
2380 list_item,
2381 &mut has_list_breaking_content_since_last_item,
2382 &mut min_continuation_for_tracking,
2383 );
2384 } else {
2385 list_blocks.push(block.clone());
2388
2389 *block = ListBlock {
2390 start_line: line_num,
2391 end_line: line_num,
2392 is_ordered: list_item.is_ordered,
2393 marker: if list_item.is_ordered {
2394 None
2395 } else {
2396 Some(list_item.marker.clone())
2397 },
2398 blockquote_prefix: blockquote_prefix.clone(),
2399 item_lines: vec![line_num],
2400 nesting_level: nesting,
2401 max_marker_width: if list_item.is_ordered {
2402 list_item.marker.len() + 1
2403 } else {
2404 list_item.marker.len()
2405 },
2406 };
2407
2408 reset_tracking_state(
2410 list_item,
2411 &mut has_list_breaking_content_since_last_item,
2412 &mut min_continuation_for_tracking,
2413 );
2414 }
2415 } else {
2416 current_block = Some(ListBlock {
2418 start_line: line_num,
2419 end_line: line_num,
2420 is_ordered: list_item.is_ordered,
2421 marker: if list_item.is_ordered {
2422 None
2423 } else {
2424 Some(list_item.marker.clone())
2425 },
2426 blockquote_prefix,
2427 item_lines: vec![line_num],
2428 nesting_level: nesting,
2429 max_marker_width: list_item.marker.len(),
2430 });
2431
2432 reset_tracking_state(
2434 list_item,
2435 &mut has_list_breaking_content_since_last_item,
2436 &mut min_continuation_for_tracking,
2437 );
2438 }
2439
2440 last_list_item_line = line_num;
2441 current_indent_level = item_indent;
2442 last_marker_width = if list_item.is_ordered {
2443 list_item.marker.len() + 1 } else {
2445 list_item.marker.len()
2446 };
2447 } else if let Some(ref mut block) = current_block {
2448 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2458 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2459 } else {
2460 false
2461 };
2462
2463 let min_continuation_indent = if block.is_ordered {
2467 current_indent_level + last_marker_width
2468 } else {
2469 current_indent_level + 2 };
2471
2472 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2473 block.end_line = line_num;
2475 } else if line_info.is_blank {
2476 let mut check_idx = line_idx + 1;
2479 let mut found_continuation = false;
2480
2481 while check_idx < lines.len() && lines[check_idx].is_blank {
2483 check_idx += 1;
2484 }
2485
2486 if check_idx < lines.len() {
2487 let next_line = &lines[check_idx];
2488 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2490 found_continuation = true;
2491 }
2492 else if !next_line.in_code_block
2494 && next_line.list_item.is_some()
2495 && let Some(item) = &next_line.list_item
2496 {
2497 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2498 .find(next_line.content(content))
2499 .map_or(String::new(), |m| m.as_str().to_string());
2500 if item.marker_column == current_indent_level
2501 && item.is_ordered == block.is_ordered
2502 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2503 {
2504 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2507 if let Some(between_line) = lines.get(idx) {
2508 let between_content = between_line.content(content);
2509 let trimmed = between_content.trim();
2510 if trimmed.is_empty() {
2512 return false;
2513 }
2514 let line_indent = between_content.len() - between_content.trim_start().len();
2516
2517 if trimmed.starts_with("```")
2519 || trimmed.starts_with("~~~")
2520 || trimmed.starts_with("---")
2521 || trimmed.starts_with("***")
2522 || trimmed.starts_with("___")
2523 || trimmed.starts_with(">")
2524 || trimmed.contains('|') || between_line.heading.is_some()
2526 {
2527 return true; }
2529
2530 line_indent >= min_continuation_indent
2532 } else {
2533 false
2534 }
2535 });
2536
2537 if block.is_ordered {
2538 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2541 if let Some(between_line) = lines.get(idx) {
2542 let trimmed = between_line.content(content).trim();
2543 if trimmed.is_empty() {
2544 return false;
2545 }
2546 trimmed.starts_with("```")
2548 || trimmed.starts_with("~~~")
2549 || trimmed.starts_with("---")
2550 || trimmed.starts_with("***")
2551 || trimmed.starts_with("___")
2552 || trimmed.starts_with(">")
2553 || trimmed.contains('|') || between_line.heading.is_some()
2555 } else {
2556 false
2557 }
2558 });
2559 found_continuation = !has_structural_separators;
2560 } else {
2561 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2563 if let Some(between_line) = lines.get(idx) {
2564 let trimmed = between_line.content(content).trim();
2565 if trimmed.is_empty() {
2566 return false;
2567 }
2568 trimmed.starts_with("```")
2570 || trimmed.starts_with("~~~")
2571 || trimmed.starts_with("---")
2572 || trimmed.starts_with("***")
2573 || trimmed.starts_with("___")
2574 || trimmed.starts_with(">")
2575 || trimmed.contains('|') || between_line.heading.is_some()
2577 } else {
2578 false
2579 }
2580 });
2581 found_continuation = !has_structural_separators;
2582 }
2583 }
2584 }
2585 }
2586
2587 if found_continuation {
2588 block.end_line = line_num;
2590 } else {
2591 list_blocks.push(block.clone());
2593 current_block = None;
2594 }
2595 } else {
2596 let min_required_indent = if block.is_ordered {
2599 current_indent_level + last_marker_width
2600 } else {
2601 current_indent_level + 2
2602 };
2603
2604 let line_content = line_info.content(content).trim();
2609 let is_structural_separator = line_info.heading.is_some()
2610 || line_content.starts_with("```")
2611 || line_content.starts_with("~~~")
2612 || line_content.starts_with("---")
2613 || line_content.starts_with("***")
2614 || line_content.starts_with("___")
2615 || line_content.starts_with(">")
2616 || (line_content.contains('|')
2617 && !line_content.contains("](")
2618 && !line_content.contains("http")
2619 && (line_content.matches('|').count() > 1
2620 || line_content.starts_with('|')
2621 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2626 && !line_info.is_blank
2627 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2628
2629 if is_lazy_continuation {
2630 let content_to_check = if !blockquote_prefix.is_empty() {
2633 line_info
2635 .content(content)
2636 .strip_prefix(&blockquote_prefix)
2637 .unwrap_or(line_info.content(content))
2638 .trim()
2639 } else {
2640 line_info.content(content).trim()
2641 };
2642
2643 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2644
2645 if starts_with_uppercase && last_list_item_line > 0 {
2648 list_blocks.push(block.clone());
2650 current_block = None;
2651 } else {
2652 block.end_line = line_num;
2654 }
2655 } else {
2656 list_blocks.push(block.clone());
2658 current_block = None;
2659 }
2660 }
2661 }
2662 }
2663
2664 if let Some(block) = current_block {
2666 list_blocks.push(block);
2667 }
2668
2669 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2671
2672 list_blocks
2673 }
2674
2675 fn compute_char_frequency(content: &str) -> CharFrequency {
2677 let mut frequency = CharFrequency::default();
2678
2679 for ch in content.chars() {
2680 match ch {
2681 '#' => frequency.hash_count += 1,
2682 '*' => frequency.asterisk_count += 1,
2683 '_' => frequency.underscore_count += 1,
2684 '-' => frequency.hyphen_count += 1,
2685 '+' => frequency.plus_count += 1,
2686 '>' => frequency.gt_count += 1,
2687 '|' => frequency.pipe_count += 1,
2688 '[' => frequency.bracket_count += 1,
2689 '`' => frequency.backtick_count += 1,
2690 '<' => frequency.lt_count += 1,
2691 '!' => frequency.exclamation_count += 1,
2692 '\n' => frequency.newline_count += 1,
2693 _ => {}
2694 }
2695 }
2696
2697 frequency
2698 }
2699
2700 fn parse_html_tags(
2702 content: &str,
2703 lines: &[LineInfo],
2704 code_blocks: &[(usize, usize)],
2705 flavor: MarkdownFlavor,
2706 ) -> Vec<HtmlTag> {
2707 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2708 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2709
2710 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2711
2712 for cap in HTML_TAG_REGEX.captures_iter(content) {
2713 let full_match = cap.get(0).unwrap();
2714 let match_start = full_match.start();
2715 let match_end = full_match.end();
2716
2717 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2719 continue;
2720 }
2721
2722 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2723 let tag_name_original = cap.get(2).unwrap().as_str();
2724 let tag_name = tag_name_original.to_lowercase();
2725 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2726
2727 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2730 continue;
2731 }
2732
2733 let mut line_num = 1;
2735 let mut col_start = match_start;
2736 let mut col_end = match_end;
2737 for (idx, line_info) in lines.iter().enumerate() {
2738 if match_start >= line_info.byte_offset {
2739 line_num = idx + 1;
2740 col_start = match_start - line_info.byte_offset;
2741 col_end = match_end - line_info.byte_offset;
2742 } else {
2743 break;
2744 }
2745 }
2746
2747 html_tags.push(HtmlTag {
2748 line: line_num,
2749 start_col: col_start,
2750 end_col: col_end,
2751 byte_offset: match_start,
2752 byte_end: match_end,
2753 tag_name,
2754 is_closing,
2755 is_self_closing,
2756 raw_content: full_match.as_str().to_string(),
2757 });
2758 }
2759
2760 html_tags
2761 }
2762
2763 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2765 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2766 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2767
2768 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2769
2770 for cap in EMPHASIS_REGEX.captures_iter(content) {
2771 let full_match = cap.get(0).unwrap();
2772 let match_start = full_match.start();
2773 let match_end = full_match.end();
2774
2775 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2777 continue;
2778 }
2779
2780 let opening_markers = cap.get(1).unwrap().as_str();
2781 let content_part = cap.get(2).unwrap().as_str();
2782 let closing_markers = cap.get(3).unwrap().as_str();
2783
2784 if opening_markers.chars().next() != closing_markers.chars().next()
2786 || opening_markers.len() != closing_markers.len()
2787 {
2788 continue;
2789 }
2790
2791 let marker = opening_markers.chars().next().unwrap();
2792 let marker_count = opening_markers.len();
2793
2794 let mut line_num = 1;
2796 let mut col_start = match_start;
2797 let mut col_end = match_end;
2798 for (idx, line_info) in lines.iter().enumerate() {
2799 if match_start >= line_info.byte_offset {
2800 line_num = idx + 1;
2801 col_start = match_start - line_info.byte_offset;
2802 col_end = match_end - line_info.byte_offset;
2803 } else {
2804 break;
2805 }
2806 }
2807
2808 emphasis_spans.push(EmphasisSpan {
2809 line: line_num,
2810 start_col: col_start,
2811 end_col: col_end,
2812 byte_offset: match_start,
2813 byte_end: match_end,
2814 marker,
2815 marker_count,
2816 content: content_part.to_string(),
2817 });
2818 }
2819
2820 emphasis_spans
2821 }
2822
2823 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2825 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2826
2827 for (line_idx, line_info) in lines.iter().enumerate() {
2828 if line_info.in_code_block || line_info.is_blank {
2830 continue;
2831 }
2832
2833 let line = line_info.content(content);
2834 let line_num = line_idx + 1;
2835
2836 if !line.contains('|') {
2838 continue;
2839 }
2840
2841 let parts: Vec<&str> = line.split('|').collect();
2843 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2844
2845 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2847 let mut column_alignments = Vec::new();
2848
2849 if is_separator {
2850 for part in &parts[1..parts.len() - 1] {
2851 let trimmed = part.trim();
2853 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2854 "center".to_string()
2855 } else if trimmed.ends_with(':') {
2856 "right".to_string()
2857 } else if trimmed.starts_with(':') {
2858 "left".to_string()
2859 } else {
2860 "none".to_string()
2861 };
2862 column_alignments.push(alignment);
2863 }
2864 }
2865
2866 table_rows.push(TableRow {
2867 line: line_num,
2868 is_separator,
2869 column_count,
2870 column_alignments,
2871 });
2872 }
2873
2874 table_rows
2875 }
2876
2877 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2879 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2880
2881 for cap in BARE_URL_PATTERN.captures_iter(content) {
2883 let full_match = cap.get(0).unwrap();
2884 let match_start = full_match.start();
2885 let match_end = full_match.end();
2886
2887 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2889 continue;
2890 }
2891
2892 let preceding_char = if match_start > 0 {
2894 content.chars().nth(match_start - 1)
2895 } else {
2896 None
2897 };
2898 let following_char = content.chars().nth(match_end);
2899
2900 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2901 continue;
2902 }
2903 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2904 continue;
2905 }
2906
2907 let url = full_match.as_str();
2908 let url_type = if url.starts_with("https://") {
2909 "https"
2910 } else if url.starts_with("http://") {
2911 "http"
2912 } else if url.starts_with("ftp://") {
2913 "ftp"
2914 } else {
2915 "other"
2916 };
2917
2918 let mut line_num = 1;
2920 let mut col_start = match_start;
2921 let mut col_end = match_end;
2922 for (idx, line_info) in lines.iter().enumerate() {
2923 if match_start >= line_info.byte_offset {
2924 line_num = idx + 1;
2925 col_start = match_start - line_info.byte_offset;
2926 col_end = match_end - line_info.byte_offset;
2927 } else {
2928 break;
2929 }
2930 }
2931
2932 bare_urls.push(BareUrl {
2933 line: line_num,
2934 start_col: col_start,
2935 end_col: col_end,
2936 byte_offset: match_start,
2937 byte_end: match_end,
2938 url: url.to_string(),
2939 url_type: url_type.to_string(),
2940 });
2941 }
2942
2943 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2945 let full_match = cap.get(0).unwrap();
2946 let match_start = full_match.start();
2947 let match_end = full_match.end();
2948
2949 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2951 continue;
2952 }
2953
2954 let preceding_char = if match_start > 0 {
2956 content.chars().nth(match_start - 1)
2957 } else {
2958 None
2959 };
2960 let following_char = content.chars().nth(match_end);
2961
2962 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2963 continue;
2964 }
2965 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2966 continue;
2967 }
2968
2969 let email = full_match.as_str();
2970
2971 let mut line_num = 1;
2973 let mut col_start = match_start;
2974 let mut col_end = match_end;
2975 for (idx, line_info) in lines.iter().enumerate() {
2976 if match_start >= line_info.byte_offset {
2977 line_num = idx + 1;
2978 col_start = match_start - line_info.byte_offset;
2979 col_end = match_end - line_info.byte_offset;
2980 } else {
2981 break;
2982 }
2983 }
2984
2985 bare_urls.push(BareUrl {
2986 line: line_num,
2987 start_col: col_start,
2988 end_col: col_end,
2989 byte_offset: match_start,
2990 byte_end: match_end,
2991 url: email.to_string(),
2992 url_type: "email".to_string(),
2993 });
2994 }
2995
2996 bare_urls
2997 }
2998}
2999
3000fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3002 if list_blocks.len() < 2 {
3003 return;
3004 }
3005
3006 let mut merger = ListBlockMerger::new(content, lines);
3007 *list_blocks = merger.merge(list_blocks);
3008}
3009
3010struct ListBlockMerger<'a> {
3012 content: &'a str,
3013 lines: &'a [LineInfo],
3014}
3015
3016impl<'a> ListBlockMerger<'a> {
3017 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3018 Self { content, lines }
3019 }
3020
3021 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3022 let mut merged = Vec::with_capacity(list_blocks.len());
3023 let mut current = list_blocks[0].clone();
3024
3025 for next in list_blocks.iter().skip(1) {
3026 if self.should_merge_blocks(¤t, next) {
3027 current = self.merge_two_blocks(current, next);
3028 } else {
3029 merged.push(current);
3030 current = next.clone();
3031 }
3032 }
3033
3034 merged.push(current);
3035 merged
3036 }
3037
3038 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3040 if !self.blocks_are_compatible(current, next) {
3042 return false;
3043 }
3044
3045 let spacing = self.analyze_spacing_between(current, next);
3047 match spacing {
3048 BlockSpacing::Consecutive => true,
3049 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3050 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3051 self.can_merge_with_content_between(current, next)
3052 }
3053 }
3054 }
3055
3056 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3058 current.is_ordered == next.is_ordered
3059 && current.blockquote_prefix == next.blockquote_prefix
3060 && current.nesting_level == next.nesting_level
3061 }
3062
3063 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3065 let gap = next.start_line - current.end_line;
3066
3067 match gap {
3068 1 => BlockSpacing::Consecutive,
3069 2 => BlockSpacing::SingleBlank,
3070 _ if gap > 2 => {
3071 if self.has_only_blank_lines_between(current, next) {
3072 BlockSpacing::MultipleBlanks
3073 } else {
3074 BlockSpacing::ContentBetween
3075 }
3076 }
3077 _ => BlockSpacing::Consecutive, }
3079 }
3080
3081 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3083 if has_meaningful_content_between(self.content, current, next, self.lines) {
3086 return false; }
3088
3089 !current.is_ordered && current.marker == next.marker
3091 }
3092
3093 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3095 if has_meaningful_content_between(self.content, current, next, self.lines) {
3097 return false; }
3099
3100 current.is_ordered && next.is_ordered
3102 }
3103
3104 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3106 for line_num in (current.end_line + 1)..next.start_line {
3107 if let Some(line_info) = self.lines.get(line_num - 1)
3108 && !line_info.content(self.content).trim().is_empty()
3109 {
3110 return false;
3111 }
3112 }
3113 true
3114 }
3115
3116 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3118 current.end_line = next.end_line;
3119 current.item_lines.extend_from_slice(&next.item_lines);
3120
3121 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3123
3124 if !current.is_ordered && self.markers_differ(¤t, next) {
3126 current.marker = None; }
3128
3129 current
3130 }
3131
3132 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3134 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3135 }
3136}
3137
3138#[derive(Debug, PartialEq)]
3140enum BlockSpacing {
3141 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3146
3147fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3149 for line_num in (current.end_line + 1)..next.start_line {
3151 if let Some(line_info) = lines.get(line_num - 1) {
3152 let trimmed = line_info.content(content).trim();
3154
3155 if trimmed.is_empty() {
3157 continue;
3158 }
3159
3160 if line_info.heading.is_some() {
3164 return true; }
3166
3167 if is_horizontal_rule(trimmed) {
3169 return true; }
3171
3172 if trimmed.contains('|') && trimmed.len() > 1 {
3175 if !trimmed.contains("](") && !trimmed.contains("http") {
3177 let pipe_count = trimmed.matches('|').count();
3179 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3180 return true; }
3182 }
3183 }
3184
3185 if trimmed.starts_with('>') {
3187 return true; }
3189
3190 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3192 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3193
3194 let min_continuation_indent = if current.is_ordered {
3196 current.nesting_level + current.max_marker_width + 1 } else {
3198 current.nesting_level + 2
3199 };
3200
3201 if line_indent < min_continuation_indent {
3202 return true; }
3205 }
3206
3207 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3209
3210 let min_indent = if current.is_ordered {
3212 current.nesting_level + current.max_marker_width
3213 } else {
3214 current.nesting_level + 2
3215 };
3216
3217 if line_indent < min_indent {
3219 return true; }
3221
3222 }
3225 }
3226
3227 false
3229}
3230
3231fn is_horizontal_rule(trimmed: &str) -> bool {
3233 if trimmed.len() < 3 {
3234 return false;
3235 }
3236
3237 let chars: Vec<char> = trimmed.chars().collect();
3239 if let Some(&first_char) = chars.first()
3240 && (first_char == '-' || first_char == '*' || first_char == '_')
3241 {
3242 let mut count = 0;
3243 for &ch in &chars {
3244 if ch == first_char {
3245 count += 1;
3246 } else if ch != ' ' && ch != '\t' {
3247 return false; }
3249 }
3250 return count >= 3;
3251 }
3252 false
3253}
3254
3255#[cfg(test)]
3257mod tests {
3258 use super::*;
3259
3260 #[test]
3261 fn test_empty_content() {
3262 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3263 assert_eq!(ctx.content, "");
3264 assert_eq!(ctx.line_offsets, vec![0]);
3265 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3266 assert_eq!(ctx.lines.len(), 0);
3267 }
3268
3269 #[test]
3270 fn test_single_line() {
3271 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3272 assert_eq!(ctx.content, "# Hello");
3273 assert_eq!(ctx.line_offsets, vec![0]);
3274 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3275 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3276 }
3277
3278 #[test]
3279 fn test_multi_line() {
3280 let content = "# Title\n\nSecond line\nThird line";
3281 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3282 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3283 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3290
3291 #[test]
3292 fn test_line_info() {
3293 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3294 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3295
3296 assert_eq!(ctx.lines.len(), 7);
3298
3299 let line1 = &ctx.lines[0];
3301 assert_eq!(line1.content(ctx.content), "# Title");
3302 assert_eq!(line1.byte_offset, 0);
3303 assert_eq!(line1.indent, 0);
3304 assert!(!line1.is_blank);
3305 assert!(!line1.in_code_block);
3306 assert!(line1.list_item.is_none());
3307
3308 let line2 = &ctx.lines[1];
3310 assert_eq!(line2.content(ctx.content), " indented");
3311 assert_eq!(line2.byte_offset, 8);
3312 assert_eq!(line2.indent, 4);
3313 assert!(!line2.is_blank);
3314
3315 let line3 = &ctx.lines[2];
3317 assert_eq!(line3.content(ctx.content), "");
3318 assert!(line3.is_blank);
3319
3320 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3322 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3323 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3324 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3325 }
3326
3327 #[test]
3328 fn test_list_item_detection() {
3329 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3330 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3331
3332 let line1 = &ctx.lines[0];
3334 assert!(line1.list_item.is_some());
3335 let list1 = line1.list_item.as_ref().unwrap();
3336 assert_eq!(list1.marker, "-");
3337 assert!(!list1.is_ordered);
3338 assert_eq!(list1.marker_column, 0);
3339 assert_eq!(list1.content_column, 2);
3340
3341 let line2 = &ctx.lines[1];
3343 assert!(line2.list_item.is_some());
3344 let list2 = line2.list_item.as_ref().unwrap();
3345 assert_eq!(list2.marker, "*");
3346 assert_eq!(list2.marker_column, 2);
3347
3348 let line3 = &ctx.lines[2];
3350 assert!(line3.list_item.is_some());
3351 let list3 = line3.list_item.as_ref().unwrap();
3352 assert_eq!(list3.marker, "1.");
3353 assert!(list3.is_ordered);
3354 assert_eq!(list3.number, Some(1));
3355
3356 let line6 = &ctx.lines[5];
3358 assert!(line6.list_item.is_none());
3359 }
3360
3361 #[test]
3362 fn test_offset_to_line_col_edge_cases() {
3363 let content = "a\nb\nc";
3364 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3365 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3373
3374 #[test]
3375 fn test_mdx_esm_blocks() {
3376 let content = r##"import {Chart} from './snowfall.js'
3377export const year = 2023
3378
3379# Last year's snowfall
3380
3381In {year}, the snowfall was above average.
3382It was followed by a warm spring which caused
3383flood conditions in many of the nearby rivers.
3384
3385<Chart color="#fcb32c" year={year} />
3386"##;
3387
3388 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3389
3390 assert_eq!(ctx.lines.len(), 10);
3392 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3393 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3394 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3395 assert!(
3396 !ctx.lines[3].in_esm_block,
3397 "Line 4 (heading) should NOT be in_esm_block"
3398 );
3399 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3400 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3401 }
3402
3403 #[test]
3404 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3405 let content = r#"import {Chart} from './snowfall.js'
3406export const year = 2023
3407
3408# Last year's snowfall
3409"#;
3410
3411 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3412
3413 assert!(
3415 !ctx.lines[0].in_esm_block,
3416 "Line 1 should NOT be in_esm_block in Standard flavor"
3417 );
3418 assert!(
3419 !ctx.lines[1].in_esm_block,
3420 "Line 2 should NOT be in_esm_block in Standard flavor"
3421 );
3422 }
3423}