1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11 Regex::new(
12 r#"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
16 |
17 \[([^\]]*)\] # Reference ID in group 6
18 )"#
19 ).unwrap()
20});
21
22static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25 Regex::new(
26 r#"(?sx)
27 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
28 (?:
29 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
30 |
31 \[([^\]]*)\] # Reference ID in group 6
32 )"#
33 ).unwrap()
34});
35
36static REF_DEF_PATTERN: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42 Regex::new(
43 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44 ).unwrap()
45});
46
47static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54#[derive(Debug, Clone)]
56pub struct LineInfo {
57 pub content: String,
59 pub byte_offset: usize,
61 pub indent: usize,
63 pub is_blank: bool,
65 pub in_code_block: bool,
67 pub in_front_matter: bool,
69 pub in_html_block: bool,
71 pub in_html_comment: bool,
73 pub list_item: Option<ListItemInfo>,
75 pub heading: Option<HeadingInfo>,
77 pub blockquote: Option<BlockquoteInfo>,
79 pub in_mkdocstrings: bool,
81 pub in_esm_block: bool,
83}
84
85#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88 pub marker: String,
90 pub is_ordered: bool,
92 pub number: Option<usize>,
94 pub marker_column: usize,
96 pub content_column: usize,
98}
99
100#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103 ATX,
105 Setext1,
107 Setext2,
109}
110
111#[derive(Debug, Clone)]
113pub struct ParsedLink {
114 pub line: usize,
116 pub start_col: usize,
118 pub end_col: usize,
120 pub byte_offset: usize,
122 pub byte_end: usize,
124 pub text: String,
126 pub url: String,
128 pub is_reference: bool,
130 pub reference_id: Option<String>,
132}
133
134#[derive(Debug, Clone)]
136pub struct BrokenLinkInfo {
137 pub reference: String,
139 pub span: std::ops::Range<usize>,
141}
142
143#[derive(Debug, Clone)]
145pub struct ParsedImage {
146 pub line: usize,
148 pub start_col: usize,
150 pub end_col: usize,
152 pub byte_offset: usize,
154 pub byte_end: usize,
156 pub alt_text: String,
158 pub url: String,
160 pub is_reference: bool,
162 pub reference_id: Option<String>,
164}
165
166#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169 pub line: usize,
171 pub id: String,
173 pub url: String,
175 pub title: Option<String>,
177 pub byte_offset: usize,
179 pub byte_end: usize,
181}
182
183#[derive(Debug, Clone)]
185pub struct CodeSpan {
186 pub line: usize,
188 pub start_col: usize,
190 pub end_col: usize,
192 pub byte_offset: usize,
194 pub byte_end: usize,
196 pub backtick_count: usize,
198 pub content: String,
200}
201
202#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205 pub level: u8,
207 pub style: HeadingStyle,
209 pub marker: String,
211 pub marker_column: usize,
213 pub content_column: usize,
215 pub text: String,
217 pub custom_id: Option<String>,
219 pub raw_text: String,
221 pub has_closing_sequence: bool,
223 pub closing_sequence: String,
225}
226
227#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230 pub nesting_level: usize,
232 pub indent: String,
234 pub marker_column: usize,
236 pub prefix: String,
238 pub content: String,
240 pub has_no_space_after_marker: bool,
242 pub has_multiple_spaces_after_marker: bool,
244 pub needs_md028_fix: bool,
246}
247
248#[derive(Debug, Clone)]
250pub struct ListBlock {
251 pub start_line: usize,
253 pub end_line: usize,
255 pub is_ordered: bool,
257 pub marker: Option<String>,
259 pub blockquote_prefix: String,
261 pub item_lines: Vec<usize>,
263 pub nesting_level: usize,
265 pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274 pub hash_count: usize,
276 pub asterisk_count: usize,
278 pub underscore_count: usize,
280 pub hyphen_count: usize,
282 pub plus_count: usize,
284 pub gt_count: usize,
286 pub pipe_count: usize,
288 pub bracket_count: usize,
290 pub backtick_count: usize,
292 pub lt_count: usize,
294 pub exclamation_count: usize,
296 pub newline_count: usize,
298}
299
300#[derive(Debug, Clone)]
302pub struct HtmlTag {
303 pub line: usize,
305 pub start_col: usize,
307 pub end_col: usize,
309 pub byte_offset: usize,
311 pub byte_end: usize,
313 pub tag_name: String,
315 pub is_closing: bool,
317 pub is_self_closing: bool,
319 pub raw_content: String,
321}
322
323#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326 pub line: usize,
328 pub start_col: usize,
330 pub end_col: usize,
332 pub byte_offset: usize,
334 pub byte_end: usize,
336 pub marker: char,
338 pub marker_count: usize,
340 pub content: String,
342}
343
344#[derive(Debug, Clone)]
346pub struct TableRow {
347 pub line: usize,
349 pub is_separator: bool,
351 pub column_count: usize,
353 pub column_alignments: Vec<String>, }
356
357#[derive(Debug, Clone)]
359pub struct BareUrl {
360 pub line: usize,
362 pub start_col: usize,
364 pub end_col: usize,
366 pub byte_offset: usize,
368 pub byte_end: usize,
370 pub url: String,
372 pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377 pub content: &'a str,
378 pub line_offsets: Vec<usize>,
379 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
398
399struct BlockquoteComponents<'a> {
401 indent: &'a str,
402 markers: &'a str,
403 spaces_after: &'a str,
404 content: &'a str,
405}
406
407#[inline]
409fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
410 let bytes = line.as_bytes();
411 let mut pos = 0;
412
413 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
415 pos += 1;
416 }
417 let indent_end = pos;
418
419 if pos >= bytes.len() || bytes[pos] != b'>' {
421 return None;
422 }
423
424 while pos < bytes.len() && bytes[pos] == b'>' {
426 pos += 1;
427 }
428 let markers_end = pos;
429
430 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
432 pos += 1;
433 }
434 let spaces_end = pos;
435
436 Some(BlockquoteComponents {
437 indent: &line[0..indent_end],
438 markers: &line[indent_end..markers_end],
439 spaces_after: &line[markers_end..spaces_end],
440 content: &line[spaces_end..],
441 })
442}
443
444impl<'a> LintContext<'a> {
445 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
446 use std::time::Instant;
447 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
448
449 let start = Instant::now();
450 let mut line_offsets = vec![0];
451 for (i, c) in content.char_indices() {
452 if c == '\n' {
453 line_offsets.push(i + 1);
454 }
455 }
456 if profile {
457 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
458 }
459
460 let start = Instant::now();
462 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
463 if profile {
464 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
465 }
466
467 let start = Instant::now();
469 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
470 if profile {
471 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
472 }
473
474 let start = Instant::now();
476 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
477 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
478 } else {
479 Vec::new()
480 };
481 if profile {
482 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
483 }
484
485 let start = Instant::now();
487 let mut lines = Self::compute_basic_line_info(
488 content,
489 &line_offsets,
490 &code_blocks,
491 flavor,
492 &html_comment_ranges,
493 &autodoc_ranges,
494 );
495 if profile {
496 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
497 }
498
499 let start = Instant::now();
501 Self::detect_html_blocks(&mut lines);
502 if profile {
503 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
504 }
505
506 let start = Instant::now();
508 Self::detect_esm_blocks(&mut lines, flavor);
509 if profile {
510 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
511 }
512
513 let start = Instant::now();
515 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
516 if profile {
517 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
518 }
519
520 let start = Instant::now();
522 let code_spans = Self::parse_code_spans(content, &lines);
523 if profile {
524 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
525 }
526
527 let start = Instant::now();
529 let (links, broken_links) =
530 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
531 if profile {
532 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
533 }
534
535 let start = Instant::now();
536 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
537 if profile {
538 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
539 }
540
541 let start = Instant::now();
542 let reference_defs = Self::parse_reference_defs(content, &lines);
543 if profile {
544 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
545 }
546
547 let start = Instant::now();
548 let list_blocks = Self::parse_list_blocks(&lines);
549 if profile {
550 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
551 }
552
553 let start = Instant::now();
555 let char_frequency = Self::compute_char_frequency(content);
556 if profile {
557 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
558 }
559
560 let start = Instant::now();
562 let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
563 content,
564 &code_blocks,
565 &code_spans,
566 &html_comment_ranges,
567 );
568 if profile {
569 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
570 }
571
572 let start = Instant::now();
574 let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
575 if profile {
576 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
577 }
578
579 let start = Instant::now();
581 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
582 if profile {
583 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
584 }
585
586 Self {
587 content,
588 line_offsets,
589 code_blocks,
590 lines,
591 links,
592 images,
593 broken_links,
594 reference_defs,
595 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
596 list_blocks,
597 char_frequency,
598 html_tags_cache: Mutex::new(None),
599 emphasis_spans_cache: Mutex::new(None),
600 table_rows_cache: Mutex::new(None),
601 bare_urls_cache: Mutex::new(None),
602 html_comment_ranges,
603 table_blocks,
604 line_index,
605 jinja_ranges,
606 flavor,
607 }
608 }
609
610 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
612 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
613
614 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
615 }
616
617 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
619 &self.html_comment_ranges
620 }
621
622 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
624 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
625
626 Arc::clone(cache.get_or_insert_with(|| {
627 Arc::new(Self::parse_html_tags(
628 self.content,
629 &self.lines,
630 &self.code_blocks,
631 self.flavor,
632 ))
633 }))
634 }
635
636 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
638 let mut cache = self
639 .emphasis_spans_cache
640 .lock()
641 .expect("Emphasis spans cache mutex poisoned");
642
643 Arc::clone(
644 cache.get_or_insert_with(|| {
645 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
646 }),
647 )
648 }
649
650 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
652 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
653
654 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
655 }
656
657 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
659 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
660
661 Arc::clone(
662 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
663 )
664 }
665
666 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
668 match self.line_offsets.binary_search(&offset) {
669 Ok(line) => (line + 1, 1),
670 Err(line) => {
671 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
672 (line, offset - line_start + 1)
673 }
674 }
675 }
676
677 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
679 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
681 return true;
682 }
683
684 self.code_spans()
686 .iter()
687 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
688 }
689
690 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
692 if line_num > 0 {
693 self.lines.get(line_num - 1)
694 } else {
695 None
696 }
697 }
698
699 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
701 self.line_info(line_num).map(|info| info.byte_offset)
702 }
703
704 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
706 let normalized_id = ref_id.to_lowercase();
707 self.reference_defs
708 .iter()
709 .find(|def| def.id == normalized_id)
710 .map(|def| def.url.as_str())
711 }
712
713 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
715 self.links.iter().filter(|link| link.line == line_num).collect()
716 }
717
718 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
720 self.images.iter().filter(|img| img.line == line_num).collect()
721 }
722
723 pub fn is_in_list_block(&self, line_num: usize) -> bool {
725 self.list_blocks
726 .iter()
727 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
728 }
729
730 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
732 self.list_blocks
733 .iter()
734 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
735 }
736
737 pub fn is_in_code_block(&self, line_num: usize) -> bool {
741 if line_num == 0 || line_num > self.lines.len() {
742 return false;
743 }
744 self.lines[line_num - 1].in_code_block
745 }
746
747 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
749 if line_num == 0 || line_num > self.lines.len() {
750 return false;
751 }
752 self.lines[line_num - 1].in_front_matter
753 }
754
755 pub fn is_in_html_block(&self, line_num: usize) -> bool {
757 if line_num == 0 || line_num > self.lines.len() {
758 return false;
759 }
760 self.lines[line_num - 1].in_html_block
761 }
762
763 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
765 if line_num == 0 || line_num > self.lines.len() {
766 return false;
767 }
768
769 let col_0indexed = if col > 0 { col - 1 } else { 0 };
773 let code_spans = self.code_spans();
774 code_spans
775 .iter()
776 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
777 }
778
779 #[inline]
782 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
783 self.reference_defs
784 .iter()
785 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
786 }
787
788 #[inline]
792 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
793 self.html_comment_ranges
794 .iter()
795 .any(|range| byte_pos >= range.start && byte_pos < range.end)
796 }
797
798 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
800 self.jinja_ranges
801 .iter()
802 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
803 }
804
805 pub fn has_char(&self, ch: char) -> bool {
807 match ch {
808 '#' => self.char_frequency.hash_count > 0,
809 '*' => self.char_frequency.asterisk_count > 0,
810 '_' => self.char_frequency.underscore_count > 0,
811 '-' => self.char_frequency.hyphen_count > 0,
812 '+' => self.char_frequency.plus_count > 0,
813 '>' => self.char_frequency.gt_count > 0,
814 '|' => self.char_frequency.pipe_count > 0,
815 '[' => self.char_frequency.bracket_count > 0,
816 '`' => self.char_frequency.backtick_count > 0,
817 '<' => self.char_frequency.lt_count > 0,
818 '!' => self.char_frequency.exclamation_count > 0,
819 '\n' => self.char_frequency.newline_count > 0,
820 _ => self.content.contains(ch), }
822 }
823
824 pub fn char_count(&self, ch: char) -> usize {
826 match ch {
827 '#' => self.char_frequency.hash_count,
828 '*' => self.char_frequency.asterisk_count,
829 '_' => self.char_frequency.underscore_count,
830 '-' => self.char_frequency.hyphen_count,
831 '+' => self.char_frequency.plus_count,
832 '>' => self.char_frequency.gt_count,
833 '|' => self.char_frequency.pipe_count,
834 '[' => self.char_frequency.bracket_count,
835 '`' => self.char_frequency.backtick_count,
836 '<' => self.char_frequency.lt_count,
837 '!' => self.char_frequency.exclamation_count,
838 '\n' => self.char_frequency.newline_count,
839 _ => self.content.matches(ch).count(), }
841 }
842
843 pub fn likely_has_headings(&self) -> bool {
845 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
847
848 pub fn likely_has_lists(&self) -> bool {
850 self.char_frequency.asterisk_count > 0
851 || self.char_frequency.hyphen_count > 0
852 || self.char_frequency.plus_count > 0
853 }
854
855 pub fn likely_has_emphasis(&self) -> bool {
857 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
858 }
859
860 pub fn likely_has_tables(&self) -> bool {
862 self.char_frequency.pipe_count > 2
863 }
864
865 pub fn likely_has_blockquotes(&self) -> bool {
867 self.char_frequency.gt_count > 0
868 }
869
870 pub fn likely_has_code(&self) -> bool {
872 self.char_frequency.backtick_count > 0
873 }
874
875 pub fn likely_has_links_or_images(&self) -> bool {
877 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
878 }
879
880 pub fn likely_has_html(&self) -> bool {
882 self.char_frequency.lt_count > 0
883 }
884
885 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
887 self.html_tags()
888 .iter()
889 .filter(|tag| tag.line == line_num)
890 .cloned()
891 .collect()
892 }
893
894 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
896 self.emphasis_spans()
897 .iter()
898 .filter(|span| span.line == line_num)
899 .cloned()
900 .collect()
901 }
902
903 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
905 self.table_rows()
906 .iter()
907 .filter(|row| row.line == line_num)
908 .cloned()
909 .collect()
910 }
911
912 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
914 self.bare_urls()
915 .iter()
916 .filter(|url| url.line == line_num)
917 .cloned()
918 .collect()
919 }
920
921 #[inline]
927 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
928 let idx = match lines.binary_search_by(|line| {
930 if byte_offset < line.byte_offset {
931 std::cmp::Ordering::Greater
932 } else if byte_offset > line.byte_offset + line.content.len() {
933 std::cmp::Ordering::Less
934 } else {
935 std::cmp::Ordering::Equal
936 }
937 }) {
938 Ok(idx) => idx,
939 Err(idx) => idx.saturating_sub(1),
940 };
941
942 let line = &lines[idx];
943 let line_num = idx + 1;
944 let col = byte_offset.saturating_sub(line.byte_offset);
945
946 (idx, line_num, col)
947 }
948
949 #[inline]
951 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
952 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
954
955 if idx > 0 {
957 let span = &code_spans[idx - 1];
958 if offset >= span.byte_offset && offset < span.byte_end {
959 return true;
960 }
961 }
962
963 false
964 }
965
966 fn parse_links(
968 content: &str,
969 lines: &[LineInfo],
970 code_blocks: &[(usize, usize)],
971 code_spans: &[CodeSpan],
972 flavor: MarkdownFlavor,
973 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
974 ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
975 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
976 use std::collections::HashSet;
977
978 let mut links = Vec::with_capacity(content.len() / 500);
979 let mut broken_links = Vec::new();
980
981 let mut found_positions = HashSet::new();
983
984 let parser = Parser::new_with_broken_link_callback(
993 content,
994 pulldown_cmark::Options::empty(),
995 Some(|link: BrokenLink<'_>| {
996 broken_links.push(BrokenLinkInfo {
997 reference: link.reference.to_string(),
998 span: link.span.clone(),
999 });
1000 None
1001 }),
1002 )
1003 .into_offset_iter();
1004
1005 let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
1006 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1009 match event {
1010 Event::Start(Tag::Link {
1011 link_type,
1012 dest_url,
1013 id,
1014 ..
1015 }) => {
1016 link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1018 text_chunks.clear();
1019 }
1020 Event::Text(text) if !link_stack.is_empty() => {
1021 text_chunks.push((text.to_string(), range.start, range.end));
1023 }
1024 Event::Code(code) if !link_stack.is_empty() => {
1025 let code_text = format!("`{code}`");
1027 text_chunks.push((code_text, range.start, range.end));
1028 }
1029 Event::End(TagEnd::Link) => {
1030 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1031 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1033 text_chunks.clear();
1034 continue;
1035 }
1036
1037 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1039
1040 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1042 text_chunks.clear();
1043 continue;
1044 }
1045
1046 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1047
1048 let is_reference = matches!(
1049 link_type,
1050 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1051 );
1052
1053 let link_text = if start_pos < content.len() {
1056 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1057
1058 let mut close_pos = None;
1061 let mut depth = 0;
1062
1063 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1064 let mut backslash_count = 0;
1066 let mut j = i;
1067 while j > 0 && link_bytes[j - 1] == b'\\' {
1068 backslash_count += 1;
1069 j -= 1;
1070 }
1071 let is_escaped = backslash_count % 2 != 0;
1072
1073 if !is_escaped {
1074 if byte == b'[' {
1075 depth += 1;
1076 } else if byte == b']' {
1077 if depth == 0 {
1078 close_pos = Some(i);
1080 break;
1081 } else {
1082 depth -= 1;
1083 }
1084 }
1085 }
1086 }
1087
1088 if let Some(pos) = close_pos {
1089 std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1090 } else {
1091 String::new()
1092 }
1093 } else {
1094 String::new()
1095 };
1096
1097 let reference_id = if is_reference && !ref_id.is_empty() {
1099 Some(ref_id.to_lowercase())
1100 } else if is_reference {
1101 Some(link_text.to_lowercase())
1103 } else {
1104 None
1105 };
1106
1107 let has_escaped_bang = start_pos >= 2
1111 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1112 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1113
1114 let has_escaped_bracket =
1117 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1118
1119 if has_escaped_bang || has_escaped_bracket {
1120 text_chunks.clear();
1121 continue; }
1123
1124 found_positions.insert(start_pos);
1126
1127 links.push(ParsedLink {
1128 line: line_num,
1129 start_col: col_start,
1130 end_col: col_end,
1131 byte_offset: start_pos,
1132 byte_end: range.end,
1133 text: link_text,
1134 url,
1135 is_reference,
1136 reference_id,
1137 });
1138
1139 text_chunks.clear();
1140 }
1141 }
1142 _ => {}
1143 }
1144 }
1145
1146 for cap in LINK_PATTERN.captures_iter(content) {
1150 let full_match = cap.get(0).unwrap();
1151 let match_start = full_match.start();
1152 let match_end = full_match.end();
1153
1154 if found_positions.contains(&match_start) {
1156 continue;
1157 }
1158
1159 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1161 continue;
1162 }
1163
1164 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1166 continue;
1167 }
1168
1169 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1171 continue;
1172 }
1173
1174 if Self::is_offset_in_code_span(code_spans, match_start) {
1176 continue;
1177 }
1178
1179 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1181 continue;
1182 }
1183
1184 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1186
1187 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1189 continue;
1190 }
1191
1192 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1193
1194 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1195
1196 if let Some(ref_id) = cap.get(6) {
1198 let ref_id_str = ref_id.as_str();
1199 let normalized_ref = if ref_id_str.is_empty() {
1200 text.to_lowercase() } else {
1202 ref_id_str.to_lowercase()
1203 };
1204
1205 links.push(ParsedLink {
1207 line: line_num,
1208 start_col: col_start,
1209 end_col: col_end,
1210 byte_offset: match_start,
1211 byte_end: match_end,
1212 text,
1213 url: String::new(), is_reference: true,
1215 reference_id: Some(normalized_ref),
1216 });
1217 }
1218 }
1219
1220 (links, broken_links)
1221 }
1222
1223 fn parse_images(
1225 content: &str,
1226 lines: &[LineInfo],
1227 code_blocks: &[(usize, usize)],
1228 code_spans: &[CodeSpan],
1229 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1230 ) -> Vec<ParsedImage> {
1231 use crate::utils::skip_context::is_in_html_comment_ranges;
1232 use std::collections::HashSet;
1233
1234 let mut images = Vec::with_capacity(content.len() / 1000);
1236 let mut found_positions = HashSet::new();
1237
1238 let parser = Parser::new(content).into_offset_iter();
1240 let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1241 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1244 match event {
1245 Event::Start(Tag::Image {
1246 link_type,
1247 dest_url,
1248 id,
1249 ..
1250 }) => {
1251 image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1252 text_chunks.clear();
1253 }
1254 Event::Text(text) if !image_stack.is_empty() => {
1255 text_chunks.push((text.to_string(), range.start, range.end));
1256 }
1257 Event::Code(code) if !image_stack.is_empty() => {
1258 let code_text = format!("`{code}`");
1259 text_chunks.push((code_text, range.start, range.end));
1260 }
1261 Event::End(TagEnd::Image) => {
1262 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1263 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1265 continue;
1266 }
1267
1268 if Self::is_offset_in_code_span(code_spans, start_pos) {
1270 continue;
1271 }
1272
1273 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1275 continue;
1276 }
1277
1278 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1280 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1281
1282 let is_reference = matches!(
1283 link_type,
1284 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1285 );
1286
1287 let alt_text = if start_pos < content.len() {
1290 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1291
1292 let mut close_pos = None;
1295 let mut depth = 0;
1296
1297 if image_bytes.len() > 2 {
1298 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1299 let mut backslash_count = 0;
1301 let mut j = i;
1302 while j > 0 && image_bytes[j - 1] == b'\\' {
1303 backslash_count += 1;
1304 j -= 1;
1305 }
1306 let is_escaped = backslash_count % 2 != 0;
1307
1308 if !is_escaped {
1309 if byte == b'[' {
1310 depth += 1;
1311 } else if byte == b']' {
1312 if depth == 0 {
1313 close_pos = Some(i);
1315 break;
1316 } else {
1317 depth -= 1;
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 if let Some(pos) = close_pos {
1325 std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1326 } else {
1327 String::new()
1328 }
1329 } else {
1330 String::new()
1331 };
1332
1333 let reference_id = if is_reference && !ref_id.is_empty() {
1334 Some(ref_id.to_lowercase())
1335 } else if is_reference {
1336 Some(alt_text.to_lowercase()) } else {
1338 None
1339 };
1340
1341 found_positions.insert(start_pos);
1342 images.push(ParsedImage {
1343 line: line_num,
1344 start_col: col_start,
1345 end_col: col_end,
1346 byte_offset: start_pos,
1347 byte_end: range.end,
1348 alt_text,
1349 url,
1350 is_reference,
1351 reference_id,
1352 });
1353 }
1354 }
1355 _ => {}
1356 }
1357 }
1358
1359 for cap in IMAGE_PATTERN.captures_iter(content) {
1361 let full_match = cap.get(0).unwrap();
1362 let match_start = full_match.start();
1363 let match_end = full_match.end();
1364
1365 if found_positions.contains(&match_start) {
1367 continue;
1368 }
1369
1370 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1372 continue;
1373 }
1374
1375 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1377 || Self::is_offset_in_code_span(code_spans, match_start)
1378 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1379 {
1380 continue;
1381 }
1382
1383 if let Some(ref_id) = cap.get(6) {
1385 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1386 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1387 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1388 let ref_id_str = ref_id.as_str();
1389 let normalized_ref = if ref_id_str.is_empty() {
1390 alt_text.to_lowercase()
1391 } else {
1392 ref_id_str.to_lowercase()
1393 };
1394
1395 images.push(ParsedImage {
1396 line: line_num,
1397 start_col: col_start,
1398 end_col: col_end,
1399 byte_offset: match_start,
1400 byte_end: match_end,
1401 alt_text,
1402 url: String::new(),
1403 is_reference: true,
1404 reference_id: Some(normalized_ref),
1405 });
1406 }
1407 }
1408
1409 images
1410 }
1411
1412 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1414 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1418 if line_info.in_code_block {
1420 continue;
1421 }
1422
1423 let line = &line_info.content;
1424 let line_num = line_idx + 1;
1425
1426 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1427 let id = cap.get(1).unwrap().as_str().to_lowercase();
1428 let url = cap.get(2).unwrap().as_str().to_string();
1429 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1430
1431 let match_obj = cap.get(0).unwrap();
1434 let byte_offset = line_info.byte_offset + match_obj.start();
1435 let byte_end = line_info.byte_offset + match_obj.end();
1436
1437 refs.push(ReferenceDef {
1438 line: line_num,
1439 id,
1440 url,
1441 title,
1442 byte_offset,
1443 byte_end,
1444 });
1445 }
1446 }
1447
1448 refs
1449 }
1450
1451 #[inline]
1455 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1456 let trimmed_start = line.trim_start();
1457 if !trimmed_start.starts_with('>') {
1458 return None;
1459 }
1460
1461 let leading_ws_len = line.len() - trimmed_start.len();
1462 let after_gt = &trimmed_start[1..];
1463 let content = after_gt.trim_start();
1464 let ws_after_gt_len = after_gt.len() - content.len();
1465 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1466
1467 Some((&line[..prefix_len], content))
1468 }
1469
1470 #[inline]
1474 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1475 let bytes = line.as_bytes();
1476 let mut i = 0;
1477
1478 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1480 i += 1;
1481 }
1482
1483 if i >= bytes.len() {
1485 return None;
1486 }
1487 let marker = bytes[i] as char;
1488 if marker != '-' && marker != '*' && marker != '+' {
1489 return None;
1490 }
1491 let marker_pos = i;
1492 i += 1;
1493
1494 let spacing_start = i;
1496 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1497 i += 1;
1498 }
1499
1500 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1501 }
1502
1503 #[inline]
1507 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1508 let bytes = line.as_bytes();
1509 let mut i = 0;
1510
1511 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1513 i += 1;
1514 }
1515
1516 let number_start = i;
1518 while i < bytes.len() && bytes[i].is_ascii_digit() {
1519 i += 1;
1520 }
1521 if i == number_start {
1522 return None; }
1524
1525 if i >= bytes.len() {
1527 return None;
1528 }
1529 let delimiter = bytes[i] as char;
1530 if delimiter != '.' && delimiter != ')' {
1531 return None;
1532 }
1533 let delimiter_pos = i;
1534 i += 1;
1535
1536 let spacing_start = i;
1538 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1539 i += 1;
1540 }
1541
1542 Some((
1543 &line[..number_start],
1544 &line[number_start..delimiter_pos],
1545 delimiter,
1546 &line[spacing_start..i],
1547 &line[i..],
1548 ))
1549 }
1550
1551 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1554 let num_lines = line_offsets.len();
1555 let mut in_code_block = vec![false; num_lines];
1556
1557 for &(start, end) in code_blocks {
1559 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1561 let mut boundary = start;
1562 while boundary > 0 && !content.is_char_boundary(boundary) {
1563 boundary -= 1;
1564 }
1565 boundary
1566 } else {
1567 start
1568 };
1569
1570 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1571 let mut boundary = end;
1572 while boundary < content.len() && !content.is_char_boundary(boundary) {
1573 boundary += 1;
1574 }
1575 boundary
1576 } else {
1577 end.min(content.len())
1578 };
1579
1580 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1595 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1596
1597 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1599 *flag = true;
1600 }
1601 }
1602
1603 in_code_block
1604 }
1605
1606 fn compute_basic_line_info(
1608 content: &str,
1609 line_offsets: &[usize],
1610 code_blocks: &[(usize, usize)],
1611 flavor: MarkdownFlavor,
1612 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1613 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1614 ) -> Vec<LineInfo> {
1615 let content_lines: Vec<&str> = content.lines().collect();
1616 let mut lines = Vec::with_capacity(content_lines.len());
1617
1618 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1620
1621 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1624
1625 for (i, line) in content_lines.iter().enumerate() {
1626 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1627 let indent = line.len() - line.trim_start().len();
1628
1629 let blockquote_parse = Self::parse_blockquote_prefix(line);
1631
1632 let is_blank = if let Some((_, content)) = blockquote_parse {
1634 content.trim().is_empty()
1636 } else {
1637 line.trim().is_empty()
1638 };
1639
1640 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1642
1643 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1645 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1646 let in_html_comment =
1648 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1649 let list_item = if !(in_code_block
1650 || is_blank
1651 || in_mkdocstrings
1652 || in_html_comment
1653 || (front_matter_end > 0 && i < front_matter_end))
1654 {
1655 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1657 (content, prefix.len())
1658 } else {
1659 (&**line, 0)
1660 };
1661
1662 if let Some((leading_spaces, marker, spacing, _content)) =
1663 Self::parse_unordered_list(line_for_list_check)
1664 {
1665 let marker_column = blockquote_prefix_len + leading_spaces.len();
1666 let content_column = marker_column + 1 + spacing.len();
1667
1668 if spacing.is_empty() {
1675 None
1676 } else {
1677 Some(ListItemInfo {
1678 marker: marker.to_string(),
1679 is_ordered: false,
1680 number: None,
1681 marker_column,
1682 content_column,
1683 })
1684 }
1685 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1686 Self::parse_ordered_list(line_for_list_check)
1687 {
1688 let marker = format!("{number_str}{delimiter}");
1689 let marker_column = blockquote_prefix_len + leading_spaces.len();
1690 let content_column = marker_column + marker.len() + spacing.len();
1691
1692 if spacing.is_empty() {
1695 None
1696 } else {
1697 Some(ListItemInfo {
1698 marker,
1699 is_ordered: true,
1700 number: number_str.parse().ok(),
1701 marker_column,
1702 content_column,
1703 })
1704 }
1705 } else {
1706 None
1707 }
1708 } else {
1709 None
1710 };
1711
1712 lines.push(LineInfo {
1713 content: line.to_string(),
1714 byte_offset,
1715 indent,
1716 is_blank,
1717 in_code_block,
1718 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1719 in_html_block: false, in_html_comment,
1721 list_item,
1722 heading: None, blockquote: None, in_mkdocstrings,
1725 in_esm_block: false, });
1727 }
1728
1729 lines
1730 }
1731
1732 fn detect_headings_and_blockquotes(
1734 content: &str,
1735 lines: &mut [LineInfo],
1736 flavor: MarkdownFlavor,
1737 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1738 ) {
1739 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1741 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1742 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1743 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1744
1745 let content_lines: Vec<&str> = content.lines().collect();
1746
1747 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1749
1750 for i in 0..lines.len() {
1752 if lines[i].in_code_block {
1753 continue;
1754 }
1755
1756 if front_matter_end > 0 && i < front_matter_end {
1758 continue;
1759 }
1760
1761 if lines[i].in_html_block {
1763 continue;
1764 }
1765
1766 let line = content_lines[i];
1767
1768 if let Some(bq) = parse_blockquote_detailed(line) {
1770 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1772
1773 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1775
1776 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1778 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1780
1781 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1785
1786 lines[i].blockquote = Some(BlockquoteInfo {
1787 nesting_level,
1788 indent: bq.indent.to_string(),
1789 marker_column,
1790 prefix,
1791 content: bq.content.to_string(),
1792 has_no_space_after_marker: has_no_space,
1793 has_multiple_spaces_after_marker: has_multiple_spaces,
1794 needs_md028_fix,
1795 });
1796 }
1797
1798 if lines[i].is_blank {
1800 continue;
1801 }
1802
1803 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1806 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1807 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1808 } else {
1809 false
1810 };
1811
1812 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1813 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1815 continue;
1816 }
1817 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1818 let hashes = caps.get(2).map_or("", |m| m.as_str());
1819 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1820 let rest = caps.get(4).map_or("", |m| m.as_str());
1821
1822 let level = hashes.len() as u8;
1823 let marker_column = leading_spaces.len();
1824
1825 let (text, has_closing, closing_seq) = {
1827 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1829 if rest[id_start..].trim_end().ends_with('}') {
1831 (&rest[..id_start], &rest[id_start..])
1833 } else {
1834 (rest, "")
1835 }
1836 } else {
1837 (rest, "")
1838 };
1839
1840 let trimmed_rest = rest_without_id.trim_end();
1842 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1843 let mut start_of_hashes = last_hash_pos;
1845 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1846 start_of_hashes -= 1;
1847 }
1848
1849 let has_space_before = start_of_hashes == 0
1851 || trimmed_rest
1852 .chars()
1853 .nth(start_of_hashes - 1)
1854 .is_some_and(|c| c.is_whitespace());
1855
1856 let potential_closing = &trimmed_rest[start_of_hashes..];
1858 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1859
1860 if is_all_hashes && has_space_before {
1861 let closing_hashes = potential_closing.to_string();
1863 let text_part = if !custom_id_part.is_empty() {
1866 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1869 } else {
1870 rest_without_id[..start_of_hashes].trim_end().to_string()
1871 };
1872 (text_part, true, closing_hashes)
1873 } else {
1874 (rest.to_string(), false, String::new())
1876 }
1877 } else {
1878 (rest.to_string(), false, String::new())
1880 }
1881 };
1882
1883 let content_column = marker_column + hashes.len() + spaces_after.len();
1884
1885 let raw_text = text.trim().to_string();
1887 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1888
1889 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1891 let next_line = content_lines[i + 1];
1892 if !lines[i + 1].in_code_block
1893 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1894 && let Some(next_line_id) =
1895 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1896 {
1897 custom_id = Some(next_line_id);
1898 }
1899 }
1900
1901 lines[i].heading = Some(HeadingInfo {
1902 level,
1903 style: HeadingStyle::ATX,
1904 marker: hashes.to_string(),
1905 marker_column,
1906 content_column,
1907 text: clean_text,
1908 custom_id,
1909 raw_text,
1910 has_closing_sequence: has_closing,
1911 closing_sequence: closing_seq,
1912 });
1913 }
1914 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1916 let next_line = content_lines[i + 1];
1917 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1918 if front_matter_end > 0 && i < front_matter_end {
1920 continue;
1921 }
1922
1923 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1925 {
1926 continue;
1927 }
1928
1929 let underline = next_line.trim();
1930
1931 if underline == "---" {
1934 continue;
1935 }
1936
1937 let current_line_trimmed = line.trim();
1939 if current_line_trimmed.contains(':')
1940 && !current_line_trimmed.starts_with('#')
1941 && !current_line_trimmed.contains('[')
1942 && !current_line_trimmed.contains("](")
1943 {
1944 continue;
1946 }
1947
1948 let level = if underline.starts_with('=') { 1 } else { 2 };
1949 let style = if level == 1 {
1950 HeadingStyle::Setext1
1951 } else {
1952 HeadingStyle::Setext2
1953 };
1954
1955 let raw_text = line.trim().to_string();
1957 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1958
1959 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1961 let attr_line = content_lines[i + 2];
1962 if !lines[i + 2].in_code_block
1963 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1964 && let Some(attr_line_id) =
1965 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1966 {
1967 custom_id = Some(attr_line_id);
1968 }
1969 }
1970
1971 lines[i].heading = Some(HeadingInfo {
1972 level,
1973 style,
1974 marker: underline.to_string(),
1975 marker_column: next_line.len() - next_line.trim_start().len(),
1976 content_column: lines[i].indent,
1977 text: clean_text,
1978 custom_id,
1979 raw_text,
1980 has_closing_sequence: false,
1981 closing_sequence: String::new(),
1982 });
1983 }
1984 }
1985 }
1986 }
1987
1988 fn detect_html_blocks(lines: &mut [LineInfo]) {
1990 const BLOCK_ELEMENTS: &[&str] = &[
1992 "address",
1993 "article",
1994 "aside",
1995 "blockquote",
1996 "details",
1997 "dialog",
1998 "dd",
1999 "div",
2000 "dl",
2001 "dt",
2002 "fieldset",
2003 "figcaption",
2004 "figure",
2005 "footer",
2006 "form",
2007 "h1",
2008 "h2",
2009 "h3",
2010 "h4",
2011 "h5",
2012 "h6",
2013 "header",
2014 "hr",
2015 "li",
2016 "main",
2017 "nav",
2018 "ol",
2019 "p",
2020 "pre",
2021 "script",
2022 "section",
2023 "style",
2024 "table",
2025 "tbody",
2026 "td",
2027 "tfoot",
2028 "th",
2029 "thead",
2030 "tr",
2031 "ul",
2032 ];
2033
2034 let mut i = 0;
2035 while i < lines.len() {
2036 if lines[i].in_code_block || lines[i].in_front_matter {
2038 i += 1;
2039 continue;
2040 }
2041
2042 let trimmed = lines[i].content.trim_start();
2043
2044 if trimmed.starts_with('<') && trimmed.len() > 1 {
2046 let after_bracket = &trimmed[1..];
2048 let is_closing = after_bracket.starts_with('/');
2049 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2050
2051 let tag_name = tag_start
2053 .chars()
2054 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2055 .collect::<String>()
2056 .to_lowercase();
2057
2058 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2060 lines[i].in_html_block = true;
2062
2063 if !is_closing {
2066 let closing_tag = format!("</{tag_name}>");
2067 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2069 let mut j = i + 1;
2070 while j < lines.len() && j < i + 100 {
2071 if !allow_blank_lines && lines[j].is_blank {
2074 break;
2075 }
2076
2077 lines[j].in_html_block = true;
2078
2079 if lines[j].content.contains(&closing_tag) {
2081 break;
2082 }
2083 j += 1;
2084 }
2085 }
2086 }
2087 }
2088
2089 i += 1;
2090 }
2091 }
2092
2093 fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2096 if !flavor.supports_esm_blocks() {
2098 return;
2099 }
2100
2101 for line in lines.iter_mut() {
2102 if line.is_blank || line.in_html_comment {
2104 continue;
2105 }
2106
2107 let trimmed = line.content.trim_start();
2109 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2110 line.in_esm_block = true;
2111 } else {
2112 break;
2114 }
2115 }
2116 }
2117
2118 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2120 let mut code_spans = Vec::new();
2121
2122 if !content.contains('`') {
2124 return code_spans;
2125 }
2126
2127 let parser = Parser::new(content).into_offset_iter();
2129
2130 for (event, range) in parser {
2131 if let Event::Code(_) = event {
2132 let start_pos = range.start;
2133 let end_pos = range.end;
2134
2135 let full_span = &content[start_pos..end_pos];
2137 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2138
2139 let content_start = start_pos + backtick_count;
2141 let content_end = end_pos - backtick_count;
2142 let span_content = if content_start < content_end {
2143 content[content_start..content_end].to_string()
2144 } else {
2145 String::new()
2146 };
2147
2148 let line_idx = lines
2151 .partition_point(|line| line.byte_offset <= start_pos)
2152 .saturating_sub(1);
2153 let line_num = line_idx + 1;
2154 let col_start = start_pos - lines[line_idx].byte_offset;
2155
2156 let end_line_idx = lines
2158 .partition_point(|line| line.byte_offset <= end_pos)
2159 .saturating_sub(1);
2160 let col_end = end_pos - lines[end_line_idx].byte_offset;
2161
2162 code_spans.push(CodeSpan {
2163 line: line_num,
2164 start_col: col_start,
2165 end_col: col_end,
2166 byte_offset: start_pos,
2167 byte_end: end_pos,
2168 backtick_count,
2169 content: span_content,
2170 });
2171 }
2172 }
2173
2174 code_spans.sort_by_key(|span| span.byte_offset);
2176
2177 code_spans
2178 }
2179
2180 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2182 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2185 let mut last_list_item_line = 0;
2186 let mut current_indent_level = 0;
2187 let mut last_marker_width = 0;
2188
2189 for (line_idx, line_info) in lines.iter().enumerate() {
2190 let line_num = line_idx + 1;
2191
2192 if line_info.in_code_block {
2194 if let Some(ref mut block) = current_block {
2195 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2197
2198 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2200
2201 match context {
2202 CodeBlockContext::Indented => {
2203 block.end_line = line_num;
2205 continue;
2206 }
2207 CodeBlockContext::Standalone => {
2208 let completed_block = current_block.take().unwrap();
2210 list_blocks.push(completed_block);
2211 continue;
2212 }
2213 CodeBlockContext::Adjacent => {
2214 block.end_line = line_num;
2216 continue;
2217 }
2218 }
2219 } else {
2220 continue;
2222 }
2223 }
2224
2225 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2227 caps.get(0).unwrap().as_str().to_string()
2228 } else {
2229 String::new()
2230 };
2231
2232 if let Some(list_item) = &line_info.list_item {
2234 let item_indent = list_item.marker_column;
2236 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2239 let is_nested = nesting > block.nesting_level;
2243 let same_type =
2244 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2245 let same_context = block.blockquote_prefix == blockquote_prefix;
2246 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2250 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2251
2252 let has_non_list_content = {
2254 let mut found_non_list = false;
2255 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2257
2258 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2260 let last_line = &lines[block_last_item_line - 1];
2261 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2262 log::debug!(
2263 "After problematic line {}: checking lines {} to {} for non-list content",
2264 block_last_item_line,
2265 block_last_item_line + 1,
2266 line_num
2267 );
2268 if line_num == block_last_item_line + 1 {
2270 log::debug!("Lines are consecutive, no content between");
2271 }
2272 }
2273 }
2274
2275 for check_line in (block_last_item_line + 1)..line_num {
2276 let check_idx = check_line - 1;
2277 if check_idx < lines.len() {
2278 let check_info = &lines[check_idx];
2279 let is_list_breaking_content = if check_info.in_code_block {
2281 let last_item_marker_width =
2283 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2284 lines[block_last_item_line - 1]
2285 .list_item
2286 .as_ref()
2287 .map(|li| {
2288 if li.is_ordered {
2289 li.marker.len() + 1 } else {
2291 li.marker.len()
2292 }
2293 })
2294 .unwrap_or(3) } else {
2296 3 };
2298
2299 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2300
2301 let context = CodeBlockUtils::analyze_code_block_context(
2303 lines,
2304 check_line - 1,
2305 min_continuation,
2306 );
2307
2308 matches!(context, CodeBlockContext::Standalone)
2310 } else if !check_info.is_blank && check_info.list_item.is_none() {
2311 let line_content = check_info.content.trim();
2313
2314 if check_info.heading.is_some()
2316 || line_content.starts_with("---")
2317 || line_content.starts_with("***")
2318 || line_content.starts_with("___")
2319 || (line_content.contains('|')
2320 && !line_content.contains("](")
2321 && !line_content.contains("http")
2322 && (line_content.matches('|').count() > 1
2323 || line_content.starts_with('|')
2324 || line_content.ends_with('|')))
2325 || line_content.starts_with(">")
2326 {
2327 true
2328 }
2329 else {
2331 let last_item_marker_width =
2332 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2333 lines[block_last_item_line - 1]
2334 .list_item
2335 .as_ref()
2336 .map(|li| {
2337 if li.is_ordered {
2338 li.marker.len() + 1 } else {
2340 li.marker.len()
2341 }
2342 })
2343 .unwrap_or(3) } else {
2345 3 };
2347
2348 let min_continuation =
2349 if block.is_ordered { last_item_marker_width } else { 2 };
2350 check_info.indent < min_continuation
2351 }
2352 } else {
2353 false
2354 };
2355
2356 if is_list_breaking_content {
2357 found_non_list = true;
2359 break;
2360 }
2361 }
2362 }
2363 found_non_list
2364 };
2365
2366 let mut continues_list = if is_nested {
2370 same_context && reasonable_distance && !has_non_list_content
2372 } else {
2373 let result = same_type
2375 && same_context
2376 && reasonable_distance
2377 && marker_compatible
2378 && !has_non_list_content;
2379
2380 if block.item_lines.last().is_some_and(|&last_line| {
2382 last_line > 0
2383 && last_line <= lines.len()
2384 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2385 && lines[last_line - 1].content.contains(r"\`")
2386 }) {
2387 log::debug!(
2388 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2389 );
2390 if line_num > 0 && line_num <= lines.len() {
2391 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2392 }
2393 }
2394
2395 result
2396 };
2397
2398 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2401 if block.item_lines.contains(&(line_num - 1)) {
2403 continues_list = true;
2405 }
2406 }
2407
2408 if continues_list {
2409 block.end_line = line_num;
2411 block.item_lines.push(line_num);
2412
2413 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2415 list_item.marker.len() + 1
2416 } else {
2417 list_item.marker.len()
2418 });
2419
2420 if !block.is_ordered
2422 && block.marker.is_some()
2423 && block.marker.as_ref() != Some(&list_item.marker)
2424 {
2425 block.marker = None;
2427 }
2428 } else {
2429 list_blocks.push(block.clone());
2432
2433 *block = ListBlock {
2434 start_line: line_num,
2435 end_line: line_num,
2436 is_ordered: list_item.is_ordered,
2437 marker: if list_item.is_ordered {
2438 None
2439 } else {
2440 Some(list_item.marker.clone())
2441 },
2442 blockquote_prefix: blockquote_prefix.clone(),
2443 item_lines: vec![line_num],
2444 nesting_level: nesting,
2445 max_marker_width: if list_item.is_ordered {
2446 list_item.marker.len() + 1
2447 } else {
2448 list_item.marker.len()
2449 },
2450 };
2451 }
2452 } else {
2453 current_block = Some(ListBlock {
2455 start_line: line_num,
2456 end_line: line_num,
2457 is_ordered: list_item.is_ordered,
2458 marker: if list_item.is_ordered {
2459 None
2460 } else {
2461 Some(list_item.marker.clone())
2462 },
2463 blockquote_prefix,
2464 item_lines: vec![line_num],
2465 nesting_level: nesting,
2466 max_marker_width: list_item.marker.len(),
2467 });
2468 }
2469
2470 last_list_item_line = line_num;
2471 current_indent_level = item_indent;
2472 last_marker_width = if list_item.is_ordered {
2473 list_item.marker.len() + 1 } else {
2475 list_item.marker.len()
2476 };
2477 } else if let Some(ref mut block) = current_block {
2478 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2488 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2489 } else {
2490 false
2491 };
2492
2493 let min_continuation_indent = if block.is_ordered {
2497 current_indent_level + last_marker_width
2498 } else {
2499 current_indent_level + 2 };
2501
2502 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2503 block.end_line = line_num;
2505 } else if line_info.is_blank {
2506 let mut check_idx = line_idx + 1;
2509 let mut found_continuation = false;
2510
2511 while check_idx < lines.len() && lines[check_idx].is_blank {
2513 check_idx += 1;
2514 }
2515
2516 if check_idx < lines.len() {
2517 let next_line = &lines[check_idx];
2518 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2520 found_continuation = true;
2521 }
2522 else if !next_line.in_code_block
2524 && next_line.list_item.is_some()
2525 && let Some(item) = &next_line.list_item
2526 {
2527 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2528 .find(&next_line.content)
2529 .map_or(String::new(), |m| m.as_str().to_string());
2530 if item.marker_column == current_indent_level
2531 && item.is_ordered == block.is_ordered
2532 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2533 {
2534 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2537 if let Some(between_line) = lines.get(idx) {
2538 let trimmed = between_line.content.trim();
2539 if trimmed.is_empty() {
2541 return false;
2542 }
2543 let line_indent =
2545 between_line.content.len() - between_line.content.trim_start().len();
2546
2547 if trimmed.starts_with("```")
2549 || trimmed.starts_with("~~~")
2550 || trimmed.starts_with("---")
2551 || trimmed.starts_with("***")
2552 || trimmed.starts_with("___")
2553 || trimmed.starts_with(">")
2554 || trimmed.contains('|') || between_line.heading.is_some()
2556 {
2557 return true; }
2559
2560 line_indent >= min_continuation_indent
2562 } else {
2563 false
2564 }
2565 });
2566
2567 if block.is_ordered {
2568 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2571 if let Some(between_line) = lines.get(idx) {
2572 let trimmed = between_line.content.trim();
2573 if trimmed.is_empty() {
2574 return false;
2575 }
2576 trimmed.starts_with("```")
2578 || trimmed.starts_with("~~~")
2579 || trimmed.starts_with("---")
2580 || trimmed.starts_with("***")
2581 || trimmed.starts_with("___")
2582 || trimmed.starts_with(">")
2583 || trimmed.contains('|') || between_line.heading.is_some()
2585 } else {
2586 false
2587 }
2588 });
2589 found_continuation = !has_structural_separators;
2590 } else {
2591 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2593 if let Some(between_line) = lines.get(idx) {
2594 let trimmed = between_line.content.trim();
2595 if trimmed.is_empty() {
2596 return false;
2597 }
2598 trimmed.starts_with("```")
2600 || trimmed.starts_with("~~~")
2601 || trimmed.starts_with("---")
2602 || trimmed.starts_with("***")
2603 || trimmed.starts_with("___")
2604 || trimmed.starts_with(">")
2605 || trimmed.contains('|') || between_line.heading.is_some()
2607 } else {
2608 false
2609 }
2610 });
2611 found_continuation = !has_structural_separators;
2612 }
2613 }
2614 }
2615 }
2616
2617 if found_continuation {
2618 block.end_line = line_num;
2620 } else {
2621 list_blocks.push(block.clone());
2623 current_block = None;
2624 }
2625 } else {
2626 let min_required_indent = if block.is_ordered {
2629 current_indent_level + last_marker_width
2630 } else {
2631 current_indent_level + 2
2632 };
2633
2634 let line_content = line_info.content.trim();
2639 let is_structural_separator = line_info.heading.is_some()
2640 || line_content.starts_with("```")
2641 || line_content.starts_with("~~~")
2642 || line_content.starts_with("---")
2643 || line_content.starts_with("***")
2644 || line_content.starts_with("___")
2645 || line_content.starts_with(">")
2646 || (line_content.contains('|')
2647 && !line_content.contains("](")
2648 && !line_content.contains("http")
2649 && (line_content.matches('|').count() > 1
2650 || line_content.starts_with('|')
2651 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2656 && !line_info.is_blank
2657 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2658
2659 if is_lazy_continuation {
2660 let content_to_check = if !blockquote_prefix.is_empty() {
2663 line_info
2665 .content
2666 .strip_prefix(&blockquote_prefix)
2667 .unwrap_or(&line_info.content)
2668 .trim()
2669 } else {
2670 line_info.content.trim()
2671 };
2672
2673 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2674
2675 if starts_with_uppercase && last_list_item_line > 0 {
2678 list_blocks.push(block.clone());
2680 current_block = None;
2681 } else {
2682 block.end_line = line_num;
2684 }
2685 } else {
2686 list_blocks.push(block.clone());
2688 current_block = None;
2689 }
2690 }
2691 }
2692 }
2693
2694 if let Some(block) = current_block {
2696 list_blocks.push(block);
2697 }
2698
2699 merge_adjacent_list_blocks(&mut list_blocks, lines);
2701
2702 list_blocks
2703 }
2704
2705 fn compute_char_frequency(content: &str) -> CharFrequency {
2707 let mut frequency = CharFrequency::default();
2708
2709 for ch in content.chars() {
2710 match ch {
2711 '#' => frequency.hash_count += 1,
2712 '*' => frequency.asterisk_count += 1,
2713 '_' => frequency.underscore_count += 1,
2714 '-' => frequency.hyphen_count += 1,
2715 '+' => frequency.plus_count += 1,
2716 '>' => frequency.gt_count += 1,
2717 '|' => frequency.pipe_count += 1,
2718 '[' => frequency.bracket_count += 1,
2719 '`' => frequency.backtick_count += 1,
2720 '<' => frequency.lt_count += 1,
2721 '!' => frequency.exclamation_count += 1,
2722 '\n' => frequency.newline_count += 1,
2723 _ => {}
2724 }
2725 }
2726
2727 frequency
2728 }
2729
2730 fn parse_html_tags(
2732 content: &str,
2733 lines: &[LineInfo],
2734 code_blocks: &[(usize, usize)],
2735 flavor: MarkdownFlavor,
2736 ) -> Vec<HtmlTag> {
2737 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2738 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2739
2740 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2741
2742 for cap in HTML_TAG_REGEX.captures_iter(content) {
2743 let full_match = cap.get(0).unwrap();
2744 let match_start = full_match.start();
2745 let match_end = full_match.end();
2746
2747 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2749 continue;
2750 }
2751
2752 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2753 let tag_name_original = cap.get(2).unwrap().as_str();
2754 let tag_name = tag_name_original.to_lowercase();
2755 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2756
2757 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2760 continue;
2761 }
2762
2763 let mut line_num = 1;
2765 let mut col_start = match_start;
2766 let mut col_end = match_end;
2767 for (idx, line_info) in lines.iter().enumerate() {
2768 if match_start >= line_info.byte_offset {
2769 line_num = idx + 1;
2770 col_start = match_start - line_info.byte_offset;
2771 col_end = match_end - line_info.byte_offset;
2772 } else {
2773 break;
2774 }
2775 }
2776
2777 html_tags.push(HtmlTag {
2778 line: line_num,
2779 start_col: col_start,
2780 end_col: col_end,
2781 byte_offset: match_start,
2782 byte_end: match_end,
2783 tag_name,
2784 is_closing,
2785 is_self_closing,
2786 raw_content: full_match.as_str().to_string(),
2787 });
2788 }
2789
2790 html_tags
2791 }
2792
2793 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2795 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2796 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2797
2798 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2799
2800 for cap in EMPHASIS_REGEX.captures_iter(content) {
2801 let full_match = cap.get(0).unwrap();
2802 let match_start = full_match.start();
2803 let match_end = full_match.end();
2804
2805 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2807 continue;
2808 }
2809
2810 let opening_markers = cap.get(1).unwrap().as_str();
2811 let content_part = cap.get(2).unwrap().as_str();
2812 let closing_markers = cap.get(3).unwrap().as_str();
2813
2814 if opening_markers.chars().next() != closing_markers.chars().next()
2816 || opening_markers.len() != closing_markers.len()
2817 {
2818 continue;
2819 }
2820
2821 let marker = opening_markers.chars().next().unwrap();
2822 let marker_count = opening_markers.len();
2823
2824 let mut line_num = 1;
2826 let mut col_start = match_start;
2827 let mut col_end = match_end;
2828 for (idx, line_info) in lines.iter().enumerate() {
2829 if match_start >= line_info.byte_offset {
2830 line_num = idx + 1;
2831 col_start = match_start - line_info.byte_offset;
2832 col_end = match_end - line_info.byte_offset;
2833 } else {
2834 break;
2835 }
2836 }
2837
2838 emphasis_spans.push(EmphasisSpan {
2839 line: line_num,
2840 start_col: col_start,
2841 end_col: col_end,
2842 byte_offset: match_start,
2843 byte_end: match_end,
2844 marker,
2845 marker_count,
2846 content: content_part.to_string(),
2847 });
2848 }
2849
2850 emphasis_spans
2851 }
2852
2853 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2855 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2856
2857 for (line_idx, line_info) in lines.iter().enumerate() {
2858 if line_info.in_code_block || line_info.is_blank {
2860 continue;
2861 }
2862
2863 let line = &line_info.content;
2864 let line_num = line_idx + 1;
2865
2866 if !line.contains('|') {
2868 continue;
2869 }
2870
2871 let parts: Vec<&str> = line.split('|').collect();
2873 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2874
2875 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2877 let mut column_alignments = Vec::new();
2878
2879 if is_separator {
2880 for part in &parts[1..parts.len() - 1] {
2881 let trimmed = part.trim();
2883 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2884 "center".to_string()
2885 } else if trimmed.ends_with(':') {
2886 "right".to_string()
2887 } else if trimmed.starts_with(':') {
2888 "left".to_string()
2889 } else {
2890 "none".to_string()
2891 };
2892 column_alignments.push(alignment);
2893 }
2894 }
2895
2896 table_rows.push(TableRow {
2897 line: line_num,
2898 is_separator,
2899 column_count,
2900 column_alignments,
2901 });
2902 }
2903
2904 table_rows
2905 }
2906
2907 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2909 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2910
2911 for cap in BARE_URL_PATTERN.captures_iter(content) {
2913 let full_match = cap.get(0).unwrap();
2914 let match_start = full_match.start();
2915 let match_end = full_match.end();
2916
2917 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2919 continue;
2920 }
2921
2922 let preceding_char = if match_start > 0 {
2924 content.chars().nth(match_start - 1)
2925 } else {
2926 None
2927 };
2928 let following_char = content.chars().nth(match_end);
2929
2930 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2931 continue;
2932 }
2933 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2934 continue;
2935 }
2936
2937 let url = full_match.as_str();
2938 let url_type = if url.starts_with("https://") {
2939 "https"
2940 } else if url.starts_with("http://") {
2941 "http"
2942 } else if url.starts_with("ftp://") {
2943 "ftp"
2944 } else {
2945 "other"
2946 };
2947
2948 let mut line_num = 1;
2950 let mut col_start = match_start;
2951 let mut col_end = match_end;
2952 for (idx, line_info) in lines.iter().enumerate() {
2953 if match_start >= line_info.byte_offset {
2954 line_num = idx + 1;
2955 col_start = match_start - line_info.byte_offset;
2956 col_end = match_end - line_info.byte_offset;
2957 } else {
2958 break;
2959 }
2960 }
2961
2962 bare_urls.push(BareUrl {
2963 line: line_num,
2964 start_col: col_start,
2965 end_col: col_end,
2966 byte_offset: match_start,
2967 byte_end: match_end,
2968 url: url.to_string(),
2969 url_type: url_type.to_string(),
2970 });
2971 }
2972
2973 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2975 let full_match = cap.get(0).unwrap();
2976 let match_start = full_match.start();
2977 let match_end = full_match.end();
2978
2979 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2981 continue;
2982 }
2983
2984 let preceding_char = if match_start > 0 {
2986 content.chars().nth(match_start - 1)
2987 } else {
2988 None
2989 };
2990 let following_char = content.chars().nth(match_end);
2991
2992 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2993 continue;
2994 }
2995 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2996 continue;
2997 }
2998
2999 let email = full_match.as_str();
3000
3001 let mut line_num = 1;
3003 let mut col_start = match_start;
3004 let mut col_end = match_end;
3005 for (idx, line_info) in lines.iter().enumerate() {
3006 if match_start >= line_info.byte_offset {
3007 line_num = idx + 1;
3008 col_start = match_start - line_info.byte_offset;
3009 col_end = match_end - line_info.byte_offset;
3010 } else {
3011 break;
3012 }
3013 }
3014
3015 bare_urls.push(BareUrl {
3016 line: line_num,
3017 start_col: col_start,
3018 end_col: col_end,
3019 byte_offset: match_start,
3020 byte_end: match_end,
3021 url: email.to_string(),
3022 url_type: "email".to_string(),
3023 });
3024 }
3025
3026 bare_urls
3027 }
3028}
3029
3030fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3032 if list_blocks.len() < 2 {
3033 return;
3034 }
3035
3036 let mut merger = ListBlockMerger::new(lines);
3037 *list_blocks = merger.merge(list_blocks);
3038}
3039
3040struct ListBlockMerger<'a> {
3042 lines: &'a [LineInfo],
3043}
3044
3045impl<'a> ListBlockMerger<'a> {
3046 fn new(lines: &'a [LineInfo]) -> Self {
3047 Self { lines }
3048 }
3049
3050 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3051 let mut merged = Vec::with_capacity(list_blocks.len());
3052 let mut current = list_blocks[0].clone();
3053
3054 for next in list_blocks.iter().skip(1) {
3055 if self.should_merge_blocks(¤t, next) {
3056 current = self.merge_two_blocks(current, next);
3057 } else {
3058 merged.push(current);
3059 current = next.clone();
3060 }
3061 }
3062
3063 merged.push(current);
3064 merged
3065 }
3066
3067 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3069 if !self.blocks_are_compatible(current, next) {
3071 return false;
3072 }
3073
3074 let spacing = self.analyze_spacing_between(current, next);
3076 match spacing {
3077 BlockSpacing::Consecutive => true,
3078 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3079 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3080 self.can_merge_with_content_between(current, next)
3081 }
3082 }
3083 }
3084
3085 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3087 current.is_ordered == next.is_ordered
3088 && current.blockquote_prefix == next.blockquote_prefix
3089 && current.nesting_level == next.nesting_level
3090 }
3091
3092 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3094 let gap = next.start_line - current.end_line;
3095
3096 match gap {
3097 1 => BlockSpacing::Consecutive,
3098 2 => BlockSpacing::SingleBlank,
3099 _ if gap > 2 => {
3100 if self.has_only_blank_lines_between(current, next) {
3101 BlockSpacing::MultipleBlanks
3102 } else {
3103 BlockSpacing::ContentBetween
3104 }
3105 }
3106 _ => BlockSpacing::Consecutive, }
3108 }
3109
3110 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3112 if has_meaningful_content_between(current, next, self.lines) {
3115 return false; }
3117
3118 !current.is_ordered && current.marker == next.marker
3120 }
3121
3122 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3124 if has_meaningful_content_between(current, next, self.lines) {
3126 return false; }
3128
3129 current.is_ordered && next.is_ordered
3131 }
3132
3133 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3135 for line_num in (current.end_line + 1)..next.start_line {
3136 if let Some(line_info) = self.lines.get(line_num - 1)
3137 && !line_info.content.trim().is_empty()
3138 {
3139 return false;
3140 }
3141 }
3142 true
3143 }
3144
3145 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3147 current.end_line = next.end_line;
3148 current.item_lines.extend_from_slice(&next.item_lines);
3149
3150 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3152
3153 if !current.is_ordered && self.markers_differ(¤t, next) {
3155 current.marker = None; }
3157
3158 current
3159 }
3160
3161 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3163 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3164 }
3165}
3166
3167#[derive(Debug, PartialEq)]
3169enum BlockSpacing {
3170 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3175
3176fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3178 for line_num in (current.end_line + 1)..next.start_line {
3180 if let Some(line_info) = lines.get(line_num - 1) {
3181 let trimmed = line_info.content.trim();
3183
3184 if trimmed.is_empty() {
3186 continue;
3187 }
3188
3189 if line_info.heading.is_some() {
3193 return true; }
3195
3196 if is_horizontal_rule(trimmed) {
3198 return true; }
3200
3201 if trimmed.contains('|') && trimmed.len() > 1 {
3204 if !trimmed.contains("](") && !trimmed.contains("http") {
3206 let pipe_count = trimmed.matches('|').count();
3208 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3209 return true; }
3211 }
3212 }
3213
3214 if trimmed.starts_with('>') {
3216 return true; }
3218
3219 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3221 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3222
3223 let min_continuation_indent = if current.is_ordered {
3225 current.nesting_level + current.max_marker_width + 1 } else {
3227 current.nesting_level + 2
3228 };
3229
3230 if line_indent < min_continuation_indent {
3231 return true; }
3234 }
3235
3236 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3238
3239 let min_indent = if current.is_ordered {
3241 current.nesting_level + current.max_marker_width
3242 } else {
3243 current.nesting_level + 2
3244 };
3245
3246 if line_indent < min_indent {
3248 return true; }
3250
3251 }
3254 }
3255
3256 false
3258}
3259
3260fn is_horizontal_rule(trimmed: &str) -> bool {
3262 if trimmed.len() < 3 {
3263 return false;
3264 }
3265
3266 let chars: Vec<char> = trimmed.chars().collect();
3268 if let Some(&first_char) = chars.first()
3269 && (first_char == '-' || first_char == '*' || first_char == '_')
3270 {
3271 let mut count = 0;
3272 for &ch in &chars {
3273 if ch == first_char {
3274 count += 1;
3275 } else if ch != ' ' && ch != '\t' {
3276 return false; }
3278 }
3279 return count >= 3;
3280 }
3281 false
3282}
3283
3284#[cfg(test)]
3286mod tests {
3287 use super::*;
3288
3289 #[test]
3290 fn test_empty_content() {
3291 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3292 assert_eq!(ctx.content, "");
3293 assert_eq!(ctx.line_offsets, vec![0]);
3294 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3295 assert_eq!(ctx.lines.len(), 0);
3296 }
3297
3298 #[test]
3299 fn test_single_line() {
3300 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3301 assert_eq!(ctx.content, "# Hello");
3302 assert_eq!(ctx.line_offsets, vec![0]);
3303 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3304 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3305 }
3306
3307 #[test]
3308 fn test_multi_line() {
3309 let content = "# Title\n\nSecond line\nThird line";
3310 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3311 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3312 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3319
3320 #[test]
3321 fn test_line_info() {
3322 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3323 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3324
3325 assert_eq!(ctx.lines.len(), 7);
3327
3328 let line1 = &ctx.lines[0];
3330 assert_eq!(line1.content, "# Title");
3331 assert_eq!(line1.byte_offset, 0);
3332 assert_eq!(line1.indent, 0);
3333 assert!(!line1.is_blank);
3334 assert!(!line1.in_code_block);
3335 assert!(line1.list_item.is_none());
3336
3337 let line2 = &ctx.lines[1];
3339 assert_eq!(line2.content, " indented");
3340 assert_eq!(line2.byte_offset, 8);
3341 assert_eq!(line2.indent, 4);
3342 assert!(!line2.is_blank);
3343
3344 let line3 = &ctx.lines[2];
3346 assert_eq!(line3.content, "");
3347 assert!(line3.is_blank);
3348
3349 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3351 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3352 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3353 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3354 }
3355
3356 #[test]
3357 fn test_list_item_detection() {
3358 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3359 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3360
3361 let line1 = &ctx.lines[0];
3363 assert!(line1.list_item.is_some());
3364 let list1 = line1.list_item.as_ref().unwrap();
3365 assert_eq!(list1.marker, "-");
3366 assert!(!list1.is_ordered);
3367 assert_eq!(list1.marker_column, 0);
3368 assert_eq!(list1.content_column, 2);
3369
3370 let line2 = &ctx.lines[1];
3372 assert!(line2.list_item.is_some());
3373 let list2 = line2.list_item.as_ref().unwrap();
3374 assert_eq!(list2.marker, "*");
3375 assert_eq!(list2.marker_column, 2);
3376
3377 let line3 = &ctx.lines[2];
3379 assert!(line3.list_item.is_some());
3380 let list3 = line3.list_item.as_ref().unwrap();
3381 assert_eq!(list3.marker, "1.");
3382 assert!(list3.is_ordered);
3383 assert_eq!(list3.number, Some(1));
3384
3385 let line6 = &ctx.lines[5];
3387 assert!(line6.list_item.is_none());
3388 }
3389
3390 #[test]
3391 fn test_offset_to_line_col_edge_cases() {
3392 let content = "a\nb\nc";
3393 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3394 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3402
3403 #[test]
3404 fn test_mdx_esm_blocks() {
3405 let content = r##"import {Chart} from './snowfall.js'
3406export const year = 2023
3407
3408# Last year's snowfall
3409
3410In {year}, the snowfall was above average.
3411It was followed by a warm spring which caused
3412flood conditions in many of the nearby rivers.
3413
3414<Chart color="#fcb32c" year={year} />
3415"##;
3416
3417 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3418
3419 assert_eq!(ctx.lines.len(), 10);
3421 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3422 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3423 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3424 assert!(
3425 !ctx.lines[3].in_esm_block,
3426 "Line 4 (heading) should NOT be in_esm_block"
3427 );
3428 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3429 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3430 }
3431
3432 #[test]
3433 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3434 let content = r#"import {Chart} from './snowfall.js'
3435export const year = 2023
3436
3437# Last year's snowfall
3438"#;
3439
3440 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3441
3442 assert!(
3444 !ctx.lines[0].in_esm_block,
3445 "Line 1 should NOT be in_esm_block in Standard flavor"
3446 );
3447 assert!(
3448 !ctx.lines[1].in_esm_block,
3449 "Line 2 should NOT be in_esm_block in Standard flavor"
3450 );
3451 }
3452}