1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11 Regex::new(
12 r#"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
16 |
17 \[([^\]]*)\] # Reference ID in group 6
18 )"#
19 ).unwrap()
20});
21
22static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25 Regex::new(
26 r#"(?sx)
27 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
28 (?:
29 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
30 |
31 \[([^\]]*)\] # Reference ID in group 6
32 )"#
33 ).unwrap()
34});
35
36static REF_DEF_PATTERN: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42 Regex::new(
43 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44 ).unwrap()
45});
46
47static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54#[derive(Debug, Clone)]
56pub struct LineInfo {
57 pub content: String,
59 pub byte_offset: usize,
61 pub indent: usize,
63 pub is_blank: bool,
65 pub in_code_block: bool,
67 pub in_front_matter: bool,
69 pub in_html_block: bool,
71 pub in_html_comment: bool,
73 pub list_item: Option<ListItemInfo>,
75 pub heading: Option<HeadingInfo>,
77 pub blockquote: Option<BlockquoteInfo>,
79 pub in_mkdocstrings: bool,
81 pub in_esm_block: bool,
83}
84
85#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88 pub marker: String,
90 pub is_ordered: bool,
92 pub number: Option<usize>,
94 pub marker_column: usize,
96 pub content_column: usize,
98}
99
100#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103 ATX,
105 Setext1,
107 Setext2,
109}
110
111#[derive(Debug, Clone)]
113pub struct ParsedLink {
114 pub line: usize,
116 pub start_col: usize,
118 pub end_col: usize,
120 pub byte_offset: usize,
122 pub byte_end: usize,
124 pub text: String,
126 pub url: String,
128 pub is_reference: bool,
130 pub reference_id: Option<String>,
132}
133
134#[derive(Debug, Clone)]
136pub struct BrokenLinkInfo {
137 pub reference: String,
139 pub span: std::ops::Range<usize>,
141}
142
143#[derive(Debug, Clone)]
145pub struct ParsedImage {
146 pub line: usize,
148 pub start_col: usize,
150 pub end_col: usize,
152 pub byte_offset: usize,
154 pub byte_end: usize,
156 pub alt_text: String,
158 pub url: String,
160 pub is_reference: bool,
162 pub reference_id: Option<String>,
164}
165
166#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169 pub line: usize,
171 pub id: String,
173 pub url: String,
175 pub title: Option<String>,
177 pub byte_offset: usize,
179 pub byte_end: usize,
181}
182
183#[derive(Debug, Clone)]
185pub struct CodeSpan {
186 pub line: usize,
188 pub start_col: usize,
190 pub end_col: usize,
192 pub byte_offset: usize,
194 pub byte_end: usize,
196 pub backtick_count: usize,
198 pub content: String,
200}
201
202#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205 pub level: u8,
207 pub style: HeadingStyle,
209 pub marker: String,
211 pub marker_column: usize,
213 pub content_column: usize,
215 pub text: String,
217 pub custom_id: Option<String>,
219 pub raw_text: String,
221 pub has_closing_sequence: bool,
223 pub closing_sequence: String,
225}
226
227#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230 pub nesting_level: usize,
232 pub indent: String,
234 pub marker_column: usize,
236 pub prefix: String,
238 pub content: String,
240 pub has_no_space_after_marker: bool,
242 pub has_multiple_spaces_after_marker: bool,
244 pub needs_md028_fix: bool,
246}
247
248#[derive(Debug, Clone)]
250pub struct ListBlock {
251 pub start_line: usize,
253 pub end_line: usize,
255 pub is_ordered: bool,
257 pub marker: Option<String>,
259 pub blockquote_prefix: String,
261 pub item_lines: Vec<usize>,
263 pub nesting_level: usize,
265 pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274 pub hash_count: usize,
276 pub asterisk_count: usize,
278 pub underscore_count: usize,
280 pub hyphen_count: usize,
282 pub plus_count: usize,
284 pub gt_count: usize,
286 pub pipe_count: usize,
288 pub bracket_count: usize,
290 pub backtick_count: usize,
292 pub lt_count: usize,
294 pub exclamation_count: usize,
296 pub newline_count: usize,
298}
299
300#[derive(Debug, Clone)]
302pub struct HtmlTag {
303 pub line: usize,
305 pub start_col: usize,
307 pub end_col: usize,
309 pub byte_offset: usize,
311 pub byte_end: usize,
313 pub tag_name: String,
315 pub is_closing: bool,
317 pub is_self_closing: bool,
319 pub raw_content: String,
321}
322
323#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326 pub line: usize,
328 pub start_col: usize,
330 pub end_col: usize,
332 pub byte_offset: usize,
334 pub byte_end: usize,
336 pub marker: char,
338 pub marker_count: usize,
340 pub content: String,
342}
343
344#[derive(Debug, Clone)]
346pub struct TableRow {
347 pub line: usize,
349 pub is_separator: bool,
351 pub column_count: usize,
353 pub column_alignments: Vec<String>, }
356
357#[derive(Debug, Clone)]
359pub struct BareUrl {
360 pub line: usize,
362 pub start_col: usize,
364 pub end_col: usize,
366 pub byte_offset: usize,
368 pub byte_end: usize,
370 pub url: String,
372 pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377 pub content: &'a str,
378 pub line_offsets: Vec<usize>,
379 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
398
399struct BlockquoteComponents<'a> {
401 indent: &'a str,
402 markers: &'a str,
403 spaces_after: &'a str,
404 content: &'a str,
405}
406
407#[inline]
409fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
410 let bytes = line.as_bytes();
411 let mut pos = 0;
412
413 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
415 pos += 1;
416 }
417 let indent_end = pos;
418
419 if pos >= bytes.len() || bytes[pos] != b'>' {
421 return None;
422 }
423
424 while pos < bytes.len() && bytes[pos] == b'>' {
426 pos += 1;
427 }
428 let markers_end = pos;
429
430 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
432 pos += 1;
433 }
434 let spaces_end = pos;
435
436 Some(BlockquoteComponents {
437 indent: &line[0..indent_end],
438 markers: &line[indent_end..markers_end],
439 spaces_after: &line[markers_end..spaces_end],
440 content: &line[spaces_end..],
441 })
442}
443
444impl<'a> LintContext<'a> {
445 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
446 use std::time::Instant;
447 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
448
449 let start = Instant::now();
450 let mut line_offsets = vec![0];
451 for (i, c) in content.char_indices() {
452 if c == '\n' {
453 line_offsets.push(i + 1);
454 }
455 }
456 if profile {
457 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
458 }
459
460 let start = Instant::now();
462 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
463 if profile {
464 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
465 }
466
467 let start = Instant::now();
469 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
470 if profile {
471 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
472 }
473
474 let start = Instant::now();
476 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
477 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
478 } else {
479 Vec::new()
480 };
481 if profile {
482 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
483 }
484
485 let start = Instant::now();
487 let mut lines = Self::compute_basic_line_info(
488 content,
489 &line_offsets,
490 &code_blocks,
491 flavor,
492 &html_comment_ranges,
493 &autodoc_ranges,
494 );
495 if profile {
496 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
497 }
498
499 let start = Instant::now();
501 Self::detect_html_blocks(&mut lines);
502 if profile {
503 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
504 }
505
506 let start = Instant::now();
508 Self::detect_esm_blocks(&mut lines, flavor);
509 if profile {
510 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
511 }
512
513 let start = Instant::now();
515 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
516 if profile {
517 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
518 }
519
520 let start = Instant::now();
522 let code_spans = Self::parse_code_spans(content, &lines);
523 if profile {
524 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
525 }
526
527 let start = Instant::now();
529 let (links, broken_links) =
530 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
531 if profile {
532 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
533 }
534
535 let start = Instant::now();
536 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
537 if profile {
538 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
539 }
540
541 let start = Instant::now();
542 let reference_defs = Self::parse_reference_defs(content, &lines);
543 if profile {
544 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
545 }
546
547 let start = Instant::now();
548 let list_blocks = Self::parse_list_blocks(&lines);
549 if profile {
550 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
551 }
552
553 let start = Instant::now();
555 let char_frequency = Self::compute_char_frequency(content);
556 if profile {
557 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
558 }
559
560 let start = Instant::now();
562 let table_blocks =
563 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(content, &code_blocks, &code_spans);
564 if profile {
565 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
566 }
567
568 let start = Instant::now();
570 let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
571 if profile {
572 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
573 }
574
575 let start = Instant::now();
577 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
578 if profile {
579 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
580 }
581
582 Self {
583 content,
584 line_offsets,
585 code_blocks,
586 lines,
587 links,
588 images,
589 broken_links,
590 reference_defs,
591 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
592 list_blocks,
593 char_frequency,
594 html_tags_cache: Mutex::new(None),
595 emphasis_spans_cache: Mutex::new(None),
596 table_rows_cache: Mutex::new(None),
597 bare_urls_cache: Mutex::new(None),
598 html_comment_ranges,
599 table_blocks,
600 line_index,
601 jinja_ranges,
602 flavor,
603 }
604 }
605
606 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
608 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
609
610 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
611 }
612
613 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
615 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
616
617 Arc::clone(cache.get_or_insert_with(|| {
618 Arc::new(Self::parse_html_tags(
619 self.content,
620 &self.lines,
621 &self.code_blocks,
622 self.flavor,
623 ))
624 }))
625 }
626
627 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
629 let mut cache = self
630 .emphasis_spans_cache
631 .lock()
632 .expect("Emphasis spans cache mutex poisoned");
633
634 Arc::clone(
635 cache.get_or_insert_with(|| {
636 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
637 }),
638 )
639 }
640
641 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
643 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
644
645 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
646 }
647
648 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
650 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
651
652 Arc::clone(
653 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
654 )
655 }
656
657 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
659 match self.line_offsets.binary_search(&offset) {
660 Ok(line) => (line + 1, 1),
661 Err(line) => {
662 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
663 (line, offset - line_start + 1)
664 }
665 }
666 }
667
668 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
670 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
672 return true;
673 }
674
675 self.code_spans()
677 .iter()
678 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
679 }
680
681 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
683 if line_num > 0 {
684 self.lines.get(line_num - 1)
685 } else {
686 None
687 }
688 }
689
690 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
692 self.line_info(line_num).map(|info| info.byte_offset)
693 }
694
695 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
697 let normalized_id = ref_id.to_lowercase();
698 self.reference_defs
699 .iter()
700 .find(|def| def.id == normalized_id)
701 .map(|def| def.url.as_str())
702 }
703
704 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
706 self.links.iter().filter(|link| link.line == line_num).collect()
707 }
708
709 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
711 self.images.iter().filter(|img| img.line == line_num).collect()
712 }
713
714 pub fn is_in_list_block(&self, line_num: usize) -> bool {
716 self.list_blocks
717 .iter()
718 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
719 }
720
721 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
723 self.list_blocks
724 .iter()
725 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
726 }
727
728 pub fn is_in_code_block(&self, line_num: usize) -> bool {
732 if line_num == 0 || line_num > self.lines.len() {
733 return false;
734 }
735 self.lines[line_num - 1].in_code_block
736 }
737
738 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
740 if line_num == 0 || line_num > self.lines.len() {
741 return false;
742 }
743 self.lines[line_num - 1].in_front_matter
744 }
745
746 pub fn is_in_html_block(&self, line_num: usize) -> bool {
748 if line_num == 0 || line_num > self.lines.len() {
749 return false;
750 }
751 self.lines[line_num - 1].in_html_block
752 }
753
754 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
756 if line_num == 0 || line_num > self.lines.len() {
757 return false;
758 }
759
760 let col_0indexed = if col > 0 { col - 1 } else { 0 };
764 let code_spans = self.code_spans();
765 code_spans
766 .iter()
767 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
768 }
769
770 #[inline]
773 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
774 self.reference_defs
775 .iter()
776 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
777 }
778
779 #[inline]
783 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
784 self.html_comment_ranges
785 .iter()
786 .any(|range| byte_pos >= range.start && byte_pos < range.end)
787 }
788
789 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
791 self.jinja_ranges
792 .iter()
793 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
794 }
795
796 pub fn has_char(&self, ch: char) -> bool {
798 match ch {
799 '#' => self.char_frequency.hash_count > 0,
800 '*' => self.char_frequency.asterisk_count > 0,
801 '_' => self.char_frequency.underscore_count > 0,
802 '-' => self.char_frequency.hyphen_count > 0,
803 '+' => self.char_frequency.plus_count > 0,
804 '>' => self.char_frequency.gt_count > 0,
805 '|' => self.char_frequency.pipe_count > 0,
806 '[' => self.char_frequency.bracket_count > 0,
807 '`' => self.char_frequency.backtick_count > 0,
808 '<' => self.char_frequency.lt_count > 0,
809 '!' => self.char_frequency.exclamation_count > 0,
810 '\n' => self.char_frequency.newline_count > 0,
811 _ => self.content.contains(ch), }
813 }
814
815 pub fn char_count(&self, ch: char) -> usize {
817 match ch {
818 '#' => self.char_frequency.hash_count,
819 '*' => self.char_frequency.asterisk_count,
820 '_' => self.char_frequency.underscore_count,
821 '-' => self.char_frequency.hyphen_count,
822 '+' => self.char_frequency.plus_count,
823 '>' => self.char_frequency.gt_count,
824 '|' => self.char_frequency.pipe_count,
825 '[' => self.char_frequency.bracket_count,
826 '`' => self.char_frequency.backtick_count,
827 '<' => self.char_frequency.lt_count,
828 '!' => self.char_frequency.exclamation_count,
829 '\n' => self.char_frequency.newline_count,
830 _ => self.content.matches(ch).count(), }
832 }
833
834 pub fn likely_has_headings(&self) -> bool {
836 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
838
839 pub fn likely_has_lists(&self) -> bool {
841 self.char_frequency.asterisk_count > 0
842 || self.char_frequency.hyphen_count > 0
843 || self.char_frequency.plus_count > 0
844 }
845
846 pub fn likely_has_emphasis(&self) -> bool {
848 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
849 }
850
851 pub fn likely_has_tables(&self) -> bool {
853 self.char_frequency.pipe_count > 2
854 }
855
856 pub fn likely_has_blockquotes(&self) -> bool {
858 self.char_frequency.gt_count > 0
859 }
860
861 pub fn likely_has_code(&self) -> bool {
863 self.char_frequency.backtick_count > 0
864 }
865
866 pub fn likely_has_links_or_images(&self) -> bool {
868 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
869 }
870
871 pub fn likely_has_html(&self) -> bool {
873 self.char_frequency.lt_count > 0
874 }
875
876 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
878 self.html_tags()
879 .iter()
880 .filter(|tag| tag.line == line_num)
881 .cloned()
882 .collect()
883 }
884
885 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
887 self.emphasis_spans()
888 .iter()
889 .filter(|span| span.line == line_num)
890 .cloned()
891 .collect()
892 }
893
894 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
896 self.table_rows()
897 .iter()
898 .filter(|row| row.line == line_num)
899 .cloned()
900 .collect()
901 }
902
903 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
905 self.bare_urls()
906 .iter()
907 .filter(|url| url.line == line_num)
908 .cloned()
909 .collect()
910 }
911
912 #[inline]
918 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
919 let idx = match lines.binary_search_by(|line| {
921 if byte_offset < line.byte_offset {
922 std::cmp::Ordering::Greater
923 } else if byte_offset > line.byte_offset + line.content.len() {
924 std::cmp::Ordering::Less
925 } else {
926 std::cmp::Ordering::Equal
927 }
928 }) {
929 Ok(idx) => idx,
930 Err(idx) => idx.saturating_sub(1),
931 };
932
933 let line = &lines[idx];
934 let line_num = idx + 1;
935 let col = byte_offset.saturating_sub(line.byte_offset);
936
937 (idx, line_num, col)
938 }
939
940 #[inline]
942 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
943 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
945
946 if idx > 0 {
948 let span = &code_spans[idx - 1];
949 if offset >= span.byte_offset && offset < span.byte_end {
950 return true;
951 }
952 }
953
954 false
955 }
956
957 fn parse_links(
959 content: &str,
960 lines: &[LineInfo],
961 code_blocks: &[(usize, usize)],
962 code_spans: &[CodeSpan],
963 flavor: MarkdownFlavor,
964 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
965 ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
966 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
967 use std::collections::HashSet;
968
969 let mut links = Vec::with_capacity(content.len() / 500);
970 let mut broken_links = Vec::new();
971
972 let mut found_positions = HashSet::new();
974
975 let parser = Parser::new_with_broken_link_callback(
984 content,
985 pulldown_cmark::Options::empty(),
986 Some(|link: BrokenLink<'_>| {
987 broken_links.push(BrokenLinkInfo {
988 reference: link.reference.to_string(),
989 span: link.span.clone(),
990 });
991 None
992 }),
993 )
994 .into_offset_iter();
995
996 let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
997 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1000 match event {
1001 Event::Start(Tag::Link {
1002 link_type,
1003 dest_url,
1004 id,
1005 ..
1006 }) => {
1007 link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1009 text_chunks.clear();
1010 }
1011 Event::Text(text) if !link_stack.is_empty() => {
1012 text_chunks.push((text.to_string(), range.start, range.end));
1014 }
1015 Event::Code(code) if !link_stack.is_empty() => {
1016 let code_text = format!("`{code}`");
1018 text_chunks.push((code_text, range.start, range.end));
1019 }
1020 Event::End(TagEnd::Link) => {
1021 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1022 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1024 text_chunks.clear();
1025 continue;
1026 }
1027
1028 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1030
1031 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1033 text_chunks.clear();
1034 continue;
1035 }
1036
1037 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1038
1039 let is_reference = matches!(
1040 link_type,
1041 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1042 );
1043
1044 let link_text = if start_pos < content.len() {
1047 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1048
1049 let mut close_pos = None;
1052 let mut depth = 0;
1053
1054 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1055 let mut backslash_count = 0;
1057 let mut j = i;
1058 while j > 0 && link_bytes[j - 1] == b'\\' {
1059 backslash_count += 1;
1060 j -= 1;
1061 }
1062 let is_escaped = backslash_count % 2 != 0;
1063
1064 if !is_escaped {
1065 if byte == b'[' {
1066 depth += 1;
1067 } else if byte == b']' {
1068 if depth == 0 {
1069 close_pos = Some(i);
1071 break;
1072 } else {
1073 depth -= 1;
1074 }
1075 }
1076 }
1077 }
1078
1079 if let Some(pos) = close_pos {
1080 std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1081 } else {
1082 String::new()
1083 }
1084 } else {
1085 String::new()
1086 };
1087
1088 let reference_id = if is_reference && !ref_id.is_empty() {
1090 Some(ref_id.to_lowercase())
1091 } else if is_reference {
1092 Some(link_text.to_lowercase())
1094 } else {
1095 None
1096 };
1097
1098 let has_escaped_bang = start_pos >= 2
1102 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1103 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1104
1105 let has_escaped_bracket =
1108 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1109
1110 if has_escaped_bang || has_escaped_bracket {
1111 text_chunks.clear();
1112 continue; }
1114
1115 found_positions.insert(start_pos);
1117
1118 links.push(ParsedLink {
1119 line: line_num,
1120 start_col: col_start,
1121 end_col: col_end,
1122 byte_offset: start_pos,
1123 byte_end: range.end,
1124 text: link_text,
1125 url,
1126 is_reference,
1127 reference_id,
1128 });
1129
1130 text_chunks.clear();
1131 }
1132 }
1133 _ => {}
1134 }
1135 }
1136
1137 for cap in LINK_PATTERN.captures_iter(content) {
1141 let full_match = cap.get(0).unwrap();
1142 let match_start = full_match.start();
1143 let match_end = full_match.end();
1144
1145 if found_positions.contains(&match_start) {
1147 continue;
1148 }
1149
1150 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1152 continue;
1153 }
1154
1155 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1157 continue;
1158 }
1159
1160 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1162 continue;
1163 }
1164
1165 if Self::is_offset_in_code_span(code_spans, match_start) {
1167 continue;
1168 }
1169
1170 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1172 continue;
1173 }
1174
1175 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1177
1178 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1180 continue;
1181 }
1182
1183 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1184
1185 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1186
1187 if let Some(ref_id) = cap.get(6) {
1189 let ref_id_str = ref_id.as_str();
1190 let normalized_ref = if ref_id_str.is_empty() {
1191 text.to_lowercase() } else {
1193 ref_id_str.to_lowercase()
1194 };
1195
1196 links.push(ParsedLink {
1198 line: line_num,
1199 start_col: col_start,
1200 end_col: col_end,
1201 byte_offset: match_start,
1202 byte_end: match_end,
1203 text,
1204 url: String::new(), is_reference: true,
1206 reference_id: Some(normalized_ref),
1207 });
1208 }
1209 }
1210
1211 (links, broken_links)
1212 }
1213
1214 fn parse_images(
1216 content: &str,
1217 lines: &[LineInfo],
1218 code_blocks: &[(usize, usize)],
1219 code_spans: &[CodeSpan],
1220 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1221 ) -> Vec<ParsedImage> {
1222 use crate::utils::skip_context::is_in_html_comment_ranges;
1223 use std::collections::HashSet;
1224
1225 let mut images = Vec::with_capacity(content.len() / 1000);
1227 let mut found_positions = HashSet::new();
1228
1229 let parser = Parser::new(content).into_offset_iter();
1231 let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1232 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1235 match event {
1236 Event::Start(Tag::Image {
1237 link_type,
1238 dest_url,
1239 id,
1240 ..
1241 }) => {
1242 image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1243 text_chunks.clear();
1244 }
1245 Event::Text(text) if !image_stack.is_empty() => {
1246 text_chunks.push((text.to_string(), range.start, range.end));
1247 }
1248 Event::Code(code) if !image_stack.is_empty() => {
1249 let code_text = format!("`{code}`");
1250 text_chunks.push((code_text, range.start, range.end));
1251 }
1252 Event::End(TagEnd::Image) => {
1253 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1254 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1256 continue;
1257 }
1258
1259 if Self::is_offset_in_code_span(code_spans, start_pos) {
1261 continue;
1262 }
1263
1264 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1266 continue;
1267 }
1268
1269 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1271 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1272
1273 let is_reference = matches!(
1274 link_type,
1275 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1276 );
1277
1278 let alt_text = if start_pos < content.len() {
1281 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1282
1283 let mut close_pos = None;
1286 let mut depth = 0;
1287
1288 if image_bytes.len() > 2 {
1289 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1290 let mut backslash_count = 0;
1292 let mut j = i;
1293 while j > 0 && image_bytes[j - 1] == b'\\' {
1294 backslash_count += 1;
1295 j -= 1;
1296 }
1297 let is_escaped = backslash_count % 2 != 0;
1298
1299 if !is_escaped {
1300 if byte == b'[' {
1301 depth += 1;
1302 } else if byte == b']' {
1303 if depth == 0 {
1304 close_pos = Some(i);
1306 break;
1307 } else {
1308 depth -= 1;
1309 }
1310 }
1311 }
1312 }
1313 }
1314
1315 if let Some(pos) = close_pos {
1316 std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1317 } else {
1318 String::new()
1319 }
1320 } else {
1321 String::new()
1322 };
1323
1324 let reference_id = if is_reference && !ref_id.is_empty() {
1325 Some(ref_id.to_lowercase())
1326 } else if is_reference {
1327 Some(alt_text.to_lowercase()) } else {
1329 None
1330 };
1331
1332 found_positions.insert(start_pos);
1333 images.push(ParsedImage {
1334 line: line_num,
1335 start_col: col_start,
1336 end_col: col_end,
1337 byte_offset: start_pos,
1338 byte_end: range.end,
1339 alt_text,
1340 url,
1341 is_reference,
1342 reference_id,
1343 });
1344 }
1345 }
1346 _ => {}
1347 }
1348 }
1349
1350 for cap in IMAGE_PATTERN.captures_iter(content) {
1352 let full_match = cap.get(0).unwrap();
1353 let match_start = full_match.start();
1354 let match_end = full_match.end();
1355
1356 if found_positions.contains(&match_start) {
1358 continue;
1359 }
1360
1361 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1363 continue;
1364 }
1365
1366 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1368 || Self::is_offset_in_code_span(code_spans, match_start)
1369 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1370 {
1371 continue;
1372 }
1373
1374 if let Some(ref_id) = cap.get(6) {
1376 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1377 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1378 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1379 let ref_id_str = ref_id.as_str();
1380 let normalized_ref = if ref_id_str.is_empty() {
1381 alt_text.to_lowercase()
1382 } else {
1383 ref_id_str.to_lowercase()
1384 };
1385
1386 images.push(ParsedImage {
1387 line: line_num,
1388 start_col: col_start,
1389 end_col: col_end,
1390 byte_offset: match_start,
1391 byte_end: match_end,
1392 alt_text,
1393 url: String::new(),
1394 is_reference: true,
1395 reference_id: Some(normalized_ref),
1396 });
1397 }
1398 }
1399
1400 images
1401 }
1402
1403 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1405 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1409 if line_info.in_code_block {
1411 continue;
1412 }
1413
1414 let line = &line_info.content;
1415 let line_num = line_idx + 1;
1416
1417 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1418 let id = cap.get(1).unwrap().as_str().to_lowercase();
1419 let url = cap.get(2).unwrap().as_str().to_string();
1420 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1421
1422 let match_obj = cap.get(0).unwrap();
1425 let byte_offset = line_info.byte_offset + match_obj.start();
1426 let byte_end = line_info.byte_offset + match_obj.end();
1427
1428 refs.push(ReferenceDef {
1429 line: line_num,
1430 id,
1431 url,
1432 title,
1433 byte_offset,
1434 byte_end,
1435 });
1436 }
1437 }
1438
1439 refs
1440 }
1441
1442 #[inline]
1446 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1447 let trimmed_start = line.trim_start();
1448 if !trimmed_start.starts_with('>') {
1449 return None;
1450 }
1451
1452 let leading_ws_len = line.len() - trimmed_start.len();
1453 let after_gt = &trimmed_start[1..];
1454 let content = after_gt.trim_start();
1455 let ws_after_gt_len = after_gt.len() - content.len();
1456 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1457
1458 Some((&line[..prefix_len], content))
1459 }
1460
1461 #[inline]
1465 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1466 let bytes = line.as_bytes();
1467 let mut i = 0;
1468
1469 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1471 i += 1;
1472 }
1473
1474 if i >= bytes.len() {
1476 return None;
1477 }
1478 let marker = bytes[i] as char;
1479 if marker != '-' && marker != '*' && marker != '+' {
1480 return None;
1481 }
1482 let marker_pos = i;
1483 i += 1;
1484
1485 let spacing_start = i;
1487 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1488 i += 1;
1489 }
1490
1491 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1492 }
1493
1494 #[inline]
1498 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1499 let bytes = line.as_bytes();
1500 let mut i = 0;
1501
1502 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1504 i += 1;
1505 }
1506
1507 let number_start = i;
1509 while i < bytes.len() && bytes[i].is_ascii_digit() {
1510 i += 1;
1511 }
1512 if i == number_start {
1513 return None; }
1515
1516 if i >= bytes.len() {
1518 return None;
1519 }
1520 let delimiter = bytes[i] as char;
1521 if delimiter != '.' && delimiter != ')' {
1522 return None;
1523 }
1524 let delimiter_pos = i;
1525 i += 1;
1526
1527 let spacing_start = i;
1529 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1530 i += 1;
1531 }
1532
1533 Some((
1534 &line[..number_start],
1535 &line[number_start..delimiter_pos],
1536 delimiter,
1537 &line[spacing_start..i],
1538 &line[i..],
1539 ))
1540 }
1541
1542 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1545 let num_lines = line_offsets.len();
1546 let mut in_code_block = vec![false; num_lines];
1547
1548 for &(start, end) in code_blocks {
1550 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1552 let mut boundary = start;
1553 while boundary > 0 && !content.is_char_boundary(boundary) {
1554 boundary -= 1;
1555 }
1556 boundary
1557 } else {
1558 start
1559 };
1560
1561 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1562 let mut boundary = end;
1563 while boundary < content.len() && !content.is_char_boundary(boundary) {
1564 boundary += 1;
1565 }
1566 boundary
1567 } else {
1568 end.min(content.len())
1569 };
1570
1571 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1586 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1587
1588 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1590 *flag = true;
1591 }
1592 }
1593
1594 in_code_block
1595 }
1596
1597 fn compute_basic_line_info(
1599 content: &str,
1600 line_offsets: &[usize],
1601 code_blocks: &[(usize, usize)],
1602 flavor: MarkdownFlavor,
1603 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1604 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1605 ) -> Vec<LineInfo> {
1606 let content_lines: Vec<&str> = content.lines().collect();
1607 let mut lines = Vec::with_capacity(content_lines.len());
1608
1609 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1611
1612 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1615
1616 for (i, line) in content_lines.iter().enumerate() {
1617 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1618 let indent = line.len() - line.trim_start().len();
1619
1620 let blockquote_parse = Self::parse_blockquote_prefix(line);
1622
1623 let is_blank = if let Some((_, content)) = blockquote_parse {
1625 content.trim().is_empty()
1627 } else {
1628 line.trim().is_empty()
1629 };
1630
1631 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1633
1634 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1636 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1637 let in_html_comment =
1639 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1640 let list_item = if !(in_code_block
1641 || is_blank
1642 || in_mkdocstrings
1643 || in_html_comment
1644 || (front_matter_end > 0 && i < front_matter_end))
1645 {
1646 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1648 (content, prefix.len())
1649 } else {
1650 (&**line, 0)
1651 };
1652
1653 if let Some((leading_spaces, marker, spacing, _content)) =
1654 Self::parse_unordered_list(line_for_list_check)
1655 {
1656 let marker_column = blockquote_prefix_len + leading_spaces.len();
1657 let content_column = marker_column + 1 + spacing.len();
1658
1659 if spacing.is_empty() {
1666 None
1667 } else {
1668 Some(ListItemInfo {
1669 marker: marker.to_string(),
1670 is_ordered: false,
1671 number: None,
1672 marker_column,
1673 content_column,
1674 })
1675 }
1676 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1677 Self::parse_ordered_list(line_for_list_check)
1678 {
1679 let marker = format!("{number_str}{delimiter}");
1680 let marker_column = blockquote_prefix_len + leading_spaces.len();
1681 let content_column = marker_column + marker.len() + spacing.len();
1682
1683 if spacing.is_empty() {
1686 None
1687 } else {
1688 Some(ListItemInfo {
1689 marker,
1690 is_ordered: true,
1691 number: number_str.parse().ok(),
1692 marker_column,
1693 content_column,
1694 })
1695 }
1696 } else {
1697 None
1698 }
1699 } else {
1700 None
1701 };
1702
1703 lines.push(LineInfo {
1704 content: line.to_string(),
1705 byte_offset,
1706 indent,
1707 is_blank,
1708 in_code_block,
1709 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1710 in_html_block: false, in_html_comment,
1712 list_item,
1713 heading: None, blockquote: None, in_mkdocstrings,
1716 in_esm_block: false, });
1718 }
1719
1720 lines
1721 }
1722
1723 fn detect_headings_and_blockquotes(
1725 content: &str,
1726 lines: &mut [LineInfo],
1727 flavor: MarkdownFlavor,
1728 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1729 ) {
1730 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1732 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1733 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1734 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1735
1736 let content_lines: Vec<&str> = content.lines().collect();
1737
1738 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1740
1741 for i in 0..lines.len() {
1743 if lines[i].in_code_block {
1744 continue;
1745 }
1746
1747 if front_matter_end > 0 && i < front_matter_end {
1749 continue;
1750 }
1751
1752 if lines[i].in_html_block {
1754 continue;
1755 }
1756
1757 let line = content_lines[i];
1758
1759 if let Some(bq) = parse_blockquote_detailed(line) {
1761 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1763
1764 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1766
1767 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1769 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1771
1772 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1776
1777 lines[i].blockquote = Some(BlockquoteInfo {
1778 nesting_level,
1779 indent: bq.indent.to_string(),
1780 marker_column,
1781 prefix,
1782 content: bq.content.to_string(),
1783 has_no_space_after_marker: has_no_space,
1784 has_multiple_spaces_after_marker: has_multiple_spaces,
1785 needs_md028_fix,
1786 });
1787 }
1788
1789 if lines[i].is_blank {
1791 continue;
1792 }
1793
1794 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1797 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1798 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1799 } else {
1800 false
1801 };
1802
1803 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1804 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1806 continue;
1807 }
1808 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1809 let hashes = caps.get(2).map_or("", |m| m.as_str());
1810 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1811 let rest = caps.get(4).map_or("", |m| m.as_str());
1812
1813 let level = hashes.len() as u8;
1814 let marker_column = leading_spaces.len();
1815
1816 let (text, has_closing, closing_seq) = {
1818 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1820 if rest[id_start..].trim_end().ends_with('}') {
1822 (&rest[..id_start], &rest[id_start..])
1824 } else {
1825 (rest, "")
1826 }
1827 } else {
1828 (rest, "")
1829 };
1830
1831 let trimmed_rest = rest_without_id.trim_end();
1833 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1834 let mut start_of_hashes = last_hash_pos;
1836 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1837 start_of_hashes -= 1;
1838 }
1839
1840 let has_space_before = start_of_hashes == 0
1842 || trimmed_rest
1843 .chars()
1844 .nth(start_of_hashes - 1)
1845 .is_some_and(|c| c.is_whitespace());
1846
1847 let potential_closing = &trimmed_rest[start_of_hashes..];
1849 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1850
1851 if is_all_hashes && has_space_before {
1852 let closing_hashes = potential_closing.to_string();
1854 let text_part = if !custom_id_part.is_empty() {
1857 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1860 } else {
1861 rest_without_id[..start_of_hashes].trim_end().to_string()
1862 };
1863 (text_part, true, closing_hashes)
1864 } else {
1865 (rest.to_string(), false, String::new())
1867 }
1868 } else {
1869 (rest.to_string(), false, String::new())
1871 }
1872 };
1873
1874 let content_column = marker_column + hashes.len() + spaces_after.len();
1875
1876 let raw_text = text.trim().to_string();
1878 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1879
1880 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1882 let next_line = content_lines[i + 1];
1883 if !lines[i + 1].in_code_block
1884 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1885 && let Some(next_line_id) =
1886 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1887 {
1888 custom_id = Some(next_line_id);
1889 }
1890 }
1891
1892 lines[i].heading = Some(HeadingInfo {
1893 level,
1894 style: HeadingStyle::ATX,
1895 marker: hashes.to_string(),
1896 marker_column,
1897 content_column,
1898 text: clean_text,
1899 custom_id,
1900 raw_text,
1901 has_closing_sequence: has_closing,
1902 closing_sequence: closing_seq,
1903 });
1904 }
1905 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1907 let next_line = content_lines[i + 1];
1908 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1909 if front_matter_end > 0 && i < front_matter_end {
1911 continue;
1912 }
1913
1914 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1916 {
1917 continue;
1918 }
1919
1920 let underline = next_line.trim();
1921
1922 if underline == "---" {
1925 continue;
1926 }
1927
1928 let current_line_trimmed = line.trim();
1930 if current_line_trimmed.contains(':')
1931 && !current_line_trimmed.starts_with('#')
1932 && !current_line_trimmed.contains('[')
1933 && !current_line_trimmed.contains("](")
1934 {
1935 continue;
1937 }
1938
1939 let level = if underline.starts_with('=') { 1 } else { 2 };
1940 let style = if level == 1 {
1941 HeadingStyle::Setext1
1942 } else {
1943 HeadingStyle::Setext2
1944 };
1945
1946 let raw_text = line.trim().to_string();
1948 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1949
1950 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1952 let attr_line = content_lines[i + 2];
1953 if !lines[i + 2].in_code_block
1954 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1955 && let Some(attr_line_id) =
1956 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1957 {
1958 custom_id = Some(attr_line_id);
1959 }
1960 }
1961
1962 lines[i].heading = Some(HeadingInfo {
1963 level,
1964 style,
1965 marker: underline.to_string(),
1966 marker_column: next_line.len() - next_line.trim_start().len(),
1967 content_column: lines[i].indent,
1968 text: clean_text,
1969 custom_id,
1970 raw_text,
1971 has_closing_sequence: false,
1972 closing_sequence: String::new(),
1973 });
1974 }
1975 }
1976 }
1977 }
1978
1979 fn detect_html_blocks(lines: &mut [LineInfo]) {
1981 const BLOCK_ELEMENTS: &[&str] = &[
1983 "address",
1984 "article",
1985 "aside",
1986 "blockquote",
1987 "details",
1988 "dialog",
1989 "dd",
1990 "div",
1991 "dl",
1992 "dt",
1993 "fieldset",
1994 "figcaption",
1995 "figure",
1996 "footer",
1997 "form",
1998 "h1",
1999 "h2",
2000 "h3",
2001 "h4",
2002 "h5",
2003 "h6",
2004 "header",
2005 "hr",
2006 "li",
2007 "main",
2008 "nav",
2009 "ol",
2010 "p",
2011 "pre",
2012 "script",
2013 "section",
2014 "style",
2015 "table",
2016 "tbody",
2017 "td",
2018 "tfoot",
2019 "th",
2020 "thead",
2021 "tr",
2022 "ul",
2023 ];
2024
2025 let mut i = 0;
2026 while i < lines.len() {
2027 if lines[i].in_code_block || lines[i].in_front_matter {
2029 i += 1;
2030 continue;
2031 }
2032
2033 let trimmed = lines[i].content.trim_start();
2034
2035 if trimmed.starts_with('<') && trimmed.len() > 1 {
2037 let after_bracket = &trimmed[1..];
2039 let is_closing = after_bracket.starts_with('/');
2040 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2041
2042 let tag_name = tag_start
2044 .chars()
2045 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2046 .collect::<String>()
2047 .to_lowercase();
2048
2049 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2051 lines[i].in_html_block = true;
2053
2054 if !is_closing {
2057 let closing_tag = format!("</{tag_name}>");
2058 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2060 let mut j = i + 1;
2061 while j < lines.len() && j < i + 100 {
2062 if !allow_blank_lines && lines[j].is_blank {
2065 break;
2066 }
2067
2068 lines[j].in_html_block = true;
2069
2070 if lines[j].content.contains(&closing_tag) {
2072 break;
2073 }
2074 j += 1;
2075 }
2076 }
2077 }
2078 }
2079
2080 i += 1;
2081 }
2082 }
2083
2084 fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2087 if !flavor.supports_esm_blocks() {
2089 return;
2090 }
2091
2092 for line in lines.iter_mut() {
2093 if line.is_blank || line.in_html_comment {
2095 continue;
2096 }
2097
2098 let trimmed = line.content.trim_start();
2100 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2101 line.in_esm_block = true;
2102 } else {
2103 break;
2105 }
2106 }
2107 }
2108
2109 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2111 let mut code_spans = Vec::new();
2112
2113 if !content.contains('`') {
2115 return code_spans;
2116 }
2117
2118 let parser = Parser::new(content).into_offset_iter();
2120
2121 for (event, range) in parser {
2122 if let Event::Code(_) = event {
2123 let start_pos = range.start;
2124 let end_pos = range.end;
2125
2126 let full_span = &content[start_pos..end_pos];
2128 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2129
2130 let content_start = start_pos + backtick_count;
2132 let content_end = end_pos - backtick_count;
2133 let span_content = if content_start < content_end {
2134 content[content_start..content_end].to_string()
2135 } else {
2136 String::new()
2137 };
2138
2139 let line_idx = lines
2142 .partition_point(|line| line.byte_offset <= start_pos)
2143 .saturating_sub(1);
2144 let line_num = line_idx + 1;
2145 let col_start = start_pos - lines[line_idx].byte_offset;
2146
2147 let end_line_idx = lines
2149 .partition_point(|line| line.byte_offset <= end_pos)
2150 .saturating_sub(1);
2151 let col_end = end_pos - lines[end_line_idx].byte_offset;
2152
2153 code_spans.push(CodeSpan {
2154 line: line_num,
2155 start_col: col_start,
2156 end_col: col_end,
2157 byte_offset: start_pos,
2158 byte_end: end_pos,
2159 backtick_count,
2160 content: span_content,
2161 });
2162 }
2163 }
2164
2165 code_spans.sort_by_key(|span| span.byte_offset);
2167
2168 code_spans
2169 }
2170
2171 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2173 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2176 let mut last_list_item_line = 0;
2177 let mut current_indent_level = 0;
2178 let mut last_marker_width = 0;
2179
2180 for (line_idx, line_info) in lines.iter().enumerate() {
2181 let line_num = line_idx + 1;
2182
2183 if line_info.in_code_block {
2185 if let Some(ref mut block) = current_block {
2186 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2188
2189 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2191
2192 match context {
2193 CodeBlockContext::Indented => {
2194 block.end_line = line_num;
2196 continue;
2197 }
2198 CodeBlockContext::Standalone => {
2199 let completed_block = current_block.take().unwrap();
2201 list_blocks.push(completed_block);
2202 continue;
2203 }
2204 CodeBlockContext::Adjacent => {
2205 block.end_line = line_num;
2207 continue;
2208 }
2209 }
2210 } else {
2211 continue;
2213 }
2214 }
2215
2216 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2218 caps.get(0).unwrap().as_str().to_string()
2219 } else {
2220 String::new()
2221 };
2222
2223 if let Some(list_item) = &line_info.list_item {
2225 let item_indent = list_item.marker_column;
2227 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2230 let is_nested = nesting > block.nesting_level;
2234 let same_type =
2235 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2236 let same_context = block.blockquote_prefix == blockquote_prefix;
2237 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2241 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2242
2243 let has_non_list_content = {
2245 let mut found_non_list = false;
2246 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2248
2249 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2251 let last_line = &lines[block_last_item_line - 1];
2252 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2253 log::debug!(
2254 "After problematic line {}: checking lines {} to {} for non-list content",
2255 block_last_item_line,
2256 block_last_item_line + 1,
2257 line_num
2258 );
2259 if line_num == block_last_item_line + 1 {
2261 log::debug!("Lines are consecutive, no content between");
2262 }
2263 }
2264 }
2265
2266 for check_line in (block_last_item_line + 1)..line_num {
2267 let check_idx = check_line - 1;
2268 if check_idx < lines.len() {
2269 let check_info = &lines[check_idx];
2270 let is_list_breaking_content = if check_info.in_code_block {
2272 let last_item_marker_width =
2274 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2275 lines[block_last_item_line - 1]
2276 .list_item
2277 .as_ref()
2278 .map(|li| {
2279 if li.is_ordered {
2280 li.marker.len() + 1 } else {
2282 li.marker.len()
2283 }
2284 })
2285 .unwrap_or(3) } else {
2287 3 };
2289
2290 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2291
2292 let context = CodeBlockUtils::analyze_code_block_context(
2294 lines,
2295 check_line - 1,
2296 min_continuation,
2297 );
2298
2299 matches!(context, CodeBlockContext::Standalone)
2301 } else if !check_info.is_blank && check_info.list_item.is_none() {
2302 let line_content = check_info.content.trim();
2304
2305 if check_info.heading.is_some()
2307 || line_content.starts_with("---")
2308 || line_content.starts_with("***")
2309 || line_content.starts_with("___")
2310 || (line_content.contains('|')
2311 && !line_content.contains("](")
2312 && !line_content.contains("http")
2313 && (line_content.matches('|').count() > 1
2314 || line_content.starts_with('|')
2315 || line_content.ends_with('|')))
2316 || line_content.starts_with(">")
2317 {
2318 true
2319 }
2320 else {
2322 let last_item_marker_width =
2323 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2324 lines[block_last_item_line - 1]
2325 .list_item
2326 .as_ref()
2327 .map(|li| {
2328 if li.is_ordered {
2329 li.marker.len() + 1 } else {
2331 li.marker.len()
2332 }
2333 })
2334 .unwrap_or(3) } else {
2336 3 };
2338
2339 let min_continuation =
2340 if block.is_ordered { last_item_marker_width } else { 2 };
2341 check_info.indent < min_continuation
2342 }
2343 } else {
2344 false
2345 };
2346
2347 if is_list_breaking_content {
2348 found_non_list = true;
2350 break;
2351 }
2352 }
2353 }
2354 found_non_list
2355 };
2356
2357 let mut continues_list = if is_nested {
2361 same_context && reasonable_distance && !has_non_list_content
2363 } else {
2364 let result = same_type
2366 && same_context
2367 && reasonable_distance
2368 && marker_compatible
2369 && !has_non_list_content;
2370
2371 if block.item_lines.last().is_some_and(|&last_line| {
2373 last_line > 0
2374 && last_line <= lines.len()
2375 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2376 && lines[last_line - 1].content.contains(r"\`")
2377 }) {
2378 log::debug!(
2379 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2380 );
2381 if line_num > 0 && line_num <= lines.len() {
2382 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2383 }
2384 }
2385
2386 result
2387 };
2388
2389 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2392 if block.item_lines.contains(&(line_num - 1)) {
2394 continues_list = true;
2396 }
2397 }
2398
2399 if continues_list {
2400 block.end_line = line_num;
2402 block.item_lines.push(line_num);
2403
2404 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2406 list_item.marker.len() + 1
2407 } else {
2408 list_item.marker.len()
2409 });
2410
2411 if !block.is_ordered
2413 && block.marker.is_some()
2414 && block.marker.as_ref() != Some(&list_item.marker)
2415 {
2416 block.marker = None;
2418 }
2419 } else {
2420 list_blocks.push(block.clone());
2423
2424 *block = ListBlock {
2425 start_line: line_num,
2426 end_line: line_num,
2427 is_ordered: list_item.is_ordered,
2428 marker: if list_item.is_ordered {
2429 None
2430 } else {
2431 Some(list_item.marker.clone())
2432 },
2433 blockquote_prefix: blockquote_prefix.clone(),
2434 item_lines: vec![line_num],
2435 nesting_level: nesting,
2436 max_marker_width: if list_item.is_ordered {
2437 list_item.marker.len() + 1
2438 } else {
2439 list_item.marker.len()
2440 },
2441 };
2442 }
2443 } else {
2444 current_block = Some(ListBlock {
2446 start_line: line_num,
2447 end_line: line_num,
2448 is_ordered: list_item.is_ordered,
2449 marker: if list_item.is_ordered {
2450 None
2451 } else {
2452 Some(list_item.marker.clone())
2453 },
2454 blockquote_prefix,
2455 item_lines: vec![line_num],
2456 nesting_level: nesting,
2457 max_marker_width: list_item.marker.len(),
2458 });
2459 }
2460
2461 last_list_item_line = line_num;
2462 current_indent_level = item_indent;
2463 last_marker_width = if list_item.is_ordered {
2464 list_item.marker.len() + 1 } else {
2466 list_item.marker.len()
2467 };
2468 } else if let Some(ref mut block) = current_block {
2469 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2479 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2480 } else {
2481 false
2482 };
2483
2484 let min_continuation_indent = if block.is_ordered {
2488 current_indent_level + last_marker_width
2489 } else {
2490 current_indent_level + 2 };
2492
2493 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2494 block.end_line = line_num;
2496 } else if line_info.is_blank {
2497 let mut check_idx = line_idx + 1;
2500 let mut found_continuation = false;
2501
2502 while check_idx < lines.len() && lines[check_idx].is_blank {
2504 check_idx += 1;
2505 }
2506
2507 if check_idx < lines.len() {
2508 let next_line = &lines[check_idx];
2509 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2511 found_continuation = true;
2512 }
2513 else if !next_line.in_code_block
2515 && next_line.list_item.is_some()
2516 && let Some(item) = &next_line.list_item
2517 {
2518 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2519 .find(&next_line.content)
2520 .map_or(String::new(), |m| m.as_str().to_string());
2521 if item.marker_column == current_indent_level
2522 && item.is_ordered == block.is_ordered
2523 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2524 {
2525 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2528 if let Some(between_line) = lines.get(idx) {
2529 let trimmed = between_line.content.trim();
2530 if trimmed.is_empty() {
2532 return false;
2533 }
2534 let line_indent =
2536 between_line.content.len() - between_line.content.trim_start().len();
2537
2538 if trimmed.starts_with("```")
2540 || trimmed.starts_with("~~~")
2541 || trimmed.starts_with("---")
2542 || trimmed.starts_with("***")
2543 || trimmed.starts_with("___")
2544 || trimmed.starts_with(">")
2545 || trimmed.contains('|') || between_line.heading.is_some()
2547 {
2548 return true; }
2550
2551 line_indent >= min_continuation_indent
2553 } else {
2554 false
2555 }
2556 });
2557
2558 if block.is_ordered {
2559 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2562 if let Some(between_line) = lines.get(idx) {
2563 let trimmed = between_line.content.trim();
2564 if trimmed.is_empty() {
2565 return false;
2566 }
2567 trimmed.starts_with("```")
2569 || trimmed.starts_with("~~~")
2570 || trimmed.starts_with("---")
2571 || trimmed.starts_with("***")
2572 || trimmed.starts_with("___")
2573 || trimmed.starts_with(">")
2574 || trimmed.contains('|') || between_line.heading.is_some()
2576 } else {
2577 false
2578 }
2579 });
2580 found_continuation = !has_structural_separators;
2581 } else {
2582 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2584 if let Some(between_line) = lines.get(idx) {
2585 let trimmed = between_line.content.trim();
2586 if trimmed.is_empty() {
2587 return false;
2588 }
2589 trimmed.starts_with("```")
2591 || trimmed.starts_with("~~~")
2592 || trimmed.starts_with("---")
2593 || trimmed.starts_with("***")
2594 || trimmed.starts_with("___")
2595 || trimmed.starts_with(">")
2596 || trimmed.contains('|') || between_line.heading.is_some()
2598 } else {
2599 false
2600 }
2601 });
2602 found_continuation = !has_structural_separators;
2603 }
2604 }
2605 }
2606 }
2607
2608 if found_continuation {
2609 block.end_line = line_num;
2611 } else {
2612 list_blocks.push(block.clone());
2614 current_block = None;
2615 }
2616 } else {
2617 let min_required_indent = if block.is_ordered {
2620 current_indent_level + last_marker_width
2621 } else {
2622 current_indent_level + 2
2623 };
2624
2625 let line_content = line_info.content.trim();
2630 let is_structural_separator = line_info.heading.is_some()
2631 || line_content.starts_with("```")
2632 || line_content.starts_with("~~~")
2633 || line_content.starts_with("---")
2634 || line_content.starts_with("***")
2635 || line_content.starts_with("___")
2636 || line_content.starts_with(">")
2637 || (line_content.contains('|')
2638 && !line_content.contains("](")
2639 && !line_content.contains("http")
2640 && (line_content.matches('|').count() > 1
2641 || line_content.starts_with('|')
2642 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2647 && !line_info.is_blank
2648 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2649
2650 if is_lazy_continuation {
2651 let content_to_check = if !blockquote_prefix.is_empty() {
2654 line_info
2656 .content
2657 .strip_prefix(&blockquote_prefix)
2658 .unwrap_or(&line_info.content)
2659 .trim()
2660 } else {
2661 line_info.content.trim()
2662 };
2663
2664 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2665
2666 if starts_with_uppercase && last_list_item_line > 0 {
2669 list_blocks.push(block.clone());
2671 current_block = None;
2672 } else {
2673 block.end_line = line_num;
2675 }
2676 } else {
2677 list_blocks.push(block.clone());
2679 current_block = None;
2680 }
2681 }
2682 }
2683 }
2684
2685 if let Some(block) = current_block {
2687 list_blocks.push(block);
2688 }
2689
2690 merge_adjacent_list_blocks(&mut list_blocks, lines);
2692
2693 list_blocks
2694 }
2695
2696 fn compute_char_frequency(content: &str) -> CharFrequency {
2698 let mut frequency = CharFrequency::default();
2699
2700 for ch in content.chars() {
2701 match ch {
2702 '#' => frequency.hash_count += 1,
2703 '*' => frequency.asterisk_count += 1,
2704 '_' => frequency.underscore_count += 1,
2705 '-' => frequency.hyphen_count += 1,
2706 '+' => frequency.plus_count += 1,
2707 '>' => frequency.gt_count += 1,
2708 '|' => frequency.pipe_count += 1,
2709 '[' => frequency.bracket_count += 1,
2710 '`' => frequency.backtick_count += 1,
2711 '<' => frequency.lt_count += 1,
2712 '!' => frequency.exclamation_count += 1,
2713 '\n' => frequency.newline_count += 1,
2714 _ => {}
2715 }
2716 }
2717
2718 frequency
2719 }
2720
2721 fn parse_html_tags(
2723 content: &str,
2724 lines: &[LineInfo],
2725 code_blocks: &[(usize, usize)],
2726 flavor: MarkdownFlavor,
2727 ) -> Vec<HtmlTag> {
2728 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2729 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2730
2731 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2732
2733 for cap in HTML_TAG_REGEX.captures_iter(content) {
2734 let full_match = cap.get(0).unwrap();
2735 let match_start = full_match.start();
2736 let match_end = full_match.end();
2737
2738 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2740 continue;
2741 }
2742
2743 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2744 let tag_name_original = cap.get(2).unwrap().as_str();
2745 let tag_name = tag_name_original.to_lowercase();
2746 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2747
2748 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2751 continue;
2752 }
2753
2754 let mut line_num = 1;
2756 let mut col_start = match_start;
2757 let mut col_end = match_end;
2758 for (idx, line_info) in lines.iter().enumerate() {
2759 if match_start >= line_info.byte_offset {
2760 line_num = idx + 1;
2761 col_start = match_start - line_info.byte_offset;
2762 col_end = match_end - line_info.byte_offset;
2763 } else {
2764 break;
2765 }
2766 }
2767
2768 html_tags.push(HtmlTag {
2769 line: line_num,
2770 start_col: col_start,
2771 end_col: col_end,
2772 byte_offset: match_start,
2773 byte_end: match_end,
2774 tag_name,
2775 is_closing,
2776 is_self_closing,
2777 raw_content: full_match.as_str().to_string(),
2778 });
2779 }
2780
2781 html_tags
2782 }
2783
2784 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2786 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2787 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2788
2789 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2790
2791 for cap in EMPHASIS_REGEX.captures_iter(content) {
2792 let full_match = cap.get(0).unwrap();
2793 let match_start = full_match.start();
2794 let match_end = full_match.end();
2795
2796 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2798 continue;
2799 }
2800
2801 let opening_markers = cap.get(1).unwrap().as_str();
2802 let content_part = cap.get(2).unwrap().as_str();
2803 let closing_markers = cap.get(3).unwrap().as_str();
2804
2805 if opening_markers.chars().next() != closing_markers.chars().next()
2807 || opening_markers.len() != closing_markers.len()
2808 {
2809 continue;
2810 }
2811
2812 let marker = opening_markers.chars().next().unwrap();
2813 let marker_count = opening_markers.len();
2814
2815 let mut line_num = 1;
2817 let mut col_start = match_start;
2818 let mut col_end = match_end;
2819 for (idx, line_info) in lines.iter().enumerate() {
2820 if match_start >= line_info.byte_offset {
2821 line_num = idx + 1;
2822 col_start = match_start - line_info.byte_offset;
2823 col_end = match_end - line_info.byte_offset;
2824 } else {
2825 break;
2826 }
2827 }
2828
2829 emphasis_spans.push(EmphasisSpan {
2830 line: line_num,
2831 start_col: col_start,
2832 end_col: col_end,
2833 byte_offset: match_start,
2834 byte_end: match_end,
2835 marker,
2836 marker_count,
2837 content: content_part.to_string(),
2838 });
2839 }
2840
2841 emphasis_spans
2842 }
2843
2844 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2846 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2847
2848 for (line_idx, line_info) in lines.iter().enumerate() {
2849 if line_info.in_code_block || line_info.is_blank {
2851 continue;
2852 }
2853
2854 let line = &line_info.content;
2855 let line_num = line_idx + 1;
2856
2857 if !line.contains('|') {
2859 continue;
2860 }
2861
2862 let parts: Vec<&str> = line.split('|').collect();
2864 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2865
2866 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2868 let mut column_alignments = Vec::new();
2869
2870 if is_separator {
2871 for part in &parts[1..parts.len() - 1] {
2872 let trimmed = part.trim();
2874 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2875 "center".to_string()
2876 } else if trimmed.ends_with(':') {
2877 "right".to_string()
2878 } else if trimmed.starts_with(':') {
2879 "left".to_string()
2880 } else {
2881 "none".to_string()
2882 };
2883 column_alignments.push(alignment);
2884 }
2885 }
2886
2887 table_rows.push(TableRow {
2888 line: line_num,
2889 is_separator,
2890 column_count,
2891 column_alignments,
2892 });
2893 }
2894
2895 table_rows
2896 }
2897
2898 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2900 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2901
2902 for cap in BARE_URL_PATTERN.captures_iter(content) {
2904 let full_match = cap.get(0).unwrap();
2905 let match_start = full_match.start();
2906 let match_end = full_match.end();
2907
2908 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2910 continue;
2911 }
2912
2913 let preceding_char = if match_start > 0 {
2915 content.chars().nth(match_start - 1)
2916 } else {
2917 None
2918 };
2919 let following_char = content.chars().nth(match_end);
2920
2921 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2922 continue;
2923 }
2924 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2925 continue;
2926 }
2927
2928 let url = full_match.as_str();
2929 let url_type = if url.starts_with("https://") {
2930 "https"
2931 } else if url.starts_with("http://") {
2932 "http"
2933 } else if url.starts_with("ftp://") {
2934 "ftp"
2935 } else {
2936 "other"
2937 };
2938
2939 let mut line_num = 1;
2941 let mut col_start = match_start;
2942 let mut col_end = match_end;
2943 for (idx, line_info) in lines.iter().enumerate() {
2944 if match_start >= line_info.byte_offset {
2945 line_num = idx + 1;
2946 col_start = match_start - line_info.byte_offset;
2947 col_end = match_end - line_info.byte_offset;
2948 } else {
2949 break;
2950 }
2951 }
2952
2953 bare_urls.push(BareUrl {
2954 line: line_num,
2955 start_col: col_start,
2956 end_col: col_end,
2957 byte_offset: match_start,
2958 byte_end: match_end,
2959 url: url.to_string(),
2960 url_type: url_type.to_string(),
2961 });
2962 }
2963
2964 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2966 let full_match = cap.get(0).unwrap();
2967 let match_start = full_match.start();
2968 let match_end = full_match.end();
2969
2970 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2972 continue;
2973 }
2974
2975 let preceding_char = if match_start > 0 {
2977 content.chars().nth(match_start - 1)
2978 } else {
2979 None
2980 };
2981 let following_char = content.chars().nth(match_end);
2982
2983 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2984 continue;
2985 }
2986 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2987 continue;
2988 }
2989
2990 let email = full_match.as_str();
2991
2992 let mut line_num = 1;
2994 let mut col_start = match_start;
2995 let mut col_end = match_end;
2996 for (idx, line_info) in lines.iter().enumerate() {
2997 if match_start >= line_info.byte_offset {
2998 line_num = idx + 1;
2999 col_start = match_start - line_info.byte_offset;
3000 col_end = match_end - line_info.byte_offset;
3001 } else {
3002 break;
3003 }
3004 }
3005
3006 bare_urls.push(BareUrl {
3007 line: line_num,
3008 start_col: col_start,
3009 end_col: col_end,
3010 byte_offset: match_start,
3011 byte_end: match_end,
3012 url: email.to_string(),
3013 url_type: "email".to_string(),
3014 });
3015 }
3016
3017 bare_urls
3018 }
3019}
3020
3021fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3023 if list_blocks.len() < 2 {
3024 return;
3025 }
3026
3027 let mut merger = ListBlockMerger::new(lines);
3028 *list_blocks = merger.merge(list_blocks);
3029}
3030
3031struct ListBlockMerger<'a> {
3033 lines: &'a [LineInfo],
3034}
3035
3036impl<'a> ListBlockMerger<'a> {
3037 fn new(lines: &'a [LineInfo]) -> Self {
3038 Self { lines }
3039 }
3040
3041 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3042 let mut merged = Vec::with_capacity(list_blocks.len());
3043 let mut current = list_blocks[0].clone();
3044
3045 for next in list_blocks.iter().skip(1) {
3046 if self.should_merge_blocks(¤t, next) {
3047 current = self.merge_two_blocks(current, next);
3048 } else {
3049 merged.push(current);
3050 current = next.clone();
3051 }
3052 }
3053
3054 merged.push(current);
3055 merged
3056 }
3057
3058 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3060 if !self.blocks_are_compatible(current, next) {
3062 return false;
3063 }
3064
3065 let spacing = self.analyze_spacing_between(current, next);
3067 match spacing {
3068 BlockSpacing::Consecutive => true,
3069 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3070 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3071 self.can_merge_with_content_between(current, next)
3072 }
3073 }
3074 }
3075
3076 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3078 current.is_ordered == next.is_ordered
3079 && current.blockquote_prefix == next.blockquote_prefix
3080 && current.nesting_level == next.nesting_level
3081 }
3082
3083 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3085 let gap = next.start_line - current.end_line;
3086
3087 match gap {
3088 1 => BlockSpacing::Consecutive,
3089 2 => BlockSpacing::SingleBlank,
3090 _ if gap > 2 => {
3091 if self.has_only_blank_lines_between(current, next) {
3092 BlockSpacing::MultipleBlanks
3093 } else {
3094 BlockSpacing::ContentBetween
3095 }
3096 }
3097 _ => BlockSpacing::Consecutive, }
3099 }
3100
3101 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3103 if has_meaningful_content_between(current, next, self.lines) {
3106 return false; }
3108
3109 !current.is_ordered && current.marker == next.marker
3111 }
3112
3113 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3115 if has_meaningful_content_between(current, next, self.lines) {
3117 return false; }
3119
3120 current.is_ordered && next.is_ordered
3122 }
3123
3124 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3126 for line_num in (current.end_line + 1)..next.start_line {
3127 if let Some(line_info) = self.lines.get(line_num - 1)
3128 && !line_info.content.trim().is_empty()
3129 {
3130 return false;
3131 }
3132 }
3133 true
3134 }
3135
3136 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3138 current.end_line = next.end_line;
3139 current.item_lines.extend_from_slice(&next.item_lines);
3140
3141 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3143
3144 if !current.is_ordered && self.markers_differ(¤t, next) {
3146 current.marker = None; }
3148
3149 current
3150 }
3151
3152 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3154 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3155 }
3156}
3157
3158#[derive(Debug, PartialEq)]
3160enum BlockSpacing {
3161 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3166
3167fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3169 for line_num in (current.end_line + 1)..next.start_line {
3171 if let Some(line_info) = lines.get(line_num - 1) {
3172 let trimmed = line_info.content.trim();
3174
3175 if trimmed.is_empty() {
3177 continue;
3178 }
3179
3180 if line_info.heading.is_some() {
3184 return true; }
3186
3187 if is_horizontal_rule(trimmed) {
3189 return true; }
3191
3192 if trimmed.contains('|') && trimmed.len() > 1 {
3195 if !trimmed.contains("](") && !trimmed.contains("http") {
3197 let pipe_count = trimmed.matches('|').count();
3199 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3200 return true; }
3202 }
3203 }
3204
3205 if trimmed.starts_with('>') {
3207 return true; }
3209
3210 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3212 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3213
3214 let min_continuation_indent = if current.is_ordered {
3216 current.nesting_level + current.max_marker_width + 1 } else {
3218 current.nesting_level + 2
3219 };
3220
3221 if line_indent < min_continuation_indent {
3222 return true; }
3225 }
3226
3227 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3229
3230 let min_indent = if current.is_ordered {
3232 current.nesting_level + current.max_marker_width
3233 } else {
3234 current.nesting_level + 2
3235 };
3236
3237 if line_indent < min_indent {
3239 return true; }
3241
3242 }
3245 }
3246
3247 false
3249}
3250
3251fn is_horizontal_rule(trimmed: &str) -> bool {
3253 if trimmed.len() < 3 {
3254 return false;
3255 }
3256
3257 let chars: Vec<char> = trimmed.chars().collect();
3259 if let Some(&first_char) = chars.first()
3260 && (first_char == '-' || first_char == '*' || first_char == '_')
3261 {
3262 let mut count = 0;
3263 for &ch in &chars {
3264 if ch == first_char {
3265 count += 1;
3266 } else if ch != ' ' && ch != '\t' {
3267 return false; }
3269 }
3270 return count >= 3;
3271 }
3272 false
3273}
3274
3275#[cfg(test)]
3277mod tests {
3278 use super::*;
3279
3280 #[test]
3281 fn test_empty_content() {
3282 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3283 assert_eq!(ctx.content, "");
3284 assert_eq!(ctx.line_offsets, vec![0]);
3285 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3286 assert_eq!(ctx.lines.len(), 0);
3287 }
3288
3289 #[test]
3290 fn test_single_line() {
3291 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3292 assert_eq!(ctx.content, "# Hello");
3293 assert_eq!(ctx.line_offsets, vec![0]);
3294 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3295 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3296 }
3297
3298 #[test]
3299 fn test_multi_line() {
3300 let content = "# Title\n\nSecond line\nThird line";
3301 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3302 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3303 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3310
3311 #[test]
3312 fn test_line_info() {
3313 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3314 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3315
3316 assert_eq!(ctx.lines.len(), 7);
3318
3319 let line1 = &ctx.lines[0];
3321 assert_eq!(line1.content, "# Title");
3322 assert_eq!(line1.byte_offset, 0);
3323 assert_eq!(line1.indent, 0);
3324 assert!(!line1.is_blank);
3325 assert!(!line1.in_code_block);
3326 assert!(line1.list_item.is_none());
3327
3328 let line2 = &ctx.lines[1];
3330 assert_eq!(line2.content, " indented");
3331 assert_eq!(line2.byte_offset, 8);
3332 assert_eq!(line2.indent, 4);
3333 assert!(!line2.is_blank);
3334
3335 let line3 = &ctx.lines[2];
3337 assert_eq!(line3.content, "");
3338 assert!(line3.is_blank);
3339
3340 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3342 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3343 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3344 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3345 }
3346
3347 #[test]
3348 fn test_list_item_detection() {
3349 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3350 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3351
3352 let line1 = &ctx.lines[0];
3354 assert!(line1.list_item.is_some());
3355 let list1 = line1.list_item.as_ref().unwrap();
3356 assert_eq!(list1.marker, "-");
3357 assert!(!list1.is_ordered);
3358 assert_eq!(list1.marker_column, 0);
3359 assert_eq!(list1.content_column, 2);
3360
3361 let line2 = &ctx.lines[1];
3363 assert!(line2.list_item.is_some());
3364 let list2 = line2.list_item.as_ref().unwrap();
3365 assert_eq!(list2.marker, "*");
3366 assert_eq!(list2.marker_column, 2);
3367
3368 let line3 = &ctx.lines[2];
3370 assert!(line3.list_item.is_some());
3371 let list3 = line3.list_item.as_ref().unwrap();
3372 assert_eq!(list3.marker, "1.");
3373 assert!(list3.is_ordered);
3374 assert_eq!(list3.number, Some(1));
3375
3376 let line6 = &ctx.lines[5];
3378 assert!(line6.list_item.is_none());
3379 }
3380
3381 #[test]
3382 fn test_offset_to_line_col_edge_cases() {
3383 let content = "a\nb\nc";
3384 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3385 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3393
3394 #[test]
3395 fn test_mdx_esm_blocks() {
3396 let content = r##"import {Chart} from './snowfall.js'
3397export const year = 2023
3398
3399# Last year's snowfall
3400
3401In {year}, the snowfall was above average.
3402It was followed by a warm spring which caused
3403flood conditions in many of the nearby rivers.
3404
3405<Chart color="#fcb32c" year={year} />
3406"##;
3407
3408 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3409
3410 assert_eq!(ctx.lines.len(), 10);
3412 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3413 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3414 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3415 assert!(
3416 !ctx.lines[3].in_esm_block,
3417 "Line 4 (heading) should NOT be in_esm_block"
3418 );
3419 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3420 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3421 }
3422
3423 #[test]
3424 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3425 let content = r#"import {Chart} from './snowfall.js'
3426export const year = 2023
3427
3428# Last year's snowfall
3429"#;
3430
3431 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3432
3433 assert!(
3435 !ctx.lines[0].in_esm_block,
3436 "Line 1 should NOT be in_esm_block in Standard flavor"
3437 );
3438 assert!(
3439 !ctx.lines[1].in_esm_block,
3440 "Line 2 should NOT be in_esm_block in Standard flavor"
3441 );
3442 }
3443}