1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11 Regex::new(
12 r#"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
16 |
17 \[([^\]]*)\] # Reference ID in group 6
18 )"#
19 ).unwrap()
20});
21
22static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25 Regex::new(
26 r#"(?sx)
27 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
28 (?:
29 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
30 |
31 \[([^\]]*)\] # Reference ID in group 6
32 )"#
33 ).unwrap()
34});
35
36static REF_DEF_PATTERN: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42 Regex::new(
43 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44 ).unwrap()
45});
46
47static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54#[derive(Debug, Clone)]
56pub struct LineInfo {
57 pub content: String,
59 pub byte_offset: usize,
61 pub indent: usize,
63 pub is_blank: bool,
65 pub in_code_block: bool,
67 pub in_front_matter: bool,
69 pub in_html_block: bool,
71 pub in_html_comment: bool,
73 pub list_item: Option<ListItemInfo>,
75 pub heading: Option<HeadingInfo>,
77 pub blockquote: Option<BlockquoteInfo>,
79 pub in_mkdocstrings: bool,
81 pub in_esm_block: bool,
83}
84
85#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88 pub marker: String,
90 pub is_ordered: bool,
92 pub number: Option<usize>,
94 pub marker_column: usize,
96 pub content_column: usize,
98}
99
100#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103 ATX,
105 Setext1,
107 Setext2,
109}
110
111#[derive(Debug, Clone)]
113pub struct ParsedLink {
114 pub line: usize,
116 pub start_col: usize,
118 pub end_col: usize,
120 pub byte_offset: usize,
122 pub byte_end: usize,
124 pub text: String,
126 pub url: String,
128 pub is_reference: bool,
130 pub reference_id: Option<String>,
132 pub link_type: LinkType,
134}
135
136#[derive(Debug, Clone)]
138pub struct BrokenLinkInfo {
139 pub reference: String,
141 pub span: std::ops::Range<usize>,
143}
144
145#[derive(Debug, Clone)]
147pub struct ParsedImage {
148 pub line: usize,
150 pub start_col: usize,
152 pub end_col: usize,
154 pub byte_offset: usize,
156 pub byte_end: usize,
158 pub alt_text: String,
160 pub url: String,
162 pub is_reference: bool,
164 pub reference_id: Option<String>,
166 pub link_type: LinkType,
168}
169
170#[derive(Debug, Clone)]
172pub struct ReferenceDef {
173 pub line: usize,
175 pub id: String,
177 pub url: String,
179 pub title: Option<String>,
181 pub byte_offset: usize,
183 pub byte_end: usize,
185}
186
187#[derive(Debug, Clone)]
189pub struct CodeSpan {
190 pub line: usize,
192 pub start_col: usize,
194 pub end_col: usize,
196 pub byte_offset: usize,
198 pub byte_end: usize,
200 pub backtick_count: usize,
202 pub content: String,
204}
205
206#[derive(Debug, Clone)]
208pub struct HeadingInfo {
209 pub level: u8,
211 pub style: HeadingStyle,
213 pub marker: String,
215 pub marker_column: usize,
217 pub content_column: usize,
219 pub text: String,
221 pub custom_id: Option<String>,
223 pub raw_text: String,
225 pub has_closing_sequence: bool,
227 pub closing_sequence: String,
229}
230
231#[derive(Debug, Clone)]
233pub struct BlockquoteInfo {
234 pub nesting_level: usize,
236 pub indent: String,
238 pub marker_column: usize,
240 pub prefix: String,
242 pub content: String,
244 pub has_no_space_after_marker: bool,
246 pub has_multiple_spaces_after_marker: bool,
248 pub needs_md028_fix: bool,
250}
251
252#[derive(Debug, Clone)]
254pub struct ListBlock {
255 pub start_line: usize,
257 pub end_line: usize,
259 pub is_ordered: bool,
261 pub marker: Option<String>,
263 pub blockquote_prefix: String,
265 pub item_lines: Vec<usize>,
267 pub nesting_level: usize,
269 pub max_marker_width: usize,
271}
272
273use std::sync::{Arc, Mutex};
274
275#[derive(Debug, Clone, Default)]
277pub struct CharFrequency {
278 pub hash_count: usize,
280 pub asterisk_count: usize,
282 pub underscore_count: usize,
284 pub hyphen_count: usize,
286 pub plus_count: usize,
288 pub gt_count: usize,
290 pub pipe_count: usize,
292 pub bracket_count: usize,
294 pub backtick_count: usize,
296 pub lt_count: usize,
298 pub exclamation_count: usize,
300 pub newline_count: usize,
302}
303
304#[derive(Debug, Clone)]
306pub struct HtmlTag {
307 pub line: usize,
309 pub start_col: usize,
311 pub end_col: usize,
313 pub byte_offset: usize,
315 pub byte_end: usize,
317 pub tag_name: String,
319 pub is_closing: bool,
321 pub is_self_closing: bool,
323 pub raw_content: String,
325}
326
327#[derive(Debug, Clone)]
329pub struct EmphasisSpan {
330 pub line: usize,
332 pub start_col: usize,
334 pub end_col: usize,
336 pub byte_offset: usize,
338 pub byte_end: usize,
340 pub marker: char,
342 pub marker_count: usize,
344 pub content: String,
346}
347
348#[derive(Debug, Clone)]
350pub struct TableRow {
351 pub line: usize,
353 pub is_separator: bool,
355 pub column_count: usize,
357 pub column_alignments: Vec<String>, }
360
361#[derive(Debug, Clone)]
363pub struct BareUrl {
364 pub line: usize,
366 pub start_col: usize,
368 pub end_col: usize,
370 pub byte_offset: usize,
372 pub byte_end: usize,
374 pub url: String,
376 pub url_type: String,
378}
379
380pub struct LintContext<'a> {
381 pub content: &'a str,
382 pub line_offsets: Vec<usize>,
383 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub broken_links: Vec<BrokenLinkInfo>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
402
403struct BlockquoteComponents<'a> {
405 indent: &'a str,
406 markers: &'a str,
407 spaces_after: &'a str,
408 content: &'a str,
409}
410
411#[inline]
413fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
414 let bytes = line.as_bytes();
415 let mut pos = 0;
416
417 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
419 pos += 1;
420 }
421 let indent_end = pos;
422
423 if pos >= bytes.len() || bytes[pos] != b'>' {
425 return None;
426 }
427
428 while pos < bytes.len() && bytes[pos] == b'>' {
430 pos += 1;
431 }
432 let markers_end = pos;
433
434 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
436 pos += 1;
437 }
438 let spaces_end = pos;
439
440 Some(BlockquoteComponents {
441 indent: &line[0..indent_end],
442 markers: &line[indent_end..markers_end],
443 spaces_after: &line[markers_end..spaces_end],
444 content: &line[spaces_end..],
445 })
446}
447
448impl<'a> LintContext<'a> {
449 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
450 use std::time::Instant;
451 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
452
453 let start = Instant::now();
454 let mut line_offsets = vec![0];
455 for (i, c) in content.char_indices() {
456 if c == '\n' {
457 line_offsets.push(i + 1);
458 }
459 }
460 if profile {
461 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
462 }
463
464 let start = Instant::now();
466 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
467 if profile {
468 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
469 }
470
471 let start = Instant::now();
473 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
474 if profile {
475 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
476 }
477
478 let start = Instant::now();
480 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
481 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
482 } else {
483 Vec::new()
484 };
485 if profile {
486 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
487 }
488
489 let start = Instant::now();
491 let mut lines = Self::compute_basic_line_info(
492 content,
493 &line_offsets,
494 &code_blocks,
495 flavor,
496 &html_comment_ranges,
497 &autodoc_ranges,
498 );
499 if profile {
500 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
501 }
502
503 let start = Instant::now();
505 Self::detect_html_blocks(&mut lines);
506 if profile {
507 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
508 }
509
510 let start = Instant::now();
512 Self::detect_esm_blocks(&mut lines, flavor);
513 if profile {
514 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
515 }
516
517 let start = Instant::now();
519 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
520 if profile {
521 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
522 }
523
524 let start = Instant::now();
526 let code_spans = Self::parse_code_spans(content, &lines);
527 if profile {
528 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
529 }
530
531 let start = Instant::now();
533 let (links, broken_links) =
534 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
535 if profile {
536 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
537 }
538
539 let start = Instant::now();
540 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
541 if profile {
542 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
543 }
544
545 let start = Instant::now();
546 let reference_defs = Self::parse_reference_defs(content, &lines);
547 if profile {
548 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
549 }
550
551 let start = Instant::now();
552 let list_blocks = Self::parse_list_blocks(&lines);
553 if profile {
554 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
555 }
556
557 let start = Instant::now();
559 let char_frequency = Self::compute_char_frequency(content);
560 if profile {
561 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
562 }
563
564 let start = Instant::now();
566 let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
567 content,
568 &code_blocks,
569 &code_spans,
570 &html_comment_ranges,
571 );
572 if profile {
573 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
574 }
575
576 let start = Instant::now();
578 let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
579 if profile {
580 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
581 }
582
583 let start = Instant::now();
585 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
586 if profile {
587 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
588 }
589
590 Self {
591 content,
592 line_offsets,
593 code_blocks,
594 lines,
595 links,
596 images,
597 broken_links,
598 reference_defs,
599 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
600 list_blocks,
601 char_frequency,
602 html_tags_cache: Mutex::new(None),
603 emphasis_spans_cache: Mutex::new(None),
604 table_rows_cache: Mutex::new(None),
605 bare_urls_cache: Mutex::new(None),
606 html_comment_ranges,
607 table_blocks,
608 line_index,
609 jinja_ranges,
610 flavor,
611 }
612 }
613
614 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
616 let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
617
618 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
619 }
620
621 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
623 &self.html_comment_ranges
624 }
625
626 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
628 let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
629
630 Arc::clone(cache.get_or_insert_with(|| {
631 Arc::new(Self::parse_html_tags(
632 self.content,
633 &self.lines,
634 &self.code_blocks,
635 self.flavor,
636 ))
637 }))
638 }
639
640 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
642 let mut cache = self
643 .emphasis_spans_cache
644 .lock()
645 .expect("Emphasis spans cache mutex poisoned");
646
647 Arc::clone(
648 cache.get_or_insert_with(|| {
649 Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
650 }),
651 )
652 }
653
654 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
656 let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
657
658 Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
659 }
660
661 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
663 let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
664
665 Arc::clone(
666 cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
667 )
668 }
669
670 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
672 match self.line_offsets.binary_search(&offset) {
673 Ok(line) => (line + 1, 1),
674 Err(line) => {
675 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
676 (line, offset - line_start + 1)
677 }
678 }
679 }
680
681 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
683 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
685 return true;
686 }
687
688 self.code_spans()
690 .iter()
691 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
692 }
693
694 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
696 if line_num > 0 {
697 self.lines.get(line_num - 1)
698 } else {
699 None
700 }
701 }
702
703 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
705 self.line_info(line_num).map(|info| info.byte_offset)
706 }
707
708 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
710 let normalized_id = ref_id.to_lowercase();
711 self.reference_defs
712 .iter()
713 .find(|def| def.id == normalized_id)
714 .map(|def| def.url.as_str())
715 }
716
717 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
719 self.links.iter().filter(|link| link.line == line_num).collect()
720 }
721
722 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
724 self.images.iter().filter(|img| img.line == line_num).collect()
725 }
726
727 pub fn is_in_list_block(&self, line_num: usize) -> bool {
729 self.list_blocks
730 .iter()
731 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
732 }
733
734 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
736 self.list_blocks
737 .iter()
738 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
739 }
740
741 pub fn is_in_code_block(&self, line_num: usize) -> bool {
745 if line_num == 0 || line_num > self.lines.len() {
746 return false;
747 }
748 self.lines[line_num - 1].in_code_block
749 }
750
751 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
753 if line_num == 0 || line_num > self.lines.len() {
754 return false;
755 }
756 self.lines[line_num - 1].in_front_matter
757 }
758
759 pub fn is_in_html_block(&self, line_num: usize) -> bool {
761 if line_num == 0 || line_num > self.lines.len() {
762 return false;
763 }
764 self.lines[line_num - 1].in_html_block
765 }
766
767 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
769 if line_num == 0 || line_num > self.lines.len() {
770 return false;
771 }
772
773 let col_0indexed = if col > 0 { col - 1 } else { 0 };
777 let code_spans = self.code_spans();
778 code_spans
779 .iter()
780 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
781 }
782
783 #[inline]
786 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
787 self.reference_defs
788 .iter()
789 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
790 }
791
792 #[inline]
796 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
797 self.html_comment_ranges
798 .iter()
799 .any(|range| byte_pos >= range.start && byte_pos < range.end)
800 }
801
802 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
804 self.jinja_ranges
805 .iter()
806 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
807 }
808
809 pub fn has_char(&self, ch: char) -> bool {
811 match ch {
812 '#' => self.char_frequency.hash_count > 0,
813 '*' => self.char_frequency.asterisk_count > 0,
814 '_' => self.char_frequency.underscore_count > 0,
815 '-' => self.char_frequency.hyphen_count > 0,
816 '+' => self.char_frequency.plus_count > 0,
817 '>' => self.char_frequency.gt_count > 0,
818 '|' => self.char_frequency.pipe_count > 0,
819 '[' => self.char_frequency.bracket_count > 0,
820 '`' => self.char_frequency.backtick_count > 0,
821 '<' => self.char_frequency.lt_count > 0,
822 '!' => self.char_frequency.exclamation_count > 0,
823 '\n' => self.char_frequency.newline_count > 0,
824 _ => self.content.contains(ch), }
826 }
827
828 pub fn char_count(&self, ch: char) -> usize {
830 match ch {
831 '#' => self.char_frequency.hash_count,
832 '*' => self.char_frequency.asterisk_count,
833 '_' => self.char_frequency.underscore_count,
834 '-' => self.char_frequency.hyphen_count,
835 '+' => self.char_frequency.plus_count,
836 '>' => self.char_frequency.gt_count,
837 '|' => self.char_frequency.pipe_count,
838 '[' => self.char_frequency.bracket_count,
839 '`' => self.char_frequency.backtick_count,
840 '<' => self.char_frequency.lt_count,
841 '!' => self.char_frequency.exclamation_count,
842 '\n' => self.char_frequency.newline_count,
843 _ => self.content.matches(ch).count(), }
845 }
846
847 pub fn likely_has_headings(&self) -> bool {
849 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
851
852 pub fn likely_has_lists(&self) -> bool {
854 self.char_frequency.asterisk_count > 0
855 || self.char_frequency.hyphen_count > 0
856 || self.char_frequency.plus_count > 0
857 }
858
859 pub fn likely_has_emphasis(&self) -> bool {
861 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
862 }
863
864 pub fn likely_has_tables(&self) -> bool {
866 self.char_frequency.pipe_count > 2
867 }
868
869 pub fn likely_has_blockquotes(&self) -> bool {
871 self.char_frequency.gt_count > 0
872 }
873
874 pub fn likely_has_code(&self) -> bool {
876 self.char_frequency.backtick_count > 0
877 }
878
879 pub fn likely_has_links_or_images(&self) -> bool {
881 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
882 }
883
884 pub fn likely_has_html(&self) -> bool {
886 self.char_frequency.lt_count > 0
887 }
888
889 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
891 self.html_tags()
892 .iter()
893 .filter(|tag| tag.line == line_num)
894 .cloned()
895 .collect()
896 }
897
898 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
900 self.emphasis_spans()
901 .iter()
902 .filter(|span| span.line == line_num)
903 .cloned()
904 .collect()
905 }
906
907 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
909 self.table_rows()
910 .iter()
911 .filter(|row| row.line == line_num)
912 .cloned()
913 .collect()
914 }
915
916 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
918 self.bare_urls()
919 .iter()
920 .filter(|url| url.line == line_num)
921 .cloned()
922 .collect()
923 }
924
925 #[inline]
931 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
932 let idx = match lines.binary_search_by(|line| {
934 if byte_offset < line.byte_offset {
935 std::cmp::Ordering::Greater
936 } else if byte_offset > line.byte_offset + line.content.len() {
937 std::cmp::Ordering::Less
938 } else {
939 std::cmp::Ordering::Equal
940 }
941 }) {
942 Ok(idx) => idx,
943 Err(idx) => idx.saturating_sub(1),
944 };
945
946 let line = &lines[idx];
947 let line_num = idx + 1;
948 let col = byte_offset.saturating_sub(line.byte_offset);
949
950 (idx, line_num, col)
951 }
952
953 #[inline]
955 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
956 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
958
959 if idx > 0 {
961 let span = &code_spans[idx - 1];
962 if offset >= span.byte_offset && offset < span.byte_end {
963 return true;
964 }
965 }
966
967 false
968 }
969
970 fn parse_links(
972 content: &str,
973 lines: &[LineInfo],
974 code_blocks: &[(usize, usize)],
975 code_spans: &[CodeSpan],
976 flavor: MarkdownFlavor,
977 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
978 ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
979 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
980 use std::collections::HashSet;
981
982 let mut links = Vec::with_capacity(content.len() / 500);
983 let mut broken_links = Vec::new();
984
985 let mut found_positions = HashSet::new();
987
988 let mut options = Options::empty();
998 options.insert(Options::ENABLE_WIKILINKS);
999
1000 let parser = Parser::new_with_broken_link_callback(
1001 content,
1002 options,
1003 Some(|link: BrokenLink<'_>| {
1004 broken_links.push(BrokenLinkInfo {
1005 reference: link.reference.to_string(),
1006 span: link.span.clone(),
1007 });
1008 None
1009 }),
1010 )
1011 .into_offset_iter();
1012
1013 let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
1014 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1017 match event {
1018 Event::Start(Tag::Link {
1019 link_type,
1020 dest_url,
1021 id,
1022 ..
1023 }) => {
1024 link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1026 text_chunks.clear();
1027 }
1028 Event::Text(text) if !link_stack.is_empty() => {
1029 text_chunks.push((text.to_string(), range.start, range.end));
1031 }
1032 Event::Code(code) if !link_stack.is_empty() => {
1033 let code_text = format!("`{code}`");
1035 text_chunks.push((code_text, range.start, range.end));
1036 }
1037 Event::End(TagEnd::Link) => {
1038 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1039 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1041 text_chunks.clear();
1042 continue;
1043 }
1044
1045 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1047
1048 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1050 text_chunks.clear();
1051 continue;
1052 }
1053
1054 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1055
1056 let is_reference = matches!(
1057 link_type,
1058 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1059 );
1060
1061 let link_text = if start_pos < content.len() {
1064 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1065
1066 let mut close_pos = None;
1070 let mut depth = 0;
1071 let mut in_code_span = false;
1072
1073 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1074 let mut backslash_count = 0;
1076 let mut j = i;
1077 while j > 0 && link_bytes[j - 1] == b'\\' {
1078 backslash_count += 1;
1079 j -= 1;
1080 }
1081 let is_escaped = backslash_count % 2 != 0;
1082
1083 if byte == b'`' && !is_escaped {
1085 in_code_span = !in_code_span;
1086 }
1087
1088 if !is_escaped && !in_code_span {
1090 if byte == b'[' {
1091 depth += 1;
1092 } else if byte == b']' {
1093 if depth == 0 {
1094 close_pos = Some(i);
1096 break;
1097 } else {
1098 depth -= 1;
1099 }
1100 }
1101 }
1102 }
1103
1104 if let Some(pos) = close_pos {
1105 std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1106 } else {
1107 String::new()
1108 }
1109 } else {
1110 String::new()
1111 };
1112
1113 let reference_id = if is_reference && !ref_id.is_empty() {
1115 Some(ref_id.to_lowercase())
1116 } else if is_reference {
1117 Some(link_text.to_lowercase())
1119 } else {
1120 None
1121 };
1122
1123 let has_escaped_bang = start_pos >= 2
1127 && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1128 && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1129
1130 let has_escaped_bracket =
1133 start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1134
1135 if has_escaped_bang || has_escaped_bracket {
1136 text_chunks.clear();
1137 continue; }
1139
1140 found_positions.insert(start_pos);
1142
1143 links.push(ParsedLink {
1144 line: line_num,
1145 start_col: col_start,
1146 end_col: col_end,
1147 byte_offset: start_pos,
1148 byte_end: range.end,
1149 text: link_text,
1150 url,
1151 is_reference,
1152 reference_id,
1153 link_type,
1154 });
1155
1156 text_chunks.clear();
1157 }
1158 }
1159 _ => {}
1160 }
1161 }
1162
1163 for cap in LINK_PATTERN.captures_iter(content) {
1167 let full_match = cap.get(0).unwrap();
1168 let match_start = full_match.start();
1169 let match_end = full_match.end();
1170
1171 if found_positions.contains(&match_start) {
1173 continue;
1174 }
1175
1176 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1178 continue;
1179 }
1180
1181 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1183 continue;
1184 }
1185
1186 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1188 continue;
1189 }
1190
1191 if Self::is_offset_in_code_span(code_spans, match_start) {
1193 continue;
1194 }
1195
1196 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1198 continue;
1199 }
1200
1201 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1203
1204 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1206 continue;
1207 }
1208
1209 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1210
1211 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1212
1213 if let Some(ref_id) = cap.get(6) {
1215 let ref_id_str = ref_id.as_str();
1216 let normalized_ref = if ref_id_str.is_empty() {
1217 text.to_lowercase() } else {
1219 ref_id_str.to_lowercase()
1220 };
1221
1222 links.push(ParsedLink {
1224 line: line_num,
1225 start_col: col_start,
1226 end_col: col_end,
1227 byte_offset: match_start,
1228 byte_end: match_end,
1229 text,
1230 url: String::new(), is_reference: true,
1232 reference_id: Some(normalized_ref),
1233 link_type: LinkType::Reference, });
1235 }
1236 }
1237
1238 (links, broken_links)
1239 }
1240
1241 fn parse_images(
1243 content: &str,
1244 lines: &[LineInfo],
1245 code_blocks: &[(usize, usize)],
1246 code_spans: &[CodeSpan],
1247 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1248 ) -> Vec<ParsedImage> {
1249 use crate::utils::skip_context::is_in_html_comment_ranges;
1250 use std::collections::HashSet;
1251
1252 let mut images = Vec::with_capacity(content.len() / 1000);
1254 let mut found_positions = HashSet::new();
1255
1256 let parser = Parser::new(content).into_offset_iter();
1258 let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1259 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1262 match event {
1263 Event::Start(Tag::Image {
1264 link_type,
1265 dest_url,
1266 id,
1267 ..
1268 }) => {
1269 image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1270 text_chunks.clear();
1271 }
1272 Event::Text(text) if !image_stack.is_empty() => {
1273 text_chunks.push((text.to_string(), range.start, range.end));
1274 }
1275 Event::Code(code) if !image_stack.is_empty() => {
1276 let code_text = format!("`{code}`");
1277 text_chunks.push((code_text, range.start, range.end));
1278 }
1279 Event::End(TagEnd::Image) => {
1280 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1281 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1283 continue;
1284 }
1285
1286 if Self::is_offset_in_code_span(code_spans, start_pos) {
1288 continue;
1289 }
1290
1291 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1293 continue;
1294 }
1295
1296 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1298 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1299
1300 let is_reference = matches!(
1301 link_type,
1302 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1303 );
1304
1305 let alt_text = if start_pos < content.len() {
1308 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1309
1310 let mut close_pos = None;
1313 let mut depth = 0;
1314
1315 if image_bytes.len() > 2 {
1316 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1317 let mut backslash_count = 0;
1319 let mut j = i;
1320 while j > 0 && image_bytes[j - 1] == b'\\' {
1321 backslash_count += 1;
1322 j -= 1;
1323 }
1324 let is_escaped = backslash_count % 2 != 0;
1325
1326 if !is_escaped {
1327 if byte == b'[' {
1328 depth += 1;
1329 } else if byte == b']' {
1330 if depth == 0 {
1331 close_pos = Some(i);
1333 break;
1334 } else {
1335 depth -= 1;
1336 }
1337 }
1338 }
1339 }
1340 }
1341
1342 if let Some(pos) = close_pos {
1343 std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1344 } else {
1345 String::new()
1346 }
1347 } else {
1348 String::new()
1349 };
1350
1351 let reference_id = if is_reference && !ref_id.is_empty() {
1352 Some(ref_id.to_lowercase())
1353 } else if is_reference {
1354 Some(alt_text.to_lowercase()) } else {
1356 None
1357 };
1358
1359 found_positions.insert(start_pos);
1360 images.push(ParsedImage {
1361 line: line_num,
1362 start_col: col_start,
1363 end_col: col_end,
1364 byte_offset: start_pos,
1365 byte_end: range.end,
1366 alt_text,
1367 url,
1368 is_reference,
1369 reference_id,
1370 link_type,
1371 });
1372 }
1373 }
1374 _ => {}
1375 }
1376 }
1377
1378 for cap in IMAGE_PATTERN.captures_iter(content) {
1380 let full_match = cap.get(0).unwrap();
1381 let match_start = full_match.start();
1382 let match_end = full_match.end();
1383
1384 if found_positions.contains(&match_start) {
1386 continue;
1387 }
1388
1389 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1391 continue;
1392 }
1393
1394 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1396 || Self::is_offset_in_code_span(code_spans, match_start)
1397 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1398 {
1399 continue;
1400 }
1401
1402 if let Some(ref_id) = cap.get(6) {
1404 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1405 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1406 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1407 let ref_id_str = ref_id.as_str();
1408 let normalized_ref = if ref_id_str.is_empty() {
1409 alt_text.to_lowercase()
1410 } else {
1411 ref_id_str.to_lowercase()
1412 };
1413
1414 images.push(ParsedImage {
1415 line: line_num,
1416 start_col: col_start,
1417 end_col: col_end,
1418 byte_offset: match_start,
1419 byte_end: match_end,
1420 alt_text,
1421 url: String::new(),
1422 is_reference: true,
1423 reference_id: Some(normalized_ref),
1424 link_type: LinkType::Reference, });
1426 }
1427 }
1428
1429 images
1430 }
1431
1432 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1434 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1438 if line_info.in_code_block {
1440 continue;
1441 }
1442
1443 let line = &line_info.content;
1444 let line_num = line_idx + 1;
1445
1446 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1447 let id = cap.get(1).unwrap().as_str().to_lowercase();
1448 let url = cap.get(2).unwrap().as_str().to_string();
1449 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1450
1451 let match_obj = cap.get(0).unwrap();
1454 let byte_offset = line_info.byte_offset + match_obj.start();
1455 let byte_end = line_info.byte_offset + match_obj.end();
1456
1457 refs.push(ReferenceDef {
1458 line: line_num,
1459 id,
1460 url,
1461 title,
1462 byte_offset,
1463 byte_end,
1464 });
1465 }
1466 }
1467
1468 refs
1469 }
1470
1471 #[inline]
1475 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1476 let trimmed_start = line.trim_start();
1477 if !trimmed_start.starts_with('>') {
1478 return None;
1479 }
1480
1481 let leading_ws_len = line.len() - trimmed_start.len();
1482 let after_gt = &trimmed_start[1..];
1483 let content = after_gt.trim_start();
1484 let ws_after_gt_len = after_gt.len() - content.len();
1485 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1486
1487 Some((&line[..prefix_len], content))
1488 }
1489
1490 #[inline]
1494 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1495 let bytes = line.as_bytes();
1496 let mut i = 0;
1497
1498 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1500 i += 1;
1501 }
1502
1503 if i >= bytes.len() {
1505 return None;
1506 }
1507 let marker = bytes[i] as char;
1508 if marker != '-' && marker != '*' && marker != '+' {
1509 return None;
1510 }
1511 let marker_pos = i;
1512 i += 1;
1513
1514 let spacing_start = i;
1516 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1517 i += 1;
1518 }
1519
1520 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1521 }
1522
1523 #[inline]
1527 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1528 let bytes = line.as_bytes();
1529 let mut i = 0;
1530
1531 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1533 i += 1;
1534 }
1535
1536 let number_start = i;
1538 while i < bytes.len() && bytes[i].is_ascii_digit() {
1539 i += 1;
1540 }
1541 if i == number_start {
1542 return None; }
1544
1545 if i >= bytes.len() {
1547 return None;
1548 }
1549 let delimiter = bytes[i] as char;
1550 if delimiter != '.' && delimiter != ')' {
1551 return None;
1552 }
1553 let delimiter_pos = i;
1554 i += 1;
1555
1556 let spacing_start = i;
1558 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1559 i += 1;
1560 }
1561
1562 Some((
1563 &line[..number_start],
1564 &line[number_start..delimiter_pos],
1565 delimiter,
1566 &line[spacing_start..i],
1567 &line[i..],
1568 ))
1569 }
1570
1571 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1574 let num_lines = line_offsets.len();
1575 let mut in_code_block = vec![false; num_lines];
1576
1577 for &(start, end) in code_blocks {
1579 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1581 let mut boundary = start;
1582 while boundary > 0 && !content.is_char_boundary(boundary) {
1583 boundary -= 1;
1584 }
1585 boundary
1586 } else {
1587 start
1588 };
1589
1590 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1591 let mut boundary = end;
1592 while boundary < content.len() && !content.is_char_boundary(boundary) {
1593 boundary += 1;
1594 }
1595 boundary
1596 } else {
1597 end.min(content.len())
1598 };
1599
1600 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1615 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1616
1617 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1619 *flag = true;
1620 }
1621 }
1622
1623 in_code_block
1624 }
1625
1626 fn compute_basic_line_info(
1628 content: &str,
1629 line_offsets: &[usize],
1630 code_blocks: &[(usize, usize)],
1631 flavor: MarkdownFlavor,
1632 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1633 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1634 ) -> Vec<LineInfo> {
1635 let content_lines: Vec<&str> = content.lines().collect();
1636 let mut lines = Vec::with_capacity(content_lines.len());
1637
1638 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1640
1641 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1644
1645 for (i, line) in content_lines.iter().enumerate() {
1646 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1647 let indent = line.len() - line.trim_start().len();
1648
1649 let blockquote_parse = Self::parse_blockquote_prefix(line);
1651
1652 let is_blank = if let Some((_, content)) = blockquote_parse {
1654 content.trim().is_empty()
1656 } else {
1657 line.trim().is_empty()
1658 };
1659
1660 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1662
1663 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1665 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1666 let in_html_comment =
1668 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1669 let list_item = if !(in_code_block
1670 || is_blank
1671 || in_mkdocstrings
1672 || in_html_comment
1673 || (front_matter_end > 0 && i < front_matter_end))
1674 {
1675 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1677 (content, prefix.len())
1678 } else {
1679 (&**line, 0)
1680 };
1681
1682 if let Some((leading_spaces, marker, spacing, _content)) =
1683 Self::parse_unordered_list(line_for_list_check)
1684 {
1685 let marker_column = blockquote_prefix_len + leading_spaces.len();
1686 let content_column = marker_column + 1 + spacing.len();
1687
1688 if spacing.is_empty() {
1695 None
1696 } else {
1697 Some(ListItemInfo {
1698 marker: marker.to_string(),
1699 is_ordered: false,
1700 number: None,
1701 marker_column,
1702 content_column,
1703 })
1704 }
1705 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1706 Self::parse_ordered_list(line_for_list_check)
1707 {
1708 let marker = format!("{number_str}{delimiter}");
1709 let marker_column = blockquote_prefix_len + leading_spaces.len();
1710 let content_column = marker_column + marker.len() + spacing.len();
1711
1712 if spacing.is_empty() {
1715 None
1716 } else {
1717 Some(ListItemInfo {
1718 marker,
1719 is_ordered: true,
1720 number: number_str.parse().ok(),
1721 marker_column,
1722 content_column,
1723 })
1724 }
1725 } else {
1726 None
1727 }
1728 } else {
1729 None
1730 };
1731
1732 lines.push(LineInfo {
1733 content: line.to_string(),
1734 byte_offset,
1735 indent,
1736 is_blank,
1737 in_code_block,
1738 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1739 in_html_block: false, in_html_comment,
1741 list_item,
1742 heading: None, blockquote: None, in_mkdocstrings,
1745 in_esm_block: false, });
1747 }
1748
1749 lines
1750 }
1751
1752 fn detect_headings_and_blockquotes(
1754 content: &str,
1755 lines: &mut [LineInfo],
1756 flavor: MarkdownFlavor,
1757 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1758 ) {
1759 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1761 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1762 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1763 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1764
1765 let content_lines: Vec<&str> = content.lines().collect();
1766
1767 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1769
1770 for i in 0..lines.len() {
1772 if lines[i].in_code_block {
1773 continue;
1774 }
1775
1776 if front_matter_end > 0 && i < front_matter_end {
1778 continue;
1779 }
1780
1781 if lines[i].in_html_block {
1783 continue;
1784 }
1785
1786 let line = content_lines[i];
1787
1788 if let Some(bq) = parse_blockquote_detailed(line) {
1790 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1792
1793 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1795
1796 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1798 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1800
1801 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1805
1806 lines[i].blockquote = Some(BlockquoteInfo {
1807 nesting_level,
1808 indent: bq.indent.to_string(),
1809 marker_column,
1810 prefix,
1811 content: bq.content.to_string(),
1812 has_no_space_after_marker: has_no_space,
1813 has_multiple_spaces_after_marker: has_multiple_spaces,
1814 needs_md028_fix,
1815 });
1816 }
1817
1818 if lines[i].is_blank {
1820 continue;
1821 }
1822
1823 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1826 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1827 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1828 } else {
1829 false
1830 };
1831
1832 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1833 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1835 continue;
1836 }
1837 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1838 let hashes = caps.get(2).map_or("", |m| m.as_str());
1839 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1840 let rest = caps.get(4).map_or("", |m| m.as_str());
1841
1842 let level = hashes.len() as u8;
1843 let marker_column = leading_spaces.len();
1844
1845 let (text, has_closing, closing_seq) = {
1847 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1849 if rest[id_start..].trim_end().ends_with('}') {
1851 (&rest[..id_start], &rest[id_start..])
1853 } else {
1854 (rest, "")
1855 }
1856 } else {
1857 (rest, "")
1858 };
1859
1860 let trimmed_rest = rest_without_id.trim_end();
1862 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1863 let mut start_of_hashes = last_hash_pos;
1865 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1866 start_of_hashes -= 1;
1867 }
1868
1869 let has_space_before = start_of_hashes == 0
1871 || trimmed_rest
1872 .chars()
1873 .nth(start_of_hashes - 1)
1874 .is_some_and(|c| c.is_whitespace());
1875
1876 let potential_closing = &trimmed_rest[start_of_hashes..];
1878 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1879
1880 if is_all_hashes && has_space_before {
1881 let closing_hashes = potential_closing.to_string();
1883 let text_part = if !custom_id_part.is_empty() {
1886 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1889 } else {
1890 rest_without_id[..start_of_hashes].trim_end().to_string()
1891 };
1892 (text_part, true, closing_hashes)
1893 } else {
1894 (rest.to_string(), false, String::new())
1896 }
1897 } else {
1898 (rest.to_string(), false, String::new())
1900 }
1901 };
1902
1903 let content_column = marker_column + hashes.len() + spaces_after.len();
1904
1905 let raw_text = text.trim().to_string();
1907 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1908
1909 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1911 let next_line = content_lines[i + 1];
1912 if !lines[i + 1].in_code_block
1913 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1914 && let Some(next_line_id) =
1915 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1916 {
1917 custom_id = Some(next_line_id);
1918 }
1919 }
1920
1921 lines[i].heading = Some(HeadingInfo {
1922 level,
1923 style: HeadingStyle::ATX,
1924 marker: hashes.to_string(),
1925 marker_column,
1926 content_column,
1927 text: clean_text,
1928 custom_id,
1929 raw_text,
1930 has_closing_sequence: has_closing,
1931 closing_sequence: closing_seq,
1932 });
1933 }
1934 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1936 let next_line = content_lines[i + 1];
1937 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1938 if front_matter_end > 0 && i < front_matter_end {
1940 continue;
1941 }
1942
1943 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1945 {
1946 continue;
1947 }
1948
1949 let underline = next_line.trim();
1950
1951 if underline == "---" {
1954 continue;
1955 }
1956
1957 let current_line_trimmed = line.trim();
1959 if current_line_trimmed.contains(':')
1960 && !current_line_trimmed.starts_with('#')
1961 && !current_line_trimmed.contains('[')
1962 && !current_line_trimmed.contains("](")
1963 {
1964 continue;
1966 }
1967
1968 let level = if underline.starts_with('=') { 1 } else { 2 };
1969 let style = if level == 1 {
1970 HeadingStyle::Setext1
1971 } else {
1972 HeadingStyle::Setext2
1973 };
1974
1975 let raw_text = line.trim().to_string();
1977 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1978
1979 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1981 let attr_line = content_lines[i + 2];
1982 if !lines[i + 2].in_code_block
1983 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1984 && let Some(attr_line_id) =
1985 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1986 {
1987 custom_id = Some(attr_line_id);
1988 }
1989 }
1990
1991 lines[i].heading = Some(HeadingInfo {
1992 level,
1993 style,
1994 marker: underline.to_string(),
1995 marker_column: next_line.len() - next_line.trim_start().len(),
1996 content_column: lines[i].indent,
1997 text: clean_text,
1998 custom_id,
1999 raw_text,
2000 has_closing_sequence: false,
2001 closing_sequence: String::new(),
2002 });
2003 }
2004 }
2005 }
2006 }
2007
2008 fn detect_html_blocks(lines: &mut [LineInfo]) {
2010 const BLOCK_ELEMENTS: &[&str] = &[
2012 "address",
2013 "article",
2014 "aside",
2015 "blockquote",
2016 "details",
2017 "dialog",
2018 "dd",
2019 "div",
2020 "dl",
2021 "dt",
2022 "fieldset",
2023 "figcaption",
2024 "figure",
2025 "footer",
2026 "form",
2027 "h1",
2028 "h2",
2029 "h3",
2030 "h4",
2031 "h5",
2032 "h6",
2033 "header",
2034 "hr",
2035 "li",
2036 "main",
2037 "nav",
2038 "ol",
2039 "p",
2040 "pre",
2041 "script",
2042 "section",
2043 "style",
2044 "table",
2045 "tbody",
2046 "td",
2047 "tfoot",
2048 "th",
2049 "thead",
2050 "tr",
2051 "ul",
2052 ];
2053
2054 let mut i = 0;
2055 while i < lines.len() {
2056 if lines[i].in_code_block || lines[i].in_front_matter {
2058 i += 1;
2059 continue;
2060 }
2061
2062 let trimmed = lines[i].content.trim_start();
2063
2064 if trimmed.starts_with('<') && trimmed.len() > 1 {
2066 let after_bracket = &trimmed[1..];
2068 let is_closing = after_bracket.starts_with('/');
2069 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2070
2071 let tag_name = tag_start
2073 .chars()
2074 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2075 .collect::<String>()
2076 .to_lowercase();
2077
2078 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2080 lines[i].in_html_block = true;
2082
2083 if !is_closing {
2086 let closing_tag = format!("</{tag_name}>");
2087 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2089 let mut j = i + 1;
2090 while j < lines.len() && j < i + 100 {
2091 if !allow_blank_lines && lines[j].is_blank {
2094 break;
2095 }
2096
2097 lines[j].in_html_block = true;
2098
2099 if lines[j].content.contains(&closing_tag) {
2101 break;
2102 }
2103 j += 1;
2104 }
2105 }
2106 }
2107 }
2108
2109 i += 1;
2110 }
2111 }
2112
2113 fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2116 if !flavor.supports_esm_blocks() {
2118 return;
2119 }
2120
2121 for line in lines.iter_mut() {
2122 if line.is_blank || line.in_html_comment {
2124 continue;
2125 }
2126
2127 let trimmed = line.content.trim_start();
2129 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2130 line.in_esm_block = true;
2131 } else {
2132 break;
2134 }
2135 }
2136 }
2137
2138 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2140 let mut code_spans = Vec::new();
2141
2142 if !content.contains('`') {
2144 return code_spans;
2145 }
2146
2147 let parser = Parser::new(content).into_offset_iter();
2149
2150 for (event, range) in parser {
2151 if let Event::Code(_) = event {
2152 let start_pos = range.start;
2153 let end_pos = range.end;
2154
2155 let full_span = &content[start_pos..end_pos];
2157 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2158
2159 let content_start = start_pos + backtick_count;
2161 let content_end = end_pos - backtick_count;
2162 let span_content = if content_start < content_end {
2163 content[content_start..content_end].to_string()
2164 } else {
2165 String::new()
2166 };
2167
2168 let line_idx = lines
2171 .partition_point(|line| line.byte_offset <= start_pos)
2172 .saturating_sub(1);
2173 let line_num = line_idx + 1;
2174 let col_start = start_pos - lines[line_idx].byte_offset;
2175
2176 let end_line_idx = lines
2178 .partition_point(|line| line.byte_offset <= end_pos)
2179 .saturating_sub(1);
2180 let col_end = end_pos - lines[end_line_idx].byte_offset;
2181
2182 code_spans.push(CodeSpan {
2183 line: line_num,
2184 start_col: col_start,
2185 end_col: col_end,
2186 byte_offset: start_pos,
2187 byte_end: end_pos,
2188 backtick_count,
2189 content: span_content,
2190 });
2191 }
2192 }
2193
2194 code_spans.sort_by_key(|span| span.byte_offset);
2196
2197 code_spans
2198 }
2199
2200 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2202 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2205 let mut last_list_item_line = 0;
2206 let mut current_indent_level = 0;
2207 let mut last_marker_width = 0;
2208
2209 for (line_idx, line_info) in lines.iter().enumerate() {
2210 let line_num = line_idx + 1;
2211
2212 if line_info.in_code_block {
2214 if let Some(ref mut block) = current_block {
2215 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2217
2218 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2220
2221 match context {
2222 CodeBlockContext::Indented => {
2223 block.end_line = line_num;
2225 continue;
2226 }
2227 CodeBlockContext::Standalone => {
2228 let completed_block = current_block.take().unwrap();
2230 list_blocks.push(completed_block);
2231 continue;
2232 }
2233 CodeBlockContext::Adjacent => {
2234 block.end_line = line_num;
2236 continue;
2237 }
2238 }
2239 } else {
2240 continue;
2242 }
2243 }
2244
2245 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2247 caps.get(0).unwrap().as_str().to_string()
2248 } else {
2249 String::new()
2250 };
2251
2252 if let Some(list_item) = &line_info.list_item {
2254 let item_indent = list_item.marker_column;
2256 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2259 let is_nested = nesting > block.nesting_level;
2263 let same_type =
2264 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2265 let same_context = block.blockquote_prefix == blockquote_prefix;
2266 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2270 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2271
2272 let has_non_list_content = {
2274 let mut found_non_list = false;
2275 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2277
2278 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2280 let last_line = &lines[block_last_item_line - 1];
2281 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2282 log::debug!(
2283 "After problematic line {}: checking lines {} to {} for non-list content",
2284 block_last_item_line,
2285 block_last_item_line + 1,
2286 line_num
2287 );
2288 if line_num == block_last_item_line + 1 {
2290 log::debug!("Lines are consecutive, no content between");
2291 }
2292 }
2293 }
2294
2295 for check_line in (block_last_item_line + 1)..line_num {
2296 let check_idx = check_line - 1;
2297 if check_idx < lines.len() {
2298 let check_info = &lines[check_idx];
2299 let is_list_breaking_content = if check_info.in_code_block {
2301 let last_item_marker_width =
2303 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2304 lines[block_last_item_line - 1]
2305 .list_item
2306 .as_ref()
2307 .map(|li| {
2308 if li.is_ordered {
2309 li.marker.len() + 1 } else {
2311 li.marker.len()
2312 }
2313 })
2314 .unwrap_or(3) } else {
2316 3 };
2318
2319 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2320
2321 let context = CodeBlockUtils::analyze_code_block_context(
2323 lines,
2324 check_line - 1,
2325 min_continuation,
2326 );
2327
2328 matches!(context, CodeBlockContext::Standalone)
2330 } else if !check_info.is_blank && check_info.list_item.is_none() {
2331 let line_content = check_info.content.trim();
2333
2334 if check_info.heading.is_some()
2336 || line_content.starts_with("---")
2337 || line_content.starts_with("***")
2338 || line_content.starts_with("___")
2339 || (line_content.contains('|')
2340 && !line_content.contains("](")
2341 && !line_content.contains("http")
2342 && (line_content.matches('|').count() > 1
2343 || line_content.starts_with('|')
2344 || line_content.ends_with('|')))
2345 || line_content.starts_with(">")
2346 {
2347 true
2348 }
2349 else {
2351 let last_item_marker_width =
2352 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2353 lines[block_last_item_line - 1]
2354 .list_item
2355 .as_ref()
2356 .map(|li| {
2357 if li.is_ordered {
2358 li.marker.len() + 1 } else {
2360 li.marker.len()
2361 }
2362 })
2363 .unwrap_or(3) } else {
2365 3 };
2367
2368 let min_continuation =
2369 if block.is_ordered { last_item_marker_width } else { 2 };
2370 check_info.indent < min_continuation
2371 }
2372 } else {
2373 false
2374 };
2375
2376 if is_list_breaking_content {
2377 found_non_list = true;
2379 break;
2380 }
2381 }
2382 }
2383 found_non_list
2384 };
2385
2386 let mut continues_list = if is_nested {
2390 same_context && reasonable_distance && !has_non_list_content
2392 } else {
2393 let result = same_type
2395 && same_context
2396 && reasonable_distance
2397 && marker_compatible
2398 && !has_non_list_content;
2399
2400 if block.item_lines.last().is_some_and(|&last_line| {
2402 last_line > 0
2403 && last_line <= lines.len()
2404 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2405 && lines[last_line - 1].content.contains(r"\`")
2406 }) {
2407 log::debug!(
2408 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2409 );
2410 if line_num > 0 && line_num <= lines.len() {
2411 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2412 }
2413 }
2414
2415 result
2416 };
2417
2418 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2421 if block.item_lines.contains(&(line_num - 1)) {
2423 continues_list = true;
2425 }
2426 }
2427
2428 if continues_list {
2429 block.end_line = line_num;
2431 block.item_lines.push(line_num);
2432
2433 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2435 list_item.marker.len() + 1
2436 } else {
2437 list_item.marker.len()
2438 });
2439
2440 if !block.is_ordered
2442 && block.marker.is_some()
2443 && block.marker.as_ref() != Some(&list_item.marker)
2444 {
2445 block.marker = None;
2447 }
2448 } else {
2449 list_blocks.push(block.clone());
2452
2453 *block = ListBlock {
2454 start_line: line_num,
2455 end_line: line_num,
2456 is_ordered: list_item.is_ordered,
2457 marker: if list_item.is_ordered {
2458 None
2459 } else {
2460 Some(list_item.marker.clone())
2461 },
2462 blockquote_prefix: blockquote_prefix.clone(),
2463 item_lines: vec![line_num],
2464 nesting_level: nesting,
2465 max_marker_width: if list_item.is_ordered {
2466 list_item.marker.len() + 1
2467 } else {
2468 list_item.marker.len()
2469 },
2470 };
2471 }
2472 } else {
2473 current_block = Some(ListBlock {
2475 start_line: line_num,
2476 end_line: line_num,
2477 is_ordered: list_item.is_ordered,
2478 marker: if list_item.is_ordered {
2479 None
2480 } else {
2481 Some(list_item.marker.clone())
2482 },
2483 blockquote_prefix,
2484 item_lines: vec![line_num],
2485 nesting_level: nesting,
2486 max_marker_width: list_item.marker.len(),
2487 });
2488 }
2489
2490 last_list_item_line = line_num;
2491 current_indent_level = item_indent;
2492 last_marker_width = if list_item.is_ordered {
2493 list_item.marker.len() + 1 } else {
2495 list_item.marker.len()
2496 };
2497 } else if let Some(ref mut block) = current_block {
2498 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2508 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2509 } else {
2510 false
2511 };
2512
2513 let min_continuation_indent = if block.is_ordered {
2517 current_indent_level + last_marker_width
2518 } else {
2519 current_indent_level + 2 };
2521
2522 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2523 block.end_line = line_num;
2525 } else if line_info.is_blank {
2526 let mut check_idx = line_idx + 1;
2529 let mut found_continuation = false;
2530
2531 while check_idx < lines.len() && lines[check_idx].is_blank {
2533 check_idx += 1;
2534 }
2535
2536 if check_idx < lines.len() {
2537 let next_line = &lines[check_idx];
2538 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2540 found_continuation = true;
2541 }
2542 else if !next_line.in_code_block
2544 && next_line.list_item.is_some()
2545 && let Some(item) = &next_line.list_item
2546 {
2547 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2548 .find(&next_line.content)
2549 .map_or(String::new(), |m| m.as_str().to_string());
2550 if item.marker_column == current_indent_level
2551 && item.is_ordered == block.is_ordered
2552 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2553 {
2554 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2557 if let Some(between_line) = lines.get(idx) {
2558 let trimmed = between_line.content.trim();
2559 if trimmed.is_empty() {
2561 return false;
2562 }
2563 let line_indent =
2565 between_line.content.len() - between_line.content.trim_start().len();
2566
2567 if trimmed.starts_with("```")
2569 || trimmed.starts_with("~~~")
2570 || trimmed.starts_with("---")
2571 || trimmed.starts_with("***")
2572 || trimmed.starts_with("___")
2573 || trimmed.starts_with(">")
2574 || trimmed.contains('|') || between_line.heading.is_some()
2576 {
2577 return true; }
2579
2580 line_indent >= min_continuation_indent
2582 } else {
2583 false
2584 }
2585 });
2586
2587 if block.is_ordered {
2588 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2591 if let Some(between_line) = lines.get(idx) {
2592 let trimmed = between_line.content.trim();
2593 if trimmed.is_empty() {
2594 return false;
2595 }
2596 trimmed.starts_with("```")
2598 || trimmed.starts_with("~~~")
2599 || trimmed.starts_with("---")
2600 || trimmed.starts_with("***")
2601 || trimmed.starts_with("___")
2602 || trimmed.starts_with(">")
2603 || trimmed.contains('|') || between_line.heading.is_some()
2605 } else {
2606 false
2607 }
2608 });
2609 found_continuation = !has_structural_separators;
2610 } else {
2611 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2613 if let Some(between_line) = lines.get(idx) {
2614 let trimmed = between_line.content.trim();
2615 if trimmed.is_empty() {
2616 return false;
2617 }
2618 trimmed.starts_with("```")
2620 || trimmed.starts_with("~~~")
2621 || trimmed.starts_with("---")
2622 || trimmed.starts_with("***")
2623 || trimmed.starts_with("___")
2624 || trimmed.starts_with(">")
2625 || trimmed.contains('|') || between_line.heading.is_some()
2627 } else {
2628 false
2629 }
2630 });
2631 found_continuation = !has_structural_separators;
2632 }
2633 }
2634 }
2635 }
2636
2637 if found_continuation {
2638 block.end_line = line_num;
2640 } else {
2641 list_blocks.push(block.clone());
2643 current_block = None;
2644 }
2645 } else {
2646 let min_required_indent = if block.is_ordered {
2649 current_indent_level + last_marker_width
2650 } else {
2651 current_indent_level + 2
2652 };
2653
2654 let line_content = line_info.content.trim();
2659 let is_structural_separator = line_info.heading.is_some()
2660 || line_content.starts_with("```")
2661 || line_content.starts_with("~~~")
2662 || line_content.starts_with("---")
2663 || line_content.starts_with("***")
2664 || line_content.starts_with("___")
2665 || line_content.starts_with(">")
2666 || (line_content.contains('|')
2667 && !line_content.contains("](")
2668 && !line_content.contains("http")
2669 && (line_content.matches('|').count() > 1
2670 || line_content.starts_with('|')
2671 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2676 && !line_info.is_blank
2677 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2678
2679 if is_lazy_continuation {
2680 let content_to_check = if !blockquote_prefix.is_empty() {
2683 line_info
2685 .content
2686 .strip_prefix(&blockquote_prefix)
2687 .unwrap_or(&line_info.content)
2688 .trim()
2689 } else {
2690 line_info.content.trim()
2691 };
2692
2693 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2694
2695 if starts_with_uppercase && last_list_item_line > 0 {
2698 list_blocks.push(block.clone());
2700 current_block = None;
2701 } else {
2702 block.end_line = line_num;
2704 }
2705 } else {
2706 list_blocks.push(block.clone());
2708 current_block = None;
2709 }
2710 }
2711 }
2712 }
2713
2714 if let Some(block) = current_block {
2716 list_blocks.push(block);
2717 }
2718
2719 merge_adjacent_list_blocks(&mut list_blocks, lines);
2721
2722 list_blocks
2723 }
2724
2725 fn compute_char_frequency(content: &str) -> CharFrequency {
2727 let mut frequency = CharFrequency::default();
2728
2729 for ch in content.chars() {
2730 match ch {
2731 '#' => frequency.hash_count += 1,
2732 '*' => frequency.asterisk_count += 1,
2733 '_' => frequency.underscore_count += 1,
2734 '-' => frequency.hyphen_count += 1,
2735 '+' => frequency.plus_count += 1,
2736 '>' => frequency.gt_count += 1,
2737 '|' => frequency.pipe_count += 1,
2738 '[' => frequency.bracket_count += 1,
2739 '`' => frequency.backtick_count += 1,
2740 '<' => frequency.lt_count += 1,
2741 '!' => frequency.exclamation_count += 1,
2742 '\n' => frequency.newline_count += 1,
2743 _ => {}
2744 }
2745 }
2746
2747 frequency
2748 }
2749
2750 fn parse_html_tags(
2752 content: &str,
2753 lines: &[LineInfo],
2754 code_blocks: &[(usize, usize)],
2755 flavor: MarkdownFlavor,
2756 ) -> Vec<HtmlTag> {
2757 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2758 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2759
2760 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2761
2762 for cap in HTML_TAG_REGEX.captures_iter(content) {
2763 let full_match = cap.get(0).unwrap();
2764 let match_start = full_match.start();
2765 let match_end = full_match.end();
2766
2767 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2769 continue;
2770 }
2771
2772 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2773 let tag_name_original = cap.get(2).unwrap().as_str();
2774 let tag_name = tag_name_original.to_lowercase();
2775 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2776
2777 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2780 continue;
2781 }
2782
2783 let mut line_num = 1;
2785 let mut col_start = match_start;
2786 let mut col_end = match_end;
2787 for (idx, line_info) in lines.iter().enumerate() {
2788 if match_start >= line_info.byte_offset {
2789 line_num = idx + 1;
2790 col_start = match_start - line_info.byte_offset;
2791 col_end = match_end - line_info.byte_offset;
2792 } else {
2793 break;
2794 }
2795 }
2796
2797 html_tags.push(HtmlTag {
2798 line: line_num,
2799 start_col: col_start,
2800 end_col: col_end,
2801 byte_offset: match_start,
2802 byte_end: match_end,
2803 tag_name,
2804 is_closing,
2805 is_self_closing,
2806 raw_content: full_match.as_str().to_string(),
2807 });
2808 }
2809
2810 html_tags
2811 }
2812
2813 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2815 static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2816 LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2817
2818 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2819
2820 for cap in EMPHASIS_REGEX.captures_iter(content) {
2821 let full_match = cap.get(0).unwrap();
2822 let match_start = full_match.start();
2823 let match_end = full_match.end();
2824
2825 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2827 continue;
2828 }
2829
2830 let opening_markers = cap.get(1).unwrap().as_str();
2831 let content_part = cap.get(2).unwrap().as_str();
2832 let closing_markers = cap.get(3).unwrap().as_str();
2833
2834 if opening_markers.chars().next() != closing_markers.chars().next()
2836 || opening_markers.len() != closing_markers.len()
2837 {
2838 continue;
2839 }
2840
2841 let marker = opening_markers.chars().next().unwrap();
2842 let marker_count = opening_markers.len();
2843
2844 let mut line_num = 1;
2846 let mut col_start = match_start;
2847 let mut col_end = match_end;
2848 for (idx, line_info) in lines.iter().enumerate() {
2849 if match_start >= line_info.byte_offset {
2850 line_num = idx + 1;
2851 col_start = match_start - line_info.byte_offset;
2852 col_end = match_end - line_info.byte_offset;
2853 } else {
2854 break;
2855 }
2856 }
2857
2858 emphasis_spans.push(EmphasisSpan {
2859 line: line_num,
2860 start_col: col_start,
2861 end_col: col_end,
2862 byte_offset: match_start,
2863 byte_end: match_end,
2864 marker,
2865 marker_count,
2866 content: content_part.to_string(),
2867 });
2868 }
2869
2870 emphasis_spans
2871 }
2872
2873 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2875 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2876
2877 for (line_idx, line_info) in lines.iter().enumerate() {
2878 if line_info.in_code_block || line_info.is_blank {
2880 continue;
2881 }
2882
2883 let line = &line_info.content;
2884 let line_num = line_idx + 1;
2885
2886 if !line.contains('|') {
2888 continue;
2889 }
2890
2891 let parts: Vec<&str> = line.split('|').collect();
2893 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2894
2895 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2897 let mut column_alignments = Vec::new();
2898
2899 if is_separator {
2900 for part in &parts[1..parts.len() - 1] {
2901 let trimmed = part.trim();
2903 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2904 "center".to_string()
2905 } else if trimmed.ends_with(':') {
2906 "right".to_string()
2907 } else if trimmed.starts_with(':') {
2908 "left".to_string()
2909 } else {
2910 "none".to_string()
2911 };
2912 column_alignments.push(alignment);
2913 }
2914 }
2915
2916 table_rows.push(TableRow {
2917 line: line_num,
2918 is_separator,
2919 column_count,
2920 column_alignments,
2921 });
2922 }
2923
2924 table_rows
2925 }
2926
2927 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2929 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2930
2931 for cap in BARE_URL_PATTERN.captures_iter(content) {
2933 let full_match = cap.get(0).unwrap();
2934 let match_start = full_match.start();
2935 let match_end = full_match.end();
2936
2937 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2939 continue;
2940 }
2941
2942 let preceding_char = if match_start > 0 {
2944 content.chars().nth(match_start - 1)
2945 } else {
2946 None
2947 };
2948 let following_char = content.chars().nth(match_end);
2949
2950 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2951 continue;
2952 }
2953 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2954 continue;
2955 }
2956
2957 let url = full_match.as_str();
2958 let url_type = if url.starts_with("https://") {
2959 "https"
2960 } else if url.starts_with("http://") {
2961 "http"
2962 } else if url.starts_with("ftp://") {
2963 "ftp"
2964 } else {
2965 "other"
2966 };
2967
2968 let mut line_num = 1;
2970 let mut col_start = match_start;
2971 let mut col_end = match_end;
2972 for (idx, line_info) in lines.iter().enumerate() {
2973 if match_start >= line_info.byte_offset {
2974 line_num = idx + 1;
2975 col_start = match_start - line_info.byte_offset;
2976 col_end = match_end - line_info.byte_offset;
2977 } else {
2978 break;
2979 }
2980 }
2981
2982 bare_urls.push(BareUrl {
2983 line: line_num,
2984 start_col: col_start,
2985 end_col: col_end,
2986 byte_offset: match_start,
2987 byte_end: match_end,
2988 url: url.to_string(),
2989 url_type: url_type.to_string(),
2990 });
2991 }
2992
2993 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2995 let full_match = cap.get(0).unwrap();
2996 let match_start = full_match.start();
2997 let match_end = full_match.end();
2998
2999 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3001 continue;
3002 }
3003
3004 let preceding_char = if match_start > 0 {
3006 content.chars().nth(match_start - 1)
3007 } else {
3008 None
3009 };
3010 let following_char = content.chars().nth(match_end);
3011
3012 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3013 continue;
3014 }
3015 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3016 continue;
3017 }
3018
3019 let email = full_match.as_str();
3020
3021 let mut line_num = 1;
3023 let mut col_start = match_start;
3024 let mut col_end = match_end;
3025 for (idx, line_info) in lines.iter().enumerate() {
3026 if match_start >= line_info.byte_offset {
3027 line_num = idx + 1;
3028 col_start = match_start - line_info.byte_offset;
3029 col_end = match_end - line_info.byte_offset;
3030 } else {
3031 break;
3032 }
3033 }
3034
3035 bare_urls.push(BareUrl {
3036 line: line_num,
3037 start_col: col_start,
3038 end_col: col_end,
3039 byte_offset: match_start,
3040 byte_end: match_end,
3041 url: email.to_string(),
3042 url_type: "email".to_string(),
3043 });
3044 }
3045
3046 bare_urls
3047 }
3048}
3049
3050fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3052 if list_blocks.len() < 2 {
3053 return;
3054 }
3055
3056 let mut merger = ListBlockMerger::new(lines);
3057 *list_blocks = merger.merge(list_blocks);
3058}
3059
3060struct ListBlockMerger<'a> {
3062 lines: &'a [LineInfo],
3063}
3064
3065impl<'a> ListBlockMerger<'a> {
3066 fn new(lines: &'a [LineInfo]) -> Self {
3067 Self { lines }
3068 }
3069
3070 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3071 let mut merged = Vec::with_capacity(list_blocks.len());
3072 let mut current = list_blocks[0].clone();
3073
3074 for next in list_blocks.iter().skip(1) {
3075 if self.should_merge_blocks(¤t, next) {
3076 current = self.merge_two_blocks(current, next);
3077 } else {
3078 merged.push(current);
3079 current = next.clone();
3080 }
3081 }
3082
3083 merged.push(current);
3084 merged
3085 }
3086
3087 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3089 if !self.blocks_are_compatible(current, next) {
3091 return false;
3092 }
3093
3094 let spacing = self.analyze_spacing_between(current, next);
3096 match spacing {
3097 BlockSpacing::Consecutive => true,
3098 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3099 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3100 self.can_merge_with_content_between(current, next)
3101 }
3102 }
3103 }
3104
3105 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3107 current.is_ordered == next.is_ordered
3108 && current.blockquote_prefix == next.blockquote_prefix
3109 && current.nesting_level == next.nesting_level
3110 }
3111
3112 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3114 let gap = next.start_line - current.end_line;
3115
3116 match gap {
3117 1 => BlockSpacing::Consecutive,
3118 2 => BlockSpacing::SingleBlank,
3119 _ if gap > 2 => {
3120 if self.has_only_blank_lines_between(current, next) {
3121 BlockSpacing::MultipleBlanks
3122 } else {
3123 BlockSpacing::ContentBetween
3124 }
3125 }
3126 _ => BlockSpacing::Consecutive, }
3128 }
3129
3130 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3132 if has_meaningful_content_between(current, next, self.lines) {
3135 return false; }
3137
3138 !current.is_ordered && current.marker == next.marker
3140 }
3141
3142 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3144 if has_meaningful_content_between(current, next, self.lines) {
3146 return false; }
3148
3149 current.is_ordered && next.is_ordered
3151 }
3152
3153 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3155 for line_num in (current.end_line + 1)..next.start_line {
3156 if let Some(line_info) = self.lines.get(line_num - 1)
3157 && !line_info.content.trim().is_empty()
3158 {
3159 return false;
3160 }
3161 }
3162 true
3163 }
3164
3165 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3167 current.end_line = next.end_line;
3168 current.item_lines.extend_from_slice(&next.item_lines);
3169
3170 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3172
3173 if !current.is_ordered && self.markers_differ(¤t, next) {
3175 current.marker = None; }
3177
3178 current
3179 }
3180
3181 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3183 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3184 }
3185}
3186
3187#[derive(Debug, PartialEq)]
3189enum BlockSpacing {
3190 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
3195
3196fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3198 for line_num in (current.end_line + 1)..next.start_line {
3200 if let Some(line_info) = lines.get(line_num - 1) {
3201 let trimmed = line_info.content.trim();
3203
3204 if trimmed.is_empty() {
3206 continue;
3207 }
3208
3209 if line_info.heading.is_some() {
3213 return true; }
3215
3216 if is_horizontal_rule(trimmed) {
3218 return true; }
3220
3221 if trimmed.contains('|') && trimmed.len() > 1 {
3224 if !trimmed.contains("](") && !trimmed.contains("http") {
3226 let pipe_count = trimmed.matches('|').count();
3228 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3229 return true; }
3231 }
3232 }
3233
3234 if trimmed.starts_with('>') {
3236 return true; }
3238
3239 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3241 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3242
3243 let min_continuation_indent = if current.is_ordered {
3245 current.nesting_level + current.max_marker_width + 1 } else {
3247 current.nesting_level + 2
3248 };
3249
3250 if line_indent < min_continuation_indent {
3251 return true; }
3254 }
3255
3256 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3258
3259 let min_indent = if current.is_ordered {
3261 current.nesting_level + current.max_marker_width
3262 } else {
3263 current.nesting_level + 2
3264 };
3265
3266 if line_indent < min_indent {
3268 return true; }
3270
3271 }
3274 }
3275
3276 false
3278}
3279
3280fn is_horizontal_rule(trimmed: &str) -> bool {
3282 if trimmed.len() < 3 {
3283 return false;
3284 }
3285
3286 let chars: Vec<char> = trimmed.chars().collect();
3288 if let Some(&first_char) = chars.first()
3289 && (first_char == '-' || first_char == '*' || first_char == '_')
3290 {
3291 let mut count = 0;
3292 for &ch in &chars {
3293 if ch == first_char {
3294 count += 1;
3295 } else if ch != ' ' && ch != '\t' {
3296 return false; }
3298 }
3299 return count >= 3;
3300 }
3301 false
3302}
3303
3304#[cfg(test)]
3306mod tests {
3307 use super::*;
3308
3309 #[test]
3310 fn test_empty_content() {
3311 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3312 assert_eq!(ctx.content, "");
3313 assert_eq!(ctx.line_offsets, vec![0]);
3314 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3315 assert_eq!(ctx.lines.len(), 0);
3316 }
3317
3318 #[test]
3319 fn test_single_line() {
3320 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3321 assert_eq!(ctx.content, "# Hello");
3322 assert_eq!(ctx.line_offsets, vec![0]);
3323 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3324 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3325 }
3326
3327 #[test]
3328 fn test_multi_line() {
3329 let content = "# Title\n\nSecond line\nThird line";
3330 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3331 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3332 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3339
3340 #[test]
3341 fn test_line_info() {
3342 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3343 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3344
3345 assert_eq!(ctx.lines.len(), 7);
3347
3348 let line1 = &ctx.lines[0];
3350 assert_eq!(line1.content, "# Title");
3351 assert_eq!(line1.byte_offset, 0);
3352 assert_eq!(line1.indent, 0);
3353 assert!(!line1.is_blank);
3354 assert!(!line1.in_code_block);
3355 assert!(line1.list_item.is_none());
3356
3357 let line2 = &ctx.lines[1];
3359 assert_eq!(line2.content, " indented");
3360 assert_eq!(line2.byte_offset, 8);
3361 assert_eq!(line2.indent, 4);
3362 assert!(!line2.is_blank);
3363
3364 let line3 = &ctx.lines[2];
3366 assert_eq!(line3.content, "");
3367 assert!(line3.is_blank);
3368
3369 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3371 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3372 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3373 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3374 }
3375
3376 #[test]
3377 fn test_list_item_detection() {
3378 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3379 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3380
3381 let line1 = &ctx.lines[0];
3383 assert!(line1.list_item.is_some());
3384 let list1 = line1.list_item.as_ref().unwrap();
3385 assert_eq!(list1.marker, "-");
3386 assert!(!list1.is_ordered);
3387 assert_eq!(list1.marker_column, 0);
3388 assert_eq!(list1.content_column, 2);
3389
3390 let line2 = &ctx.lines[1];
3392 assert!(line2.list_item.is_some());
3393 let list2 = line2.list_item.as_ref().unwrap();
3394 assert_eq!(list2.marker, "*");
3395 assert_eq!(list2.marker_column, 2);
3396
3397 let line3 = &ctx.lines[2];
3399 assert!(line3.list_item.is_some());
3400 let list3 = line3.list_item.as_ref().unwrap();
3401 assert_eq!(list3.marker, "1.");
3402 assert!(list3.is_ordered);
3403 assert_eq!(list3.number, Some(1));
3404
3405 let line6 = &ctx.lines[5];
3407 assert!(line6.list_item.is_none());
3408 }
3409
3410 #[test]
3411 fn test_offset_to_line_col_edge_cases() {
3412 let content = "a\nb\nc";
3413 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3414 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3422
3423 #[test]
3424 fn test_mdx_esm_blocks() {
3425 let content = r##"import {Chart} from './snowfall.js'
3426export const year = 2023
3427
3428# Last year's snowfall
3429
3430In {year}, the snowfall was above average.
3431It was followed by a warm spring which caused
3432flood conditions in many of the nearby rivers.
3433
3434<Chart color="#fcb32c" year={year} />
3435"##;
3436
3437 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3438
3439 assert_eq!(ctx.lines.len(), 10);
3441 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3442 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3443 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3444 assert!(
3445 !ctx.lines[3].in_esm_block,
3446 "Line 4 (heading) should NOT be in_esm_block"
3447 );
3448 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3449 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3450 }
3451
3452 #[test]
3453 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3454 let content = r#"import {Chart} from './snowfall.js'
3455export const year = 2023
3456
3457# Last year's snowfall
3458"#;
3459
3460 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3461
3462 assert!(
3464 !ctx.lines[0].in_esm_block,
3465 "Line 1 should NOT be in_esm_block in Standard flavor"
3466 );
3467 assert!(
3468 !ctx.lines[1].in_esm_block,
3469 "Line 2 should NOT be in_esm_block in Standard flavor"
3470 );
3471 }
3472}