1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use pulldown_cmark::{Event, Parser};
6use regex::Regex;
7
8lazy_static! {
9 static ref LINK_PATTERN: Regex = Regex::new(
12 r#"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
16 |
17 \[([^\]]*)\] # Reference ID in group 6
18 )"#
19 ).unwrap();
20
21 static ref IMAGE_PATTERN: Regex = Regex::new(
24 r#"(?sx)
25 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
26 (?:
27 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
28 |
29 \[([^\]]*)\] # Reference ID in group 6
30 )"#
31 ).unwrap();
32
33 static ref REF_DEF_PATTERN: Regex = Regex::new(
35 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36 ).unwrap();
37
38 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41 r"`+"
42 ).unwrap();
43
44 static ref BARE_URL_PATTERN: Regex = Regex::new(
46 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47 ).unwrap();
48
49 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52 ).unwrap();
53
54 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57 ).unwrap();
58
59 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63#[derive(Debug, Clone)]
65pub struct LineInfo {
66 pub content: String,
68 pub byte_offset: usize,
70 pub indent: usize,
72 pub is_blank: bool,
74 pub in_code_block: bool,
76 pub in_front_matter: bool,
78 pub in_html_block: bool,
80 pub in_html_comment: bool,
82 pub list_item: Option<ListItemInfo>,
84 pub heading: Option<HeadingInfo>,
86 pub blockquote: Option<BlockquoteInfo>,
88 pub in_mkdocstrings: bool,
90 pub in_esm_block: bool,
92}
93
94#[derive(Debug, Clone)]
96pub struct ListItemInfo {
97 pub marker: String,
99 pub is_ordered: bool,
101 pub number: Option<usize>,
103 pub marker_column: usize,
105 pub content_column: usize,
107}
108
109#[derive(Debug, Clone, PartialEq)]
111pub enum HeadingStyle {
112 ATX,
114 Setext1,
116 Setext2,
118}
119
120#[derive(Debug, Clone)]
122pub struct ParsedLink {
123 pub line: usize,
125 pub start_col: usize,
127 pub end_col: usize,
129 pub byte_offset: usize,
131 pub byte_end: usize,
133 pub text: String,
135 pub url: String,
137 pub is_reference: bool,
139 pub reference_id: Option<String>,
141}
142
143#[derive(Debug, Clone)]
145pub struct ParsedImage {
146 pub line: usize,
148 pub start_col: usize,
150 pub end_col: usize,
152 pub byte_offset: usize,
154 pub byte_end: usize,
156 pub alt_text: String,
158 pub url: String,
160 pub is_reference: bool,
162 pub reference_id: Option<String>,
164}
165
166#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169 pub line: usize,
171 pub id: String,
173 pub url: String,
175 pub title: Option<String>,
177 pub byte_offset: usize,
179 pub byte_end: usize,
181}
182
183#[derive(Debug, Clone)]
185pub struct CodeSpan {
186 pub line: usize,
188 pub start_col: usize,
190 pub end_col: usize,
192 pub byte_offset: usize,
194 pub byte_end: usize,
196 pub backtick_count: usize,
198 pub content: String,
200}
201
202#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205 pub level: u8,
207 pub style: HeadingStyle,
209 pub marker: String,
211 pub marker_column: usize,
213 pub content_column: usize,
215 pub text: String,
217 pub custom_id: Option<String>,
219 pub raw_text: String,
221 pub has_closing_sequence: bool,
223 pub closing_sequence: String,
225}
226
227#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230 pub nesting_level: usize,
232 pub indent: String,
234 pub marker_column: usize,
236 pub prefix: String,
238 pub content: String,
240 pub has_no_space_after_marker: bool,
242 pub has_multiple_spaces_after_marker: bool,
244 pub needs_md028_fix: bool,
246}
247
248#[derive(Debug, Clone)]
250pub struct ListBlock {
251 pub start_line: usize,
253 pub end_line: usize,
255 pub is_ordered: bool,
257 pub marker: Option<String>,
259 pub blockquote_prefix: String,
261 pub item_lines: Vec<usize>,
263 pub nesting_level: usize,
265 pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274 pub hash_count: usize,
276 pub asterisk_count: usize,
278 pub underscore_count: usize,
280 pub hyphen_count: usize,
282 pub plus_count: usize,
284 pub gt_count: usize,
286 pub pipe_count: usize,
288 pub bracket_count: usize,
290 pub backtick_count: usize,
292 pub lt_count: usize,
294 pub exclamation_count: usize,
296 pub newline_count: usize,
298}
299
300#[derive(Debug, Clone)]
302pub struct HtmlTag {
303 pub line: usize,
305 pub start_col: usize,
307 pub end_col: usize,
309 pub byte_offset: usize,
311 pub byte_end: usize,
313 pub tag_name: String,
315 pub is_closing: bool,
317 pub is_self_closing: bool,
319 pub raw_content: String,
321}
322
323#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326 pub line: usize,
328 pub start_col: usize,
330 pub end_col: usize,
332 pub byte_offset: usize,
334 pub byte_end: usize,
336 pub marker: char,
338 pub marker_count: usize,
340 pub content: String,
342}
343
344#[derive(Debug, Clone)]
346pub struct TableRow {
347 pub line: usize,
349 pub is_separator: bool,
351 pub column_count: usize,
353 pub column_alignments: Vec<String>, }
356
357#[derive(Debug, Clone)]
359pub struct BareUrl {
360 pub line: usize,
362 pub start_col: usize,
364 pub end_col: usize,
366 pub byte_offset: usize,
368 pub byte_end: usize,
370 pub url: String,
372 pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377 pub content: &'a str,
378 pub line_offsets: Vec<usize>,
379 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
397
398struct BlockquoteComponents<'a> {
400 indent: &'a str,
401 markers: &'a str,
402 spaces_after: &'a str,
403 content: &'a str,
404}
405
406#[inline]
408fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
409 let bytes = line.as_bytes();
410 let mut pos = 0;
411
412 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
414 pos += 1;
415 }
416 let indent_end = pos;
417
418 if pos >= bytes.len() || bytes[pos] != b'>' {
420 return None;
421 }
422
423 while pos < bytes.len() && bytes[pos] == b'>' {
425 pos += 1;
426 }
427 let markers_end = pos;
428
429 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
431 pos += 1;
432 }
433 let spaces_end = pos;
434
435 Some(BlockquoteComponents {
436 indent: &line[0..indent_end],
437 markers: &line[indent_end..markers_end],
438 spaces_after: &line[markers_end..spaces_end],
439 content: &line[spaces_end..],
440 })
441}
442
443impl<'a> LintContext<'a> {
444 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
445 use std::time::Instant;
446 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
447
448 let start = Instant::now();
449 let mut line_offsets = vec![0];
450 for (i, c) in content.char_indices() {
451 if c == '\n' {
452 line_offsets.push(i + 1);
453 }
454 }
455 if profile {
456 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
457 }
458
459 let start = Instant::now();
461 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
462 if profile {
463 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
464 }
465
466 let start = Instant::now();
468 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
469 if profile {
470 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
471 }
472
473 let start = Instant::now();
475 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
476 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
477 } else {
478 Vec::new()
479 };
480 if profile {
481 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
482 }
483
484 let start = Instant::now();
486 let mut lines = Self::compute_basic_line_info(
487 content,
488 &line_offsets,
489 &code_blocks,
490 flavor,
491 &html_comment_ranges,
492 &autodoc_ranges,
493 );
494 if profile {
495 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
496 }
497
498 let start = Instant::now();
500 Self::detect_html_blocks(&mut lines);
501 if profile {
502 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
503 }
504
505 let start = Instant::now();
507 Self::detect_esm_blocks(&mut lines, flavor);
508 if profile {
509 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
510 }
511
512 let start = Instant::now();
514 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
515 if profile {
516 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
517 }
518
519 let start = Instant::now();
521 let code_spans = Self::parse_code_spans(content, &lines);
522 if profile {
523 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
524 }
525
526 let start = Instant::now();
528 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
529 if profile {
530 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
531 }
532
533 let start = Instant::now();
534 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
535 if profile {
536 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
537 }
538
539 let start = Instant::now();
540 let reference_defs = Self::parse_reference_defs(content, &lines);
541 if profile {
542 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
543 }
544
545 let start = Instant::now();
546 let list_blocks = Self::parse_list_blocks(&lines);
547 if profile {
548 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
549 }
550
551 let start = Instant::now();
553 let char_frequency = Self::compute_char_frequency(content);
554 if profile {
555 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
556 }
557
558 let start = Instant::now();
560 let table_blocks =
561 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(content, &code_blocks, &code_spans);
562 if profile {
563 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
564 }
565
566 let start = Instant::now();
568 let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
569 if profile {
570 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
571 }
572
573 let start = Instant::now();
575 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
576 if profile {
577 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
578 }
579
580 Self {
581 content,
582 line_offsets,
583 code_blocks,
584 lines,
585 links,
586 images,
587 reference_defs,
588 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
589 list_blocks,
590 char_frequency,
591 html_tags_cache: Mutex::new(None),
592 emphasis_spans_cache: Mutex::new(None),
593 table_rows_cache: Mutex::new(None),
594 bare_urls_cache: Mutex::new(None),
595 html_comment_ranges,
596 table_blocks,
597 line_index,
598 jinja_ranges,
599 flavor,
600 }
601 }
602
603 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
605 let mut cache = self.code_spans_cache.lock().unwrap();
606
607 if cache.is_none() {
609 let code_spans = Self::parse_code_spans(self.content, &self.lines);
610 *cache = Some(Arc::new(code_spans));
611 }
612
613 cache.as_ref().unwrap().clone()
615 }
616
617 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
619 let mut cache = self.html_tags_cache.lock().unwrap();
620
621 if cache.is_none() {
622 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks, self.flavor);
623 *cache = Some(Arc::new(html_tags));
624 }
625
626 cache.as_ref().unwrap().clone()
627 }
628
629 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
631 let mut cache = self.emphasis_spans_cache.lock().unwrap();
632
633 if cache.is_none() {
634 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
635 *cache = Some(Arc::new(emphasis_spans));
636 }
637
638 cache.as_ref().unwrap().clone()
639 }
640
641 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
643 let mut cache = self.table_rows_cache.lock().unwrap();
644
645 if cache.is_none() {
646 let table_rows = Self::parse_table_rows(&self.lines);
647 *cache = Some(Arc::new(table_rows));
648 }
649
650 cache.as_ref().unwrap().clone()
651 }
652
653 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
655 let mut cache = self.bare_urls_cache.lock().unwrap();
656
657 if cache.is_none() {
658 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
659 *cache = Some(Arc::new(bare_urls));
660 }
661
662 cache.as_ref().unwrap().clone()
663 }
664
665 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
667 match self.line_offsets.binary_search(&offset) {
668 Ok(line) => (line + 1, 1),
669 Err(line) => {
670 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
671 (line, offset - line_start + 1)
672 }
673 }
674 }
675
676 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
678 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
680 return true;
681 }
682
683 self.code_spans()
685 .iter()
686 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
687 }
688
689 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
691 if line_num > 0 {
692 self.lines.get(line_num - 1)
693 } else {
694 None
695 }
696 }
697
698 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
700 self.line_info(line_num).map(|info| info.byte_offset)
701 }
702
703 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
705 let normalized_id = ref_id.to_lowercase();
706 self.reference_defs
707 .iter()
708 .find(|def| def.id == normalized_id)
709 .map(|def| def.url.as_str())
710 }
711
712 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
714 self.links.iter().filter(|link| link.line == line_num).collect()
715 }
716
717 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
719 self.images.iter().filter(|img| img.line == line_num).collect()
720 }
721
722 pub fn is_in_list_block(&self, line_num: usize) -> bool {
724 self.list_blocks
725 .iter()
726 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
727 }
728
729 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
731 self.list_blocks
732 .iter()
733 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
734 }
735
736 pub fn is_in_code_block(&self, line_num: usize) -> bool {
740 if line_num == 0 || line_num > self.lines.len() {
741 return false;
742 }
743 self.lines[line_num - 1].in_code_block
744 }
745
746 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
748 if line_num == 0 || line_num > self.lines.len() {
749 return false;
750 }
751 self.lines[line_num - 1].in_front_matter
752 }
753
754 pub fn is_in_html_block(&self, line_num: usize) -> bool {
756 if line_num == 0 || line_num > self.lines.len() {
757 return false;
758 }
759 self.lines[line_num - 1].in_html_block
760 }
761
762 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
764 if line_num == 0 || line_num > self.lines.len() {
765 return false;
766 }
767
768 let col_0indexed = if col > 0 { col - 1 } else { 0 };
772 let code_spans = self.code_spans();
773 code_spans
774 .iter()
775 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
776 }
777
778 #[inline]
781 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
782 self.reference_defs
783 .iter()
784 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
785 }
786
787 #[inline]
791 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
792 self.html_comment_ranges
793 .iter()
794 .any(|range| byte_pos >= range.start && byte_pos < range.end)
795 }
796
797 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
799 self.jinja_ranges
800 .iter()
801 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
802 }
803
804 pub fn has_char(&self, ch: char) -> bool {
806 match ch {
807 '#' => self.char_frequency.hash_count > 0,
808 '*' => self.char_frequency.asterisk_count > 0,
809 '_' => self.char_frequency.underscore_count > 0,
810 '-' => self.char_frequency.hyphen_count > 0,
811 '+' => self.char_frequency.plus_count > 0,
812 '>' => self.char_frequency.gt_count > 0,
813 '|' => self.char_frequency.pipe_count > 0,
814 '[' => self.char_frequency.bracket_count > 0,
815 '`' => self.char_frequency.backtick_count > 0,
816 '<' => self.char_frequency.lt_count > 0,
817 '!' => self.char_frequency.exclamation_count > 0,
818 '\n' => self.char_frequency.newline_count > 0,
819 _ => self.content.contains(ch), }
821 }
822
823 pub fn char_count(&self, ch: char) -> usize {
825 match ch {
826 '#' => self.char_frequency.hash_count,
827 '*' => self.char_frequency.asterisk_count,
828 '_' => self.char_frequency.underscore_count,
829 '-' => self.char_frequency.hyphen_count,
830 '+' => self.char_frequency.plus_count,
831 '>' => self.char_frequency.gt_count,
832 '|' => self.char_frequency.pipe_count,
833 '[' => self.char_frequency.bracket_count,
834 '`' => self.char_frequency.backtick_count,
835 '<' => self.char_frequency.lt_count,
836 '!' => self.char_frequency.exclamation_count,
837 '\n' => self.char_frequency.newline_count,
838 _ => self.content.matches(ch).count(), }
840 }
841
842 pub fn likely_has_headings(&self) -> bool {
844 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
846
847 pub fn likely_has_lists(&self) -> bool {
849 self.char_frequency.asterisk_count > 0
850 || self.char_frequency.hyphen_count > 0
851 || self.char_frequency.plus_count > 0
852 }
853
854 pub fn likely_has_emphasis(&self) -> bool {
856 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
857 }
858
859 pub fn likely_has_tables(&self) -> bool {
861 self.char_frequency.pipe_count > 2
862 }
863
864 pub fn likely_has_blockquotes(&self) -> bool {
866 self.char_frequency.gt_count > 0
867 }
868
869 pub fn likely_has_code(&self) -> bool {
871 self.char_frequency.backtick_count > 0
872 }
873
874 pub fn likely_has_links_or_images(&self) -> bool {
876 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
877 }
878
879 pub fn likely_has_html(&self) -> bool {
881 self.char_frequency.lt_count > 0
882 }
883
884 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
886 self.html_tags()
887 .iter()
888 .filter(|tag| tag.line == line_num)
889 .cloned()
890 .collect()
891 }
892
893 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
895 self.emphasis_spans()
896 .iter()
897 .filter(|span| span.line == line_num)
898 .cloned()
899 .collect()
900 }
901
902 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
904 self.table_rows()
905 .iter()
906 .filter(|row| row.line == line_num)
907 .cloned()
908 .collect()
909 }
910
911 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
913 self.bare_urls()
914 .iter()
915 .filter(|url| url.line == line_num)
916 .cloned()
917 .collect()
918 }
919
920 #[inline]
926 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
927 let idx = match lines.binary_search_by(|line| {
929 if byte_offset < line.byte_offset {
930 std::cmp::Ordering::Greater
931 } else if byte_offset > line.byte_offset + line.content.len() {
932 std::cmp::Ordering::Less
933 } else {
934 std::cmp::Ordering::Equal
935 }
936 }) {
937 Ok(idx) => idx,
938 Err(idx) => idx.saturating_sub(1),
939 };
940
941 let line = &lines[idx];
942 let line_num = idx + 1;
943 let col = byte_offset.saturating_sub(line.byte_offset);
944
945 (idx, line_num, col)
946 }
947
948 #[inline]
950 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
951 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
953
954 if idx > 0 {
956 let span = &code_spans[idx - 1];
957 if offset >= span.byte_offset && offset < span.byte_end {
958 return true;
959 }
960 }
961
962 false
963 }
964
965 fn parse_links(
967 content: &str,
968 lines: &[LineInfo],
969 code_blocks: &[(usize, usize)],
970 code_spans: &[CodeSpan],
971 flavor: MarkdownFlavor,
972 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
973 ) -> Vec<ParsedLink> {
974 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
975
976 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
981 let full_match = cap.get(0).unwrap();
982 let match_start = full_match.start();
983 let match_end = full_match.end();
984
985 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
987 continue;
988 }
989
990 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
992 continue;
993 }
994
995 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
997 continue;
998 }
999
1000 if Self::is_offset_in_code_span(code_spans, match_start) {
1002 continue;
1003 }
1004
1005 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1007 continue;
1008 }
1009
1010 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1012
1013 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1015 continue;
1016 }
1017
1018 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1020
1021 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1022
1023 let inline_url = cap.get(2).or_else(|| cap.get(3));
1025
1026 if let Some(url_match) = inline_url {
1027 links.push(ParsedLink {
1029 line: line_num,
1030 start_col: col_start,
1031 end_col: col_end,
1032 byte_offset: match_start,
1033 byte_end: match_end,
1034 text,
1035 url: url_match.as_str().to_string(),
1036 is_reference: false,
1037 reference_id: None,
1038 });
1039 } else if let Some(ref_id) = cap.get(6) {
1040 let ref_id_str = ref_id.as_str();
1042 let normalized_ref = if ref_id_str.is_empty() {
1043 text.to_lowercase() } else {
1045 ref_id_str.to_lowercase()
1046 };
1047
1048 links.push(ParsedLink {
1049 line: line_num,
1050 start_col: col_start,
1051 end_col: col_end,
1052 byte_offset: match_start,
1053 byte_end: match_end,
1054 text,
1055 url: String::new(), is_reference: true,
1057 reference_id: Some(normalized_ref),
1058 });
1059 }
1060 }
1061
1062 links
1063 }
1064
1065 fn parse_images(
1067 content: &str,
1068 lines: &[LineInfo],
1069 code_blocks: &[(usize, usize)],
1070 code_spans: &[CodeSpan],
1071 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1072 ) -> Vec<ParsedImage> {
1073 use crate::utils::skip_context::is_in_html_comment_ranges;
1074
1075 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
1080 let full_match = cap.get(0).unwrap();
1081 let match_start = full_match.start();
1082 let match_end = full_match.end();
1083
1084 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1086 continue;
1087 }
1088
1089 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1091 continue;
1092 }
1093
1094 if Self::is_offset_in_code_span(code_spans, match_start) {
1096 continue;
1097 }
1098
1099 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1101 continue;
1102 }
1103
1104 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1106
1107 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1109
1110 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1111
1112 let inline_url = cap.get(2).or_else(|| cap.get(3));
1114
1115 if let Some(url_match) = inline_url {
1116 images.push(ParsedImage {
1118 line: line_num,
1119 start_col: col_start,
1120 end_col: col_end,
1121 byte_offset: match_start,
1122 byte_end: match_end,
1123 alt_text,
1124 url: url_match.as_str().to_string(),
1125 is_reference: false,
1126 reference_id: None,
1127 });
1128 } else if let Some(ref_id) = cap.get(6) {
1129 let ref_id_str = ref_id.as_str();
1131 let normalized_ref = if ref_id_str.is_empty() {
1132 alt_text.to_lowercase() } else {
1134 ref_id_str.to_lowercase()
1135 };
1136
1137 images.push(ParsedImage {
1138 line: line_num,
1139 start_col: col_start,
1140 end_col: col_end,
1141 byte_offset: match_start,
1142 byte_end: match_end,
1143 alt_text,
1144 url: String::new(), is_reference: true,
1146 reference_id: Some(normalized_ref),
1147 });
1148 }
1149 }
1150
1151 images
1152 }
1153
1154 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1156 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1160 if line_info.in_code_block {
1162 continue;
1163 }
1164
1165 let line = &line_info.content;
1166 let line_num = line_idx + 1;
1167
1168 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1169 let id = cap.get(1).unwrap().as_str().to_lowercase();
1170 let url = cap.get(2).unwrap().as_str().to_string();
1171 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1172
1173 let match_obj = cap.get(0).unwrap();
1176 let byte_offset = line_info.byte_offset + match_obj.start();
1177 let byte_end = line_info.byte_offset + match_obj.end();
1178
1179 refs.push(ReferenceDef {
1180 line: line_num,
1181 id,
1182 url,
1183 title,
1184 byte_offset,
1185 byte_end,
1186 });
1187 }
1188 }
1189
1190 refs
1191 }
1192
1193 #[inline]
1197 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1198 let trimmed_start = line.trim_start();
1199 if !trimmed_start.starts_with('>') {
1200 return None;
1201 }
1202
1203 let leading_ws_len = line.len() - trimmed_start.len();
1204 let after_gt = &trimmed_start[1..];
1205 let content = after_gt.trim_start();
1206 let ws_after_gt_len = after_gt.len() - content.len();
1207 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1208
1209 Some((&line[..prefix_len], content))
1210 }
1211
1212 #[inline]
1216 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1217 let bytes = line.as_bytes();
1218 let mut i = 0;
1219
1220 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1222 i += 1;
1223 }
1224
1225 if i >= bytes.len() {
1227 return None;
1228 }
1229 let marker = bytes[i] as char;
1230 if marker != '-' && marker != '*' && marker != '+' {
1231 return None;
1232 }
1233 let marker_pos = i;
1234 i += 1;
1235
1236 let spacing_start = i;
1238 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1239 i += 1;
1240 }
1241
1242 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1243 }
1244
1245 #[inline]
1249 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1250 let bytes = line.as_bytes();
1251 let mut i = 0;
1252
1253 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1255 i += 1;
1256 }
1257
1258 let number_start = i;
1260 while i < bytes.len() && bytes[i].is_ascii_digit() {
1261 i += 1;
1262 }
1263 if i == number_start {
1264 return None; }
1266
1267 if i >= bytes.len() {
1269 return None;
1270 }
1271 let delimiter = bytes[i] as char;
1272 if delimiter != '.' && delimiter != ')' {
1273 return None;
1274 }
1275 let delimiter_pos = i;
1276 i += 1;
1277
1278 let spacing_start = i;
1280 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1281 i += 1;
1282 }
1283
1284 Some((
1285 &line[..number_start],
1286 &line[number_start..delimiter_pos],
1287 delimiter,
1288 &line[spacing_start..i],
1289 &line[i..],
1290 ))
1291 }
1292
1293 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1296 let num_lines = line_offsets.len();
1297 let mut in_code_block = vec![false; num_lines];
1298
1299 for &(start, end) in code_blocks {
1301 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1303 let mut boundary = start;
1304 while boundary > 0 && !content.is_char_boundary(boundary) {
1305 boundary -= 1;
1306 }
1307 boundary
1308 } else {
1309 start
1310 };
1311
1312 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1313 let mut boundary = end;
1314 while boundary < content.len() && !content.is_char_boundary(boundary) {
1315 boundary += 1;
1316 }
1317 boundary
1318 } else {
1319 end.min(content.len())
1320 };
1321
1322 let block_content = &content[safe_start..safe_end];
1323
1324 let content_to_check = block_content
1327 .lines()
1328 .map(|line| {
1329 let mut stripped = line.to_string();
1330 while crate::rules::blockquote_utils::BlockquoteUtils::is_blockquote(&stripped) {
1331 stripped = crate::rules::blockquote_utils::BlockquoteUtils::extract_content(&stripped);
1332 }
1333 stripped
1334 })
1335 .collect::<Vec<_>>()
1336 .join("\n");
1337
1338 let is_fenced =
1340 content_to_check.trim_start().starts_with("```") || content_to_check.trim_start().starts_with("~~~");
1341
1342 let should_mark = if is_fenced {
1345 true
1346 } else {
1347 let bytes = content_to_check.as_bytes();
1350 let mut i = 0;
1351 let mut valid_indented = true;
1352
1353 while i < bytes.len() {
1354 let line_start = i;
1355 while i < bytes.len() && bytes[i] != b'\n' {
1357 i += 1;
1358 }
1359
1360 let mut j = line_start;
1362 while j < i && (bytes[j] == b' ' || bytes[j] == b'\t') {
1364 j += 1;
1365 }
1366
1367 if j < i {
1369 let indent_len = j - line_start;
1371 let starts_with_tab = line_start < bytes.len() && bytes[line_start] == b'\t';
1372 if indent_len < 4 && !starts_with_tab {
1373 valid_indented = false;
1374 break;
1375 }
1376 }
1377
1378 i += 1; }
1380
1381 valid_indented
1382 };
1383
1384 if should_mark {
1385 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1389 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1390
1391 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1393 *flag = true;
1394 }
1395 }
1396 }
1397
1398 in_code_block
1399 }
1400
1401 fn compute_basic_line_info(
1403 content: &str,
1404 line_offsets: &[usize],
1405 code_blocks: &[(usize, usize)],
1406 flavor: MarkdownFlavor,
1407 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1408 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1409 ) -> Vec<LineInfo> {
1410 let content_lines: Vec<&str> = content.lines().collect();
1411 let mut lines = Vec::with_capacity(content_lines.len());
1412
1413 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1415
1416 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1419
1420 for (i, line) in content_lines.iter().enumerate() {
1421 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1422 let indent = line.len() - line.trim_start().len();
1423
1424 let blockquote_parse = Self::parse_blockquote_prefix(line);
1426
1427 let is_blank = if let Some((_, content)) = blockquote_parse {
1429 content.trim().is_empty()
1431 } else {
1432 line.trim().is_empty()
1433 };
1434
1435 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1437
1438 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1440 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1441 let in_html_comment =
1443 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1444 let list_item = if !(in_code_block
1445 || is_blank
1446 || in_mkdocstrings
1447 || in_html_comment
1448 || (front_matter_end > 0 && i < front_matter_end))
1449 {
1450 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1452 (content, prefix.len())
1453 } else {
1454 (&**line, 0)
1455 };
1456
1457 if let Some((leading_spaces, marker, spacing, _content)) =
1458 Self::parse_unordered_list(line_for_list_check)
1459 {
1460 let marker_column = blockquote_prefix_len + leading_spaces.len();
1461 let content_column = marker_column + 1 + spacing.len();
1462
1463 if spacing.is_empty() {
1470 None
1471 } else {
1472 Some(ListItemInfo {
1473 marker: marker.to_string(),
1474 is_ordered: false,
1475 number: None,
1476 marker_column,
1477 content_column,
1478 })
1479 }
1480 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1481 Self::parse_ordered_list(line_for_list_check)
1482 {
1483 let marker = format!("{number_str}{delimiter}");
1484 let marker_column = blockquote_prefix_len + leading_spaces.len();
1485 let content_column = marker_column + marker.len() + spacing.len();
1486
1487 if spacing.is_empty() {
1490 None
1491 } else {
1492 Some(ListItemInfo {
1493 marker,
1494 is_ordered: true,
1495 number: number_str.parse().ok(),
1496 marker_column,
1497 content_column,
1498 })
1499 }
1500 } else {
1501 None
1502 }
1503 } else {
1504 None
1505 };
1506
1507 lines.push(LineInfo {
1508 content: line.to_string(),
1509 byte_offset,
1510 indent,
1511 is_blank,
1512 in_code_block,
1513 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1514 in_html_block: false, in_html_comment,
1516 list_item,
1517 heading: None, blockquote: None, in_mkdocstrings,
1520 in_esm_block: false, });
1522 }
1523
1524 lines
1525 }
1526
1527 fn detect_headings_and_blockquotes(
1529 content: &str,
1530 lines: &mut [LineInfo],
1531 flavor: MarkdownFlavor,
1532 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1533 ) {
1534 lazy_static! {
1535
1536 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1538 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1539 }
1540
1541 let content_lines: Vec<&str> = content.lines().collect();
1542
1543 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1545
1546 for i in 0..lines.len() {
1548 if lines[i].in_code_block {
1549 continue;
1550 }
1551
1552 if front_matter_end > 0 && i < front_matter_end {
1554 continue;
1555 }
1556
1557 if lines[i].in_html_block {
1559 continue;
1560 }
1561
1562 let line = content_lines[i];
1563
1564 if let Some(bq) = parse_blockquote_detailed(line) {
1566 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1568
1569 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1571
1572 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1574 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1576
1577 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1581
1582 lines[i].blockquote = Some(BlockquoteInfo {
1583 nesting_level,
1584 indent: bq.indent.to_string(),
1585 marker_column,
1586 prefix,
1587 content: bq.content.to_string(),
1588 has_no_space_after_marker: has_no_space,
1589 has_multiple_spaces_after_marker: has_multiple_spaces,
1590 needs_md028_fix,
1591 });
1592 }
1593
1594 if lines[i].is_blank {
1596 continue;
1597 }
1598
1599 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1602 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1603 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1604 } else {
1605 false
1606 };
1607
1608 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1609 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1611 continue;
1612 }
1613 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1614 let hashes = caps.get(2).map_or("", |m| m.as_str());
1615 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1616 let rest = caps.get(4).map_or("", |m| m.as_str());
1617
1618 let level = hashes.len() as u8;
1619 let marker_column = leading_spaces.len();
1620
1621 let (text, has_closing, closing_seq) = {
1623 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1625 if rest[id_start..].trim_end().ends_with('}') {
1627 (&rest[..id_start], &rest[id_start..])
1629 } else {
1630 (rest, "")
1631 }
1632 } else {
1633 (rest, "")
1634 };
1635
1636 let trimmed_rest = rest_without_id.trim_end();
1638 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1639 let mut start_of_hashes = last_hash_pos;
1641 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1642 start_of_hashes -= 1;
1643 }
1644
1645 let has_space_before = start_of_hashes == 0
1647 || trimmed_rest
1648 .chars()
1649 .nth(start_of_hashes - 1)
1650 .is_some_and(|c| c.is_whitespace());
1651
1652 let potential_closing = &trimmed_rest[start_of_hashes..];
1654 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1655
1656 if is_all_hashes && has_space_before {
1657 let closing_hashes = potential_closing.to_string();
1659 let text_part = if !custom_id_part.is_empty() {
1662 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1665 } else {
1666 rest_without_id[..start_of_hashes].trim_end().to_string()
1667 };
1668 (text_part, true, closing_hashes)
1669 } else {
1670 (rest.to_string(), false, String::new())
1672 }
1673 } else {
1674 (rest.to_string(), false, String::new())
1676 }
1677 };
1678
1679 let content_column = marker_column + hashes.len() + spaces_after.len();
1680
1681 let raw_text = text.trim().to_string();
1683 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1684
1685 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1687 let next_line = content_lines[i + 1];
1688 if !lines[i + 1].in_code_block
1689 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1690 && let Some(next_line_id) =
1691 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1692 {
1693 custom_id = Some(next_line_id);
1694 }
1695 }
1696
1697 lines[i].heading = Some(HeadingInfo {
1698 level,
1699 style: HeadingStyle::ATX,
1700 marker: hashes.to_string(),
1701 marker_column,
1702 content_column,
1703 text: clean_text,
1704 custom_id,
1705 raw_text,
1706 has_closing_sequence: has_closing,
1707 closing_sequence: closing_seq,
1708 });
1709 }
1710 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1712 let next_line = content_lines[i + 1];
1713 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1714 if front_matter_end > 0 && i < front_matter_end {
1716 continue;
1717 }
1718
1719 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1721 {
1722 continue;
1723 }
1724
1725 let underline = next_line.trim();
1726
1727 if underline == "---" {
1730 continue;
1731 }
1732
1733 let current_line_trimmed = line.trim();
1735 if current_line_trimmed.contains(':')
1736 && !current_line_trimmed.starts_with('#')
1737 && !current_line_trimmed.contains('[')
1738 && !current_line_trimmed.contains("](")
1739 {
1740 continue;
1742 }
1743
1744 let level = if underline.starts_with('=') { 1 } else { 2 };
1745 let style = if level == 1 {
1746 HeadingStyle::Setext1
1747 } else {
1748 HeadingStyle::Setext2
1749 };
1750
1751 let raw_text = line.trim().to_string();
1753 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1754
1755 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1757 let attr_line = content_lines[i + 2];
1758 if !lines[i + 2].in_code_block
1759 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1760 && let Some(attr_line_id) =
1761 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1762 {
1763 custom_id = Some(attr_line_id);
1764 }
1765 }
1766
1767 lines[i].heading = Some(HeadingInfo {
1768 level,
1769 style,
1770 marker: underline.to_string(),
1771 marker_column: next_line.len() - next_line.trim_start().len(),
1772 content_column: lines[i].indent,
1773 text: clean_text,
1774 custom_id,
1775 raw_text,
1776 has_closing_sequence: false,
1777 closing_sequence: String::new(),
1778 });
1779 }
1780 }
1781 }
1782 }
1783
1784 fn detect_html_blocks(lines: &mut [LineInfo]) {
1786 const BLOCK_ELEMENTS: &[&str] = &[
1788 "address",
1789 "article",
1790 "aside",
1791 "blockquote",
1792 "details",
1793 "dialog",
1794 "dd",
1795 "div",
1796 "dl",
1797 "dt",
1798 "fieldset",
1799 "figcaption",
1800 "figure",
1801 "footer",
1802 "form",
1803 "h1",
1804 "h2",
1805 "h3",
1806 "h4",
1807 "h5",
1808 "h6",
1809 "header",
1810 "hr",
1811 "li",
1812 "main",
1813 "nav",
1814 "ol",
1815 "p",
1816 "pre",
1817 "script",
1818 "section",
1819 "style",
1820 "table",
1821 "tbody",
1822 "td",
1823 "tfoot",
1824 "th",
1825 "thead",
1826 "tr",
1827 "ul",
1828 ];
1829
1830 let mut i = 0;
1831 while i < lines.len() {
1832 if lines[i].in_code_block || lines[i].in_front_matter {
1834 i += 1;
1835 continue;
1836 }
1837
1838 let trimmed = lines[i].content.trim_start();
1839
1840 if trimmed.starts_with('<') && trimmed.len() > 1 {
1842 let after_bracket = &trimmed[1..];
1844 let is_closing = after_bracket.starts_with('/');
1845 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1846
1847 let tag_name = tag_start
1849 .chars()
1850 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1851 .collect::<String>()
1852 .to_lowercase();
1853
1854 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1856 lines[i].in_html_block = true;
1858
1859 if !is_closing {
1862 let closing_tag = format!("</{tag_name}>");
1863 let allow_blank_lines = tag_name == "style" || tag_name == "script";
1865 let mut j = i + 1;
1866 while j < lines.len() && j < i + 100 {
1867 if !allow_blank_lines && lines[j].is_blank {
1870 break;
1871 }
1872
1873 lines[j].in_html_block = true;
1874
1875 if lines[j].content.contains(&closing_tag) {
1877 break;
1878 }
1879 j += 1;
1880 }
1881 }
1882 }
1883 }
1884
1885 i += 1;
1886 }
1887 }
1888
1889 fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
1892 if !flavor.supports_esm_blocks() {
1894 return;
1895 }
1896
1897 for line in lines.iter_mut() {
1898 if line.is_blank || line.in_html_comment {
1900 continue;
1901 }
1902
1903 let trimmed = line.content.trim_start();
1905 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
1906 line.in_esm_block = true;
1907 } else {
1908 break;
1910 }
1911 }
1912 }
1913
1914 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1916 let mut code_spans = Vec::new();
1917
1918 if !content.contains('`') {
1920 return code_spans;
1921 }
1922
1923 let parser = Parser::new(content).into_offset_iter();
1925
1926 for (event, range) in parser {
1927 if let Event::Code(_) = event {
1928 let start_pos = range.start;
1929 let end_pos = range.end;
1930
1931 let full_span = &content[start_pos..end_pos];
1933 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1934
1935 let content_start = start_pos + backtick_count;
1937 let content_end = end_pos - backtick_count;
1938 let span_content = if content_start < content_end {
1939 content[content_start..content_end].to_string()
1940 } else {
1941 String::new()
1942 };
1943
1944 let line_idx = lines
1947 .partition_point(|line| line.byte_offset <= start_pos)
1948 .saturating_sub(1);
1949 let line_num = line_idx + 1;
1950 let col_start = start_pos - lines[line_idx].byte_offset;
1951
1952 let end_line_idx = lines
1954 .partition_point(|line| line.byte_offset <= end_pos)
1955 .saturating_sub(1);
1956 let col_end = end_pos - lines[end_line_idx].byte_offset;
1957
1958 code_spans.push(CodeSpan {
1959 line: line_num,
1960 start_col: col_start,
1961 end_col: col_end,
1962 byte_offset: start_pos,
1963 byte_end: end_pos,
1964 backtick_count,
1965 content: span_content,
1966 });
1967 }
1968 }
1969
1970 code_spans.sort_by_key(|span| span.byte_offset);
1972
1973 code_spans
1974 }
1975
1976 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1978 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1981 let mut last_list_item_line = 0;
1982 let mut current_indent_level = 0;
1983 let mut last_marker_width = 0;
1984
1985 for (line_idx, line_info) in lines.iter().enumerate() {
1986 let line_num = line_idx + 1;
1987
1988 if line_info.in_code_block {
1990 if let Some(ref mut block) = current_block {
1991 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1993
1994 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1996
1997 match context {
1998 CodeBlockContext::Indented => {
1999 block.end_line = line_num;
2001 continue;
2002 }
2003 CodeBlockContext::Standalone => {
2004 let completed_block = current_block.take().unwrap();
2006 list_blocks.push(completed_block);
2007 continue;
2008 }
2009 CodeBlockContext::Adjacent => {
2010 block.end_line = line_num;
2012 continue;
2013 }
2014 }
2015 } else {
2016 continue;
2018 }
2019 }
2020
2021 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2023 caps.get(0).unwrap().as_str().to_string()
2024 } else {
2025 String::new()
2026 };
2027
2028 if let Some(list_item) = &line_info.list_item {
2030 let item_indent = list_item.marker_column;
2032 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
2035 let is_nested = nesting > block.nesting_level;
2039 let same_type =
2040 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2041 let same_context = block.blockquote_prefix == blockquote_prefix;
2042 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
2046 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2047
2048 let has_non_list_content = {
2050 let mut found_non_list = false;
2051 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2053
2054 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2056 let last_line = &lines[block_last_item_line - 1];
2057 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2058 log::debug!(
2059 "After problematic line {}: checking lines {} to {} for non-list content",
2060 block_last_item_line,
2061 block_last_item_line + 1,
2062 line_num
2063 );
2064 if line_num == block_last_item_line + 1 {
2066 log::debug!("Lines are consecutive, no content between");
2067 }
2068 }
2069 }
2070
2071 for check_line in (block_last_item_line + 1)..line_num {
2072 let check_idx = check_line - 1;
2073 if check_idx < lines.len() {
2074 let check_info = &lines[check_idx];
2075 let is_list_breaking_content = if check_info.in_code_block {
2077 let last_item_marker_width =
2079 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2080 lines[block_last_item_line - 1]
2081 .list_item
2082 .as_ref()
2083 .map(|li| {
2084 if li.is_ordered {
2085 li.marker.len() + 1 } else {
2087 li.marker.len()
2088 }
2089 })
2090 .unwrap_or(3) } else {
2092 3 };
2094
2095 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2096
2097 let context = CodeBlockUtils::analyze_code_block_context(
2099 lines,
2100 check_line - 1,
2101 min_continuation,
2102 );
2103
2104 matches!(context, CodeBlockContext::Standalone)
2106 } else if !check_info.is_blank && check_info.list_item.is_none() {
2107 let line_content = check_info.content.trim();
2109
2110 if check_info.heading.is_some()
2112 || line_content.starts_with("---")
2113 || line_content.starts_with("***")
2114 || line_content.starts_with("___")
2115 || (line_content.contains('|')
2116 && !line_content.contains("](")
2117 && !line_content.contains("http")
2118 && (line_content.matches('|').count() > 1
2119 || line_content.starts_with('|')
2120 || line_content.ends_with('|')))
2121 || line_content.starts_with(">")
2122 {
2123 true
2124 }
2125 else {
2127 let last_item_marker_width =
2128 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2129 lines[block_last_item_line - 1]
2130 .list_item
2131 .as_ref()
2132 .map(|li| {
2133 if li.is_ordered {
2134 li.marker.len() + 1 } else {
2136 li.marker.len()
2137 }
2138 })
2139 .unwrap_or(3) } else {
2141 3 };
2143
2144 let min_continuation =
2145 if block.is_ordered { last_item_marker_width } else { 2 };
2146 check_info.indent < min_continuation
2147 }
2148 } else {
2149 false
2150 };
2151
2152 if is_list_breaking_content {
2153 found_non_list = true;
2155 break;
2156 }
2157 }
2158 }
2159 found_non_list
2160 };
2161
2162 let mut continues_list = if is_nested {
2166 same_context && reasonable_distance && !has_non_list_content
2168 } else {
2169 let result = same_type
2171 && same_context
2172 && reasonable_distance
2173 && marker_compatible
2174 && !has_non_list_content;
2175
2176 if block.item_lines.last().is_some_and(|&last_line| {
2178 last_line > 0
2179 && last_line <= lines.len()
2180 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2181 && lines[last_line - 1].content.contains(r"\`")
2182 }) {
2183 log::debug!(
2184 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2185 );
2186 if line_num > 0 && line_num <= lines.len() {
2187 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2188 }
2189 }
2190
2191 result
2192 };
2193
2194 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2197 if block.item_lines.contains(&(line_num - 1)) {
2199 continues_list = true;
2201 }
2202 }
2203
2204 if continues_list {
2205 block.end_line = line_num;
2207 block.item_lines.push(line_num);
2208
2209 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2211 list_item.marker.len() + 1
2212 } else {
2213 list_item.marker.len()
2214 });
2215
2216 if !block.is_ordered
2218 && block.marker.is_some()
2219 && block.marker.as_ref() != Some(&list_item.marker)
2220 {
2221 block.marker = None;
2223 }
2224 } else {
2225 list_blocks.push(block.clone());
2228
2229 *block = ListBlock {
2230 start_line: line_num,
2231 end_line: line_num,
2232 is_ordered: list_item.is_ordered,
2233 marker: if list_item.is_ordered {
2234 None
2235 } else {
2236 Some(list_item.marker.clone())
2237 },
2238 blockquote_prefix: blockquote_prefix.clone(),
2239 item_lines: vec![line_num],
2240 nesting_level: nesting,
2241 max_marker_width: if list_item.is_ordered {
2242 list_item.marker.len() + 1
2243 } else {
2244 list_item.marker.len()
2245 },
2246 };
2247 }
2248 } else {
2249 current_block = Some(ListBlock {
2251 start_line: line_num,
2252 end_line: line_num,
2253 is_ordered: list_item.is_ordered,
2254 marker: if list_item.is_ordered {
2255 None
2256 } else {
2257 Some(list_item.marker.clone())
2258 },
2259 blockquote_prefix,
2260 item_lines: vec![line_num],
2261 nesting_level: nesting,
2262 max_marker_width: list_item.marker.len(),
2263 });
2264 }
2265
2266 last_list_item_line = line_num;
2267 current_indent_level = item_indent;
2268 last_marker_width = if list_item.is_ordered {
2269 list_item.marker.len() + 1 } else {
2271 list_item.marker.len()
2272 };
2273 } else if let Some(ref mut block) = current_block {
2274 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2284 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2285 } else {
2286 false
2287 };
2288
2289 let min_continuation_indent = if block.is_ordered {
2293 current_indent_level + last_marker_width
2294 } else {
2295 current_indent_level + 2 };
2297
2298 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2299 block.end_line = line_num;
2301 } else if line_info.is_blank {
2302 let mut check_idx = line_idx + 1;
2305 let mut found_continuation = false;
2306
2307 while check_idx < lines.len() && lines[check_idx].is_blank {
2309 check_idx += 1;
2310 }
2311
2312 if check_idx < lines.len() {
2313 let next_line = &lines[check_idx];
2314 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2316 found_continuation = true;
2317 }
2318 else if !next_line.in_code_block
2320 && next_line.list_item.is_some()
2321 && let Some(item) = &next_line.list_item
2322 {
2323 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2324 .find(&next_line.content)
2325 .map_or(String::new(), |m| m.as_str().to_string());
2326 if item.marker_column == current_indent_level
2327 && item.is_ordered == block.is_ordered
2328 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2329 {
2330 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2333 if let Some(between_line) = lines.get(idx) {
2334 let trimmed = between_line.content.trim();
2335 if trimmed.is_empty() {
2337 return false;
2338 }
2339 let line_indent =
2341 between_line.content.len() - between_line.content.trim_start().len();
2342
2343 if trimmed.starts_with("```")
2345 || trimmed.starts_with("~~~")
2346 || trimmed.starts_with("---")
2347 || trimmed.starts_with("***")
2348 || trimmed.starts_with("___")
2349 || trimmed.starts_with(">")
2350 || trimmed.contains('|') || between_line.heading.is_some()
2352 {
2353 return true; }
2355
2356 line_indent >= min_continuation_indent
2358 } else {
2359 false
2360 }
2361 });
2362
2363 if block.is_ordered {
2364 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2367 if let Some(between_line) = lines.get(idx) {
2368 let trimmed = between_line.content.trim();
2369 if trimmed.is_empty() {
2370 return false;
2371 }
2372 trimmed.starts_with("```")
2374 || trimmed.starts_with("~~~")
2375 || trimmed.starts_with("---")
2376 || trimmed.starts_with("***")
2377 || trimmed.starts_with("___")
2378 || trimmed.starts_with(">")
2379 || trimmed.contains('|') || between_line.heading.is_some()
2381 } else {
2382 false
2383 }
2384 });
2385 found_continuation = !has_structural_separators;
2386 } else {
2387 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2389 if let Some(between_line) = lines.get(idx) {
2390 let trimmed = between_line.content.trim();
2391 if trimmed.is_empty() {
2392 return false;
2393 }
2394 trimmed.starts_with("```")
2396 || trimmed.starts_with("~~~")
2397 || trimmed.starts_with("---")
2398 || trimmed.starts_with("***")
2399 || trimmed.starts_with("___")
2400 || trimmed.starts_with(">")
2401 || trimmed.contains('|') || between_line.heading.is_some()
2403 } else {
2404 false
2405 }
2406 });
2407 found_continuation = !has_structural_separators;
2408 }
2409 }
2410 }
2411 }
2412
2413 if found_continuation {
2414 block.end_line = line_num;
2416 } else {
2417 list_blocks.push(block.clone());
2419 current_block = None;
2420 }
2421 } else {
2422 let min_required_indent = if block.is_ordered {
2425 current_indent_level + last_marker_width
2426 } else {
2427 current_indent_level + 2
2428 };
2429
2430 let line_content = line_info.content.trim();
2435 let is_structural_separator = line_info.heading.is_some()
2436 || line_content.starts_with("```")
2437 || line_content.starts_with("~~~")
2438 || line_content.starts_with("---")
2439 || line_content.starts_with("***")
2440 || line_content.starts_with("___")
2441 || line_content.starts_with(">")
2442 || (line_content.contains('|')
2443 && !line_content.contains("](")
2444 && !line_content.contains("http")
2445 && (line_content.matches('|').count() > 1
2446 || line_content.starts_with('|')
2447 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2452 && !line_info.is_blank
2453 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2454
2455 if is_lazy_continuation {
2456 let content_to_check = if !blockquote_prefix.is_empty() {
2459 line_info
2461 .content
2462 .strip_prefix(&blockquote_prefix)
2463 .unwrap_or(&line_info.content)
2464 .trim()
2465 } else {
2466 line_info.content.trim()
2467 };
2468
2469 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2470
2471 if starts_with_uppercase && last_list_item_line > 0 {
2474 list_blocks.push(block.clone());
2476 current_block = None;
2477 } else {
2478 block.end_line = line_num;
2480 }
2481 } else {
2482 list_blocks.push(block.clone());
2484 current_block = None;
2485 }
2486 }
2487 }
2488 }
2489
2490 if let Some(block) = current_block {
2492 list_blocks.push(block);
2493 }
2494
2495 merge_adjacent_list_blocks(&mut list_blocks, lines);
2497
2498 list_blocks
2499 }
2500
2501 fn compute_char_frequency(content: &str) -> CharFrequency {
2503 let mut frequency = CharFrequency::default();
2504
2505 for ch in content.chars() {
2506 match ch {
2507 '#' => frequency.hash_count += 1,
2508 '*' => frequency.asterisk_count += 1,
2509 '_' => frequency.underscore_count += 1,
2510 '-' => frequency.hyphen_count += 1,
2511 '+' => frequency.plus_count += 1,
2512 '>' => frequency.gt_count += 1,
2513 '|' => frequency.pipe_count += 1,
2514 '[' => frequency.bracket_count += 1,
2515 '`' => frequency.backtick_count += 1,
2516 '<' => frequency.lt_count += 1,
2517 '!' => frequency.exclamation_count += 1,
2518 '\n' => frequency.newline_count += 1,
2519 _ => {}
2520 }
2521 }
2522
2523 frequency
2524 }
2525
2526 fn parse_html_tags(
2528 content: &str,
2529 lines: &[LineInfo],
2530 code_blocks: &[(usize, usize)],
2531 flavor: MarkdownFlavor,
2532 ) -> Vec<HtmlTag> {
2533 lazy_static! {
2534 static ref HTML_TAG_REGEX: regex::Regex =
2535 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2536 }
2537
2538 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2539
2540 for cap in HTML_TAG_REGEX.captures_iter(content) {
2541 let full_match = cap.get(0).unwrap();
2542 let match_start = full_match.start();
2543 let match_end = full_match.end();
2544
2545 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2547 continue;
2548 }
2549
2550 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2551 let tag_name_original = cap.get(2).unwrap().as_str();
2552 let tag_name = tag_name_original.to_lowercase();
2553 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2554
2555 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2558 continue;
2559 }
2560
2561 let mut line_num = 1;
2563 let mut col_start = match_start;
2564 let mut col_end = match_end;
2565 for (idx, line_info) in lines.iter().enumerate() {
2566 if match_start >= line_info.byte_offset {
2567 line_num = idx + 1;
2568 col_start = match_start - line_info.byte_offset;
2569 col_end = match_end - line_info.byte_offset;
2570 } else {
2571 break;
2572 }
2573 }
2574
2575 html_tags.push(HtmlTag {
2576 line: line_num,
2577 start_col: col_start,
2578 end_col: col_end,
2579 byte_offset: match_start,
2580 byte_end: match_end,
2581 tag_name,
2582 is_closing,
2583 is_self_closing,
2584 raw_content: full_match.as_str().to_string(),
2585 });
2586 }
2587
2588 html_tags
2589 }
2590
2591 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2593 lazy_static! {
2594 static ref EMPHASIS_REGEX: regex::Regex =
2595 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2596 }
2597
2598 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2599
2600 for cap in EMPHASIS_REGEX.captures_iter(content) {
2601 let full_match = cap.get(0).unwrap();
2602 let match_start = full_match.start();
2603 let match_end = full_match.end();
2604
2605 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2607 continue;
2608 }
2609
2610 let opening_markers = cap.get(1).unwrap().as_str();
2611 let content_part = cap.get(2).unwrap().as_str();
2612 let closing_markers = cap.get(3).unwrap().as_str();
2613
2614 if opening_markers.chars().next() != closing_markers.chars().next()
2616 || opening_markers.len() != closing_markers.len()
2617 {
2618 continue;
2619 }
2620
2621 let marker = opening_markers.chars().next().unwrap();
2622 let marker_count = opening_markers.len();
2623
2624 let mut line_num = 1;
2626 let mut col_start = match_start;
2627 let mut col_end = match_end;
2628 for (idx, line_info) in lines.iter().enumerate() {
2629 if match_start >= line_info.byte_offset {
2630 line_num = idx + 1;
2631 col_start = match_start - line_info.byte_offset;
2632 col_end = match_end - line_info.byte_offset;
2633 } else {
2634 break;
2635 }
2636 }
2637
2638 emphasis_spans.push(EmphasisSpan {
2639 line: line_num,
2640 start_col: col_start,
2641 end_col: col_end,
2642 byte_offset: match_start,
2643 byte_end: match_end,
2644 marker,
2645 marker_count,
2646 content: content_part.to_string(),
2647 });
2648 }
2649
2650 emphasis_spans
2651 }
2652
2653 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2655 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2656
2657 for (line_idx, line_info) in lines.iter().enumerate() {
2658 if line_info.in_code_block || line_info.is_blank {
2660 continue;
2661 }
2662
2663 let line = &line_info.content;
2664 let line_num = line_idx + 1;
2665
2666 if !line.contains('|') {
2668 continue;
2669 }
2670
2671 let parts: Vec<&str> = line.split('|').collect();
2673 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2674
2675 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2677 let mut column_alignments = Vec::new();
2678
2679 if is_separator {
2680 for part in &parts[1..parts.len() - 1] {
2681 let trimmed = part.trim();
2683 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2684 "center".to_string()
2685 } else if trimmed.ends_with(':') {
2686 "right".to_string()
2687 } else if trimmed.starts_with(':') {
2688 "left".to_string()
2689 } else {
2690 "none".to_string()
2691 };
2692 column_alignments.push(alignment);
2693 }
2694 }
2695
2696 table_rows.push(TableRow {
2697 line: line_num,
2698 is_separator,
2699 column_count,
2700 column_alignments,
2701 });
2702 }
2703
2704 table_rows
2705 }
2706
2707 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2709 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2710
2711 for cap in BARE_URL_PATTERN.captures_iter(content) {
2713 let full_match = cap.get(0).unwrap();
2714 let match_start = full_match.start();
2715 let match_end = full_match.end();
2716
2717 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2719 continue;
2720 }
2721
2722 let preceding_char = if match_start > 0 {
2724 content.chars().nth(match_start - 1)
2725 } else {
2726 None
2727 };
2728 let following_char = content.chars().nth(match_end);
2729
2730 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2731 continue;
2732 }
2733 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2734 continue;
2735 }
2736
2737 let url = full_match.as_str();
2738 let url_type = if url.starts_with("https://") {
2739 "https"
2740 } else if url.starts_with("http://") {
2741 "http"
2742 } else if url.starts_with("ftp://") {
2743 "ftp"
2744 } else {
2745 "other"
2746 };
2747
2748 let mut line_num = 1;
2750 let mut col_start = match_start;
2751 let mut col_end = match_end;
2752 for (idx, line_info) in lines.iter().enumerate() {
2753 if match_start >= line_info.byte_offset {
2754 line_num = idx + 1;
2755 col_start = match_start - line_info.byte_offset;
2756 col_end = match_end - line_info.byte_offset;
2757 } else {
2758 break;
2759 }
2760 }
2761
2762 bare_urls.push(BareUrl {
2763 line: line_num,
2764 start_col: col_start,
2765 end_col: col_end,
2766 byte_offset: match_start,
2767 byte_end: match_end,
2768 url: url.to_string(),
2769 url_type: url_type.to_string(),
2770 });
2771 }
2772
2773 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2775 let full_match = cap.get(0).unwrap();
2776 let match_start = full_match.start();
2777 let match_end = full_match.end();
2778
2779 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2781 continue;
2782 }
2783
2784 let preceding_char = if match_start > 0 {
2786 content.chars().nth(match_start - 1)
2787 } else {
2788 None
2789 };
2790 let following_char = content.chars().nth(match_end);
2791
2792 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2793 continue;
2794 }
2795 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2796 continue;
2797 }
2798
2799 let email = full_match.as_str();
2800
2801 let mut line_num = 1;
2803 let mut col_start = match_start;
2804 let mut col_end = match_end;
2805 for (idx, line_info) in lines.iter().enumerate() {
2806 if match_start >= line_info.byte_offset {
2807 line_num = idx + 1;
2808 col_start = match_start - line_info.byte_offset;
2809 col_end = match_end - line_info.byte_offset;
2810 } else {
2811 break;
2812 }
2813 }
2814
2815 bare_urls.push(BareUrl {
2816 line: line_num,
2817 start_col: col_start,
2818 end_col: col_end,
2819 byte_offset: match_start,
2820 byte_end: match_end,
2821 url: email.to_string(),
2822 url_type: "email".to_string(),
2823 });
2824 }
2825
2826 bare_urls
2827 }
2828}
2829
2830fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2832 if list_blocks.len() < 2 {
2833 return;
2834 }
2835
2836 let mut merger = ListBlockMerger::new(lines);
2837 *list_blocks = merger.merge(list_blocks);
2838}
2839
2840struct ListBlockMerger<'a> {
2842 lines: &'a [LineInfo],
2843}
2844
2845impl<'a> ListBlockMerger<'a> {
2846 fn new(lines: &'a [LineInfo]) -> Self {
2847 Self { lines }
2848 }
2849
2850 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2851 let mut merged = Vec::with_capacity(list_blocks.len());
2852 let mut current = list_blocks[0].clone();
2853
2854 for next in list_blocks.iter().skip(1) {
2855 if self.should_merge_blocks(¤t, next) {
2856 current = self.merge_two_blocks(current, next);
2857 } else {
2858 merged.push(current);
2859 current = next.clone();
2860 }
2861 }
2862
2863 merged.push(current);
2864 merged
2865 }
2866
2867 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2869 if !self.blocks_are_compatible(current, next) {
2871 return false;
2872 }
2873
2874 let spacing = self.analyze_spacing_between(current, next);
2876 match spacing {
2877 BlockSpacing::Consecutive => true,
2878 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2879 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2880 self.can_merge_with_content_between(current, next)
2881 }
2882 }
2883 }
2884
2885 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2887 current.is_ordered == next.is_ordered
2888 && current.blockquote_prefix == next.blockquote_prefix
2889 && current.nesting_level == next.nesting_level
2890 }
2891
2892 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2894 let gap = next.start_line - current.end_line;
2895
2896 match gap {
2897 1 => BlockSpacing::Consecutive,
2898 2 => BlockSpacing::SingleBlank,
2899 _ if gap > 2 => {
2900 if self.has_only_blank_lines_between(current, next) {
2901 BlockSpacing::MultipleBlanks
2902 } else {
2903 BlockSpacing::ContentBetween
2904 }
2905 }
2906 _ => BlockSpacing::Consecutive, }
2908 }
2909
2910 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2912 if has_meaningful_content_between(current, next, self.lines) {
2915 return false; }
2917
2918 !current.is_ordered && current.marker == next.marker
2920 }
2921
2922 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2924 if has_meaningful_content_between(current, next, self.lines) {
2926 return false; }
2928
2929 current.is_ordered && next.is_ordered
2931 }
2932
2933 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2935 for line_num in (current.end_line + 1)..next.start_line {
2936 if let Some(line_info) = self.lines.get(line_num - 1)
2937 && !line_info.content.trim().is_empty()
2938 {
2939 return false;
2940 }
2941 }
2942 true
2943 }
2944
2945 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2947 current.end_line = next.end_line;
2948 current.item_lines.extend_from_slice(&next.item_lines);
2949
2950 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2952
2953 if !current.is_ordered && self.markers_differ(¤t, next) {
2955 current.marker = None; }
2957
2958 current
2959 }
2960
2961 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2963 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2964 }
2965}
2966
2967#[derive(Debug, PartialEq)]
2969enum BlockSpacing {
2970 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2975
2976fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2978 for line_num in (current.end_line + 1)..next.start_line {
2980 if let Some(line_info) = lines.get(line_num - 1) {
2981 let trimmed = line_info.content.trim();
2983
2984 if trimmed.is_empty() {
2986 continue;
2987 }
2988
2989 if line_info.heading.is_some() {
2993 return true; }
2995
2996 if is_horizontal_rule(trimmed) {
2998 return true; }
3000
3001 if trimmed.contains('|') && trimmed.len() > 1 {
3004 if !trimmed.contains("](") && !trimmed.contains("http") {
3006 let pipe_count = trimmed.matches('|').count();
3008 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3009 return true; }
3011 }
3012 }
3013
3014 if trimmed.starts_with('>') {
3016 return true; }
3018
3019 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3021 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3022
3023 let min_continuation_indent = if current.is_ordered {
3025 current.nesting_level + current.max_marker_width + 1 } else {
3027 current.nesting_level + 2
3028 };
3029
3030 if line_indent < min_continuation_indent {
3031 return true; }
3034 }
3035
3036 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3038
3039 let min_indent = if current.is_ordered {
3041 current.nesting_level + current.max_marker_width
3042 } else {
3043 current.nesting_level + 2
3044 };
3045
3046 if line_indent < min_indent {
3048 return true; }
3050
3051 }
3054 }
3055
3056 false
3058}
3059
3060fn is_horizontal_rule(trimmed: &str) -> bool {
3062 if trimmed.len() < 3 {
3063 return false;
3064 }
3065
3066 let chars: Vec<char> = trimmed.chars().collect();
3068 if let Some(&first_char) = chars.first()
3069 && (first_char == '-' || first_char == '*' || first_char == '_')
3070 {
3071 let mut count = 0;
3072 for &ch in &chars {
3073 if ch == first_char {
3074 count += 1;
3075 } else if ch != ' ' && ch != '\t' {
3076 return false; }
3078 }
3079 return count >= 3;
3080 }
3081 false
3082}
3083
3084#[cfg(test)]
3086mod tests {
3087 use super::*;
3088
3089 #[test]
3090 fn test_empty_content() {
3091 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3092 assert_eq!(ctx.content, "");
3093 assert_eq!(ctx.line_offsets, vec![0]);
3094 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3095 assert_eq!(ctx.lines.len(), 0);
3096 }
3097
3098 #[test]
3099 fn test_single_line() {
3100 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3101 assert_eq!(ctx.content, "# Hello");
3102 assert_eq!(ctx.line_offsets, vec![0]);
3103 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3104 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3105 }
3106
3107 #[test]
3108 fn test_multi_line() {
3109 let content = "# Title\n\nSecond line\nThird line";
3110 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3111 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3112 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3119
3120 #[test]
3121 fn test_line_info() {
3122 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3123 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3124
3125 assert_eq!(ctx.lines.len(), 7);
3127
3128 let line1 = &ctx.lines[0];
3130 assert_eq!(line1.content, "# Title");
3131 assert_eq!(line1.byte_offset, 0);
3132 assert_eq!(line1.indent, 0);
3133 assert!(!line1.is_blank);
3134 assert!(!line1.in_code_block);
3135 assert!(line1.list_item.is_none());
3136
3137 let line2 = &ctx.lines[1];
3139 assert_eq!(line2.content, " indented");
3140 assert_eq!(line2.byte_offset, 8);
3141 assert_eq!(line2.indent, 4);
3142 assert!(!line2.is_blank);
3143
3144 let line3 = &ctx.lines[2];
3146 assert_eq!(line3.content, "");
3147 assert!(line3.is_blank);
3148
3149 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3151 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3152 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3153 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3154 }
3155
3156 #[test]
3157 fn test_list_item_detection() {
3158 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3159 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3160
3161 let line1 = &ctx.lines[0];
3163 assert!(line1.list_item.is_some());
3164 let list1 = line1.list_item.as_ref().unwrap();
3165 assert_eq!(list1.marker, "-");
3166 assert!(!list1.is_ordered);
3167 assert_eq!(list1.marker_column, 0);
3168 assert_eq!(list1.content_column, 2);
3169
3170 let line2 = &ctx.lines[1];
3172 assert!(line2.list_item.is_some());
3173 let list2 = line2.list_item.as_ref().unwrap();
3174 assert_eq!(list2.marker, "*");
3175 assert_eq!(list2.marker_column, 2);
3176
3177 let line3 = &ctx.lines[2];
3179 assert!(line3.list_item.is_some());
3180 let list3 = line3.list_item.as_ref().unwrap();
3181 assert_eq!(list3.marker, "1.");
3182 assert!(list3.is_ordered);
3183 assert_eq!(list3.number, Some(1));
3184
3185 let line6 = &ctx.lines[5];
3187 assert!(line6.list_item.is_none());
3188 }
3189
3190 #[test]
3191 fn test_offset_to_line_col_edge_cases() {
3192 let content = "a\nb\nc";
3193 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3194 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3202
3203 #[test]
3204 fn test_mdx_esm_blocks() {
3205 let content = r##"import {Chart} from './snowfall.js'
3206export const year = 2023
3207
3208# Last year's snowfall
3209
3210In {year}, the snowfall was above average.
3211It was followed by a warm spring which caused
3212flood conditions in many of the nearby rivers.
3213
3214<Chart color="#fcb32c" year={year} />
3215"##;
3216
3217 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3218
3219 assert_eq!(ctx.lines.len(), 10);
3221 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3222 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3223 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3224 assert!(
3225 !ctx.lines[3].in_esm_block,
3226 "Line 4 (heading) should NOT be in_esm_block"
3227 );
3228 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3229 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3230 }
3231
3232 #[test]
3233 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3234 let content = r#"import {Chart} from './snowfall.js'
3235export const year = 2023
3236
3237# Last year's snowfall
3238"#;
3239
3240 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3241
3242 assert!(
3244 !ctx.lines[0].in_esm_block,
3245 "Line 1 should NOT be in_esm_block in Standard flavor"
3246 );
3247 assert!(
3248 !ctx.lines[1].in_esm_block,
3249 "Line 2 should NOT be in_esm_block in Standard flavor"
3250 );
3251 }
3252}