1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use pulldown_cmark::{Event, Parser};
6use regex::Regex;
7
8lazy_static! {
9 static ref LINK_PATTERN: Regex = Regex::new(
12 r#"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
16 |
17 \[([^\]]*)\] # Reference ID in group 6
18 )"#
19 ).unwrap();
20
21 static ref IMAGE_PATTERN: Regex = Regex::new(
24 r#"(?sx)
25 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
26 (?:
27 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
28 |
29 \[([^\]]*)\] # Reference ID in group 6
30 )"#
31 ).unwrap();
32
33 static ref REF_DEF_PATTERN: Regex = Regex::new(
35 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36 ).unwrap();
37
38 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41 r"`+"
42 ).unwrap();
43
44 static ref BARE_URL_PATTERN: Regex = Regex::new(
46 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47 ).unwrap();
48
49 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52 ).unwrap();
53
54 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57 ).unwrap();
58
59 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63#[derive(Debug, Clone)]
65pub struct LineInfo {
66 pub content: String,
68 pub byte_offset: usize,
70 pub indent: usize,
72 pub is_blank: bool,
74 pub in_code_block: bool,
76 pub in_front_matter: bool,
78 pub in_html_block: bool,
80 pub in_html_comment: bool,
82 pub list_item: Option<ListItemInfo>,
84 pub heading: Option<HeadingInfo>,
86 pub blockquote: Option<BlockquoteInfo>,
88 pub in_mkdocstrings: bool,
90 pub in_esm_block: bool,
92}
93
94#[derive(Debug, Clone)]
96pub struct ListItemInfo {
97 pub marker: String,
99 pub is_ordered: bool,
101 pub number: Option<usize>,
103 pub marker_column: usize,
105 pub content_column: usize,
107}
108
109#[derive(Debug, Clone, PartialEq)]
111pub enum HeadingStyle {
112 ATX,
114 Setext1,
116 Setext2,
118}
119
120#[derive(Debug, Clone)]
122pub struct ParsedLink {
123 pub line: usize,
125 pub start_col: usize,
127 pub end_col: usize,
129 pub byte_offset: usize,
131 pub byte_end: usize,
133 pub text: String,
135 pub url: String,
137 pub is_reference: bool,
139 pub reference_id: Option<String>,
141}
142
143#[derive(Debug, Clone)]
145pub struct ParsedImage {
146 pub line: usize,
148 pub start_col: usize,
150 pub end_col: usize,
152 pub byte_offset: usize,
154 pub byte_end: usize,
156 pub alt_text: String,
158 pub url: String,
160 pub is_reference: bool,
162 pub reference_id: Option<String>,
164}
165
166#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169 pub line: usize,
171 pub id: String,
173 pub url: String,
175 pub title: Option<String>,
177 pub byte_offset: usize,
179 pub byte_end: usize,
181}
182
183#[derive(Debug, Clone)]
185pub struct CodeSpan {
186 pub line: usize,
188 pub start_col: usize,
190 pub end_col: usize,
192 pub byte_offset: usize,
194 pub byte_end: usize,
196 pub backtick_count: usize,
198 pub content: String,
200}
201
202#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205 pub level: u8,
207 pub style: HeadingStyle,
209 pub marker: String,
211 pub marker_column: usize,
213 pub content_column: usize,
215 pub text: String,
217 pub custom_id: Option<String>,
219 pub raw_text: String,
221 pub has_closing_sequence: bool,
223 pub closing_sequence: String,
225}
226
227#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230 pub nesting_level: usize,
232 pub indent: String,
234 pub marker_column: usize,
236 pub prefix: String,
238 pub content: String,
240 pub has_no_space_after_marker: bool,
242 pub has_multiple_spaces_after_marker: bool,
244 pub needs_md028_fix: bool,
246}
247
248#[derive(Debug, Clone)]
250pub struct ListBlock {
251 pub start_line: usize,
253 pub end_line: usize,
255 pub is_ordered: bool,
257 pub marker: Option<String>,
259 pub blockquote_prefix: String,
261 pub item_lines: Vec<usize>,
263 pub nesting_level: usize,
265 pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274 pub hash_count: usize,
276 pub asterisk_count: usize,
278 pub underscore_count: usize,
280 pub hyphen_count: usize,
282 pub plus_count: usize,
284 pub gt_count: usize,
286 pub pipe_count: usize,
288 pub bracket_count: usize,
290 pub backtick_count: usize,
292 pub lt_count: usize,
294 pub exclamation_count: usize,
296 pub newline_count: usize,
298}
299
300#[derive(Debug, Clone)]
302pub struct HtmlTag {
303 pub line: usize,
305 pub start_col: usize,
307 pub end_col: usize,
309 pub byte_offset: usize,
311 pub byte_end: usize,
313 pub tag_name: String,
315 pub is_closing: bool,
317 pub is_self_closing: bool,
319 pub raw_content: String,
321}
322
323#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326 pub line: usize,
328 pub start_col: usize,
330 pub end_col: usize,
332 pub byte_offset: usize,
334 pub byte_end: usize,
336 pub marker: char,
338 pub marker_count: usize,
340 pub content: String,
342}
343
344#[derive(Debug, Clone)]
346pub struct TableRow {
347 pub line: usize,
349 pub is_separator: bool,
351 pub column_count: usize,
353 pub column_alignments: Vec<String>, }
356
357#[derive(Debug, Clone)]
359pub struct BareUrl {
360 pub line: usize,
362 pub start_col: usize,
364 pub end_col: usize,
366 pub byte_offset: usize,
368 pub byte_end: usize,
370 pub url: String,
372 pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377 pub content: &'a str,
378 pub line_offsets: Vec<usize>,
379 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, }
397
398struct BlockquoteComponents<'a> {
400 indent: &'a str,
401 markers: &'a str,
402 spaces_after: &'a str,
403 content: &'a str,
404}
405
406#[inline]
408fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
409 let bytes = line.as_bytes();
410 let mut pos = 0;
411
412 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
414 pos += 1;
415 }
416 let indent_end = pos;
417
418 if pos >= bytes.len() || bytes[pos] != b'>' {
420 return None;
421 }
422
423 while pos < bytes.len() && bytes[pos] == b'>' {
425 pos += 1;
426 }
427 let markers_end = pos;
428
429 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
431 pos += 1;
432 }
433 let spaces_end = pos;
434
435 Some(BlockquoteComponents {
436 indent: &line[0..indent_end],
437 markers: &line[indent_end..markers_end],
438 spaces_after: &line[markers_end..spaces_end],
439 content: &line[spaces_end..],
440 })
441}
442
443impl<'a> LintContext<'a> {
444 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
445 use std::time::Instant;
446 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
447
448 let start = Instant::now();
449 let mut line_offsets = vec![0];
450 for (i, c) in content.char_indices() {
451 if c == '\n' {
452 line_offsets.push(i + 1);
453 }
454 }
455 if profile {
456 eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
457 }
458
459 let start = Instant::now();
461 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
462 if profile {
463 eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
464 }
465
466 let start = Instant::now();
468 let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
469 if profile {
470 eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
471 }
472
473 let start = Instant::now();
475 let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
476 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
477 } else {
478 Vec::new()
479 };
480 if profile {
481 eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
482 }
483
484 let start = Instant::now();
486 let mut lines = Self::compute_basic_line_info(
487 content,
488 &line_offsets,
489 &code_blocks,
490 flavor,
491 &html_comment_ranges,
492 &autodoc_ranges,
493 );
494 if profile {
495 eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
496 }
497
498 let start = Instant::now();
500 Self::detect_html_blocks(&mut lines);
501 if profile {
502 eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
503 }
504
505 let start = Instant::now();
507 Self::detect_esm_blocks(&mut lines, flavor);
508 if profile {
509 eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
510 }
511
512 let start = Instant::now();
514 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
515 if profile {
516 eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
517 }
518
519 let start = Instant::now();
521 let code_spans = Self::parse_code_spans(content, &lines);
522 if profile {
523 eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
524 }
525
526 let start = Instant::now();
528 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
529 if profile {
530 eprintln!("[PROFILE] Links: {:?}", start.elapsed());
531 }
532
533 let start = Instant::now();
534 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
535 if profile {
536 eprintln!("[PROFILE] Images: {:?}", start.elapsed());
537 }
538
539 let start = Instant::now();
540 let reference_defs = Self::parse_reference_defs(content, &lines);
541 if profile {
542 eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
543 }
544
545 let start = Instant::now();
546 let list_blocks = Self::parse_list_blocks(&lines);
547 if profile {
548 eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
549 }
550
551 let start = Instant::now();
553 let char_frequency = Self::compute_char_frequency(content);
554 if profile {
555 eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
556 }
557
558 let start = Instant::now();
560 let table_blocks =
561 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(content, &code_blocks, &code_spans);
562 if profile {
563 eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
564 }
565
566 let start = Instant::now();
568 let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
569 if profile {
570 eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
571 }
572
573 let start = Instant::now();
575 let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
576 if profile {
577 eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
578 }
579
580 Self {
581 content,
582 line_offsets,
583 code_blocks,
584 lines,
585 links,
586 images,
587 reference_defs,
588 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
589 list_blocks,
590 char_frequency,
591 html_tags_cache: Mutex::new(None),
592 emphasis_spans_cache: Mutex::new(None),
593 table_rows_cache: Mutex::new(None),
594 bare_urls_cache: Mutex::new(None),
595 html_comment_ranges,
596 table_blocks,
597 line_index,
598 jinja_ranges,
599 flavor,
600 }
601 }
602
603 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
605 let mut cache = self.code_spans_cache.lock().unwrap();
606
607 if cache.is_none() {
609 let code_spans = Self::parse_code_spans(self.content, &self.lines);
610 *cache = Some(Arc::new(code_spans));
611 }
612
613 cache.as_ref().unwrap().clone()
615 }
616
617 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
619 let mut cache = self.html_tags_cache.lock().unwrap();
620
621 if cache.is_none() {
622 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks, self.flavor);
623 *cache = Some(Arc::new(html_tags));
624 }
625
626 cache.as_ref().unwrap().clone()
627 }
628
629 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
631 let mut cache = self.emphasis_spans_cache.lock().unwrap();
632
633 if cache.is_none() {
634 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
635 *cache = Some(Arc::new(emphasis_spans));
636 }
637
638 cache.as_ref().unwrap().clone()
639 }
640
641 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
643 let mut cache = self.table_rows_cache.lock().unwrap();
644
645 if cache.is_none() {
646 let table_rows = Self::parse_table_rows(&self.lines);
647 *cache = Some(Arc::new(table_rows));
648 }
649
650 cache.as_ref().unwrap().clone()
651 }
652
653 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
655 let mut cache = self.bare_urls_cache.lock().unwrap();
656
657 if cache.is_none() {
658 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
659 *cache = Some(Arc::new(bare_urls));
660 }
661
662 cache.as_ref().unwrap().clone()
663 }
664
665 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
667 match self.line_offsets.binary_search(&offset) {
668 Ok(line) => (line + 1, 1),
669 Err(line) => {
670 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
671 (line, offset - line_start + 1)
672 }
673 }
674 }
675
676 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
678 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
680 return true;
681 }
682
683 self.code_spans()
685 .iter()
686 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
687 }
688
689 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
691 if line_num > 0 {
692 self.lines.get(line_num - 1)
693 } else {
694 None
695 }
696 }
697
698 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
700 self.line_info(line_num).map(|info| info.byte_offset)
701 }
702
703 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
705 let normalized_id = ref_id.to_lowercase();
706 self.reference_defs
707 .iter()
708 .find(|def| def.id == normalized_id)
709 .map(|def| def.url.as_str())
710 }
711
712 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
714 self.links.iter().filter(|link| link.line == line_num).collect()
715 }
716
717 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
719 self.images.iter().filter(|img| img.line == line_num).collect()
720 }
721
722 pub fn is_in_list_block(&self, line_num: usize) -> bool {
724 self.list_blocks
725 .iter()
726 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
727 }
728
729 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
731 self.list_blocks
732 .iter()
733 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
734 }
735
736 pub fn is_in_code_block(&self, line_num: usize) -> bool {
740 if line_num == 0 || line_num > self.lines.len() {
741 return false;
742 }
743 self.lines[line_num - 1].in_code_block
744 }
745
746 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
748 if line_num == 0 || line_num > self.lines.len() {
749 return false;
750 }
751 self.lines[line_num - 1].in_front_matter
752 }
753
754 pub fn is_in_html_block(&self, line_num: usize) -> bool {
756 if line_num == 0 || line_num > self.lines.len() {
757 return false;
758 }
759 self.lines[line_num - 1].in_html_block
760 }
761
762 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
764 if line_num == 0 || line_num > self.lines.len() {
765 return false;
766 }
767
768 let col_0indexed = if col > 0 { col - 1 } else { 0 };
772 let code_spans = self.code_spans();
773 code_spans
774 .iter()
775 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
776 }
777
778 #[inline]
781 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
782 self.reference_defs
783 .iter()
784 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
785 }
786
787 #[inline]
791 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
792 self.html_comment_ranges
793 .iter()
794 .any(|range| byte_pos >= range.start && byte_pos < range.end)
795 }
796
797 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
799 self.jinja_ranges
800 .iter()
801 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
802 }
803
804 pub fn has_char(&self, ch: char) -> bool {
806 match ch {
807 '#' => self.char_frequency.hash_count > 0,
808 '*' => self.char_frequency.asterisk_count > 0,
809 '_' => self.char_frequency.underscore_count > 0,
810 '-' => self.char_frequency.hyphen_count > 0,
811 '+' => self.char_frequency.plus_count > 0,
812 '>' => self.char_frequency.gt_count > 0,
813 '|' => self.char_frequency.pipe_count > 0,
814 '[' => self.char_frequency.bracket_count > 0,
815 '`' => self.char_frequency.backtick_count > 0,
816 '<' => self.char_frequency.lt_count > 0,
817 '!' => self.char_frequency.exclamation_count > 0,
818 '\n' => self.char_frequency.newline_count > 0,
819 _ => self.content.contains(ch), }
821 }
822
823 pub fn char_count(&self, ch: char) -> usize {
825 match ch {
826 '#' => self.char_frequency.hash_count,
827 '*' => self.char_frequency.asterisk_count,
828 '_' => self.char_frequency.underscore_count,
829 '-' => self.char_frequency.hyphen_count,
830 '+' => self.char_frequency.plus_count,
831 '>' => self.char_frequency.gt_count,
832 '|' => self.char_frequency.pipe_count,
833 '[' => self.char_frequency.bracket_count,
834 '`' => self.char_frequency.backtick_count,
835 '<' => self.char_frequency.lt_count,
836 '!' => self.char_frequency.exclamation_count,
837 '\n' => self.char_frequency.newline_count,
838 _ => self.content.matches(ch).count(), }
840 }
841
842 pub fn likely_has_headings(&self) -> bool {
844 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
846
847 pub fn likely_has_lists(&self) -> bool {
849 self.char_frequency.asterisk_count > 0
850 || self.char_frequency.hyphen_count > 0
851 || self.char_frequency.plus_count > 0
852 }
853
854 pub fn likely_has_emphasis(&self) -> bool {
856 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
857 }
858
859 pub fn likely_has_tables(&self) -> bool {
861 self.char_frequency.pipe_count > 2
862 }
863
864 pub fn likely_has_blockquotes(&self) -> bool {
866 self.char_frequency.gt_count > 0
867 }
868
869 pub fn likely_has_code(&self) -> bool {
871 self.char_frequency.backtick_count > 0
872 }
873
874 pub fn likely_has_links_or_images(&self) -> bool {
876 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
877 }
878
879 pub fn likely_has_html(&self) -> bool {
881 self.char_frequency.lt_count > 0
882 }
883
884 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
886 self.html_tags()
887 .iter()
888 .filter(|tag| tag.line == line_num)
889 .cloned()
890 .collect()
891 }
892
893 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
895 self.emphasis_spans()
896 .iter()
897 .filter(|span| span.line == line_num)
898 .cloned()
899 .collect()
900 }
901
902 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
904 self.table_rows()
905 .iter()
906 .filter(|row| row.line == line_num)
907 .cloned()
908 .collect()
909 }
910
911 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
913 self.bare_urls()
914 .iter()
915 .filter(|url| url.line == line_num)
916 .cloned()
917 .collect()
918 }
919
920 #[inline]
926 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
927 let idx = match lines.binary_search_by(|line| {
929 if byte_offset < line.byte_offset {
930 std::cmp::Ordering::Greater
931 } else if byte_offset > line.byte_offset + line.content.len() {
932 std::cmp::Ordering::Less
933 } else {
934 std::cmp::Ordering::Equal
935 }
936 }) {
937 Ok(idx) => idx,
938 Err(idx) => idx.saturating_sub(1),
939 };
940
941 let line = &lines[idx];
942 let line_num = idx + 1;
943 let col = byte_offset.saturating_sub(line.byte_offset);
944
945 (idx, line_num, col)
946 }
947
948 #[inline]
950 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
951 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
953
954 if idx > 0 {
956 let span = &code_spans[idx - 1];
957 if offset >= span.byte_offset && offset < span.byte_end {
958 return true;
959 }
960 }
961
962 false
963 }
964
965 fn parse_links(
967 content: &str,
968 lines: &[LineInfo],
969 code_blocks: &[(usize, usize)],
970 code_spans: &[CodeSpan],
971 flavor: MarkdownFlavor,
972 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
973 ) -> Vec<ParsedLink> {
974 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
975
976 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
981 let full_match = cap.get(0).unwrap();
982 let match_start = full_match.start();
983 let match_end = full_match.end();
984
985 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
987 continue;
988 }
989
990 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
992 continue;
993 }
994
995 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
997 continue;
998 }
999
1000 if Self::is_offset_in_code_span(code_spans, match_start) {
1002 continue;
1003 }
1004
1005 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1007 continue;
1008 }
1009
1010 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1012
1013 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1015 continue;
1016 }
1017
1018 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1020
1021 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1022
1023 let inline_url = cap.get(2).or_else(|| cap.get(3));
1025
1026 if let Some(url_match) = inline_url {
1027 links.push(ParsedLink {
1029 line: line_num,
1030 start_col: col_start,
1031 end_col: col_end,
1032 byte_offset: match_start,
1033 byte_end: match_end,
1034 text,
1035 url: url_match.as_str().to_string(),
1036 is_reference: false,
1037 reference_id: None,
1038 });
1039 } else if let Some(ref_id) = cap.get(6) {
1040 let ref_id_str = ref_id.as_str();
1042 let normalized_ref = if ref_id_str.is_empty() {
1043 text.to_lowercase() } else {
1045 ref_id_str.to_lowercase()
1046 };
1047
1048 links.push(ParsedLink {
1049 line: line_num,
1050 start_col: col_start,
1051 end_col: col_end,
1052 byte_offset: match_start,
1053 byte_end: match_end,
1054 text,
1055 url: String::new(), is_reference: true,
1057 reference_id: Some(normalized_ref),
1058 });
1059 }
1060 }
1061
1062 links
1063 }
1064
1065 fn parse_images(
1067 content: &str,
1068 lines: &[LineInfo],
1069 code_blocks: &[(usize, usize)],
1070 code_spans: &[CodeSpan],
1071 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1072 ) -> Vec<ParsedImage> {
1073 use crate::utils::skip_context::is_in_html_comment_ranges;
1074
1075 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
1080 let full_match = cap.get(0).unwrap();
1081 let match_start = full_match.start();
1082 let match_end = full_match.end();
1083
1084 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1086 continue;
1087 }
1088
1089 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1091 continue;
1092 }
1093
1094 if Self::is_offset_in_code_span(code_spans, match_start) {
1096 continue;
1097 }
1098
1099 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1101 continue;
1102 }
1103
1104 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1106
1107 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1109
1110 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1111
1112 let inline_url = cap.get(2).or_else(|| cap.get(3));
1114
1115 if let Some(url_match) = inline_url {
1116 images.push(ParsedImage {
1118 line: line_num,
1119 start_col: col_start,
1120 end_col: col_end,
1121 byte_offset: match_start,
1122 byte_end: match_end,
1123 alt_text,
1124 url: url_match.as_str().to_string(),
1125 is_reference: false,
1126 reference_id: None,
1127 });
1128 } else if let Some(ref_id) = cap.get(6) {
1129 let ref_id_str = ref_id.as_str();
1131 let normalized_ref = if ref_id_str.is_empty() {
1132 alt_text.to_lowercase() } else {
1134 ref_id_str.to_lowercase()
1135 };
1136
1137 images.push(ParsedImage {
1138 line: line_num,
1139 start_col: col_start,
1140 end_col: col_end,
1141 byte_offset: match_start,
1142 byte_end: match_end,
1143 alt_text,
1144 url: String::new(), is_reference: true,
1146 reference_id: Some(normalized_ref),
1147 });
1148 }
1149 }
1150
1151 images
1152 }
1153
1154 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1156 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1160 if line_info.in_code_block {
1162 continue;
1163 }
1164
1165 let line = &line_info.content;
1166 let line_num = line_idx + 1;
1167
1168 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1169 let id = cap.get(1).unwrap().as_str().to_lowercase();
1170 let url = cap.get(2).unwrap().as_str().to_string();
1171 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1172
1173 let match_obj = cap.get(0).unwrap();
1176 let byte_offset = line_info.byte_offset + match_obj.start();
1177 let byte_end = line_info.byte_offset + match_obj.end();
1178
1179 refs.push(ReferenceDef {
1180 line: line_num,
1181 id,
1182 url,
1183 title,
1184 byte_offset,
1185 byte_end,
1186 });
1187 }
1188 }
1189
1190 refs
1191 }
1192
1193 #[inline]
1197 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1198 let trimmed_start = line.trim_start();
1199 if !trimmed_start.starts_with('>') {
1200 return None;
1201 }
1202
1203 let leading_ws_len = line.len() - trimmed_start.len();
1204 let after_gt = &trimmed_start[1..];
1205 let content = after_gt.trim_start();
1206 let ws_after_gt_len = after_gt.len() - content.len();
1207 let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1208
1209 Some((&line[..prefix_len], content))
1210 }
1211
1212 #[inline]
1216 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1217 let bytes = line.as_bytes();
1218 let mut i = 0;
1219
1220 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1222 i += 1;
1223 }
1224
1225 if i >= bytes.len() {
1227 return None;
1228 }
1229 let marker = bytes[i] as char;
1230 if marker != '-' && marker != '*' && marker != '+' {
1231 return None;
1232 }
1233 let marker_pos = i;
1234 i += 1;
1235
1236 let spacing_start = i;
1238 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1239 i += 1;
1240 }
1241
1242 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1243 }
1244
1245 #[inline]
1249 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1250 let bytes = line.as_bytes();
1251 let mut i = 0;
1252
1253 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1255 i += 1;
1256 }
1257
1258 let number_start = i;
1260 while i < bytes.len() && bytes[i].is_ascii_digit() {
1261 i += 1;
1262 }
1263 if i == number_start {
1264 return None; }
1266
1267 if i >= bytes.len() {
1269 return None;
1270 }
1271 let delimiter = bytes[i] as char;
1272 if delimiter != '.' && delimiter != ')' {
1273 return None;
1274 }
1275 let delimiter_pos = i;
1276 i += 1;
1277
1278 let spacing_start = i;
1280 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1281 i += 1;
1282 }
1283
1284 Some((
1285 &line[..number_start],
1286 &line[number_start..delimiter_pos],
1287 delimiter,
1288 &line[spacing_start..i],
1289 &line[i..],
1290 ))
1291 }
1292
1293 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1296 let num_lines = line_offsets.len();
1297 let mut in_code_block = vec![false; num_lines];
1298
1299 for &(start, end) in code_blocks {
1301 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1303 let mut boundary = start;
1304 while boundary > 0 && !content.is_char_boundary(boundary) {
1305 boundary -= 1;
1306 }
1307 boundary
1308 } else {
1309 start
1310 };
1311
1312 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1313 let mut boundary = end;
1314 while boundary < content.len() && !content.is_char_boundary(boundary) {
1315 boundary += 1;
1316 }
1317 boundary
1318 } else {
1319 end.min(content.len())
1320 };
1321
1322 let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1337 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1338
1339 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1341 *flag = true;
1342 }
1343 }
1344
1345 in_code_block
1346 }
1347
1348 fn compute_basic_line_info(
1350 content: &str,
1351 line_offsets: &[usize],
1352 code_blocks: &[(usize, usize)],
1353 flavor: MarkdownFlavor,
1354 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1355 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1356 ) -> Vec<LineInfo> {
1357 let content_lines: Vec<&str> = content.lines().collect();
1358 let mut lines = Vec::with_capacity(content_lines.len());
1359
1360 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1362
1363 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1366
1367 for (i, line) in content_lines.iter().enumerate() {
1368 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1369 let indent = line.len() - line.trim_start().len();
1370
1371 let blockquote_parse = Self::parse_blockquote_prefix(line);
1373
1374 let is_blank = if let Some((_, content)) = blockquote_parse {
1376 content.trim().is_empty()
1378 } else {
1379 line.trim().is_empty()
1380 };
1381
1382 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1384
1385 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1387 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1388 let in_html_comment =
1390 crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1391 let list_item = if !(in_code_block
1392 || is_blank
1393 || in_mkdocstrings
1394 || in_html_comment
1395 || (front_matter_end > 0 && i < front_matter_end))
1396 {
1397 let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1399 (content, prefix.len())
1400 } else {
1401 (&**line, 0)
1402 };
1403
1404 if let Some((leading_spaces, marker, spacing, _content)) =
1405 Self::parse_unordered_list(line_for_list_check)
1406 {
1407 let marker_column = blockquote_prefix_len + leading_spaces.len();
1408 let content_column = marker_column + 1 + spacing.len();
1409
1410 if spacing.is_empty() {
1417 None
1418 } else {
1419 Some(ListItemInfo {
1420 marker: marker.to_string(),
1421 is_ordered: false,
1422 number: None,
1423 marker_column,
1424 content_column,
1425 })
1426 }
1427 } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1428 Self::parse_ordered_list(line_for_list_check)
1429 {
1430 let marker = format!("{number_str}{delimiter}");
1431 let marker_column = blockquote_prefix_len + leading_spaces.len();
1432 let content_column = marker_column + marker.len() + spacing.len();
1433
1434 if spacing.is_empty() {
1437 None
1438 } else {
1439 Some(ListItemInfo {
1440 marker,
1441 is_ordered: true,
1442 number: number_str.parse().ok(),
1443 marker_column,
1444 content_column,
1445 })
1446 }
1447 } else {
1448 None
1449 }
1450 } else {
1451 None
1452 };
1453
1454 lines.push(LineInfo {
1455 content: line.to_string(),
1456 byte_offset,
1457 indent,
1458 is_blank,
1459 in_code_block,
1460 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1461 in_html_block: false, in_html_comment,
1463 list_item,
1464 heading: None, blockquote: None, in_mkdocstrings,
1467 in_esm_block: false, });
1469 }
1470
1471 lines
1472 }
1473
1474 fn detect_headings_and_blockquotes(
1476 content: &str,
1477 lines: &mut [LineInfo],
1478 flavor: MarkdownFlavor,
1479 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1480 ) {
1481 lazy_static! {
1482
1483 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1485 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1486 }
1487
1488 let content_lines: Vec<&str> = content.lines().collect();
1489
1490 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1492
1493 for i in 0..lines.len() {
1495 if lines[i].in_code_block {
1496 continue;
1497 }
1498
1499 if front_matter_end > 0 && i < front_matter_end {
1501 continue;
1502 }
1503
1504 if lines[i].in_html_block {
1506 continue;
1507 }
1508
1509 let line = content_lines[i];
1510
1511 if let Some(bq) = parse_blockquote_detailed(line) {
1513 let nesting_level = bq.markers.len(); let marker_column = bq.indent.len();
1515
1516 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1518
1519 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1521 let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1523
1524 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1528
1529 lines[i].blockquote = Some(BlockquoteInfo {
1530 nesting_level,
1531 indent: bq.indent.to_string(),
1532 marker_column,
1533 prefix,
1534 content: bq.content.to_string(),
1535 has_no_space_after_marker: has_no_space,
1536 has_multiple_spaces_after_marker: has_multiple_spaces,
1537 needs_md028_fix,
1538 });
1539 }
1540
1541 if lines[i].is_blank {
1543 continue;
1544 }
1545
1546 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1549 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1550 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1551 } else {
1552 false
1553 };
1554
1555 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1556 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1558 continue;
1559 }
1560 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1561 let hashes = caps.get(2).map_or("", |m| m.as_str());
1562 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1563 let rest = caps.get(4).map_or("", |m| m.as_str());
1564
1565 let level = hashes.len() as u8;
1566 let marker_column = leading_spaces.len();
1567
1568 let (text, has_closing, closing_seq) = {
1570 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1572 if rest[id_start..].trim_end().ends_with('}') {
1574 (&rest[..id_start], &rest[id_start..])
1576 } else {
1577 (rest, "")
1578 }
1579 } else {
1580 (rest, "")
1581 };
1582
1583 let trimmed_rest = rest_without_id.trim_end();
1585 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1586 let mut start_of_hashes = last_hash_pos;
1588 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1589 start_of_hashes -= 1;
1590 }
1591
1592 let has_space_before = start_of_hashes == 0
1594 || trimmed_rest
1595 .chars()
1596 .nth(start_of_hashes - 1)
1597 .is_some_and(|c| c.is_whitespace());
1598
1599 let potential_closing = &trimmed_rest[start_of_hashes..];
1601 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1602
1603 if is_all_hashes && has_space_before {
1604 let closing_hashes = potential_closing.to_string();
1606 let text_part = if !custom_id_part.is_empty() {
1609 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1612 } else {
1613 rest_without_id[..start_of_hashes].trim_end().to_string()
1614 };
1615 (text_part, true, closing_hashes)
1616 } else {
1617 (rest.to_string(), false, String::new())
1619 }
1620 } else {
1621 (rest.to_string(), false, String::new())
1623 }
1624 };
1625
1626 let content_column = marker_column + hashes.len() + spaces_after.len();
1627
1628 let raw_text = text.trim().to_string();
1630 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1631
1632 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1634 let next_line = content_lines[i + 1];
1635 if !lines[i + 1].in_code_block
1636 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1637 && let Some(next_line_id) =
1638 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1639 {
1640 custom_id = Some(next_line_id);
1641 }
1642 }
1643
1644 lines[i].heading = Some(HeadingInfo {
1645 level,
1646 style: HeadingStyle::ATX,
1647 marker: hashes.to_string(),
1648 marker_column,
1649 content_column,
1650 text: clean_text,
1651 custom_id,
1652 raw_text,
1653 has_closing_sequence: has_closing,
1654 closing_sequence: closing_seq,
1655 });
1656 }
1657 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1659 let next_line = content_lines[i + 1];
1660 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1661 if front_matter_end > 0 && i < front_matter_end {
1663 continue;
1664 }
1665
1666 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1668 {
1669 continue;
1670 }
1671
1672 let underline = next_line.trim();
1673
1674 if underline == "---" {
1677 continue;
1678 }
1679
1680 let current_line_trimmed = line.trim();
1682 if current_line_trimmed.contains(':')
1683 && !current_line_trimmed.starts_with('#')
1684 && !current_line_trimmed.contains('[')
1685 && !current_line_trimmed.contains("](")
1686 {
1687 continue;
1689 }
1690
1691 let level = if underline.starts_with('=') { 1 } else { 2 };
1692 let style = if level == 1 {
1693 HeadingStyle::Setext1
1694 } else {
1695 HeadingStyle::Setext2
1696 };
1697
1698 let raw_text = line.trim().to_string();
1700 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1701
1702 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1704 let attr_line = content_lines[i + 2];
1705 if !lines[i + 2].in_code_block
1706 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1707 && let Some(attr_line_id) =
1708 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1709 {
1710 custom_id = Some(attr_line_id);
1711 }
1712 }
1713
1714 lines[i].heading = Some(HeadingInfo {
1715 level,
1716 style,
1717 marker: underline.to_string(),
1718 marker_column: next_line.len() - next_line.trim_start().len(),
1719 content_column: lines[i].indent,
1720 text: clean_text,
1721 custom_id,
1722 raw_text,
1723 has_closing_sequence: false,
1724 closing_sequence: String::new(),
1725 });
1726 }
1727 }
1728 }
1729 }
1730
1731 fn detect_html_blocks(lines: &mut [LineInfo]) {
1733 const BLOCK_ELEMENTS: &[&str] = &[
1735 "address",
1736 "article",
1737 "aside",
1738 "blockquote",
1739 "details",
1740 "dialog",
1741 "dd",
1742 "div",
1743 "dl",
1744 "dt",
1745 "fieldset",
1746 "figcaption",
1747 "figure",
1748 "footer",
1749 "form",
1750 "h1",
1751 "h2",
1752 "h3",
1753 "h4",
1754 "h5",
1755 "h6",
1756 "header",
1757 "hr",
1758 "li",
1759 "main",
1760 "nav",
1761 "ol",
1762 "p",
1763 "pre",
1764 "script",
1765 "section",
1766 "style",
1767 "table",
1768 "tbody",
1769 "td",
1770 "tfoot",
1771 "th",
1772 "thead",
1773 "tr",
1774 "ul",
1775 ];
1776
1777 let mut i = 0;
1778 while i < lines.len() {
1779 if lines[i].in_code_block || lines[i].in_front_matter {
1781 i += 1;
1782 continue;
1783 }
1784
1785 let trimmed = lines[i].content.trim_start();
1786
1787 if trimmed.starts_with('<') && trimmed.len() > 1 {
1789 let after_bracket = &trimmed[1..];
1791 let is_closing = after_bracket.starts_with('/');
1792 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1793
1794 let tag_name = tag_start
1796 .chars()
1797 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1798 .collect::<String>()
1799 .to_lowercase();
1800
1801 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1803 lines[i].in_html_block = true;
1805
1806 if !is_closing {
1809 let closing_tag = format!("</{tag_name}>");
1810 let allow_blank_lines = tag_name == "style" || tag_name == "script";
1812 let mut j = i + 1;
1813 while j < lines.len() && j < i + 100 {
1814 if !allow_blank_lines && lines[j].is_blank {
1817 break;
1818 }
1819
1820 lines[j].in_html_block = true;
1821
1822 if lines[j].content.contains(&closing_tag) {
1824 break;
1825 }
1826 j += 1;
1827 }
1828 }
1829 }
1830 }
1831
1832 i += 1;
1833 }
1834 }
1835
1836 fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
1839 if !flavor.supports_esm_blocks() {
1841 return;
1842 }
1843
1844 for line in lines.iter_mut() {
1845 if line.is_blank || line.in_html_comment {
1847 continue;
1848 }
1849
1850 let trimmed = line.content.trim_start();
1852 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
1853 line.in_esm_block = true;
1854 } else {
1855 break;
1857 }
1858 }
1859 }
1860
1861 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1863 let mut code_spans = Vec::new();
1864
1865 if !content.contains('`') {
1867 return code_spans;
1868 }
1869
1870 let parser = Parser::new(content).into_offset_iter();
1872
1873 for (event, range) in parser {
1874 if let Event::Code(_) = event {
1875 let start_pos = range.start;
1876 let end_pos = range.end;
1877
1878 let full_span = &content[start_pos..end_pos];
1880 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1881
1882 let content_start = start_pos + backtick_count;
1884 let content_end = end_pos - backtick_count;
1885 let span_content = if content_start < content_end {
1886 content[content_start..content_end].to_string()
1887 } else {
1888 String::new()
1889 };
1890
1891 let line_idx = lines
1894 .partition_point(|line| line.byte_offset <= start_pos)
1895 .saturating_sub(1);
1896 let line_num = line_idx + 1;
1897 let col_start = start_pos - lines[line_idx].byte_offset;
1898
1899 let end_line_idx = lines
1901 .partition_point(|line| line.byte_offset <= end_pos)
1902 .saturating_sub(1);
1903 let col_end = end_pos - lines[end_line_idx].byte_offset;
1904
1905 code_spans.push(CodeSpan {
1906 line: line_num,
1907 start_col: col_start,
1908 end_col: col_end,
1909 byte_offset: start_pos,
1910 byte_end: end_pos,
1911 backtick_count,
1912 content: span_content,
1913 });
1914 }
1915 }
1916
1917 code_spans.sort_by_key(|span| span.byte_offset);
1919
1920 code_spans
1921 }
1922
1923 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1925 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1928 let mut last_list_item_line = 0;
1929 let mut current_indent_level = 0;
1930 let mut last_marker_width = 0;
1931
1932 for (line_idx, line_info) in lines.iter().enumerate() {
1933 let line_num = line_idx + 1;
1934
1935 if line_info.in_code_block {
1937 if let Some(ref mut block) = current_block {
1938 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1940
1941 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1943
1944 match context {
1945 CodeBlockContext::Indented => {
1946 block.end_line = line_num;
1948 continue;
1949 }
1950 CodeBlockContext::Standalone => {
1951 let completed_block = current_block.take().unwrap();
1953 list_blocks.push(completed_block);
1954 continue;
1955 }
1956 CodeBlockContext::Adjacent => {
1957 block.end_line = line_num;
1959 continue;
1960 }
1961 }
1962 } else {
1963 continue;
1965 }
1966 }
1967
1968 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1970 caps.get(0).unwrap().as_str().to_string()
1971 } else {
1972 String::new()
1973 };
1974
1975 if let Some(list_item) = &line_info.list_item {
1977 let item_indent = list_item.marker_column;
1979 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1982 let is_nested = nesting > block.nesting_level;
1986 let same_type =
1987 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1988 let same_context = block.blockquote_prefix == blockquote_prefix;
1989 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1993 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1994
1995 let has_non_list_content = {
1997 let mut found_non_list = false;
1998 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2000
2001 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2003 let last_line = &lines[block_last_item_line - 1];
2004 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2005 log::debug!(
2006 "After problematic line {}: checking lines {} to {} for non-list content",
2007 block_last_item_line,
2008 block_last_item_line + 1,
2009 line_num
2010 );
2011 if line_num == block_last_item_line + 1 {
2013 log::debug!("Lines are consecutive, no content between");
2014 }
2015 }
2016 }
2017
2018 for check_line in (block_last_item_line + 1)..line_num {
2019 let check_idx = check_line - 1;
2020 if check_idx < lines.len() {
2021 let check_info = &lines[check_idx];
2022 let is_list_breaking_content = if check_info.in_code_block {
2024 let last_item_marker_width =
2026 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2027 lines[block_last_item_line - 1]
2028 .list_item
2029 .as_ref()
2030 .map(|li| {
2031 if li.is_ordered {
2032 li.marker.len() + 1 } else {
2034 li.marker.len()
2035 }
2036 })
2037 .unwrap_or(3) } else {
2039 3 };
2041
2042 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2043
2044 let context = CodeBlockUtils::analyze_code_block_context(
2046 lines,
2047 check_line - 1,
2048 min_continuation,
2049 );
2050
2051 matches!(context, CodeBlockContext::Standalone)
2053 } else if !check_info.is_blank && check_info.list_item.is_none() {
2054 let line_content = check_info.content.trim();
2056
2057 if check_info.heading.is_some()
2059 || line_content.starts_with("---")
2060 || line_content.starts_with("***")
2061 || line_content.starts_with("___")
2062 || (line_content.contains('|')
2063 && !line_content.contains("](")
2064 && !line_content.contains("http")
2065 && (line_content.matches('|').count() > 1
2066 || line_content.starts_with('|')
2067 || line_content.ends_with('|')))
2068 || line_content.starts_with(">")
2069 {
2070 true
2071 }
2072 else {
2074 let last_item_marker_width =
2075 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2076 lines[block_last_item_line - 1]
2077 .list_item
2078 .as_ref()
2079 .map(|li| {
2080 if li.is_ordered {
2081 li.marker.len() + 1 } else {
2083 li.marker.len()
2084 }
2085 })
2086 .unwrap_or(3) } else {
2088 3 };
2090
2091 let min_continuation =
2092 if block.is_ordered { last_item_marker_width } else { 2 };
2093 check_info.indent < min_continuation
2094 }
2095 } else {
2096 false
2097 };
2098
2099 if is_list_breaking_content {
2100 found_non_list = true;
2102 break;
2103 }
2104 }
2105 }
2106 found_non_list
2107 };
2108
2109 let mut continues_list = if is_nested {
2113 same_context && reasonable_distance && !has_non_list_content
2115 } else {
2116 let result = same_type
2118 && same_context
2119 && reasonable_distance
2120 && marker_compatible
2121 && !has_non_list_content;
2122
2123 if block.item_lines.last().is_some_and(|&last_line| {
2125 last_line > 0
2126 && last_line <= lines.len()
2127 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2128 && lines[last_line - 1].content.contains(r"\`")
2129 }) {
2130 log::debug!(
2131 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2132 );
2133 if line_num > 0 && line_num <= lines.len() {
2134 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2135 }
2136 }
2137
2138 result
2139 };
2140
2141 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2144 if block.item_lines.contains(&(line_num - 1)) {
2146 continues_list = true;
2148 }
2149 }
2150
2151 if continues_list {
2152 block.end_line = line_num;
2154 block.item_lines.push(line_num);
2155
2156 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2158 list_item.marker.len() + 1
2159 } else {
2160 list_item.marker.len()
2161 });
2162
2163 if !block.is_ordered
2165 && block.marker.is_some()
2166 && block.marker.as_ref() != Some(&list_item.marker)
2167 {
2168 block.marker = None;
2170 }
2171 } else {
2172 list_blocks.push(block.clone());
2175
2176 *block = ListBlock {
2177 start_line: line_num,
2178 end_line: line_num,
2179 is_ordered: list_item.is_ordered,
2180 marker: if list_item.is_ordered {
2181 None
2182 } else {
2183 Some(list_item.marker.clone())
2184 },
2185 blockquote_prefix: blockquote_prefix.clone(),
2186 item_lines: vec![line_num],
2187 nesting_level: nesting,
2188 max_marker_width: if list_item.is_ordered {
2189 list_item.marker.len() + 1
2190 } else {
2191 list_item.marker.len()
2192 },
2193 };
2194 }
2195 } else {
2196 current_block = Some(ListBlock {
2198 start_line: line_num,
2199 end_line: line_num,
2200 is_ordered: list_item.is_ordered,
2201 marker: if list_item.is_ordered {
2202 None
2203 } else {
2204 Some(list_item.marker.clone())
2205 },
2206 blockquote_prefix,
2207 item_lines: vec![line_num],
2208 nesting_level: nesting,
2209 max_marker_width: list_item.marker.len(),
2210 });
2211 }
2212
2213 last_list_item_line = line_num;
2214 current_indent_level = item_indent;
2215 last_marker_width = if list_item.is_ordered {
2216 list_item.marker.len() + 1 } else {
2218 list_item.marker.len()
2219 };
2220 } else if let Some(ref mut block) = current_block {
2221 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2231 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2232 } else {
2233 false
2234 };
2235
2236 let min_continuation_indent = if block.is_ordered {
2240 current_indent_level + last_marker_width
2241 } else {
2242 current_indent_level + 2 };
2244
2245 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2246 block.end_line = line_num;
2248 } else if line_info.is_blank {
2249 let mut check_idx = line_idx + 1;
2252 let mut found_continuation = false;
2253
2254 while check_idx < lines.len() && lines[check_idx].is_blank {
2256 check_idx += 1;
2257 }
2258
2259 if check_idx < lines.len() {
2260 let next_line = &lines[check_idx];
2261 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2263 found_continuation = true;
2264 }
2265 else if !next_line.in_code_block
2267 && next_line.list_item.is_some()
2268 && let Some(item) = &next_line.list_item
2269 {
2270 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2271 .find(&next_line.content)
2272 .map_or(String::new(), |m| m.as_str().to_string());
2273 if item.marker_column == current_indent_level
2274 && item.is_ordered == block.is_ordered
2275 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2276 {
2277 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2280 if let Some(between_line) = lines.get(idx) {
2281 let trimmed = between_line.content.trim();
2282 if trimmed.is_empty() {
2284 return false;
2285 }
2286 let line_indent =
2288 between_line.content.len() - between_line.content.trim_start().len();
2289
2290 if trimmed.starts_with("```")
2292 || trimmed.starts_with("~~~")
2293 || trimmed.starts_with("---")
2294 || trimmed.starts_with("***")
2295 || trimmed.starts_with("___")
2296 || trimmed.starts_with(">")
2297 || trimmed.contains('|') || between_line.heading.is_some()
2299 {
2300 return true; }
2302
2303 line_indent >= min_continuation_indent
2305 } else {
2306 false
2307 }
2308 });
2309
2310 if block.is_ordered {
2311 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2314 if let Some(between_line) = lines.get(idx) {
2315 let trimmed = between_line.content.trim();
2316 if trimmed.is_empty() {
2317 return false;
2318 }
2319 trimmed.starts_with("```")
2321 || trimmed.starts_with("~~~")
2322 || trimmed.starts_with("---")
2323 || trimmed.starts_with("***")
2324 || trimmed.starts_with("___")
2325 || trimmed.starts_with(">")
2326 || trimmed.contains('|') || between_line.heading.is_some()
2328 } else {
2329 false
2330 }
2331 });
2332 found_continuation = !has_structural_separators;
2333 } else {
2334 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2336 if let Some(between_line) = lines.get(idx) {
2337 let trimmed = between_line.content.trim();
2338 if trimmed.is_empty() {
2339 return false;
2340 }
2341 trimmed.starts_with("```")
2343 || trimmed.starts_with("~~~")
2344 || trimmed.starts_with("---")
2345 || trimmed.starts_with("***")
2346 || trimmed.starts_with("___")
2347 || trimmed.starts_with(">")
2348 || trimmed.contains('|') || between_line.heading.is_some()
2350 } else {
2351 false
2352 }
2353 });
2354 found_continuation = !has_structural_separators;
2355 }
2356 }
2357 }
2358 }
2359
2360 if found_continuation {
2361 block.end_line = line_num;
2363 } else {
2364 list_blocks.push(block.clone());
2366 current_block = None;
2367 }
2368 } else {
2369 let min_required_indent = if block.is_ordered {
2372 current_indent_level + last_marker_width
2373 } else {
2374 current_indent_level + 2
2375 };
2376
2377 let line_content = line_info.content.trim();
2382 let is_structural_separator = line_info.heading.is_some()
2383 || line_content.starts_with("```")
2384 || line_content.starts_with("~~~")
2385 || line_content.starts_with("---")
2386 || line_content.starts_with("***")
2387 || line_content.starts_with("___")
2388 || line_content.starts_with(">")
2389 || (line_content.contains('|')
2390 && !line_content.contains("](")
2391 && !line_content.contains("http")
2392 && (line_content.matches('|').count() > 1
2393 || line_content.starts_with('|')
2394 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2399 && !line_info.is_blank
2400 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2401
2402 if is_lazy_continuation {
2403 let content_to_check = if !blockquote_prefix.is_empty() {
2406 line_info
2408 .content
2409 .strip_prefix(&blockquote_prefix)
2410 .unwrap_or(&line_info.content)
2411 .trim()
2412 } else {
2413 line_info.content.trim()
2414 };
2415
2416 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2417
2418 if starts_with_uppercase && last_list_item_line > 0 {
2421 list_blocks.push(block.clone());
2423 current_block = None;
2424 } else {
2425 block.end_line = line_num;
2427 }
2428 } else {
2429 list_blocks.push(block.clone());
2431 current_block = None;
2432 }
2433 }
2434 }
2435 }
2436
2437 if let Some(block) = current_block {
2439 list_blocks.push(block);
2440 }
2441
2442 merge_adjacent_list_blocks(&mut list_blocks, lines);
2444
2445 list_blocks
2446 }
2447
2448 fn compute_char_frequency(content: &str) -> CharFrequency {
2450 let mut frequency = CharFrequency::default();
2451
2452 for ch in content.chars() {
2453 match ch {
2454 '#' => frequency.hash_count += 1,
2455 '*' => frequency.asterisk_count += 1,
2456 '_' => frequency.underscore_count += 1,
2457 '-' => frequency.hyphen_count += 1,
2458 '+' => frequency.plus_count += 1,
2459 '>' => frequency.gt_count += 1,
2460 '|' => frequency.pipe_count += 1,
2461 '[' => frequency.bracket_count += 1,
2462 '`' => frequency.backtick_count += 1,
2463 '<' => frequency.lt_count += 1,
2464 '!' => frequency.exclamation_count += 1,
2465 '\n' => frequency.newline_count += 1,
2466 _ => {}
2467 }
2468 }
2469
2470 frequency
2471 }
2472
2473 fn parse_html_tags(
2475 content: &str,
2476 lines: &[LineInfo],
2477 code_blocks: &[(usize, usize)],
2478 flavor: MarkdownFlavor,
2479 ) -> Vec<HtmlTag> {
2480 lazy_static! {
2481 static ref HTML_TAG_REGEX: regex::Regex =
2482 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2483 }
2484
2485 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2486
2487 for cap in HTML_TAG_REGEX.captures_iter(content) {
2488 let full_match = cap.get(0).unwrap();
2489 let match_start = full_match.start();
2490 let match_end = full_match.end();
2491
2492 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2494 continue;
2495 }
2496
2497 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2498 let tag_name_original = cap.get(2).unwrap().as_str();
2499 let tag_name = tag_name_original.to_lowercase();
2500 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2501
2502 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2505 continue;
2506 }
2507
2508 let mut line_num = 1;
2510 let mut col_start = match_start;
2511 let mut col_end = match_end;
2512 for (idx, line_info) in lines.iter().enumerate() {
2513 if match_start >= line_info.byte_offset {
2514 line_num = idx + 1;
2515 col_start = match_start - line_info.byte_offset;
2516 col_end = match_end - line_info.byte_offset;
2517 } else {
2518 break;
2519 }
2520 }
2521
2522 html_tags.push(HtmlTag {
2523 line: line_num,
2524 start_col: col_start,
2525 end_col: col_end,
2526 byte_offset: match_start,
2527 byte_end: match_end,
2528 tag_name,
2529 is_closing,
2530 is_self_closing,
2531 raw_content: full_match.as_str().to_string(),
2532 });
2533 }
2534
2535 html_tags
2536 }
2537
2538 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2540 lazy_static! {
2541 static ref EMPHASIS_REGEX: regex::Regex =
2542 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2543 }
2544
2545 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2546
2547 for cap in EMPHASIS_REGEX.captures_iter(content) {
2548 let full_match = cap.get(0).unwrap();
2549 let match_start = full_match.start();
2550 let match_end = full_match.end();
2551
2552 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2554 continue;
2555 }
2556
2557 let opening_markers = cap.get(1).unwrap().as_str();
2558 let content_part = cap.get(2).unwrap().as_str();
2559 let closing_markers = cap.get(3).unwrap().as_str();
2560
2561 if opening_markers.chars().next() != closing_markers.chars().next()
2563 || opening_markers.len() != closing_markers.len()
2564 {
2565 continue;
2566 }
2567
2568 let marker = opening_markers.chars().next().unwrap();
2569 let marker_count = opening_markers.len();
2570
2571 let mut line_num = 1;
2573 let mut col_start = match_start;
2574 let mut col_end = match_end;
2575 for (idx, line_info) in lines.iter().enumerate() {
2576 if match_start >= line_info.byte_offset {
2577 line_num = idx + 1;
2578 col_start = match_start - line_info.byte_offset;
2579 col_end = match_end - line_info.byte_offset;
2580 } else {
2581 break;
2582 }
2583 }
2584
2585 emphasis_spans.push(EmphasisSpan {
2586 line: line_num,
2587 start_col: col_start,
2588 end_col: col_end,
2589 byte_offset: match_start,
2590 byte_end: match_end,
2591 marker,
2592 marker_count,
2593 content: content_part.to_string(),
2594 });
2595 }
2596
2597 emphasis_spans
2598 }
2599
2600 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2602 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2603
2604 for (line_idx, line_info) in lines.iter().enumerate() {
2605 if line_info.in_code_block || line_info.is_blank {
2607 continue;
2608 }
2609
2610 let line = &line_info.content;
2611 let line_num = line_idx + 1;
2612
2613 if !line.contains('|') {
2615 continue;
2616 }
2617
2618 let parts: Vec<&str> = line.split('|').collect();
2620 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2621
2622 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2624 let mut column_alignments = Vec::new();
2625
2626 if is_separator {
2627 for part in &parts[1..parts.len() - 1] {
2628 let trimmed = part.trim();
2630 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2631 "center".to_string()
2632 } else if trimmed.ends_with(':') {
2633 "right".to_string()
2634 } else if trimmed.starts_with(':') {
2635 "left".to_string()
2636 } else {
2637 "none".to_string()
2638 };
2639 column_alignments.push(alignment);
2640 }
2641 }
2642
2643 table_rows.push(TableRow {
2644 line: line_num,
2645 is_separator,
2646 column_count,
2647 column_alignments,
2648 });
2649 }
2650
2651 table_rows
2652 }
2653
2654 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2656 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2657
2658 for cap in BARE_URL_PATTERN.captures_iter(content) {
2660 let full_match = cap.get(0).unwrap();
2661 let match_start = full_match.start();
2662 let match_end = full_match.end();
2663
2664 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2666 continue;
2667 }
2668
2669 let preceding_char = if match_start > 0 {
2671 content.chars().nth(match_start - 1)
2672 } else {
2673 None
2674 };
2675 let following_char = content.chars().nth(match_end);
2676
2677 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2678 continue;
2679 }
2680 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2681 continue;
2682 }
2683
2684 let url = full_match.as_str();
2685 let url_type = if url.starts_with("https://") {
2686 "https"
2687 } else if url.starts_with("http://") {
2688 "http"
2689 } else if url.starts_with("ftp://") {
2690 "ftp"
2691 } else {
2692 "other"
2693 };
2694
2695 let mut line_num = 1;
2697 let mut col_start = match_start;
2698 let mut col_end = match_end;
2699 for (idx, line_info) in lines.iter().enumerate() {
2700 if match_start >= line_info.byte_offset {
2701 line_num = idx + 1;
2702 col_start = match_start - line_info.byte_offset;
2703 col_end = match_end - line_info.byte_offset;
2704 } else {
2705 break;
2706 }
2707 }
2708
2709 bare_urls.push(BareUrl {
2710 line: line_num,
2711 start_col: col_start,
2712 end_col: col_end,
2713 byte_offset: match_start,
2714 byte_end: match_end,
2715 url: url.to_string(),
2716 url_type: url_type.to_string(),
2717 });
2718 }
2719
2720 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2722 let full_match = cap.get(0).unwrap();
2723 let match_start = full_match.start();
2724 let match_end = full_match.end();
2725
2726 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2728 continue;
2729 }
2730
2731 let preceding_char = if match_start > 0 {
2733 content.chars().nth(match_start - 1)
2734 } else {
2735 None
2736 };
2737 let following_char = content.chars().nth(match_end);
2738
2739 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2740 continue;
2741 }
2742 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2743 continue;
2744 }
2745
2746 let email = full_match.as_str();
2747
2748 let mut line_num = 1;
2750 let mut col_start = match_start;
2751 let mut col_end = match_end;
2752 for (idx, line_info) in lines.iter().enumerate() {
2753 if match_start >= line_info.byte_offset {
2754 line_num = idx + 1;
2755 col_start = match_start - line_info.byte_offset;
2756 col_end = match_end - line_info.byte_offset;
2757 } else {
2758 break;
2759 }
2760 }
2761
2762 bare_urls.push(BareUrl {
2763 line: line_num,
2764 start_col: col_start,
2765 end_col: col_end,
2766 byte_offset: match_start,
2767 byte_end: match_end,
2768 url: email.to_string(),
2769 url_type: "email".to_string(),
2770 });
2771 }
2772
2773 bare_urls
2774 }
2775}
2776
2777fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2779 if list_blocks.len() < 2 {
2780 return;
2781 }
2782
2783 let mut merger = ListBlockMerger::new(lines);
2784 *list_blocks = merger.merge(list_blocks);
2785}
2786
2787struct ListBlockMerger<'a> {
2789 lines: &'a [LineInfo],
2790}
2791
2792impl<'a> ListBlockMerger<'a> {
2793 fn new(lines: &'a [LineInfo]) -> Self {
2794 Self { lines }
2795 }
2796
2797 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2798 let mut merged = Vec::with_capacity(list_blocks.len());
2799 let mut current = list_blocks[0].clone();
2800
2801 for next in list_blocks.iter().skip(1) {
2802 if self.should_merge_blocks(¤t, next) {
2803 current = self.merge_two_blocks(current, next);
2804 } else {
2805 merged.push(current);
2806 current = next.clone();
2807 }
2808 }
2809
2810 merged.push(current);
2811 merged
2812 }
2813
2814 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2816 if !self.blocks_are_compatible(current, next) {
2818 return false;
2819 }
2820
2821 let spacing = self.analyze_spacing_between(current, next);
2823 match spacing {
2824 BlockSpacing::Consecutive => true,
2825 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2826 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2827 self.can_merge_with_content_between(current, next)
2828 }
2829 }
2830 }
2831
2832 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2834 current.is_ordered == next.is_ordered
2835 && current.blockquote_prefix == next.blockquote_prefix
2836 && current.nesting_level == next.nesting_level
2837 }
2838
2839 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2841 let gap = next.start_line - current.end_line;
2842
2843 match gap {
2844 1 => BlockSpacing::Consecutive,
2845 2 => BlockSpacing::SingleBlank,
2846 _ if gap > 2 => {
2847 if self.has_only_blank_lines_between(current, next) {
2848 BlockSpacing::MultipleBlanks
2849 } else {
2850 BlockSpacing::ContentBetween
2851 }
2852 }
2853 _ => BlockSpacing::Consecutive, }
2855 }
2856
2857 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2859 if has_meaningful_content_between(current, next, self.lines) {
2862 return false; }
2864
2865 !current.is_ordered && current.marker == next.marker
2867 }
2868
2869 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2871 if has_meaningful_content_between(current, next, self.lines) {
2873 return false; }
2875
2876 current.is_ordered && next.is_ordered
2878 }
2879
2880 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2882 for line_num in (current.end_line + 1)..next.start_line {
2883 if let Some(line_info) = self.lines.get(line_num - 1)
2884 && !line_info.content.trim().is_empty()
2885 {
2886 return false;
2887 }
2888 }
2889 true
2890 }
2891
2892 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2894 current.end_line = next.end_line;
2895 current.item_lines.extend_from_slice(&next.item_lines);
2896
2897 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2899
2900 if !current.is_ordered && self.markers_differ(¤t, next) {
2902 current.marker = None; }
2904
2905 current
2906 }
2907
2908 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2910 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2911 }
2912}
2913
2914#[derive(Debug, PartialEq)]
2916enum BlockSpacing {
2917 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2922
2923fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2925 for line_num in (current.end_line + 1)..next.start_line {
2927 if let Some(line_info) = lines.get(line_num - 1) {
2928 let trimmed = line_info.content.trim();
2930
2931 if trimmed.is_empty() {
2933 continue;
2934 }
2935
2936 if line_info.heading.is_some() {
2940 return true; }
2942
2943 if is_horizontal_rule(trimmed) {
2945 return true; }
2947
2948 if trimmed.contains('|') && trimmed.len() > 1 {
2951 if !trimmed.contains("](") && !trimmed.contains("http") {
2953 let pipe_count = trimmed.matches('|').count();
2955 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2956 return true; }
2958 }
2959 }
2960
2961 if trimmed.starts_with('>') {
2963 return true; }
2965
2966 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2968 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2969
2970 let min_continuation_indent = if current.is_ordered {
2972 current.nesting_level + current.max_marker_width + 1 } else {
2974 current.nesting_level + 2
2975 };
2976
2977 if line_indent < min_continuation_indent {
2978 return true; }
2981 }
2982
2983 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2985
2986 let min_indent = if current.is_ordered {
2988 current.nesting_level + current.max_marker_width
2989 } else {
2990 current.nesting_level + 2
2991 };
2992
2993 if line_indent < min_indent {
2995 return true; }
2997
2998 }
3001 }
3002
3003 false
3005}
3006
3007fn is_horizontal_rule(trimmed: &str) -> bool {
3009 if trimmed.len() < 3 {
3010 return false;
3011 }
3012
3013 let chars: Vec<char> = trimmed.chars().collect();
3015 if let Some(&first_char) = chars.first()
3016 && (first_char == '-' || first_char == '*' || first_char == '_')
3017 {
3018 let mut count = 0;
3019 for &ch in &chars {
3020 if ch == first_char {
3021 count += 1;
3022 } else if ch != ' ' && ch != '\t' {
3023 return false; }
3025 }
3026 return count >= 3;
3027 }
3028 false
3029}
3030
3031#[cfg(test)]
3033mod tests {
3034 use super::*;
3035
3036 #[test]
3037 fn test_empty_content() {
3038 let ctx = LintContext::new("", MarkdownFlavor::Standard);
3039 assert_eq!(ctx.content, "");
3040 assert_eq!(ctx.line_offsets, vec![0]);
3041 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3042 assert_eq!(ctx.lines.len(), 0);
3043 }
3044
3045 #[test]
3046 fn test_single_line() {
3047 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3048 assert_eq!(ctx.content, "# Hello");
3049 assert_eq!(ctx.line_offsets, vec![0]);
3050 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3051 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3052 }
3053
3054 #[test]
3055 fn test_multi_line() {
3056 let content = "# Title\n\nSecond line\nThird line";
3057 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3058 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3059 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
3066
3067 #[test]
3068 fn test_line_info() {
3069 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
3070 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3071
3072 assert_eq!(ctx.lines.len(), 7);
3074
3075 let line1 = &ctx.lines[0];
3077 assert_eq!(line1.content, "# Title");
3078 assert_eq!(line1.byte_offset, 0);
3079 assert_eq!(line1.indent, 0);
3080 assert!(!line1.is_blank);
3081 assert!(!line1.in_code_block);
3082 assert!(line1.list_item.is_none());
3083
3084 let line2 = &ctx.lines[1];
3086 assert_eq!(line2.content, " indented");
3087 assert_eq!(line2.byte_offset, 8);
3088 assert_eq!(line2.indent, 4);
3089 assert!(!line2.is_blank);
3090
3091 let line3 = &ctx.lines[2];
3093 assert_eq!(line3.content, "");
3094 assert!(line3.is_blank);
3095
3096 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3098 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3099 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3100 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3101 }
3102
3103 #[test]
3104 fn test_list_item_detection() {
3105 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
3106 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3107
3108 let line1 = &ctx.lines[0];
3110 assert!(line1.list_item.is_some());
3111 let list1 = line1.list_item.as_ref().unwrap();
3112 assert_eq!(list1.marker, "-");
3113 assert!(!list1.is_ordered);
3114 assert_eq!(list1.marker_column, 0);
3115 assert_eq!(list1.content_column, 2);
3116
3117 let line2 = &ctx.lines[1];
3119 assert!(line2.list_item.is_some());
3120 let list2 = line2.list_item.as_ref().unwrap();
3121 assert_eq!(list2.marker, "*");
3122 assert_eq!(list2.marker_column, 2);
3123
3124 let line3 = &ctx.lines[2];
3126 assert!(line3.list_item.is_some());
3127 let list3 = line3.list_item.as_ref().unwrap();
3128 assert_eq!(list3.marker, "1.");
3129 assert!(list3.is_ordered);
3130 assert_eq!(list3.number, Some(1));
3131
3132 let line6 = &ctx.lines[5];
3134 assert!(line6.list_item.is_none());
3135 }
3136
3137 #[test]
3138 fn test_offset_to_line_col_edge_cases() {
3139 let content = "a\nb\nc";
3140 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3141 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
3149
3150 #[test]
3151 fn test_mdx_esm_blocks() {
3152 let content = r##"import {Chart} from './snowfall.js'
3153export const year = 2023
3154
3155# Last year's snowfall
3156
3157In {year}, the snowfall was above average.
3158It was followed by a warm spring which caused
3159flood conditions in many of the nearby rivers.
3160
3161<Chart color="#fcb32c" year={year} />
3162"##;
3163
3164 let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3165
3166 assert_eq!(ctx.lines.len(), 10);
3168 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3169 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3170 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3171 assert!(
3172 !ctx.lines[3].in_esm_block,
3173 "Line 4 (heading) should NOT be in_esm_block"
3174 );
3175 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3176 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3177 }
3178
3179 #[test]
3180 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3181 let content = r#"import {Chart} from './snowfall.js'
3182export const year = 2023
3183
3184# Last year's snowfall
3185"#;
3186
3187 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3188
3189 assert!(
3191 !ctx.lines[0].in_esm_block,
3192 "Line 1 should NOT be in_esm_block in Standard flavor"
3193 );
3194 assert!(
3195 !ctx.lines[1].in_esm_block,
3196 "Line 2 should NOT be in_esm_block in Standard flavor"
3197 );
3198 }
3199}