1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::path::PathBuf;
10use std::sync::LazyLock;
11
12#[cfg(not(target_arch = "wasm32"))]
14macro_rules! profile_section {
15 ($name:expr, $profile:expr, $code:expr) => {{
16 let start = std::time::Instant::now();
17 let result = $code;
18 if $profile {
19 eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
20 }
21 result
22 }};
23}
24
25#[cfg(target_arch = "wasm32")]
26macro_rules! profile_section {
27 ($name:expr, $profile:expr, $code:expr) => {{ $code }};
28}
29
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33 Regex::new(
34 r#"(?sx)
35 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
36 (?:
37 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
38 |
39 \[([^\]]*)\] # Reference ID in group 6
40 )"#
41 ).unwrap()
42});
43
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47 Regex::new(
48 r#"(?sx)
49 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
50 (?:
51 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
52 |
53 \[([^\]]*)\] # Reference ID in group 6
54 )"#
55 ).unwrap()
56});
57
58static REF_DEF_PATTERN: LazyLock<Regex> =
60 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
61
62static BARE_EMAIL_PATTERN: LazyLock<Regex> =
66 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
67
68static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
70
71#[derive(Debug, Clone)]
73pub struct LineInfo {
74 pub byte_offset: usize,
76 pub byte_len: usize,
78 pub indent: usize,
80 pub visual_indent: usize,
84 pub is_blank: bool,
86 pub in_code_block: bool,
88 pub in_front_matter: bool,
90 pub in_html_block: bool,
92 pub in_html_comment: bool,
94 pub list_item: Option<ListItemInfo>,
96 pub heading: Option<HeadingInfo>,
98 pub blockquote: Option<BlockquoteInfo>,
100 pub in_mkdocstrings: bool,
102 pub in_esm_block: bool,
104 pub in_code_span_continuation: bool,
106 pub is_horizontal_rule: bool,
109 pub in_math_block: bool,
111}
112
113impl LineInfo {
114 pub fn content<'a>(&self, source: &'a str) -> &'a str {
116 &source[self.byte_offset..self.byte_offset + self.byte_len]
117 }
118}
119
120#[derive(Debug, Clone)]
122pub struct ListItemInfo {
123 pub marker: String,
125 pub is_ordered: bool,
127 pub number: Option<usize>,
129 pub marker_column: usize,
131 pub content_column: usize,
133}
134
135#[derive(Debug, Clone, PartialEq)]
137pub enum HeadingStyle {
138 ATX,
140 Setext1,
142 Setext2,
144}
145
146#[derive(Debug, Clone)]
148pub struct ParsedLink<'a> {
149 pub line: usize,
151 pub start_col: usize,
153 pub end_col: usize,
155 pub byte_offset: usize,
157 pub byte_end: usize,
159 pub text: Cow<'a, str>,
161 pub url: Cow<'a, str>,
163 pub is_reference: bool,
165 pub reference_id: Option<Cow<'a, str>>,
167 pub link_type: LinkType,
169}
170
171#[derive(Debug, Clone)]
173pub struct BrokenLinkInfo {
174 pub reference: String,
176 pub span: std::ops::Range<usize>,
178}
179
180#[derive(Debug, Clone)]
182pub struct FootnoteRef {
183 pub id: String,
185 pub line: usize,
187 pub byte_offset: usize,
189 pub byte_end: usize,
191}
192
193#[derive(Debug, Clone)]
195pub struct ParsedImage<'a> {
196 pub line: usize,
198 pub start_col: usize,
200 pub end_col: usize,
202 pub byte_offset: usize,
204 pub byte_end: usize,
206 pub alt_text: Cow<'a, str>,
208 pub url: Cow<'a, str>,
210 pub is_reference: bool,
212 pub reference_id: Option<Cow<'a, str>>,
214 pub link_type: LinkType,
216}
217
218#[derive(Debug, Clone)]
220pub struct ReferenceDef {
221 pub line: usize,
223 pub id: String,
225 pub url: String,
227 pub title: Option<String>,
229 pub byte_offset: usize,
231 pub byte_end: usize,
233 pub title_byte_start: Option<usize>,
235 pub title_byte_end: Option<usize>,
237}
238
239#[derive(Debug, Clone)]
241pub struct CodeSpan {
242 pub line: usize,
244 pub end_line: usize,
246 pub start_col: usize,
248 pub end_col: usize,
250 pub byte_offset: usize,
252 pub byte_end: usize,
254 pub backtick_count: usize,
256 pub content: String,
258}
259
260#[derive(Debug, Clone)]
262pub struct HeadingInfo {
263 pub level: u8,
265 pub style: HeadingStyle,
267 pub marker: String,
269 pub marker_column: usize,
271 pub content_column: usize,
273 pub text: String,
275 pub custom_id: Option<String>,
277 pub raw_text: String,
279 pub has_closing_sequence: bool,
281 pub closing_sequence: String,
283 pub is_valid: bool,
286}
287
288#[derive(Debug, Clone)]
293pub struct ValidHeading<'a> {
294 pub line_num: usize,
296 pub heading: &'a HeadingInfo,
298 pub line_info: &'a LineInfo,
300}
301
302pub struct ValidHeadingsIter<'a> {
307 lines: &'a [LineInfo],
308 current_index: usize,
309}
310
311impl<'a> ValidHeadingsIter<'a> {
312 fn new(lines: &'a [LineInfo]) -> Self {
313 Self {
314 lines,
315 current_index: 0,
316 }
317 }
318}
319
320impl<'a> Iterator for ValidHeadingsIter<'a> {
321 type Item = ValidHeading<'a>;
322
323 fn next(&mut self) -> Option<Self::Item> {
324 while self.current_index < self.lines.len() {
325 let idx = self.current_index;
326 self.current_index += 1;
327
328 let line_info = &self.lines[idx];
329 if let Some(heading) = &line_info.heading
330 && heading.is_valid
331 {
332 return Some(ValidHeading {
333 line_num: idx + 1, heading,
335 line_info,
336 });
337 }
338 }
339 None
340 }
341}
342
343#[derive(Debug, Clone)]
345pub struct BlockquoteInfo {
346 pub nesting_level: usize,
348 pub indent: String,
350 pub marker_column: usize,
352 pub prefix: String,
354 pub content: String,
356 pub has_no_space_after_marker: bool,
358 pub has_multiple_spaces_after_marker: bool,
360 pub needs_md028_fix: bool,
362}
363
364#[derive(Debug, Clone)]
366pub struct ListBlock {
367 pub start_line: usize,
369 pub end_line: usize,
371 pub is_ordered: bool,
373 pub marker: Option<String>,
375 pub blockquote_prefix: String,
377 pub item_lines: Vec<usize>,
379 pub nesting_level: usize,
381 pub max_marker_width: usize,
383}
384
385use std::sync::{Arc, OnceLock};
386
387type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
389
390#[derive(Debug, Clone, Default)]
392pub struct CharFrequency {
393 pub hash_count: usize,
395 pub asterisk_count: usize,
397 pub underscore_count: usize,
399 pub hyphen_count: usize,
401 pub plus_count: usize,
403 pub gt_count: usize,
405 pub pipe_count: usize,
407 pub bracket_count: usize,
409 pub backtick_count: usize,
411 pub lt_count: usize,
413 pub exclamation_count: usize,
415 pub newline_count: usize,
417}
418
419#[derive(Debug, Clone)]
421pub struct HtmlTag {
422 pub line: usize,
424 pub start_col: usize,
426 pub end_col: usize,
428 pub byte_offset: usize,
430 pub byte_end: usize,
432 pub tag_name: String,
434 pub is_closing: bool,
436 pub is_self_closing: bool,
438 pub raw_content: String,
440}
441
442#[derive(Debug, Clone)]
444pub struct EmphasisSpan {
445 pub line: usize,
447 pub start_col: usize,
449 pub end_col: usize,
451 pub byte_offset: usize,
453 pub byte_end: usize,
455 pub marker: char,
457 pub marker_count: usize,
459 pub content: String,
461}
462
463#[derive(Debug, Clone)]
465pub struct TableRow {
466 pub line: usize,
468 pub is_separator: bool,
470 pub column_count: usize,
472 pub column_alignments: Vec<String>, }
475
476#[derive(Debug, Clone)]
478pub struct BareUrl {
479 pub line: usize,
481 pub start_col: usize,
483 pub end_col: usize,
485 pub byte_offset: usize,
487 pub byte_end: usize,
489 pub url: String,
491 pub url_type: String,
493}
494
495pub struct LintContext<'a> {
496 pub content: &'a str,
497 pub line_offsets: Vec<usize>,
498 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink<'a>>, pub images: Vec<ParsedImage<'a>>, pub broken_links: Vec<BrokenLinkInfo>, pub footnote_refs: Vec<FootnoteRef>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, has_mixed_list_nesting_cache: OnceLock<bool>, html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, pub line_index: crate::utils::range_utils::LineIndex<'a>, jinja_ranges: Vec<(usize, usize)>, pub flavor: MarkdownFlavor, pub source_file: Option<PathBuf>, }
520
521struct BlockquoteComponents<'a> {
523 indent: &'a str,
524 markers: &'a str,
525 spaces_after: &'a str,
526 content: &'a str,
527}
528
529#[inline]
531fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
532 let bytes = line.as_bytes();
533 let mut pos = 0;
534
535 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
537 pos += 1;
538 }
539 let indent_end = pos;
540
541 if pos >= bytes.len() || bytes[pos] != b'>' {
543 return None;
544 }
545
546 while pos < bytes.len() && bytes[pos] == b'>' {
548 pos += 1;
549 }
550 let markers_end = pos;
551
552 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
554 pos += 1;
555 }
556 let spaces_end = pos;
557
558 Some(BlockquoteComponents {
559 indent: &line[0..indent_end],
560 markers: &line[indent_end..markers_end],
561 spaces_after: &line[markers_end..spaces_end],
562 content: &line[spaces_end..],
563 })
564}
565
566impl<'a> LintContext<'a> {
567 pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
568 #[cfg(not(target_arch = "wasm32"))]
569 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
570 #[cfg(target_arch = "wasm32")]
571 let profile = false;
572
573 let line_offsets = profile_section!("Line offsets", profile, {
574 let mut offsets = vec![0];
575 for (i, c) in content.char_indices() {
576 if c == '\n' {
577 offsets.push(i + 1);
578 }
579 }
580 offsets
581 });
582
583 let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
585
586 let html_comment_ranges = profile_section!(
588 "HTML comment ranges",
589 profile,
590 crate::utils::skip_context::compute_html_comment_ranges(content)
591 );
592
593 let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
595 if flavor == MarkdownFlavor::MkDocs {
596 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
597 } else {
598 Vec::new()
599 }
600 });
601
602 let (mut lines, emphasis_spans) = profile_section!(
605 "Basic line info",
606 profile,
607 Self::compute_basic_line_info(
608 content,
609 &line_offsets,
610 &code_blocks,
611 flavor,
612 &html_comment_ranges,
613 &autodoc_ranges,
614 )
615 );
616
617 profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
619
620 profile_section!(
622 "ESM blocks",
623 profile,
624 Self::detect_esm_blocks(content, &mut lines, flavor)
625 );
626
627 let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
629
630 profile_section!(
632 "Headings & blockquotes",
633 profile,
634 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
635 );
636
637 let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
639
640 for span in &code_spans {
643 if span.end_line > span.line {
644 for line_num in (span.line + 1)..=span.end_line {
646 if let Some(line_info) = lines.get_mut(line_num - 1) {
647 line_info.in_code_span_continuation = true;
648 }
649 }
650 }
651 }
652
653 let (links, broken_links, footnote_refs) = profile_section!(
655 "Links",
656 profile,
657 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
658 );
659
660 let images = profile_section!(
661 "Images",
662 profile,
663 Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
664 );
665
666 let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
667
668 let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
669
670 let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
672
673 let table_blocks = profile_section!(
675 "Table blocks",
676 profile,
677 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
678 content,
679 &code_blocks,
680 &code_spans,
681 &html_comment_ranges,
682 )
683 );
684
685 let line_index = profile_section!(
687 "Line index",
688 profile,
689 crate::utils::range_utils::LineIndex::new(content)
690 );
691
692 let jinja_ranges = profile_section!(
694 "Jinja ranges",
695 profile,
696 crate::utils::jinja_utils::find_jinja_ranges(content)
697 );
698
699 Self {
700 content,
701 line_offsets,
702 code_blocks,
703 lines,
704 links,
705 images,
706 broken_links,
707 footnote_refs,
708 reference_defs,
709 code_spans_cache: OnceLock::from(Arc::new(code_spans)),
710 list_blocks,
711 char_frequency,
712 html_tags_cache: OnceLock::new(),
713 emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
714 table_rows_cache: OnceLock::new(),
715 bare_urls_cache: OnceLock::new(),
716 has_mixed_list_nesting_cache: OnceLock::new(),
717 html_comment_ranges,
718 table_blocks,
719 line_index,
720 jinja_ranges,
721 flavor,
722 source_file,
723 }
724 }
725
726 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
728 Arc::clone(
729 self.code_spans_cache
730 .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
731 )
732 }
733
734 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
736 &self.html_comment_ranges
737 }
738
739 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
741 Arc::clone(self.html_tags_cache.get_or_init(|| {
742 Arc::new(Self::parse_html_tags(
743 self.content,
744 &self.lines,
745 &self.code_blocks,
746 self.flavor,
747 ))
748 }))
749 }
750
751 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
753 Arc::clone(
754 self.emphasis_spans_cache
755 .get()
756 .expect("emphasis_spans_cache initialized during construction"),
757 )
758 }
759
760 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
762 Arc::clone(
763 self.table_rows_cache
764 .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
765 )
766 }
767
768 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
770 Arc::clone(
771 self.bare_urls_cache
772 .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
773 )
774 }
775
776 pub fn has_mixed_list_nesting(&self) -> bool {
780 *self
781 .has_mixed_list_nesting_cache
782 .get_or_init(|| self.compute_mixed_list_nesting())
783 }
784
785 fn compute_mixed_list_nesting(&self) -> bool {
787 let mut stack: Vec<(usize, bool)> = Vec::new();
792 let mut last_was_blank = false;
793
794 for line_info in &self.lines {
795 if line_info.in_code_block
797 || line_info.in_front_matter
798 || line_info.in_mkdocstrings
799 || line_info.in_html_comment
800 || line_info.in_esm_block
801 {
802 continue;
803 }
804
805 if line_info.is_blank {
807 last_was_blank = true;
808 continue;
809 }
810
811 if let Some(list_item) = &line_info.list_item {
812 let current_pos = if list_item.marker_column == 1 {
814 0
815 } else {
816 list_item.marker_column
817 };
818
819 if last_was_blank && current_pos == 0 {
821 stack.clear();
822 }
823 last_was_blank = false;
824
825 while let Some(&(pos, _)) = stack.last() {
827 if pos >= current_pos {
828 stack.pop();
829 } else {
830 break;
831 }
832 }
833
834 if let Some(&(_, parent_is_ordered)) = stack.last()
836 && parent_is_ordered != list_item.is_ordered
837 {
838 return true; }
840
841 stack.push((current_pos, list_item.is_ordered));
842 } else {
843 last_was_blank = false;
845 }
846 }
847
848 false
849 }
850
851 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
853 match self.line_offsets.binary_search(&offset) {
854 Ok(line) => (line + 1, 1),
855 Err(line) => {
856 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
857 (line, offset - line_start + 1)
858 }
859 }
860 }
861
862 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
864 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
866 return true;
867 }
868
869 self.code_spans()
871 .iter()
872 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
873 }
874
875 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
877 if line_num > 0 {
878 self.lines.get(line_num - 1)
879 } else {
880 None
881 }
882 }
883
884 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
886 self.line_info(line_num).map(|info| info.byte_offset)
887 }
888
889 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
891 let normalized_id = ref_id.to_lowercase();
892 self.reference_defs
893 .iter()
894 .find(|def| def.id == normalized_id)
895 .map(|def| def.url.as_str())
896 }
897
898 pub fn is_in_list_block(&self, line_num: usize) -> bool {
900 self.list_blocks
901 .iter()
902 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
903 }
904
905 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
907 self.list_blocks
908 .iter()
909 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
910 }
911
912 pub fn is_in_code_block(&self, line_num: usize) -> bool {
916 if line_num == 0 || line_num > self.lines.len() {
917 return false;
918 }
919 self.lines[line_num - 1].in_code_block
920 }
921
922 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
924 if line_num == 0 || line_num > self.lines.len() {
925 return false;
926 }
927 self.lines[line_num - 1].in_front_matter
928 }
929
930 pub fn is_in_html_block(&self, line_num: usize) -> bool {
932 if line_num == 0 || line_num > self.lines.len() {
933 return false;
934 }
935 self.lines[line_num - 1].in_html_block
936 }
937
938 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
940 if line_num == 0 || line_num > self.lines.len() {
941 return false;
942 }
943
944 let col_0indexed = if col > 0 { col - 1 } else { 0 };
948 let code_spans = self.code_spans();
949 code_spans.iter().any(|span| {
950 if line_num < span.line || line_num > span.end_line {
952 return false;
953 }
954
955 if span.line == span.end_line {
956 col_0indexed >= span.start_col && col_0indexed < span.end_col
958 } else if line_num == span.line {
959 col_0indexed >= span.start_col
961 } else if line_num == span.end_line {
962 col_0indexed < span.end_col
964 } else {
965 true
967 }
968 })
969 }
970
971 #[inline]
973 pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
974 let code_spans = self.code_spans();
975 code_spans
976 .iter()
977 .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
978 }
979
980 #[inline]
983 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
984 self.reference_defs
985 .iter()
986 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
987 }
988
989 #[inline]
993 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
994 self.html_comment_ranges
995 .iter()
996 .any(|range| byte_pos >= range.start && byte_pos < range.end)
997 }
998
999 #[inline]
1002 pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1003 self.html_tags()
1004 .iter()
1005 .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1006 }
1007
1008 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1010 self.jinja_ranges
1011 .iter()
1012 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1013 }
1014
1015 pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1017 self.reference_defs.iter().any(|def| {
1018 if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1019 byte_pos >= start && byte_pos < end
1020 } else {
1021 false
1022 }
1023 })
1024 }
1025
1026 pub fn has_char(&self, ch: char) -> bool {
1028 match ch {
1029 '#' => self.char_frequency.hash_count > 0,
1030 '*' => self.char_frequency.asterisk_count > 0,
1031 '_' => self.char_frequency.underscore_count > 0,
1032 '-' => self.char_frequency.hyphen_count > 0,
1033 '+' => self.char_frequency.plus_count > 0,
1034 '>' => self.char_frequency.gt_count > 0,
1035 '|' => self.char_frequency.pipe_count > 0,
1036 '[' => self.char_frequency.bracket_count > 0,
1037 '`' => self.char_frequency.backtick_count > 0,
1038 '<' => self.char_frequency.lt_count > 0,
1039 '!' => self.char_frequency.exclamation_count > 0,
1040 '\n' => self.char_frequency.newline_count > 0,
1041 _ => self.content.contains(ch), }
1043 }
1044
1045 pub fn char_count(&self, ch: char) -> usize {
1047 match ch {
1048 '#' => self.char_frequency.hash_count,
1049 '*' => self.char_frequency.asterisk_count,
1050 '_' => self.char_frequency.underscore_count,
1051 '-' => self.char_frequency.hyphen_count,
1052 '+' => self.char_frequency.plus_count,
1053 '>' => self.char_frequency.gt_count,
1054 '|' => self.char_frequency.pipe_count,
1055 '[' => self.char_frequency.bracket_count,
1056 '`' => self.char_frequency.backtick_count,
1057 '<' => self.char_frequency.lt_count,
1058 '!' => self.char_frequency.exclamation_count,
1059 '\n' => self.char_frequency.newline_count,
1060 _ => self.content.matches(ch).count(), }
1062 }
1063
1064 pub fn likely_has_headings(&self) -> bool {
1066 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
1068
1069 pub fn likely_has_lists(&self) -> bool {
1071 self.char_frequency.asterisk_count > 0
1072 || self.char_frequency.hyphen_count > 0
1073 || self.char_frequency.plus_count > 0
1074 }
1075
1076 pub fn likely_has_emphasis(&self) -> bool {
1078 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1079 }
1080
1081 pub fn likely_has_tables(&self) -> bool {
1083 self.char_frequency.pipe_count > 2
1084 }
1085
1086 pub fn likely_has_blockquotes(&self) -> bool {
1088 self.char_frequency.gt_count > 0
1089 }
1090
1091 pub fn likely_has_code(&self) -> bool {
1093 self.char_frequency.backtick_count > 0
1094 }
1095
1096 pub fn likely_has_links_or_images(&self) -> bool {
1098 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1099 }
1100
1101 pub fn likely_has_html(&self) -> bool {
1103 self.char_frequency.lt_count > 0
1104 }
1105
1106 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1108 self.html_tags()
1109 .iter()
1110 .filter(|tag| tag.line == line_num)
1111 .cloned()
1112 .collect()
1113 }
1114
1115 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1117 self.emphasis_spans()
1118 .iter()
1119 .filter(|span| span.line == line_num)
1120 .cloned()
1121 .collect()
1122 }
1123
1124 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1126 self.table_rows()
1127 .iter()
1128 .filter(|row| row.line == line_num)
1129 .cloned()
1130 .collect()
1131 }
1132
1133 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1135 self.bare_urls()
1136 .iter()
1137 .filter(|url| url.line == line_num)
1138 .cloned()
1139 .collect()
1140 }
1141
1142 #[inline]
1148 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1149 let idx = match lines.binary_search_by(|line| {
1151 if byte_offset < line.byte_offset {
1152 std::cmp::Ordering::Greater
1153 } else if byte_offset > line.byte_offset + line.byte_len {
1154 std::cmp::Ordering::Less
1155 } else {
1156 std::cmp::Ordering::Equal
1157 }
1158 }) {
1159 Ok(idx) => idx,
1160 Err(idx) => idx.saturating_sub(1),
1161 };
1162
1163 let line = &lines[idx];
1164 let line_num = idx + 1;
1165 let col = byte_offset.saturating_sub(line.byte_offset);
1166
1167 (idx, line_num, col)
1168 }
1169
1170 #[inline]
1172 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1173 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1175
1176 if idx > 0 {
1178 let span = &code_spans[idx - 1];
1179 if offset >= span.byte_offset && offset < span.byte_end {
1180 return true;
1181 }
1182 }
1183
1184 false
1185 }
1186
1187 fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1191 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1192
1193 let mut link_ranges = Vec::new();
1194 let mut options = Options::empty();
1195 options.insert(Options::ENABLE_WIKILINKS);
1196 options.insert(Options::ENABLE_FOOTNOTES);
1197
1198 let parser = Parser::new_ext(content, options).into_offset_iter();
1199 let mut link_stack: Vec<usize> = Vec::new();
1200
1201 for (event, range) in parser {
1202 match event {
1203 Event::Start(Tag::Link { .. }) => {
1204 link_stack.push(range.start);
1205 }
1206 Event::End(TagEnd::Link) => {
1207 if let Some(start_pos) = link_stack.pop() {
1208 link_ranges.push((start_pos, range.end));
1209 }
1210 }
1211 _ => {}
1212 }
1213 }
1214
1215 link_ranges
1216 }
1217
1218 fn parse_links(
1220 content: &'a str,
1221 lines: &[LineInfo],
1222 code_blocks: &[(usize, usize)],
1223 code_spans: &[CodeSpan],
1224 flavor: MarkdownFlavor,
1225 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1226 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1227 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1228 use std::collections::HashSet;
1229
1230 let mut links = Vec::with_capacity(content.len() / 500);
1231 let mut broken_links = Vec::new();
1232 let mut footnote_refs = Vec::new();
1233
1234 let mut found_positions = HashSet::new();
1236
1237 let mut options = Options::empty();
1247 options.insert(Options::ENABLE_WIKILINKS);
1248 options.insert(Options::ENABLE_FOOTNOTES);
1249
1250 let parser = Parser::new_with_broken_link_callback(
1251 content,
1252 options,
1253 Some(|link: BrokenLink<'_>| {
1254 broken_links.push(BrokenLinkInfo {
1255 reference: link.reference.to_string(),
1256 span: link.span.clone(),
1257 });
1258 None
1259 }),
1260 )
1261 .into_offset_iter();
1262
1263 let mut link_stack: Vec<(
1264 usize,
1265 usize,
1266 pulldown_cmark::CowStr<'a>,
1267 LinkType,
1268 pulldown_cmark::CowStr<'a>,
1269 )> = Vec::new();
1270 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1273 match event {
1274 Event::Start(Tag::Link {
1275 link_type,
1276 dest_url,
1277 id,
1278 ..
1279 }) => {
1280 link_stack.push((range.start, range.end, dest_url, link_type, id));
1282 text_chunks.clear();
1283 }
1284 Event::Text(text) if !link_stack.is_empty() => {
1285 text_chunks.push((text.to_string(), range.start, range.end));
1287 }
1288 Event::Code(code) if !link_stack.is_empty() => {
1289 let code_text = format!("`{code}`");
1291 text_chunks.push((code_text, range.start, range.end));
1292 }
1293 Event::End(TagEnd::Link) => {
1294 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1295 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1297 text_chunks.clear();
1298 continue;
1299 }
1300
1301 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303
1304 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1306 text_chunks.clear();
1307 continue;
1308 }
1309
1310 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1311
1312 let is_reference = matches!(
1313 link_type,
1314 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1315 );
1316
1317 let link_text = if start_pos < content.len() {
1320 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1321
1322 let mut close_pos = None;
1326 let mut depth = 0;
1327 let mut in_code_span = false;
1328
1329 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1330 let mut backslash_count = 0;
1332 let mut j = i;
1333 while j > 0 && link_bytes[j - 1] == b'\\' {
1334 backslash_count += 1;
1335 j -= 1;
1336 }
1337 let is_escaped = backslash_count % 2 != 0;
1338
1339 if byte == b'`' && !is_escaped {
1341 in_code_span = !in_code_span;
1342 }
1343
1344 if !is_escaped && !in_code_span {
1346 if byte == b'[' {
1347 depth += 1;
1348 } else if byte == b']' {
1349 if depth == 0 {
1350 close_pos = Some(i);
1352 break;
1353 } else {
1354 depth -= 1;
1355 }
1356 }
1357 }
1358 }
1359
1360 if let Some(pos) = close_pos {
1361 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1362 } else {
1363 Cow::Borrowed("")
1364 }
1365 } else {
1366 Cow::Borrowed("")
1367 };
1368
1369 let reference_id = if is_reference && !ref_id.is_empty() {
1371 Some(Cow::Owned(ref_id.to_lowercase()))
1372 } else if is_reference {
1373 Some(Cow::Owned(link_text.to_lowercase()))
1375 } else {
1376 None
1377 };
1378
1379 found_positions.insert(start_pos);
1381
1382 links.push(ParsedLink {
1383 line: line_num,
1384 start_col: col_start,
1385 end_col: col_end,
1386 byte_offset: start_pos,
1387 byte_end: range.end,
1388 text: link_text,
1389 url: Cow::Owned(url.to_string()),
1390 is_reference,
1391 reference_id,
1392 link_type,
1393 });
1394
1395 text_chunks.clear();
1396 }
1397 }
1398 Event::FootnoteReference(footnote_id) => {
1399 if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1402 continue;
1403 }
1404
1405 let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1406 footnote_refs.push(FootnoteRef {
1407 id: footnote_id.to_string(),
1408 line: line_num,
1409 byte_offset: range.start,
1410 byte_end: range.end,
1411 });
1412 }
1413 _ => {}
1414 }
1415 }
1416
1417 for cap in LINK_PATTERN.captures_iter(content) {
1421 let full_match = cap.get(0).unwrap();
1422 let match_start = full_match.start();
1423 let match_end = full_match.end();
1424
1425 if found_positions.contains(&match_start) {
1427 continue;
1428 }
1429
1430 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1432 continue;
1433 }
1434
1435 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1437 continue;
1438 }
1439
1440 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1442 continue;
1443 }
1444
1445 if Self::is_offset_in_code_span(code_spans, match_start) {
1447 continue;
1448 }
1449
1450 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1452 continue;
1453 }
1454
1455 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1457
1458 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1460 continue;
1461 }
1462
1463 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1464
1465 let text = cap.get(1).map_or("", |m| m.as_str());
1466
1467 if let Some(ref_id) = cap.get(6) {
1469 let ref_id_str = ref_id.as_str();
1470 let normalized_ref = if ref_id_str.is_empty() {
1471 Cow::Owned(text.to_lowercase()) } else {
1473 Cow::Owned(ref_id_str.to_lowercase())
1474 };
1475
1476 links.push(ParsedLink {
1478 line: line_num,
1479 start_col: col_start,
1480 end_col: col_end,
1481 byte_offset: match_start,
1482 byte_end: match_end,
1483 text: Cow::Borrowed(text),
1484 url: Cow::Borrowed(""), is_reference: true,
1486 reference_id: Some(normalized_ref),
1487 link_type: LinkType::Reference, });
1489 }
1490 }
1491
1492 (links, broken_links, footnote_refs)
1493 }
1494
1495 fn parse_images(
1497 content: &'a str,
1498 lines: &[LineInfo],
1499 code_blocks: &[(usize, usize)],
1500 code_spans: &[CodeSpan],
1501 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1502 ) -> Vec<ParsedImage<'a>> {
1503 use crate::utils::skip_context::is_in_html_comment_ranges;
1504 use std::collections::HashSet;
1505
1506 let mut images = Vec::with_capacity(content.len() / 1000);
1508 let mut found_positions = HashSet::new();
1509
1510 let parser = Parser::new(content).into_offset_iter();
1512 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1513 Vec::new();
1514 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); for (event, range) in parser {
1517 match event {
1518 Event::Start(Tag::Image {
1519 link_type,
1520 dest_url,
1521 id,
1522 ..
1523 }) => {
1524 image_stack.push((range.start, dest_url, link_type, id));
1525 text_chunks.clear();
1526 }
1527 Event::Text(text) if !image_stack.is_empty() => {
1528 text_chunks.push((text.to_string(), range.start, range.end));
1529 }
1530 Event::Code(code) if !image_stack.is_empty() => {
1531 let code_text = format!("`{code}`");
1532 text_chunks.push((code_text, range.start, range.end));
1533 }
1534 Event::End(TagEnd::Image) => {
1535 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1536 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1538 continue;
1539 }
1540
1541 if Self::is_offset_in_code_span(code_spans, start_pos) {
1543 continue;
1544 }
1545
1546 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1548 continue;
1549 }
1550
1551 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1553 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1554
1555 let is_reference = matches!(
1556 link_type,
1557 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1558 );
1559
1560 let alt_text = if start_pos < content.len() {
1563 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1564
1565 let mut close_pos = None;
1568 let mut depth = 0;
1569
1570 if image_bytes.len() > 2 {
1571 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1572 let mut backslash_count = 0;
1574 let mut j = i;
1575 while j > 0 && image_bytes[j - 1] == b'\\' {
1576 backslash_count += 1;
1577 j -= 1;
1578 }
1579 let is_escaped = backslash_count % 2 != 0;
1580
1581 if !is_escaped {
1582 if byte == b'[' {
1583 depth += 1;
1584 } else if byte == b']' {
1585 if depth == 0 {
1586 close_pos = Some(i);
1588 break;
1589 } else {
1590 depth -= 1;
1591 }
1592 }
1593 }
1594 }
1595 }
1596
1597 if let Some(pos) = close_pos {
1598 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1599 } else {
1600 Cow::Borrowed("")
1601 }
1602 } else {
1603 Cow::Borrowed("")
1604 };
1605
1606 let reference_id = if is_reference && !ref_id.is_empty() {
1607 Some(Cow::Owned(ref_id.to_lowercase()))
1608 } else if is_reference {
1609 Some(Cow::Owned(alt_text.to_lowercase())) } else {
1611 None
1612 };
1613
1614 found_positions.insert(start_pos);
1615 images.push(ParsedImage {
1616 line: line_num,
1617 start_col: col_start,
1618 end_col: col_end,
1619 byte_offset: start_pos,
1620 byte_end: range.end,
1621 alt_text,
1622 url: Cow::Owned(url.to_string()),
1623 is_reference,
1624 reference_id,
1625 link_type,
1626 });
1627 }
1628 }
1629 _ => {}
1630 }
1631 }
1632
1633 for cap in IMAGE_PATTERN.captures_iter(content) {
1635 let full_match = cap.get(0).unwrap();
1636 let match_start = full_match.start();
1637 let match_end = full_match.end();
1638
1639 if found_positions.contains(&match_start) {
1641 continue;
1642 }
1643
1644 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1646 continue;
1647 }
1648
1649 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1651 || Self::is_offset_in_code_span(code_spans, match_start)
1652 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1653 {
1654 continue;
1655 }
1656
1657 if let Some(ref_id) = cap.get(6) {
1659 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1660 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1661 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1662 let ref_id_str = ref_id.as_str();
1663 let normalized_ref = if ref_id_str.is_empty() {
1664 Cow::Owned(alt_text.to_lowercase())
1665 } else {
1666 Cow::Owned(ref_id_str.to_lowercase())
1667 };
1668
1669 images.push(ParsedImage {
1670 line: line_num,
1671 start_col: col_start,
1672 end_col: col_end,
1673 byte_offset: match_start,
1674 byte_end: match_end,
1675 alt_text: Cow::Borrowed(alt_text),
1676 url: Cow::Borrowed(""),
1677 is_reference: true,
1678 reference_id: Some(normalized_ref),
1679 link_type: LinkType::Reference, });
1681 }
1682 }
1683
1684 images
1685 }
1686
1687 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1689 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1693 if line_info.in_code_block {
1695 continue;
1696 }
1697
1698 let line = line_info.content(content);
1699 let line_num = line_idx + 1;
1700
1701 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1702 let id = cap.get(1).unwrap().as_str().to_lowercase();
1703 let url = cap.get(2).unwrap().as_str().to_string();
1704 let title_match = cap.get(3).or_else(|| cap.get(4));
1705 let title = title_match.map(|m| m.as_str().to_string());
1706
1707 let match_obj = cap.get(0).unwrap();
1710 let byte_offset = line_info.byte_offset + match_obj.start();
1711 let byte_end = line_info.byte_offset + match_obj.end();
1712
1713 let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1715 let start = line_info.byte_offset + m.start().saturating_sub(1);
1717 let end = line_info.byte_offset + m.end() + 1; (Some(start), Some(end))
1719 } else {
1720 (None, None)
1721 };
1722
1723 refs.push(ReferenceDef {
1724 line: line_num,
1725 id,
1726 url,
1727 title,
1728 byte_offset,
1729 byte_end,
1730 title_byte_start,
1731 title_byte_end,
1732 });
1733 }
1734 }
1735
1736 refs
1737 }
1738
1739 #[inline]
1743 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1744 let trimmed_start = line.trim_start();
1745 if !trimmed_start.starts_with('>') {
1746 return None;
1747 }
1748
1749 let mut remaining = line;
1751 let mut total_prefix_len = 0;
1752
1753 loop {
1754 let trimmed = remaining.trim_start();
1755 if !trimmed.starts_with('>') {
1756 break;
1757 }
1758
1759 let leading_ws_len = remaining.len() - trimmed.len();
1761 total_prefix_len += leading_ws_len + 1;
1762
1763 let after_gt = &trimmed[1..];
1764
1765 if let Some(stripped) = after_gt.strip_prefix(' ') {
1767 total_prefix_len += 1;
1768 remaining = stripped;
1769 } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1770 total_prefix_len += 1;
1771 remaining = stripped;
1772 } else {
1773 remaining = after_gt;
1774 }
1775 }
1776
1777 Some((&line[..total_prefix_len], remaining))
1778 }
1779
1780 fn detect_list_items_and_emphasis_with_pulldown(
1804 content: &str,
1805 line_offsets: &[usize],
1806 flavor: MarkdownFlavor,
1807 front_matter_end: usize,
1808 code_blocks: &[(usize, usize)],
1809 ) -> (ListItemMap, Vec<EmphasisSpan>) {
1810 use std::collections::HashMap;
1811
1812 let mut list_items = HashMap::new();
1813 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1814
1815 let mut options = Options::empty();
1816 options.insert(Options::ENABLE_TABLES);
1817 options.insert(Options::ENABLE_FOOTNOTES);
1818 options.insert(Options::ENABLE_STRIKETHROUGH);
1819 options.insert(Options::ENABLE_TASKLISTS);
1820 options.insert(Options::ENABLE_GFM);
1822
1823 let _ = flavor;
1825
1826 let parser = Parser::new_ext(content, options).into_offset_iter();
1827 let mut list_depth: usize = 0;
1828 let mut list_stack: Vec<bool> = Vec::new();
1829
1830 for (event, range) in parser {
1831 match event {
1832 Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1834 let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1835 2
1836 } else {
1837 1
1838 };
1839 let match_start = range.start;
1840 let match_end = range.end;
1841
1842 if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1844 let marker = content[match_start..].chars().next().unwrap_or('*');
1846 if marker == '*' || marker == '_' {
1847 let content_start = match_start + marker_count;
1849 let content_end = if match_end >= marker_count {
1850 match_end - marker_count
1851 } else {
1852 match_end
1853 };
1854 let content_part = if content_start < content_end && content_end <= content.len() {
1855 &content[content_start..content_end]
1856 } else {
1857 ""
1858 };
1859
1860 let line_idx = match line_offsets.binary_search(&match_start) {
1862 Ok(idx) => idx,
1863 Err(idx) => idx.saturating_sub(1),
1864 };
1865 let line_num = line_idx + 1;
1866 let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1867 let col_start = match_start - line_start;
1868 let col_end = match_end - line_start;
1869
1870 emphasis_spans.push(EmphasisSpan {
1871 line: line_num,
1872 start_col: col_start,
1873 end_col: col_end,
1874 byte_offset: match_start,
1875 byte_end: match_end,
1876 marker,
1877 marker_count,
1878 content: content_part.to_string(),
1879 });
1880 }
1881 }
1882 }
1883 Event::Start(Tag::List(start_number)) => {
1884 list_depth += 1;
1885 list_stack.push(start_number.is_some());
1886 }
1887 Event::End(TagEnd::List(_)) => {
1888 list_depth = list_depth.saturating_sub(1);
1889 list_stack.pop();
1890 }
1891 Event::Start(Tag::Item) if list_depth > 0 => {
1892 let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1894 let item_start = range.start;
1896
1897 let mut line_idx = match line_offsets.binary_search(&item_start) {
1899 Ok(idx) => idx,
1900 Err(idx) => idx.saturating_sub(1),
1901 };
1902
1903 if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1907 line_idx += 1;
1908 }
1909
1910 if front_matter_end > 0 && line_idx < front_matter_end {
1912 continue;
1913 }
1914
1915 if line_idx < line_offsets.len() {
1916 let line_start_byte = line_offsets[line_idx];
1917 let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
1918 let line = &content[line_start_byte..line_end.min(content.len())];
1919
1920 let line = line
1922 .strip_suffix('\n')
1923 .or_else(|| line.strip_suffix("\r\n"))
1924 .unwrap_or(line);
1925
1926 let blockquote_parse = Self::parse_blockquote_prefix(line);
1928 let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
1929 (prefix.len(), content)
1930 } else {
1931 (0, line)
1932 };
1933
1934 if current_list_is_ordered {
1936 if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1937 Self::parse_ordered_list(line_to_parse)
1938 {
1939 let marker = format!("{number_str}{delimiter}");
1940 let marker_column = blockquote_prefix_len + leading_spaces.len();
1941 let content_column = marker_column + marker.len() + spacing.len();
1942 let number = number_str.parse().ok();
1943
1944 list_items.entry(line_start_byte).or_insert((
1945 true,
1946 marker,
1947 marker_column,
1948 content_column,
1949 number,
1950 ));
1951 }
1952 } else if let Some((leading_spaces, marker, spacing, _content)) =
1953 Self::parse_unordered_list(line_to_parse)
1954 {
1955 let marker_column = blockquote_prefix_len + leading_spaces.len();
1956 let content_column = marker_column + 1 + spacing.len();
1957
1958 list_items.entry(line_start_byte).or_insert((
1959 false,
1960 marker.to_string(),
1961 marker_column,
1962 content_column,
1963 None,
1964 ));
1965 }
1966 }
1967 }
1968 _ => {}
1969 }
1970 }
1971
1972 (list_items, emphasis_spans)
1973 }
1974
1975 #[inline]
1979 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1980 let bytes = line.as_bytes();
1981 let mut i = 0;
1982
1983 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1985 i += 1;
1986 }
1987
1988 if i >= bytes.len() {
1990 return None;
1991 }
1992 let marker = bytes[i] as char;
1993 if marker != '-' && marker != '*' && marker != '+' {
1994 return None;
1995 }
1996 let marker_pos = i;
1997 i += 1;
1998
1999 let spacing_start = i;
2001 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2002 i += 1;
2003 }
2004
2005 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2006 }
2007
2008 #[inline]
2012 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2013 let bytes = line.as_bytes();
2014 let mut i = 0;
2015
2016 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2018 i += 1;
2019 }
2020
2021 let number_start = i;
2023 while i < bytes.len() && bytes[i].is_ascii_digit() {
2024 i += 1;
2025 }
2026 if i == number_start {
2027 return None; }
2029
2030 if i >= bytes.len() {
2032 return None;
2033 }
2034 let delimiter = bytes[i] as char;
2035 if delimiter != '.' && delimiter != ')' {
2036 return None;
2037 }
2038 let delimiter_pos = i;
2039 i += 1;
2040
2041 let spacing_start = i;
2043 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2044 i += 1;
2045 }
2046
2047 Some((
2048 &line[..number_start],
2049 &line[number_start..delimiter_pos],
2050 delimiter,
2051 &line[spacing_start..i],
2052 &line[i..],
2053 ))
2054 }
2055
2056 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2059 let num_lines = line_offsets.len();
2060 let mut in_code_block = vec![false; num_lines];
2061
2062 for &(start, end) in code_blocks {
2064 let safe_start = if start > 0 && !content.is_char_boundary(start) {
2066 let mut boundary = start;
2067 while boundary > 0 && !content.is_char_boundary(boundary) {
2068 boundary -= 1;
2069 }
2070 boundary
2071 } else {
2072 start
2073 };
2074
2075 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2076 let mut boundary = end;
2077 while boundary < content.len() && !content.is_char_boundary(boundary) {
2078 boundary += 1;
2079 }
2080 boundary
2081 } else {
2082 end.min(content.len())
2083 };
2084
2085 let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2104 let first_line = first_line_after.saturating_sub(1);
2105 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2106
2107 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2109 *flag = true;
2110 }
2111 }
2112
2113 in_code_block
2114 }
2115
2116 fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2119 let content_lines: Vec<&str> = content.lines().collect();
2120 let num_lines = content_lines.len();
2121 let mut in_math_block = vec![false; num_lines];
2122
2123 let mut inside_math = false;
2124
2125 for (i, line) in content_lines.iter().enumerate() {
2126 if code_block_map.get(i).copied().unwrap_or(false) {
2128 continue;
2129 }
2130
2131 let trimmed = line.trim();
2132
2133 if trimmed == "$$" {
2136 if inside_math {
2137 in_math_block[i] = true;
2139 inside_math = false;
2140 } else {
2141 in_math_block[i] = true;
2143 inside_math = true;
2144 }
2145 } else if inside_math {
2146 in_math_block[i] = true;
2148 }
2149 }
2150
2151 in_math_block
2152 }
2153
2154 fn compute_basic_line_info(
2157 content: &str,
2158 line_offsets: &[usize],
2159 code_blocks: &[(usize, usize)],
2160 flavor: MarkdownFlavor,
2161 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2162 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2163 ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2164 let content_lines: Vec<&str> = content.lines().collect();
2165 let mut lines = Vec::with_capacity(content_lines.len());
2166
2167 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2169
2170 let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2172
2173 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2176
2177 let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2180 content,
2181 line_offsets,
2182 flavor,
2183 front_matter_end,
2184 code_blocks,
2185 );
2186
2187 for (i, line) in content_lines.iter().enumerate() {
2188 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2189 let indent = line.len() - line.trim_start().len();
2190 let visual_indent = ElementCache::calculate_indentation_width_default(line);
2192
2193 let blockquote_parse = Self::parse_blockquote_prefix(line);
2195
2196 let is_blank = if let Some((_, content)) = blockquote_parse {
2198 content.trim().is_empty()
2200 } else {
2201 line.trim().is_empty()
2202 };
2203
2204 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2206
2207 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2209 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2210 let line_end_offset = byte_offset + line.len();
2213 let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2214 html_comment_ranges,
2215 byte_offset,
2216 line_end_offset,
2217 );
2218 let list_item =
2221 list_item_map
2222 .get(&byte_offset)
2223 .map(
2224 |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2225 marker: marker.clone(),
2226 is_ordered: *is_ordered,
2227 number: *number,
2228 marker_column: *marker_column,
2229 content_column: *content_column,
2230 },
2231 );
2232
2233 let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2236 let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2237
2238 let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2240
2241 lines.push(LineInfo {
2242 byte_offset,
2243 byte_len: line.len(),
2244 indent,
2245 visual_indent,
2246 is_blank,
2247 in_code_block,
2248 in_front_matter,
2249 in_html_block: false, in_html_comment,
2251 list_item,
2252 heading: None, blockquote: None, in_mkdocstrings,
2255 in_esm_block: false, in_code_span_continuation: false, is_horizontal_rule: is_hr,
2258 in_math_block,
2259 });
2260 }
2261
2262 (lines, emphasis_spans)
2263 }
2264
2265 fn detect_headings_and_blockquotes(
2267 content: &str,
2268 lines: &mut [LineInfo],
2269 flavor: MarkdownFlavor,
2270 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2271 link_byte_ranges: &[(usize, usize)],
2272 ) {
2273 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2275 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2276 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2277 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2278
2279 let content_lines: Vec<&str> = content.lines().collect();
2280
2281 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2283
2284 for i in 0..lines.len() {
2286 let line = content_lines[i];
2287
2288 if !(front_matter_end > 0 && i < front_matter_end)
2293 && let Some(bq) = parse_blockquote_detailed(line)
2294 {
2295 let nesting_level = bq.markers.len();
2296 let marker_column = bq.indent.len();
2297 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2298 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2299 let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2300 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2301
2302 lines[i].blockquote = Some(BlockquoteInfo {
2303 nesting_level,
2304 indent: bq.indent.to_string(),
2305 marker_column,
2306 prefix,
2307 content: bq.content.to_string(),
2308 has_no_space_after_marker: has_no_space,
2309 has_multiple_spaces_after_marker: has_multiple_spaces,
2310 needs_md028_fix,
2311 });
2312 }
2313
2314 if lines[i].in_code_block {
2316 continue;
2317 }
2318
2319 if front_matter_end > 0 && i < front_matter_end {
2321 continue;
2322 }
2323
2324 if lines[i].in_html_block {
2326 continue;
2327 }
2328
2329 if lines[i].is_blank {
2331 continue;
2332 }
2333
2334 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2337 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2338 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2339 } else {
2340 false
2341 };
2342
2343 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2344 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2346 continue;
2347 }
2348 let line_offset = lines[i].byte_offset;
2351 if link_byte_ranges
2352 .iter()
2353 .any(|&(start, end)| line_offset > start && line_offset < end)
2354 {
2355 continue;
2356 }
2357 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2358 let hashes = caps.get(2).map_or("", |m| m.as_str());
2359 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2360 let rest = caps.get(4).map_or("", |m| m.as_str());
2361
2362 let level = hashes.len() as u8;
2363 let marker_column = leading_spaces.len();
2364
2365 let (text, has_closing, closing_seq) = {
2367 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2369 if rest[id_start..].trim_end().ends_with('}') {
2371 (&rest[..id_start], &rest[id_start..])
2373 } else {
2374 (rest, "")
2375 }
2376 } else {
2377 (rest, "")
2378 };
2379
2380 let trimmed_rest = rest_without_id.trim_end();
2382 if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2383 let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2386
2387 let last_hash_char_idx = char_positions
2389 .iter()
2390 .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2391
2392 if let Some(mut char_idx) = last_hash_char_idx {
2393 while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2395 char_idx -= 1;
2396 }
2397
2398 let start_of_hashes = char_positions[char_idx].0;
2400
2401 let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2403
2404 let potential_closing = &trimmed_rest[start_of_hashes..];
2406 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2407
2408 if is_all_hashes && has_space_before {
2409 let closing_hashes = potential_closing.to_string();
2411 let text_part = if !custom_id_part.is_empty() {
2414 format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2417 } else {
2418 trimmed_rest[..start_of_hashes].trim_end().to_string()
2419 };
2420 (text_part, true, closing_hashes)
2421 } else {
2422 (rest.to_string(), false, String::new())
2424 }
2425 } else {
2426 (rest.to_string(), false, String::new())
2428 }
2429 } else {
2430 (rest.to_string(), false, String::new())
2432 }
2433 };
2434
2435 let content_column = marker_column + hashes.len() + spaces_after.len();
2436
2437 let raw_text = text.trim().to_string();
2439 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2440
2441 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2443 let next_line = content_lines[i + 1];
2444 if !lines[i + 1].in_code_block
2445 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2446 && let Some(next_line_id) =
2447 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2448 {
2449 custom_id = Some(next_line_id);
2450 }
2451 }
2452
2453 let is_valid = !spaces_after.is_empty()
2463 || rest.is_empty()
2464 || level > 1
2465 || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2466
2467 lines[i].heading = Some(HeadingInfo {
2468 level,
2469 style: HeadingStyle::ATX,
2470 marker: hashes.to_string(),
2471 marker_column,
2472 content_column,
2473 text: clean_text,
2474 custom_id,
2475 raw_text,
2476 has_closing_sequence: has_closing,
2477 closing_sequence: closing_seq,
2478 is_valid,
2479 });
2480 }
2481 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2483 let next_line = content_lines[i + 1];
2484 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2485 if front_matter_end > 0 && i < front_matter_end {
2487 continue;
2488 }
2489
2490 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2492 {
2493 continue;
2494 }
2495
2496 let content_line = line.trim();
2499
2500 if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2502 continue;
2503 }
2504
2505 if content_line.starts_with('_') {
2507 let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2508 if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2509 continue;
2510 }
2511 }
2512
2513 if let Some(first_char) = content_line.chars().next()
2515 && first_char.is_ascii_digit()
2516 {
2517 let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2518 if num_end < content_line.len() {
2519 let next = content_line.chars().nth(num_end);
2520 if next == Some('.') || next == Some(')') {
2521 continue;
2522 }
2523 }
2524 }
2525
2526 if ATX_HEADING_REGEX.is_match(line) {
2528 continue;
2529 }
2530
2531 if content_line.starts_with('>') {
2533 continue;
2534 }
2535
2536 let trimmed_start = line.trim_start();
2538 if trimmed_start.len() >= 3 {
2539 let first_three: String = trimmed_start.chars().take(3).collect();
2540 if first_three == "```" || first_three == "~~~" {
2541 continue;
2542 }
2543 }
2544
2545 if content_line.starts_with('<') {
2547 continue;
2548 }
2549
2550 let underline = next_line.trim();
2551
2552 let level = if underline.starts_with('=') { 1 } else { 2 };
2553 let style = if level == 1 {
2554 HeadingStyle::Setext1
2555 } else {
2556 HeadingStyle::Setext2
2557 };
2558
2559 let raw_text = line.trim().to_string();
2561 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2562
2563 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2565 let attr_line = content_lines[i + 2];
2566 if !lines[i + 2].in_code_block
2567 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2568 && let Some(attr_line_id) =
2569 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2570 {
2571 custom_id = Some(attr_line_id);
2572 }
2573 }
2574
2575 lines[i].heading = Some(HeadingInfo {
2576 level,
2577 style,
2578 marker: underline.to_string(),
2579 marker_column: next_line.len() - next_line.trim_start().len(),
2580 content_column: lines[i].indent,
2581 text: clean_text,
2582 custom_id,
2583 raw_text,
2584 has_closing_sequence: false,
2585 closing_sequence: String::new(),
2586 is_valid: true, });
2588 }
2589 }
2590 }
2591 }
2592
2593 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2595 const BLOCK_ELEMENTS: &[&str] = &[
2598 "address",
2599 "article",
2600 "aside",
2601 "audio",
2602 "blockquote",
2603 "canvas",
2604 "details",
2605 "dialog",
2606 "dd",
2607 "div",
2608 "dl",
2609 "dt",
2610 "embed",
2611 "fieldset",
2612 "figcaption",
2613 "figure",
2614 "footer",
2615 "form",
2616 "h1",
2617 "h2",
2618 "h3",
2619 "h4",
2620 "h5",
2621 "h6",
2622 "header",
2623 "hr",
2624 "iframe",
2625 "li",
2626 "main",
2627 "menu",
2628 "nav",
2629 "noscript",
2630 "object",
2631 "ol",
2632 "p",
2633 "picture",
2634 "pre",
2635 "script",
2636 "search",
2637 "section",
2638 "source",
2639 "style",
2640 "summary",
2641 "svg",
2642 "table",
2643 "tbody",
2644 "td",
2645 "template",
2646 "textarea",
2647 "tfoot",
2648 "th",
2649 "thead",
2650 "tr",
2651 "track",
2652 "ul",
2653 "video",
2654 ];
2655
2656 let mut i = 0;
2657 while i < lines.len() {
2658 if lines[i].in_code_block || lines[i].in_front_matter {
2660 i += 1;
2661 continue;
2662 }
2663
2664 let trimmed = lines[i].content(content).trim_start();
2665
2666 if trimmed.starts_with('<') && trimmed.len() > 1 {
2668 let after_bracket = &trimmed[1..];
2670 let is_closing = after_bracket.starts_with('/');
2671 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2672
2673 let tag_name = tag_start
2675 .chars()
2676 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2677 .collect::<String>()
2678 .to_lowercase();
2679
2680 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2682 lines[i].in_html_block = true;
2684
2685 if !is_closing {
2688 let closing_tag = format!("</{tag_name}>");
2689 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2691 let mut j = i + 1;
2692 let mut found_closing_tag = false;
2693 while j < lines.len() && j < i + 100 {
2694 if !allow_blank_lines && lines[j].is_blank {
2697 break;
2698 }
2699
2700 lines[j].in_html_block = true;
2701
2702 if lines[j].content(content).contains(&closing_tag) {
2704 found_closing_tag = true;
2705 }
2706
2707 if found_closing_tag {
2710 j += 1;
2711 while j < lines.len() && j < i + 100 {
2713 if lines[j].is_blank {
2714 break;
2715 }
2716 lines[j].in_html_block = true;
2717 j += 1;
2718 }
2719 break;
2720 }
2721 j += 1;
2722 }
2723 }
2724 }
2725 }
2726
2727 i += 1;
2728 }
2729 }
2730
2731 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2734 if !flavor.supports_esm_blocks() {
2736 return;
2737 }
2738
2739 let mut in_multiline_comment = false;
2740
2741 for line in lines.iter_mut() {
2742 if line.is_blank || line.in_html_comment {
2744 continue;
2745 }
2746
2747 let trimmed = line.content(content).trim_start();
2748
2749 if in_multiline_comment {
2751 if trimmed.contains("*/") {
2752 in_multiline_comment = false;
2753 }
2754 continue;
2755 }
2756
2757 if trimmed.starts_with("//") {
2759 continue;
2760 }
2761
2762 if trimmed.starts_with("/*") {
2764 if !trimmed.contains("*/") {
2765 in_multiline_comment = true;
2766 }
2767 continue;
2768 }
2769
2770 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2772 line.in_esm_block = true;
2773 } else {
2774 break;
2776 }
2777 }
2778 }
2779
2780 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2782 let mut code_spans = Vec::new();
2783
2784 if !content.contains('`') {
2786 return code_spans;
2787 }
2788
2789 let parser = Parser::new(content).into_offset_iter();
2791
2792 for (event, range) in parser {
2793 if let Event::Code(_) = event {
2794 let start_pos = range.start;
2795 let end_pos = range.end;
2796
2797 let full_span = &content[start_pos..end_pos];
2799 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2800
2801 let content_start = start_pos + backtick_count;
2803 let content_end = end_pos - backtick_count;
2804 let span_content = if content_start < content_end {
2805 content[content_start..content_end].to_string()
2806 } else {
2807 String::new()
2808 };
2809
2810 let line_idx = lines
2813 .partition_point(|line| line.byte_offset <= start_pos)
2814 .saturating_sub(1);
2815 let line_num = line_idx + 1;
2816 let byte_col_start = start_pos - lines[line_idx].byte_offset;
2817
2818 let end_line_idx = lines
2820 .partition_point(|line| line.byte_offset <= end_pos)
2821 .saturating_sub(1);
2822 let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2823
2824 let line_content = lines[line_idx].content(content);
2827 let col_start = if byte_col_start <= line_content.len() {
2828 line_content[..byte_col_start].chars().count()
2829 } else {
2830 line_content.chars().count()
2831 };
2832
2833 let end_line_content = lines[end_line_idx].content(content);
2834 let col_end = if byte_col_end <= end_line_content.len() {
2835 end_line_content[..byte_col_end].chars().count()
2836 } else {
2837 end_line_content.chars().count()
2838 };
2839
2840 code_spans.push(CodeSpan {
2841 line: line_num,
2842 end_line: end_line_idx + 1,
2843 start_col: col_start,
2844 end_col: col_end,
2845 byte_offset: start_pos,
2846 byte_end: end_pos,
2847 backtick_count,
2848 content: span_content,
2849 });
2850 }
2851 }
2852
2853 code_spans.sort_by_key(|span| span.byte_offset);
2855
2856 code_spans
2857 }
2858
2859 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2870 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2872
2873 #[inline]
2876 fn reset_tracking_state(
2877 list_item: &ListItemInfo,
2878 has_list_breaking_content: &mut bool,
2879 min_continuation: &mut usize,
2880 ) {
2881 *has_list_breaking_content = false;
2882 let marker_width = if list_item.is_ordered {
2883 list_item.marker.len() + 1 } else {
2885 list_item.marker.len()
2886 };
2887 *min_continuation = if list_item.is_ordered {
2888 marker_width
2889 } else {
2890 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2891 };
2892 }
2893
2894 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
2897 let mut last_list_item_line = 0;
2898 let mut current_indent_level = 0;
2899 let mut last_marker_width = 0;
2900
2901 let mut has_list_breaking_content_since_last_item = false;
2903 let mut min_continuation_for_tracking = 0;
2904
2905 for (line_idx, line_info) in lines.iter().enumerate() {
2906 let line_num = line_idx + 1;
2907
2908 if line_info.in_code_block {
2910 if let Some(ref mut block) = current_block {
2911 let min_continuation_indent =
2913 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2914
2915 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2917
2918 match context {
2919 CodeBlockContext::Indented => {
2920 block.end_line = line_num;
2922 continue;
2923 }
2924 CodeBlockContext::Standalone => {
2925 let completed_block = current_block.take().unwrap();
2927 list_blocks.push(completed_block);
2928 continue;
2929 }
2930 CodeBlockContext::Adjacent => {
2931 block.end_line = line_num;
2933 continue;
2934 }
2935 }
2936 } else {
2937 continue;
2939 }
2940 }
2941
2942 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2944 caps.get(0).unwrap().as_str().to_string()
2945 } else {
2946 String::new()
2947 };
2948
2949 if let Some(ref block) = current_block
2952 && line_info.list_item.is_none()
2953 && !line_info.is_blank
2954 && !line_info.in_code_span_continuation
2955 {
2956 let line_content = line_info.content(content).trim();
2957
2958 let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2963
2964 let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
2967
2968 let breaks_list = line_info.heading.is_some()
2969 || line_content.starts_with("---")
2970 || line_content.starts_with("***")
2971 || line_content.starts_with("___")
2972 || crate::utils::skip_context::is_table_line(line_content)
2973 || blockquote_prefix_changes
2974 || (line_info.indent > 0
2975 && line_info.indent < min_continuation_for_tracking
2976 && !is_lazy_continuation);
2977
2978 if breaks_list {
2979 has_list_breaking_content_since_last_item = true;
2980 }
2981 }
2982
2983 if line_info.in_code_span_continuation
2986 && line_info.list_item.is_none()
2987 && let Some(ref mut block) = current_block
2988 {
2989 block.end_line = line_num;
2990 }
2991
2992 let effective_continuation_indent = if let Some(ref block) = current_block {
2998 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
2999 let line_content = line_info.content(content);
3000 let line_bq_level = line_content
3001 .chars()
3002 .take_while(|c| *c == '>' || c.is_whitespace())
3003 .filter(|&c| c == '>')
3004 .count();
3005 if line_bq_level > 0 && line_bq_level == block_bq_level {
3006 let mut pos = 0;
3008 let mut found_markers = 0;
3009 for c in line_content.chars() {
3010 pos += c.len_utf8();
3011 if c == '>' {
3012 found_markers += 1;
3013 if found_markers == line_bq_level {
3014 if line_content.get(pos..pos + 1) == Some(" ") {
3015 pos += 1;
3016 }
3017 break;
3018 }
3019 }
3020 }
3021 let after_bq = &line_content[pos..];
3022 after_bq.len() - after_bq.trim_start().len()
3023 } else {
3024 line_info.indent
3025 }
3026 } else {
3027 line_info.indent
3028 };
3029 let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3030 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3031 if block_bq_level > 0 {
3032 if block.is_ordered { last_marker_width } else { 2 }
3033 } else {
3034 min_continuation_for_tracking
3035 }
3036 } else {
3037 min_continuation_for_tracking
3038 };
3039 let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3040 || (line_info.indent == 0 && !line_info.is_blank); if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3043 eprintln!(
3044 "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3045 line_num,
3046 effective_continuation_indent,
3047 adjusted_min_continuation_for_tracking,
3048 is_valid_continuation,
3049 line_info.in_code_span_continuation,
3050 line_info.in_code_block,
3051 current_block.is_some()
3052 );
3053 }
3054
3055 if !line_info.in_code_span_continuation
3056 && line_info.list_item.is_none()
3057 && !line_info.is_blank
3058 && !line_info.in_code_block
3059 && is_valid_continuation
3060 && let Some(ref mut block) = current_block
3061 {
3062 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3063 eprintln!(
3064 "[DEBUG] Line {}: extending block.end_line from {} to {}",
3065 line_num, block.end_line, line_num
3066 );
3067 }
3068 block.end_line = line_num;
3069 }
3070
3071 if let Some(list_item) = &line_info.list_item {
3073 let item_indent = list_item.marker_column;
3075 let nesting = item_indent / 2; if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3078 eprintln!(
3079 "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3080 line_num, list_item.marker, item_indent
3081 );
3082 }
3083
3084 if let Some(ref mut block) = current_block {
3085 let is_nested = nesting > block.nesting_level;
3089 let same_type =
3090 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3091 let same_context = block.blockquote_prefix == blockquote_prefix;
3092 let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3094
3095 let marker_compatible =
3097 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3098
3099 let has_non_list_content = has_list_breaking_content_since_last_item;
3102
3103 let mut continues_list = if is_nested {
3107 same_context && reasonable_distance && !has_non_list_content
3109 } else {
3110 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3112 };
3113
3114 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3115 eprintln!(
3116 "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3117 line_num,
3118 continues_list,
3119 is_nested,
3120 same_type,
3121 same_context,
3122 reasonable_distance,
3123 marker_compatible,
3124 has_non_list_content,
3125 last_list_item_line,
3126 block.end_line
3127 );
3128 }
3129
3130 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
3133 if block.item_lines.contains(&(line_num - 1)) {
3136 continues_list = true;
3138 } else {
3139 continues_list = true;
3143 }
3144 }
3145
3146 if continues_list {
3147 block.end_line = line_num;
3149 block.item_lines.push(line_num);
3150
3151 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3153 list_item.marker.len() + 1
3154 } else {
3155 list_item.marker.len()
3156 });
3157
3158 if !block.is_ordered
3160 && block.marker.is_some()
3161 && block.marker.as_ref() != Some(&list_item.marker)
3162 {
3163 block.marker = None;
3165 }
3166
3167 reset_tracking_state(
3169 list_item,
3170 &mut has_list_breaking_content_since_last_item,
3171 &mut min_continuation_for_tracking,
3172 );
3173 } else {
3174 list_blocks.push(block.clone());
3177
3178 *block = ListBlock {
3179 start_line: line_num,
3180 end_line: line_num,
3181 is_ordered: list_item.is_ordered,
3182 marker: if list_item.is_ordered {
3183 None
3184 } else {
3185 Some(list_item.marker.clone())
3186 },
3187 blockquote_prefix: blockquote_prefix.clone(),
3188 item_lines: vec![line_num],
3189 nesting_level: nesting,
3190 max_marker_width: if list_item.is_ordered {
3191 list_item.marker.len() + 1
3192 } else {
3193 list_item.marker.len()
3194 },
3195 };
3196
3197 reset_tracking_state(
3199 list_item,
3200 &mut has_list_breaking_content_since_last_item,
3201 &mut min_continuation_for_tracking,
3202 );
3203 }
3204 } else {
3205 current_block = Some(ListBlock {
3207 start_line: line_num,
3208 end_line: line_num,
3209 is_ordered: list_item.is_ordered,
3210 marker: if list_item.is_ordered {
3211 None
3212 } else {
3213 Some(list_item.marker.clone())
3214 },
3215 blockquote_prefix,
3216 item_lines: vec![line_num],
3217 nesting_level: nesting,
3218 max_marker_width: list_item.marker.len(),
3219 });
3220
3221 reset_tracking_state(
3223 list_item,
3224 &mut has_list_breaking_content_since_last_item,
3225 &mut min_continuation_for_tracking,
3226 );
3227 }
3228
3229 last_list_item_line = line_num;
3230 current_indent_level = item_indent;
3231 last_marker_width = if list_item.is_ordered {
3232 list_item.marker.len() + 1 } else {
3234 list_item.marker.len()
3235 };
3236 } else if let Some(ref mut block) = current_block {
3237 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3239 eprintln!(
3240 "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3241 line_num, line_info.is_blank
3242 );
3243 }
3244
3245 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3253 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3254 } else {
3255 false
3256 };
3257
3258 let min_continuation_indent = if block.is_ordered {
3262 current_indent_level + last_marker_width
3263 } else {
3264 current_indent_level + 2 };
3266
3267 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3268 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3270 eprintln!(
3271 "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3272 line_num, line_info.indent, min_continuation_indent
3273 );
3274 }
3275 block.end_line = line_num;
3276 } else if line_info.is_blank {
3277 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3280 eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3281 }
3282 let mut check_idx = line_idx + 1;
3283 let mut found_continuation = false;
3284
3285 while check_idx < lines.len() && lines[check_idx].is_blank {
3287 check_idx += 1;
3288 }
3289
3290 if check_idx < lines.len() {
3291 let next_line = &lines[check_idx];
3292 let next_content = next_line.content(content);
3294 let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3297 let next_bq_level_for_indent = next_content
3298 .chars()
3299 .take_while(|c| *c == '>' || c.is_whitespace())
3300 .filter(|&c| c == '>')
3301 .count();
3302 let effective_indent =
3303 if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3304 let mut pos = 0;
3307 let mut found_markers = 0;
3308 for c in next_content.chars() {
3309 pos += c.len_utf8();
3310 if c == '>' {
3311 found_markers += 1;
3312 if found_markers == next_bq_level_for_indent {
3313 if next_content.get(pos..pos + 1) == Some(" ") {
3315 pos += 1;
3316 }
3317 break;
3318 }
3319 }
3320 }
3321 let after_blockquote_marker = &next_content[pos..];
3322 after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3323 } else {
3324 next_line.indent
3325 };
3326 let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3329 if block.is_ordered { last_marker_width } else { 2 }
3332 } else {
3333 min_continuation_indent
3334 };
3335 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3337 eprintln!(
3338 "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3339 line_num,
3340 check_idx + 1,
3341 effective_indent,
3342 adjusted_min_continuation,
3343 next_line.list_item.is_some(),
3344 next_line.in_code_block
3345 );
3346 }
3347 if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3348 found_continuation = true;
3349 }
3350 else if !next_line.in_code_block
3352 && next_line.list_item.is_some()
3353 && let Some(item) = &next_line.list_item
3354 {
3355 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3356 .find(next_line.content(content))
3357 .map_or(String::new(), |m| m.as_str().to_string());
3358 if item.marker_column == current_indent_level
3359 && item.is_ordered == block.is_ordered
3360 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3361 {
3362 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3366 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3367 if let Some(between_line) = lines.get(idx) {
3368 let between_content = between_line.content(content);
3369 let trimmed = between_content.trim();
3370 if trimmed.is_empty() {
3372 return false;
3373 }
3374 let line_indent = between_content.len() - between_content.trim_start().len();
3376
3377 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3379 .find(between_content)
3380 .map_or(String::new(), |m| m.as_str().to_string());
3381 let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3382 let blockquote_level_changed =
3383 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3384
3385 if trimmed.starts_with("```")
3387 || trimmed.starts_with("~~~")
3388 || trimmed.starts_with("---")
3389 || trimmed.starts_with("***")
3390 || trimmed.starts_with("___")
3391 || blockquote_level_changed
3392 || crate::utils::skip_context::is_table_line(trimmed)
3393 || between_line.heading.is_some()
3394 {
3395 return true; }
3397
3398 line_indent >= min_continuation_indent
3400 } else {
3401 false
3402 }
3403 });
3404
3405 if block.is_ordered {
3406 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3409 if let Some(between_line) = lines.get(idx) {
3410 let between_content = between_line.content(content);
3411 let trimmed = between_content.trim();
3412 if trimmed.is_empty() {
3413 return false;
3414 }
3415 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3417 .find(between_content)
3418 .map_or(String::new(), |m| m.as_str().to_string());
3419 let between_bq_level =
3420 between_bq_prefix.chars().filter(|&c| c == '>').count();
3421 let blockquote_level_changed =
3422 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3423 trimmed.starts_with("```")
3425 || trimmed.starts_with("~~~")
3426 || trimmed.starts_with("---")
3427 || trimmed.starts_with("***")
3428 || trimmed.starts_with("___")
3429 || blockquote_level_changed
3430 || crate::utils::skip_context::is_table_line(trimmed)
3431 || between_line.heading.is_some()
3432 } else {
3433 false
3434 }
3435 });
3436 found_continuation = !has_structural_separators;
3437 } else {
3438 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3440 if let Some(between_line) = lines.get(idx) {
3441 let between_content = between_line.content(content);
3442 let trimmed = between_content.trim();
3443 if trimmed.is_empty() {
3444 return false;
3445 }
3446 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3448 .find(between_content)
3449 .map_or(String::new(), |m| m.as_str().to_string());
3450 let between_bq_level =
3451 between_bq_prefix.chars().filter(|&c| c == '>').count();
3452 let blockquote_level_changed =
3453 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3454 trimmed.starts_with("```")
3456 || trimmed.starts_with("~~~")
3457 || trimmed.starts_with("---")
3458 || trimmed.starts_with("***")
3459 || trimmed.starts_with("___")
3460 || blockquote_level_changed
3461 || crate::utils::skip_context::is_table_line(trimmed)
3462 || between_line.heading.is_some()
3463 } else {
3464 false
3465 }
3466 });
3467 found_continuation = !has_structural_separators;
3468 }
3469 }
3470 }
3471 }
3472
3473 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3474 eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3475 }
3476 if found_continuation {
3477 block.end_line = line_num;
3479 } else {
3480 list_blocks.push(block.clone());
3482 current_block = None;
3483 }
3484 } else {
3485 let min_required_indent = if block.is_ordered {
3488 current_indent_level + last_marker_width
3489 } else {
3490 current_indent_level + 2
3491 };
3492
3493 let line_content = line_info.content(content).trim();
3498
3499 let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3501
3502 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3505 let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3506 let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3507
3508 let is_structural_separator = line_info.heading.is_some()
3509 || line_content.starts_with("```")
3510 || line_content.starts_with("~~~")
3511 || line_content.starts_with("---")
3512 || line_content.starts_with("***")
3513 || line_content.starts_with("___")
3514 || blockquote_level_changed
3515 || looks_like_table;
3516
3517 let is_lazy_continuation = !is_structural_separator
3520 && !line_info.is_blank
3521 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3522
3523 if is_lazy_continuation {
3524 let line_content_raw = line_info.content(content);
3528 let block_bq_level_lazy = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3529 let line_bq_level_lazy = line_content_raw
3530 .chars()
3531 .take_while(|c| *c == '>' || c.is_whitespace())
3532 .filter(|&c| c == '>')
3533 .count();
3534 let has_proper_blockquote_indent =
3535 if line_bq_level_lazy > 0 && line_bq_level_lazy == block_bq_level_lazy {
3536 let mut pos = 0;
3538 let mut found_markers = 0;
3539 for c in line_content_raw.chars() {
3540 pos += c.len_utf8();
3541 if c == '>' {
3542 found_markers += 1;
3543 if found_markers == line_bq_level_lazy {
3544 if line_content_raw.get(pos..pos + 1) == Some(" ") {
3545 pos += 1;
3546 }
3547 break;
3548 }
3549 }
3550 }
3551 let after_bq = &line_content_raw[pos..];
3552 let effective_indent_lazy = after_bq.len() - after_bq.trim_start().len();
3553 let min_required_for_bq = if block.is_ordered { last_marker_width } else { 2 };
3554 effective_indent_lazy >= min_required_for_bq
3555 } else {
3556 false
3557 };
3558
3559 if has_proper_blockquote_indent {
3561 block.end_line = line_num;
3562 } else {
3563 let content_to_check = if !blockquote_prefix.is_empty() {
3564 line_info
3566 .content(content)
3567 .strip_prefix(&blockquote_prefix)
3568 .unwrap_or(line_info.content(content))
3569 .trim()
3570 } else {
3571 line_info.content(content).trim()
3572 };
3573
3574 let starts_with_uppercase =
3575 content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3576
3577 if starts_with_uppercase && last_list_item_line > 0 {
3580 list_blocks.push(block.clone());
3582 current_block = None;
3583 } else {
3584 block.end_line = line_num;
3586 }
3587 }
3588 } else {
3589 list_blocks.push(block.clone());
3591 current_block = None;
3592 }
3593 }
3594 }
3595 }
3596
3597 if let Some(block) = current_block {
3599 list_blocks.push(block);
3600 }
3601
3602 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3604
3605 list_blocks
3606 }
3607
3608 fn compute_char_frequency(content: &str) -> CharFrequency {
3610 let mut frequency = CharFrequency::default();
3611
3612 for ch in content.chars() {
3613 match ch {
3614 '#' => frequency.hash_count += 1,
3615 '*' => frequency.asterisk_count += 1,
3616 '_' => frequency.underscore_count += 1,
3617 '-' => frequency.hyphen_count += 1,
3618 '+' => frequency.plus_count += 1,
3619 '>' => frequency.gt_count += 1,
3620 '|' => frequency.pipe_count += 1,
3621 '[' => frequency.bracket_count += 1,
3622 '`' => frequency.backtick_count += 1,
3623 '<' => frequency.lt_count += 1,
3624 '!' => frequency.exclamation_count += 1,
3625 '\n' => frequency.newline_count += 1,
3626 _ => {}
3627 }
3628 }
3629
3630 frequency
3631 }
3632
3633 fn parse_html_tags(
3635 content: &str,
3636 lines: &[LineInfo],
3637 code_blocks: &[(usize, usize)],
3638 flavor: MarkdownFlavor,
3639 ) -> Vec<HtmlTag> {
3640 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3641 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3642
3643 let mut html_tags = Vec::with_capacity(content.matches('<').count());
3644
3645 for cap in HTML_TAG_REGEX.captures_iter(content) {
3646 let full_match = cap.get(0).unwrap();
3647 let match_start = full_match.start();
3648 let match_end = full_match.end();
3649
3650 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3652 continue;
3653 }
3654
3655 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3656 let tag_name_original = cap.get(2).unwrap().as_str();
3657 let tag_name = tag_name_original.to_lowercase();
3658 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3659
3660 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3663 continue;
3664 }
3665
3666 let mut line_num = 1;
3668 let mut col_start = match_start;
3669 let mut col_end = match_end;
3670 for (idx, line_info) in lines.iter().enumerate() {
3671 if match_start >= line_info.byte_offset {
3672 line_num = idx + 1;
3673 col_start = match_start - line_info.byte_offset;
3674 col_end = match_end - line_info.byte_offset;
3675 } else {
3676 break;
3677 }
3678 }
3679
3680 html_tags.push(HtmlTag {
3681 line: line_num,
3682 start_col: col_start,
3683 end_col: col_end,
3684 byte_offset: match_start,
3685 byte_end: match_end,
3686 tag_name,
3687 is_closing,
3688 is_self_closing,
3689 raw_content: full_match.as_str().to_string(),
3690 });
3691 }
3692
3693 html_tags
3694 }
3695
3696 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3698 let mut table_rows = Vec::with_capacity(lines.len() / 20);
3699
3700 for (line_idx, line_info) in lines.iter().enumerate() {
3701 if line_info.in_code_block || line_info.is_blank {
3703 continue;
3704 }
3705
3706 let line = line_info.content(content);
3707 let line_num = line_idx + 1;
3708
3709 if !line.contains('|') {
3711 continue;
3712 }
3713
3714 let parts: Vec<&str> = line.split('|').collect();
3716 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3717
3718 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3720 let mut column_alignments = Vec::new();
3721
3722 if is_separator {
3723 for part in &parts[1..parts.len() - 1] {
3724 let trimmed = part.trim();
3726 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3727 "center".to_string()
3728 } else if trimmed.ends_with(':') {
3729 "right".to_string()
3730 } else if trimmed.starts_with(':') {
3731 "left".to_string()
3732 } else {
3733 "none".to_string()
3734 };
3735 column_alignments.push(alignment);
3736 }
3737 }
3738
3739 table_rows.push(TableRow {
3740 line: line_num,
3741 is_separator,
3742 column_count,
3743 column_alignments,
3744 });
3745 }
3746
3747 table_rows
3748 }
3749
3750 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3752 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3753
3754 for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3756 let full_match = cap.get(0).unwrap();
3757 let match_start = full_match.start();
3758 let match_end = full_match.end();
3759
3760 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3762 continue;
3763 }
3764
3765 let preceding_char = if match_start > 0 {
3767 content.chars().nth(match_start - 1)
3768 } else {
3769 None
3770 };
3771 let following_char = content.chars().nth(match_end);
3772
3773 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3774 continue;
3775 }
3776 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3777 continue;
3778 }
3779
3780 let url = full_match.as_str();
3781 let url_type = if url.starts_with("https://") {
3782 "https"
3783 } else if url.starts_with("http://") {
3784 "http"
3785 } else if url.starts_with("ftp://") {
3786 "ftp"
3787 } else {
3788 "other"
3789 };
3790
3791 let mut line_num = 1;
3793 let mut col_start = match_start;
3794 let mut col_end = match_end;
3795 for (idx, line_info) in lines.iter().enumerate() {
3796 if match_start >= line_info.byte_offset {
3797 line_num = idx + 1;
3798 col_start = match_start - line_info.byte_offset;
3799 col_end = match_end - line_info.byte_offset;
3800 } else {
3801 break;
3802 }
3803 }
3804
3805 bare_urls.push(BareUrl {
3806 line: line_num,
3807 start_col: col_start,
3808 end_col: col_end,
3809 byte_offset: match_start,
3810 byte_end: match_end,
3811 url: url.to_string(),
3812 url_type: url_type.to_string(),
3813 });
3814 }
3815
3816 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3818 let full_match = cap.get(0).unwrap();
3819 let match_start = full_match.start();
3820 let match_end = full_match.end();
3821
3822 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3824 continue;
3825 }
3826
3827 let preceding_char = if match_start > 0 {
3829 content.chars().nth(match_start - 1)
3830 } else {
3831 None
3832 };
3833 let following_char = content.chars().nth(match_end);
3834
3835 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3836 continue;
3837 }
3838 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3839 continue;
3840 }
3841
3842 let email = full_match.as_str();
3843
3844 let mut line_num = 1;
3846 let mut col_start = match_start;
3847 let mut col_end = match_end;
3848 for (idx, line_info) in lines.iter().enumerate() {
3849 if match_start >= line_info.byte_offset {
3850 line_num = idx + 1;
3851 col_start = match_start - line_info.byte_offset;
3852 col_end = match_end - line_info.byte_offset;
3853 } else {
3854 break;
3855 }
3856 }
3857
3858 bare_urls.push(BareUrl {
3859 line: line_num,
3860 start_col: col_start,
3861 end_col: col_end,
3862 byte_offset: match_start,
3863 byte_end: match_end,
3864 url: email.to_string(),
3865 url_type: "email".to_string(),
3866 });
3867 }
3868
3869 bare_urls
3870 }
3871
3872 #[must_use]
3892 pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3893 ValidHeadingsIter::new(&self.lines)
3894 }
3895
3896 #[must_use]
3900 pub fn has_valid_headings(&self) -> bool {
3901 self.lines
3902 .iter()
3903 .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3904 }
3905}
3906
3907fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3909 if list_blocks.len() < 2 {
3910 return;
3911 }
3912
3913 let mut merger = ListBlockMerger::new(content, lines);
3914 *list_blocks = merger.merge(list_blocks);
3915}
3916
3917struct ListBlockMerger<'a> {
3919 content: &'a str,
3920 lines: &'a [LineInfo],
3921}
3922
3923impl<'a> ListBlockMerger<'a> {
3924 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3925 Self { content, lines }
3926 }
3927
3928 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3929 let mut merged = Vec::with_capacity(list_blocks.len());
3930 let mut current = list_blocks[0].clone();
3931
3932 for next in list_blocks.iter().skip(1) {
3933 if self.should_merge_blocks(¤t, next) {
3934 current = self.merge_two_blocks(current, next);
3935 } else {
3936 merged.push(current);
3937 current = next.clone();
3938 }
3939 }
3940
3941 merged.push(current);
3942 merged
3943 }
3944
3945 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3947 if !self.blocks_are_compatible(current, next) {
3949 return false;
3950 }
3951
3952 let spacing = self.analyze_spacing_between(current, next);
3954 match spacing {
3955 BlockSpacing::Consecutive => true,
3956 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3957 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3958 self.can_merge_with_content_between(current, next)
3959 }
3960 }
3961 }
3962
3963 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3965 current.is_ordered == next.is_ordered
3966 && current.blockquote_prefix == next.blockquote_prefix
3967 && current.nesting_level == next.nesting_level
3968 }
3969
3970 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3972 let gap = next.start_line - current.end_line;
3973
3974 match gap {
3975 1 => BlockSpacing::Consecutive,
3976 2 => BlockSpacing::SingleBlank,
3977 _ if gap > 2 => {
3978 if self.has_only_blank_lines_between(current, next) {
3979 BlockSpacing::MultipleBlanks
3980 } else {
3981 BlockSpacing::ContentBetween
3982 }
3983 }
3984 _ => BlockSpacing::Consecutive, }
3986 }
3987
3988 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3990 if has_meaningful_content_between(self.content, current, next, self.lines) {
3993 return false; }
3995
3996 !current.is_ordered && current.marker == next.marker
3998 }
3999
4000 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4002 if has_meaningful_content_between(self.content, current, next, self.lines) {
4004 return false; }
4006
4007 current.is_ordered && next.is_ordered
4009 }
4010
4011 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4013 for line_num in (current.end_line + 1)..next.start_line {
4014 if let Some(line_info) = self.lines.get(line_num - 1)
4015 && !line_info.content(self.content).trim().is_empty()
4016 {
4017 return false;
4018 }
4019 }
4020 true
4021 }
4022
4023 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4025 current.end_line = next.end_line;
4026 current.item_lines.extend_from_slice(&next.item_lines);
4027
4028 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4030
4031 if !current.is_ordered && self.markers_differ(¤t, next) {
4033 current.marker = None; }
4035
4036 current
4037 }
4038
4039 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4041 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4042 }
4043}
4044
4045#[derive(Debug, PartialEq)]
4047enum BlockSpacing {
4048 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
4053
4054fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4056 for line_num in (current.end_line + 1)..next.start_line {
4058 if let Some(line_info) = lines.get(line_num - 1) {
4059 let trimmed = line_info.content(content).trim();
4061
4062 if trimmed.is_empty() {
4064 continue;
4065 }
4066
4067 if line_info.heading.is_some() {
4071 return true; }
4073
4074 if is_horizontal_rule(trimmed) {
4076 return true; }
4078
4079 if crate::utils::skip_context::is_table_line(trimmed) {
4081 return true; }
4083
4084 if trimmed.starts_with('>') {
4086 return true; }
4088
4089 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4091 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4092
4093 let min_continuation_indent = if current.is_ordered {
4095 current.nesting_level + current.max_marker_width + 1 } else {
4097 current.nesting_level + 2
4098 };
4099
4100 if line_indent < min_continuation_indent {
4101 return true; }
4104 }
4105
4106 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4108
4109 let min_indent = if current.is_ordered {
4111 current.nesting_level + current.max_marker_width
4112 } else {
4113 current.nesting_level + 2
4114 };
4115
4116 if line_indent < min_indent {
4118 return true; }
4120
4121 }
4124 }
4125
4126 false
4128}
4129
4130pub fn is_horizontal_rule_line(line: &str) -> bool {
4137 let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4139 if leading_spaces > 3 || line.starts_with('\t') {
4140 return false;
4141 }
4142
4143 is_horizontal_rule_content(line.trim())
4144}
4145
4146pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4149 if trimmed.len() < 3 {
4150 return false;
4151 }
4152
4153 let chars: Vec<char> = trimmed.chars().collect();
4155 if let Some(&first_char) = chars.first()
4156 && (first_char == '-' || first_char == '*' || first_char == '_')
4157 {
4158 let mut count = 0;
4159 for &ch in &chars {
4160 if ch == first_char {
4161 count += 1;
4162 } else if ch != ' ' && ch != '\t' {
4163 return false; }
4165 }
4166 return count >= 3;
4167 }
4168 false
4169}
4170
4171pub fn is_horizontal_rule(trimmed: &str) -> bool {
4173 is_horizontal_rule_content(trimmed)
4174}
4175
4176#[cfg(test)]
4178mod tests {
4179 use super::*;
4180
4181 #[test]
4182 fn test_empty_content() {
4183 let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4184 assert_eq!(ctx.content, "");
4185 assert_eq!(ctx.line_offsets, vec![0]);
4186 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4187 assert_eq!(ctx.lines.len(), 0);
4188 }
4189
4190 #[test]
4191 fn test_single_line() {
4192 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4193 assert_eq!(ctx.content, "# Hello");
4194 assert_eq!(ctx.line_offsets, vec![0]);
4195 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4196 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4197 }
4198
4199 #[test]
4200 fn test_multi_line() {
4201 let content = "# Title\n\nSecond line\nThird line";
4202 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4203 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4204 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
4211
4212 #[test]
4213 fn test_line_info() {
4214 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
4215 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4216
4217 assert_eq!(ctx.lines.len(), 7);
4219
4220 let line1 = &ctx.lines[0];
4222 assert_eq!(line1.content(ctx.content), "# Title");
4223 assert_eq!(line1.byte_offset, 0);
4224 assert_eq!(line1.indent, 0);
4225 assert!(!line1.is_blank);
4226 assert!(!line1.in_code_block);
4227 assert!(line1.list_item.is_none());
4228
4229 let line2 = &ctx.lines[1];
4231 assert_eq!(line2.content(ctx.content), " indented");
4232 assert_eq!(line2.byte_offset, 8);
4233 assert_eq!(line2.indent, 4);
4234 assert!(!line2.is_blank);
4235
4236 let line3 = &ctx.lines[2];
4238 assert_eq!(line3.content(ctx.content), "");
4239 assert!(line3.is_blank);
4240
4241 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4243 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4244 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4245 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4246 }
4247
4248 #[test]
4249 fn test_list_item_detection() {
4250 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
4251 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4252
4253 let line1 = &ctx.lines[0];
4255 assert!(line1.list_item.is_some());
4256 let list1 = line1.list_item.as_ref().unwrap();
4257 assert_eq!(list1.marker, "-");
4258 assert!(!list1.is_ordered);
4259 assert_eq!(list1.marker_column, 0);
4260 assert_eq!(list1.content_column, 2);
4261
4262 let line2 = &ctx.lines[1];
4264 assert!(line2.list_item.is_some());
4265 let list2 = line2.list_item.as_ref().unwrap();
4266 assert_eq!(list2.marker, "*");
4267 assert_eq!(list2.marker_column, 2);
4268
4269 let line3 = &ctx.lines[2];
4271 assert!(line3.list_item.is_some());
4272 let list3 = line3.list_item.as_ref().unwrap();
4273 assert_eq!(list3.marker, "1.");
4274 assert!(list3.is_ordered);
4275 assert_eq!(list3.number, Some(1));
4276
4277 let line6 = &ctx.lines[5];
4279 assert!(line6.list_item.is_none());
4280 }
4281
4282 #[test]
4283 fn test_offset_to_line_col_edge_cases() {
4284 let content = "a\nb\nc";
4285 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4286 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
4294
4295 #[test]
4296 fn test_mdx_esm_blocks() {
4297 let content = r##"import {Chart} from './snowfall.js'
4298export const year = 2023
4299
4300# Last year's snowfall
4301
4302In {year}, the snowfall was above average.
4303It was followed by a warm spring which caused
4304flood conditions in many of the nearby rivers.
4305
4306<Chart color="#fcb32c" year={year} />
4307"##;
4308
4309 let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4310
4311 assert_eq!(ctx.lines.len(), 10);
4313 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4314 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4315 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4316 assert!(
4317 !ctx.lines[3].in_esm_block,
4318 "Line 4 (heading) should NOT be in_esm_block"
4319 );
4320 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4321 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4322 }
4323
4324 #[test]
4325 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4326 let content = r#"import {Chart} from './snowfall.js'
4327export const year = 2023
4328
4329# Last year's snowfall
4330"#;
4331
4332 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4333
4334 assert!(
4336 !ctx.lines[0].in_esm_block,
4337 "Line 1 should NOT be in_esm_block in Standard flavor"
4338 );
4339 assert!(
4340 !ctx.lines[1].in_esm_block,
4341 "Line 2 should NOT be in_esm_block in Standard flavor"
4342 );
4343 }
4344
4345 #[test]
4346 fn test_blockquote_with_indented_content() {
4347 let content = r#"# Heading
4351
4352> -S socket-path
4353> More text
4354"#;
4355 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4356
4357 assert!(
4359 ctx.lines.get(2).is_some_and(|l| l.blockquote.is_some()),
4360 "Line 3 should be a blockquote"
4361 );
4362 assert!(
4364 ctx.lines.get(3).is_some_and(|l| l.blockquote.is_some()),
4365 "Line 4 should be a blockquote"
4366 );
4367
4368 let bq3 = ctx.lines.get(2).unwrap().blockquote.as_ref().unwrap();
4371 assert_eq!(bq3.content, "-S socket-path");
4372 assert_eq!(bq3.nesting_level, 1);
4373 assert!(bq3.has_multiple_spaces_after_marker);
4375
4376 let bq4 = ctx.lines.get(3).unwrap().blockquote.as_ref().unwrap();
4377 assert_eq!(bq4.content, "More text");
4378 assert_eq!(bq4.nesting_level, 1);
4379 }
4380}