1use crate::config::MarkdownFlavor;
2use crate::utils::ast_utils::get_cached_ast;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use markdown::mdast::Node;
6use regex::Regex;
7
8lazy_static! {
9 static ref LINK_PATTERN: Regex = Regex::new(
12 r"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \(([^)]*)\) # Inline URL in group 2 (can be empty)
16 |
17 \[([^\]]*)\] # Reference ID in group 3
18 )"
19 ).unwrap();
20
21 static ref IMAGE_PATTERN: Regex = Regex::new(
24 r"(?sx)
25 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
26 (?:
27 \(([^)]*)\) # Inline URL in group 2 (can be empty)
28 |
29 \[([^\]]*)\] # Reference ID in group 3
30 )"
31 ).unwrap();
32
33 static ref REF_DEF_PATTERN: Regex = Regex::new(
35 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36 ).unwrap();
37
38 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41 r"`+"
42 ).unwrap();
43
44 static ref BARE_URL_PATTERN: Regex = Regex::new(
46 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47 ).unwrap();
48
49 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52 ).unwrap();
53
54 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57 ).unwrap();
58
59 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63#[derive(Debug, Clone)]
65pub struct LineInfo {
66 pub content: String,
68 pub byte_offset: usize,
70 pub indent: usize,
72 pub is_blank: bool,
74 pub in_code_block: bool,
76 pub in_front_matter: bool,
78 pub in_html_block: bool,
80 pub list_item: Option<ListItemInfo>,
82 pub heading: Option<HeadingInfo>,
84 pub blockquote: Option<BlockquoteInfo>,
86 pub in_mkdocstrings: bool,
88}
89
90#[derive(Debug, Clone)]
92pub struct ListItemInfo {
93 pub marker: String,
95 pub is_ordered: bool,
97 pub number: Option<usize>,
99 pub marker_column: usize,
101 pub content_column: usize,
103}
104
105#[derive(Debug, Clone, PartialEq)]
107pub enum HeadingStyle {
108 ATX,
110 Setext1,
112 Setext2,
114}
115
116#[derive(Debug, Clone)]
118pub struct ParsedLink {
119 pub line: usize,
121 pub start_col: usize,
123 pub end_col: usize,
125 pub byte_offset: usize,
127 pub byte_end: usize,
129 pub text: String,
131 pub url: String,
133 pub is_reference: bool,
135 pub reference_id: Option<String>,
137}
138
139#[derive(Debug, Clone)]
141pub struct ParsedImage {
142 pub line: usize,
144 pub start_col: usize,
146 pub end_col: usize,
148 pub byte_offset: usize,
150 pub byte_end: usize,
152 pub alt_text: String,
154 pub url: String,
156 pub is_reference: bool,
158 pub reference_id: Option<String>,
160}
161
162#[derive(Debug, Clone)]
164pub struct ReferenceDef {
165 pub line: usize,
167 pub id: String,
169 pub url: String,
171 pub title: Option<String>,
173}
174
175#[derive(Debug, Clone)]
177pub struct CodeSpan {
178 pub line: usize,
180 pub start_col: usize,
182 pub end_col: usize,
184 pub byte_offset: usize,
186 pub byte_end: usize,
188 pub backtick_count: usize,
190 pub content: String,
192}
193
194#[derive(Debug, Clone)]
196pub struct HeadingInfo {
197 pub level: u8,
199 pub style: HeadingStyle,
201 pub marker: String,
203 pub marker_column: usize,
205 pub content_column: usize,
207 pub text: String,
209 pub custom_id: Option<String>,
211 pub raw_text: String,
213 pub has_closing_sequence: bool,
215 pub closing_sequence: String,
217}
218
219#[derive(Debug, Clone)]
221pub struct BlockquoteInfo {
222 pub nesting_level: usize,
224 pub indent: String,
226 pub marker_column: usize,
228 pub prefix: String,
230 pub content: String,
232 pub has_no_space_after_marker: bool,
234 pub has_multiple_spaces_after_marker: bool,
236 pub needs_md028_fix: bool,
238}
239
240#[derive(Debug, Clone)]
242pub struct ListBlock {
243 pub start_line: usize,
245 pub end_line: usize,
247 pub is_ordered: bool,
249 pub marker: Option<String>,
251 pub blockquote_prefix: String,
253 pub item_lines: Vec<usize>,
255 pub nesting_level: usize,
257 pub max_marker_width: usize,
259}
260
261use std::sync::{Arc, Mutex};
262
263#[derive(Debug, Clone, Default)]
265pub struct CharFrequency {
266 pub hash_count: usize,
268 pub asterisk_count: usize,
270 pub underscore_count: usize,
272 pub hyphen_count: usize,
274 pub plus_count: usize,
276 pub gt_count: usize,
278 pub pipe_count: usize,
280 pub bracket_count: usize,
282 pub backtick_count: usize,
284 pub lt_count: usize,
286 pub exclamation_count: usize,
288 pub newline_count: usize,
290}
291
292#[derive(Debug, Clone)]
294pub struct HtmlTag {
295 pub line: usize,
297 pub start_col: usize,
299 pub end_col: usize,
301 pub byte_offset: usize,
303 pub byte_end: usize,
305 pub tag_name: String,
307 pub is_closing: bool,
309 pub is_self_closing: bool,
311 pub raw_content: String,
313}
314
315#[derive(Debug, Clone)]
317pub struct EmphasisSpan {
318 pub line: usize,
320 pub start_col: usize,
322 pub end_col: usize,
324 pub byte_offset: usize,
326 pub byte_end: usize,
328 pub marker: char,
330 pub marker_count: usize,
332 pub content: String,
334}
335
336#[derive(Debug, Clone)]
338pub struct TableRow {
339 pub line: usize,
341 pub is_separator: bool,
343 pub column_count: usize,
345 pub column_alignments: Vec<String>, }
348
349#[derive(Debug, Clone)]
351pub struct BareUrl {
352 pub line: usize,
354 pub start_col: usize,
356 pub end_col: usize,
358 pub byte_offset: usize,
360 pub byte_end: usize,
362 pub url: String,
364 pub url_type: String,
366}
367
368pub struct LintContext<'a> {
369 pub content: &'a str,
370 pub line_offsets: Vec<usize>,
371 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, ast_cache: Mutex<Option<Arc<Node>>>, pub flavor: MarkdownFlavor, }
386
387impl<'a> LintContext<'a> {
388 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
389 let mut line_offsets = vec![0];
390 for (i, c) in content.char_indices() {
391 if c == '\n' {
392 line_offsets.push(i + 1);
393 }
394 }
395
396 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
398
399 let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
401
402 let ast = get_cached_ast(content);
404 let code_spans = Self::parse_code_spans(content, &lines, &ast);
405
406 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
408 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
409 let reference_defs = Self::parse_reference_defs(content, &lines);
410 let list_blocks = Self::parse_list_blocks(&lines);
413
414 Self::detect_html_blocks(&mut lines);
416
417 let char_frequency = Self::compute_char_frequency(content);
419
420 Self {
421 content,
422 line_offsets,
423 code_blocks,
424 lines,
425 links,
426 images,
427 reference_defs,
428 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
429 list_blocks,
430 char_frequency,
431 html_tags_cache: Mutex::new(None),
432 emphasis_spans_cache: Mutex::new(None),
433 table_rows_cache: Mutex::new(None),
434 bare_urls_cache: Mutex::new(None),
435 ast_cache: Mutex::new(None),
436 flavor,
437 }
438 }
439
440 pub fn get_ast(&self) -> Arc<Node> {
442 let mut cache = self.ast_cache.lock().unwrap();
443
444 if cache.is_none() {
445 *cache = Some(get_cached_ast(self.content));
448 }
449
450 cache.as_ref().unwrap().clone()
451 }
452
453 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
455 let mut cache = self.code_spans_cache.lock().unwrap();
456
457 if cache.is_none() {
459 let ast = self.get_ast();
460 let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
461 *cache = Some(Arc::new(code_spans));
462 }
463
464 cache.as_ref().unwrap().clone()
466 }
467
468 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
470 let mut cache = self.html_tags_cache.lock().unwrap();
471
472 if cache.is_none() {
473 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
474 *cache = Some(Arc::new(html_tags));
475 }
476
477 cache.as_ref().unwrap().clone()
478 }
479
480 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
482 let mut cache = self.emphasis_spans_cache.lock().unwrap();
483
484 if cache.is_none() {
485 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
486 *cache = Some(Arc::new(emphasis_spans));
487 }
488
489 cache.as_ref().unwrap().clone()
490 }
491
492 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
494 let mut cache = self.table_rows_cache.lock().unwrap();
495
496 if cache.is_none() {
497 let table_rows = Self::parse_table_rows(&self.lines);
498 *cache = Some(Arc::new(table_rows));
499 }
500
501 cache.as_ref().unwrap().clone()
502 }
503
504 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
506 let mut cache = self.bare_urls_cache.lock().unwrap();
507
508 if cache.is_none() {
509 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
510 *cache = Some(Arc::new(bare_urls));
511 }
512
513 cache.as_ref().unwrap().clone()
514 }
515
516 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
518 match self.line_offsets.binary_search(&offset) {
519 Ok(line) => (line + 1, 1),
520 Err(line) => {
521 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
522 (line, offset - line_start + 1)
523 }
524 }
525 }
526
527 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
529 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
531 return true;
532 }
533
534 self.code_spans()
536 .iter()
537 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
538 }
539
540 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
542 if line_num > 0 {
543 self.lines.get(line_num - 1)
544 } else {
545 None
546 }
547 }
548
549 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
551 self.line_info(line_num).map(|info| info.byte_offset)
552 }
553
554 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
556 let normalized_id = ref_id.to_lowercase();
557 self.reference_defs
558 .iter()
559 .find(|def| def.id == normalized_id)
560 .map(|def| def.url.as_str())
561 }
562
563 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
565 self.links.iter().filter(|link| link.line == line_num).collect()
566 }
567
568 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
570 self.images.iter().filter(|img| img.line == line_num).collect()
571 }
572
573 pub fn is_in_list_block(&self, line_num: usize) -> bool {
575 self.list_blocks
576 .iter()
577 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
578 }
579
580 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
582 self.list_blocks
583 .iter()
584 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
585 }
586
587 pub fn is_in_code_block(&self, line_num: usize) -> bool {
591 if line_num == 0 || line_num > self.lines.len() {
592 return false;
593 }
594 self.lines[line_num - 1].in_code_block
595 }
596
597 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
599 if line_num == 0 || line_num > self.lines.len() {
600 return false;
601 }
602 self.lines[line_num - 1].in_front_matter
603 }
604
605 pub fn is_in_html_block(&self, line_num: usize) -> bool {
607 if line_num == 0 || line_num > self.lines.len() {
608 return false;
609 }
610 self.lines[line_num - 1].in_html_block
611 }
612
613 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
615 if line_num == 0 || line_num > self.lines.len() {
616 return false;
617 }
618
619 let col_0indexed = if col > 0 { col - 1 } else { 0 };
623 let code_spans = self.code_spans();
624 code_spans
625 .iter()
626 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
627 }
628
629 pub fn has_char(&self, ch: char) -> bool {
631 match ch {
632 '#' => self.char_frequency.hash_count > 0,
633 '*' => self.char_frequency.asterisk_count > 0,
634 '_' => self.char_frequency.underscore_count > 0,
635 '-' => self.char_frequency.hyphen_count > 0,
636 '+' => self.char_frequency.plus_count > 0,
637 '>' => self.char_frequency.gt_count > 0,
638 '|' => self.char_frequency.pipe_count > 0,
639 '[' => self.char_frequency.bracket_count > 0,
640 '`' => self.char_frequency.backtick_count > 0,
641 '<' => self.char_frequency.lt_count > 0,
642 '!' => self.char_frequency.exclamation_count > 0,
643 '\n' => self.char_frequency.newline_count > 0,
644 _ => self.content.contains(ch), }
646 }
647
648 pub fn char_count(&self, ch: char) -> usize {
650 match ch {
651 '#' => self.char_frequency.hash_count,
652 '*' => self.char_frequency.asterisk_count,
653 '_' => self.char_frequency.underscore_count,
654 '-' => self.char_frequency.hyphen_count,
655 '+' => self.char_frequency.plus_count,
656 '>' => self.char_frequency.gt_count,
657 '|' => self.char_frequency.pipe_count,
658 '[' => self.char_frequency.bracket_count,
659 '`' => self.char_frequency.backtick_count,
660 '<' => self.char_frequency.lt_count,
661 '!' => self.char_frequency.exclamation_count,
662 '\n' => self.char_frequency.newline_count,
663 _ => self.content.matches(ch).count(), }
665 }
666
667 pub fn likely_has_headings(&self) -> bool {
669 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
671
672 pub fn likely_has_lists(&self) -> bool {
674 self.char_frequency.asterisk_count > 0
675 || self.char_frequency.hyphen_count > 0
676 || self.char_frequency.plus_count > 0
677 }
678
679 pub fn likely_has_emphasis(&self) -> bool {
681 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
682 }
683
684 pub fn likely_has_tables(&self) -> bool {
686 self.char_frequency.pipe_count > 2
687 }
688
689 pub fn likely_has_blockquotes(&self) -> bool {
691 self.char_frequency.gt_count > 0
692 }
693
694 pub fn likely_has_code(&self) -> bool {
696 self.char_frequency.backtick_count > 0
697 }
698
699 pub fn likely_has_links_or_images(&self) -> bool {
701 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
702 }
703
704 pub fn likely_has_html(&self) -> bool {
706 self.char_frequency.lt_count > 0
707 }
708
709 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
711 self.html_tags()
712 .iter()
713 .filter(|tag| tag.line == line_num)
714 .cloned()
715 .collect()
716 }
717
718 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
720 self.emphasis_spans()
721 .iter()
722 .filter(|span| span.line == line_num)
723 .cloned()
724 .collect()
725 }
726
727 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
729 self.table_rows()
730 .iter()
731 .filter(|row| row.line == line_num)
732 .cloned()
733 .collect()
734 }
735
736 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
738 self.bare_urls()
739 .iter()
740 .filter(|url| url.line == line_num)
741 .cloned()
742 .collect()
743 }
744
745 fn parse_links(
747 content: &str,
748 lines: &[LineInfo],
749 code_blocks: &[(usize, usize)],
750 code_spans: &[CodeSpan],
751 flavor: MarkdownFlavor,
752 ) -> Vec<ParsedLink> {
753 use crate::utils::skip_context::is_mkdocs_snippet_line;
754
755 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
760 let full_match = cap.get(0).unwrap();
761 let match_start = full_match.start();
762 let match_end = full_match.end();
763
764 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
766 continue;
767 }
768
769 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
771 continue;
772 }
773
774 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
776 continue;
777 }
778
779 if code_spans
781 .iter()
782 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
783 {
784 continue;
785 }
786
787 let line_idx = lines
790 .iter()
791 .position(|line| {
792 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
793 })
794 .unwrap_or(0);
795
796 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
797 continue;
798 }
799
800 let mut line_num = 1;
802 let mut col_start = match_start;
803 for (idx, line_info) in lines.iter().enumerate() {
804 if match_start >= line_info.byte_offset {
805 line_num = idx + 1;
806 col_start = match_start - line_info.byte_offset;
807 } else {
808 break;
809 }
810 }
811
812 let mut end_line_num = 1;
814 let mut col_end = match_end;
815 for (idx, line_info) in lines.iter().enumerate() {
816 if match_end > line_info.byte_offset {
817 end_line_num = idx + 1;
818 col_end = match_end - line_info.byte_offset;
819 } else {
820 break;
821 }
822 }
823
824 if line_num == end_line_num {
826 } else {
828 }
831
832 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
833
834 if let Some(inline_url) = cap.get(2) {
835 links.push(ParsedLink {
837 line: line_num,
838 start_col: col_start,
839 end_col: col_end,
840 byte_offset: match_start,
841 byte_end: match_end,
842 text,
843 url: inline_url.as_str().to_string(),
844 is_reference: false,
845 reference_id: None,
846 });
847 } else if let Some(ref_id) = cap.get(3) {
848 let ref_id_str = ref_id.as_str();
850 let normalized_ref = if ref_id_str.is_empty() {
851 text.to_lowercase() } else {
853 ref_id_str.to_lowercase()
854 };
855
856 links.push(ParsedLink {
857 line: line_num,
858 start_col: col_start,
859 end_col: col_end,
860 byte_offset: match_start,
861 byte_end: match_end,
862 text,
863 url: String::new(), is_reference: true,
865 reference_id: Some(normalized_ref),
866 });
867 }
868 }
869
870 links
871 }
872
873 fn parse_images(
875 content: &str,
876 lines: &[LineInfo],
877 code_blocks: &[(usize, usize)],
878 code_spans: &[CodeSpan],
879 ) -> Vec<ParsedImage> {
880 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
885 let full_match = cap.get(0).unwrap();
886 let match_start = full_match.start();
887 let match_end = full_match.end();
888
889 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
891 continue;
892 }
893
894 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
896 continue;
897 }
898
899 if code_spans
901 .iter()
902 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
903 {
904 continue;
905 }
906
907 let mut line_num = 1;
909 let mut col_start = match_start;
910 for (idx, line_info) in lines.iter().enumerate() {
911 if match_start >= line_info.byte_offset {
912 line_num = idx + 1;
913 col_start = match_start - line_info.byte_offset;
914 } else {
915 break;
916 }
917 }
918
919 let mut end_line_num = 1;
921 let mut col_end = match_end;
922 for (idx, line_info) in lines.iter().enumerate() {
923 if match_end > line_info.byte_offset {
924 end_line_num = idx + 1;
925 col_end = match_end - line_info.byte_offset;
926 } else {
927 break;
928 }
929 }
930
931 if line_num == end_line_num {
933 } else {
935 }
938
939 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
940
941 if let Some(inline_url) = cap.get(2) {
942 images.push(ParsedImage {
944 line: line_num,
945 start_col: col_start,
946 end_col: col_end,
947 byte_offset: match_start,
948 byte_end: match_end,
949 alt_text,
950 url: inline_url.as_str().to_string(),
951 is_reference: false,
952 reference_id: None,
953 });
954 } else if let Some(ref_id) = cap.get(3) {
955 let ref_id_str = ref_id.as_str();
957 let normalized_ref = if ref_id_str.is_empty() {
958 alt_text.to_lowercase() } else {
960 ref_id_str.to_lowercase()
961 };
962
963 images.push(ParsedImage {
964 line: line_num,
965 start_col: col_start,
966 end_col: col_end,
967 byte_offset: match_start,
968 byte_end: match_end,
969 alt_text,
970 url: String::new(), is_reference: true,
972 reference_id: Some(normalized_ref),
973 });
974 }
975 }
976
977 images
978 }
979
980 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
982 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
986 if line_info.in_code_block {
988 continue;
989 }
990
991 let line = &line_info.content;
992 let line_num = line_idx + 1;
993
994 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
995 let id = cap.get(1).unwrap().as_str().to_lowercase();
996 let url = cap.get(2).unwrap().as_str().to_string();
997 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
998
999 refs.push(ReferenceDef {
1000 line: line_num,
1001 id,
1002 url,
1003 title,
1004 });
1005 }
1006 }
1007
1008 refs
1009 }
1010
1011 fn compute_line_info(
1013 content: &str,
1014 line_offsets: &[usize],
1015 code_blocks: &[(usize, usize)],
1016 flavor: MarkdownFlavor,
1017 ) -> Vec<LineInfo> {
1018 lazy_static! {
1019 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1021 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1022
1023 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1025
1026 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1028 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1029
1030 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1032 }
1033
1034 let content_lines: Vec<&str> = content.lines().collect();
1035 let mut lines = Vec::with_capacity(content_lines.len());
1036
1037 let mut in_front_matter = false;
1039 let mut front_matter_end = 0;
1040 if content_lines.first().map(|l| l.trim()) == Some("---") {
1041 in_front_matter = true;
1042 for (idx, line) in content_lines.iter().enumerate().skip(1) {
1043 if line.trim() == "---" {
1044 front_matter_end = idx;
1045 break;
1046 }
1047 }
1048 }
1049
1050 for (i, line) in content_lines.iter().enumerate() {
1051 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1052 let indent = line.len() - line.trim_start().len();
1053 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1055 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1057 after_prefix.trim().is_empty()
1058 } else {
1059 line.trim().is_empty()
1060 };
1061 let in_code_block = code_blocks.iter().any(|&(start, end)| {
1064 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1069 let mut boundary = start;
1071 while boundary > 0 && !content.is_char_boundary(boundary) {
1072 boundary -= 1;
1073 }
1074 boundary
1075 } else {
1076 start
1077 };
1078
1079 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1080 let mut boundary = end;
1082 while boundary < content.len() && !content.is_char_boundary(boundary) {
1083 boundary += 1;
1084 }
1085 boundary
1086 } else {
1087 end.min(content.len())
1088 };
1089
1090 let block_content = &content[safe_start..safe_end];
1091 let is_multiline = block_content.contains('\n');
1092 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1093 let is_indented = !is_fenced
1094 && block_content
1095 .lines()
1096 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
1097
1098 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1099 });
1100
1101 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1103 && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1104 let list_item =
1105 if !(in_code_block || is_blank || in_mkdocstrings || in_front_matter && i <= front_matter_end) {
1106 let (line_for_list_check, blockquote_prefix_len) =
1108 if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1109 let prefix = caps.get(1).unwrap().as_str();
1110 let content = caps.get(2).unwrap().as_str();
1111 (content, prefix.len())
1112 } else {
1113 (&**line, 0)
1114 };
1115
1116 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1117 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1118 let marker = caps.get(2).map_or("", |m| m.as_str());
1119 let spacing = caps.get(3).map_or("", |m| m.as_str());
1120 let _content = caps.get(4).map_or("", |m| m.as_str());
1121 let marker_column = blockquote_prefix_len + leading_spaces.len();
1122 let content_column = marker_column + marker.len() + spacing.len();
1123
1124 if spacing.is_empty() {
1131 None
1132 } else {
1133 Some(ListItemInfo {
1134 marker: marker.to_string(),
1135 is_ordered: false,
1136 number: None,
1137 marker_column,
1138 content_column,
1139 })
1140 }
1141 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1142 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1143 let number_str = caps.get(2).map_or("", |m| m.as_str());
1144 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1145 let spacing = caps.get(4).map_or("", |m| m.as_str());
1146 let _content = caps.get(5).map_or("", |m| m.as_str());
1147 let marker = format!("{number_str}{delimiter}");
1148 let marker_column = blockquote_prefix_len + leading_spaces.len();
1149 let content_column = marker_column + marker.len() + spacing.len();
1150
1151 if spacing.is_empty() {
1154 None
1155 } else {
1156 Some(ListItemInfo {
1157 marker,
1158 is_ordered: true,
1159 number: number_str.parse().ok(),
1160 marker_column,
1161 content_column,
1162 })
1163 }
1164 } else {
1165 None
1166 }
1167 } else {
1168 None
1169 };
1170
1171 lines.push(LineInfo {
1172 content: line.to_string(),
1173 byte_offset,
1174 indent,
1175 is_blank,
1176 in_code_block,
1177 in_front_matter: in_front_matter && i <= front_matter_end,
1178 in_html_block: false, list_item,
1180 heading: None, blockquote: None, in_mkdocstrings,
1183 });
1184 }
1185
1186 for i in 0..content_lines.len() {
1188 if lines[i].in_code_block {
1189 continue;
1190 }
1191
1192 if in_front_matter && i <= front_matter_end {
1194 continue;
1195 }
1196
1197 let line = content_lines[i];
1198
1199 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1201 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1202 let markers = caps.get(2).map_or("", |m| m.as_str());
1203 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1204 let content = caps.get(4).map_or("", |m| m.as_str());
1205
1206 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1207 let marker_column = indent_str.len();
1208
1209 let prefix = format!("{indent_str}{markers}{spaces_after}");
1211
1212 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1214 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1216
1217 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1221
1222 lines[i].blockquote = Some(BlockquoteInfo {
1223 nesting_level,
1224 indent: indent_str.to_string(),
1225 marker_column,
1226 prefix,
1227 content: content.to_string(),
1228 has_no_space_after_marker: has_no_space,
1229 has_multiple_spaces_after_marker: has_multiple_spaces,
1230 needs_md028_fix,
1231 });
1232 }
1233
1234 if lines[i].is_blank {
1236 continue;
1237 }
1238
1239 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1242 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1243 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1244 } else {
1245 false
1246 };
1247
1248 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1249 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1251 continue;
1252 }
1253 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1254 let hashes = caps.get(2).map_or("", |m| m.as_str());
1255 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1256 let rest = caps.get(4).map_or("", |m| m.as_str());
1257
1258 let level = hashes.len() as u8;
1259 let marker_column = leading_spaces.len();
1260
1261 let (text, has_closing, closing_seq) = {
1263 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1265 if rest[id_start..].trim_end().ends_with('}') {
1267 (&rest[..id_start], &rest[id_start..])
1269 } else {
1270 (rest, "")
1271 }
1272 } else {
1273 (rest, "")
1274 };
1275
1276 let trimmed_rest = rest_without_id.trim_end();
1278 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1279 let mut start_of_hashes = last_hash_pos;
1281 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1282 start_of_hashes -= 1;
1283 }
1284
1285 let has_space_before = start_of_hashes == 0
1287 || trimmed_rest
1288 .chars()
1289 .nth(start_of_hashes - 1)
1290 .is_some_and(|c| c.is_whitespace());
1291
1292 let potential_closing = &trimmed_rest[start_of_hashes..];
1294 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1295
1296 if is_all_hashes && has_space_before {
1297 let closing_hashes = potential_closing.to_string();
1299 let text_part = if !custom_id_part.is_empty() {
1302 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1305 } else {
1306 rest_without_id[..start_of_hashes].trim_end().to_string()
1307 };
1308 (text_part, true, closing_hashes)
1309 } else {
1310 (rest.to_string(), false, String::new())
1312 }
1313 } else {
1314 (rest.to_string(), false, String::new())
1316 }
1317 };
1318
1319 let content_column = marker_column + hashes.len() + spaces_after.len();
1320
1321 let raw_text = text.trim().to_string();
1323 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1324
1325 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1327 let next_line = content_lines[i + 1];
1328 if !lines[i + 1].in_code_block
1329 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1330 && let Some(next_line_id) =
1331 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1332 {
1333 custom_id = Some(next_line_id);
1334 }
1335 }
1336
1337 lines[i].heading = Some(HeadingInfo {
1338 level,
1339 style: HeadingStyle::ATX,
1340 marker: hashes.to_string(),
1341 marker_column,
1342 content_column,
1343 text: clean_text,
1344 custom_id,
1345 raw_text,
1346 has_closing_sequence: has_closing,
1347 closing_sequence: closing_seq,
1348 });
1349 }
1350 else if i + 1 < content_lines.len() {
1352 let next_line = content_lines[i + 1];
1353 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1354 if in_front_matter && i < front_matter_end {
1356 continue;
1357 }
1358
1359 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1361 continue;
1362 }
1363
1364 let underline = next_line.trim();
1365
1366 if underline == "---" {
1369 continue;
1370 }
1371
1372 let current_line_trimmed = line.trim();
1374 if current_line_trimmed.contains(':')
1375 && !current_line_trimmed.starts_with('#')
1376 && !current_line_trimmed.contains('[')
1377 && !current_line_trimmed.contains("](")
1378 {
1379 continue;
1381 }
1382
1383 let level = if underline.starts_with('=') { 1 } else { 2 };
1384 let style = if level == 1 {
1385 HeadingStyle::Setext1
1386 } else {
1387 HeadingStyle::Setext2
1388 };
1389
1390 let raw_text = line.trim().to_string();
1392 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1393
1394 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1396 let attr_line = content_lines[i + 2];
1397 if !lines[i + 2].in_code_block
1398 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1399 && let Some(attr_line_id) =
1400 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1401 {
1402 custom_id = Some(attr_line_id);
1403 }
1404 }
1405
1406 lines[i].heading = Some(HeadingInfo {
1407 level,
1408 style,
1409 marker: underline.to_string(),
1410 marker_column: next_line.len() - next_line.trim_start().len(),
1411 content_column: lines[i].indent,
1412 text: clean_text,
1413 custom_id,
1414 raw_text,
1415 has_closing_sequence: false,
1416 closing_sequence: String::new(),
1417 });
1418 }
1419 }
1420 }
1421
1422 lines
1423 }
1424
1425 fn detect_html_blocks(lines: &mut [LineInfo]) {
1427 const BLOCK_ELEMENTS: &[&str] = &[
1429 "address",
1430 "article",
1431 "aside",
1432 "blockquote",
1433 "details",
1434 "dialog",
1435 "dd",
1436 "div",
1437 "dl",
1438 "dt",
1439 "fieldset",
1440 "figcaption",
1441 "figure",
1442 "footer",
1443 "form",
1444 "h1",
1445 "h2",
1446 "h3",
1447 "h4",
1448 "h5",
1449 "h6",
1450 "header",
1451 "hr",
1452 "li",
1453 "main",
1454 "nav",
1455 "ol",
1456 "p",
1457 "pre",
1458 "section",
1459 "table",
1460 "tbody",
1461 "td",
1462 "tfoot",
1463 "th",
1464 "thead",
1465 "tr",
1466 "ul",
1467 ];
1468
1469 let mut i = 0;
1470 while i < lines.len() {
1471 if lines[i].in_code_block || lines[i].in_front_matter {
1473 i += 1;
1474 continue;
1475 }
1476
1477 let trimmed = lines[i].content.trim_start();
1478
1479 if trimmed.starts_with('<') && trimmed.len() > 1 {
1481 let after_bracket = &trimmed[1..];
1483 let is_closing = after_bracket.starts_with('/');
1484 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1485
1486 let tag_name = tag_start
1488 .chars()
1489 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1490 .collect::<String>()
1491 .to_lowercase();
1492
1493 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1495 lines[i].in_html_block = true;
1497
1498 if !is_closing {
1501 let closing_tag = format!("</{tag_name}>");
1502 let mut j = i + 1;
1503 while j < lines.len() && j < i + 100 {
1504 if lines[j].is_blank {
1507 break;
1508 }
1509
1510 lines[j].in_html_block = true;
1511
1512 if lines[j].content.contains(&closing_tag) {
1514 break;
1515 }
1516 j += 1;
1517 }
1518 }
1519 }
1520 }
1521
1522 i += 1;
1523 }
1524 }
1525
1526 fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1528 let mut code_spans = Vec::new();
1529
1530 if !content.contains('`') {
1532 return code_spans;
1533 }
1534
1535 fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1537 match node {
1538 Node::InlineCode(inline_code) => {
1539 if let Some(pos) = &inline_code.position {
1540 let start_pos = pos.start.offset;
1541 let end_pos = pos.end.offset;
1542
1543 let full_span = &content[start_pos..end_pos];
1545 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1546
1547 let content_start = start_pos + backtick_count;
1549 let content_end = end_pos - backtick_count;
1550 let span_content = if content_start < content_end {
1551 content[content_start..content_end].to_string()
1552 } else {
1553 String::new()
1554 };
1555
1556 let mut line_num = 1;
1558 let mut col_start = start_pos;
1559 for (idx, line_info) in lines.iter().enumerate() {
1560 if start_pos >= line_info.byte_offset {
1561 line_num = idx + 1;
1562 col_start = start_pos - line_info.byte_offset;
1563 } else {
1564 break;
1565 }
1566 }
1567
1568 let mut col_end = end_pos;
1570 for line_info in lines.iter() {
1571 if end_pos > line_info.byte_offset {
1572 col_end = end_pos - line_info.byte_offset;
1573 } else {
1574 break;
1575 }
1576 }
1577
1578 spans.push(CodeSpan {
1579 line: line_num,
1580 start_col: col_start,
1581 end_col: col_end,
1582 byte_offset: start_pos,
1583 byte_end: end_pos,
1584 backtick_count,
1585 content: span_content,
1586 });
1587 }
1588 }
1589 Node::Root(root) => {
1591 for child in &root.children {
1592 extract_code_spans(child, content, lines, spans);
1593 }
1594 }
1595 Node::Paragraph(para) => {
1596 for child in ¶.children {
1597 extract_code_spans(child, content, lines, spans);
1598 }
1599 }
1600 Node::Heading(heading) => {
1601 for child in &heading.children {
1602 extract_code_spans(child, content, lines, spans);
1603 }
1604 }
1605 Node::List(list) => {
1606 for child in &list.children {
1607 extract_code_spans(child, content, lines, spans);
1608 }
1609 }
1610 Node::ListItem(item) => {
1611 for child in &item.children {
1612 extract_code_spans(child, content, lines, spans);
1613 }
1614 }
1615 Node::Blockquote(blockquote) => {
1616 for child in &blockquote.children {
1617 extract_code_spans(child, content, lines, spans);
1618 }
1619 }
1620 Node::Table(table) => {
1621 for child in &table.children {
1622 extract_code_spans(child, content, lines, spans);
1623 }
1624 }
1625 Node::TableRow(row) => {
1626 for child in &row.children {
1627 extract_code_spans(child, content, lines, spans);
1628 }
1629 }
1630 Node::TableCell(cell) => {
1631 for child in &cell.children {
1632 extract_code_spans(child, content, lines, spans);
1633 }
1634 }
1635 Node::Emphasis(emphasis) => {
1636 for child in &emphasis.children {
1637 extract_code_spans(child, content, lines, spans);
1638 }
1639 }
1640 Node::Strong(strong) => {
1641 for child in &strong.children {
1642 extract_code_spans(child, content, lines, spans);
1643 }
1644 }
1645 Node::Link(link) => {
1646 for child in &link.children {
1647 extract_code_spans(child, content, lines, spans);
1648 }
1649 }
1650 Node::LinkReference(link_ref) => {
1651 for child in &link_ref.children {
1652 extract_code_spans(child, content, lines, spans);
1653 }
1654 }
1655 Node::FootnoteDefinition(footnote) => {
1656 for child in &footnote.children {
1657 extract_code_spans(child, content, lines, spans);
1658 }
1659 }
1660 Node::Delete(delete) => {
1661 for child in &delete.children {
1662 extract_code_spans(child, content, lines, spans);
1663 }
1664 }
1665 Node::Code(_)
1667 | Node::Text(_)
1668 | Node::Html(_)
1669 | Node::Image(_)
1670 | Node::ImageReference(_)
1671 | Node::FootnoteReference(_)
1672 | Node::Break(_)
1673 | Node::ThematicBreak(_)
1674 | Node::Definition(_)
1675 | Node::Yaml(_)
1676 | Node::Toml(_)
1677 | Node::Math(_)
1678 | Node::InlineMath(_)
1679 | Node::MdxJsxFlowElement(_)
1680 | Node::MdxFlowExpression(_)
1681 | Node::MdxJsxTextElement(_)
1682 | Node::MdxTextExpression(_)
1683 | Node::MdxjsEsm(_) => {
1684 }
1686 }
1687 }
1688
1689 extract_code_spans(ast, content, lines, &mut code_spans);
1691
1692 code_spans.sort_by_key(|span| span.byte_offset);
1694
1695 code_spans
1696 }
1697
1698 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1700 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1703 let mut last_list_item_line = 0;
1704 let mut current_indent_level = 0;
1705 let mut last_marker_width = 0;
1706
1707 for (line_idx, line_info) in lines.iter().enumerate() {
1708 let line_num = line_idx + 1;
1709
1710 if line_info.in_code_block {
1712 if let Some(ref mut block) = current_block {
1713 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1715
1716 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1718
1719 match context {
1720 CodeBlockContext::Indented => {
1721 block.end_line = line_num;
1723 continue;
1724 }
1725 CodeBlockContext::Standalone => {
1726 let completed_block = current_block.take().unwrap();
1728 list_blocks.push(completed_block);
1729 continue;
1730 }
1731 CodeBlockContext::Adjacent => {
1732 block.end_line = line_num;
1734 continue;
1735 }
1736 }
1737 } else {
1738 continue;
1740 }
1741 }
1742
1743 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1745 caps.get(0).unwrap().as_str().to_string()
1746 } else {
1747 String::new()
1748 };
1749
1750 if let Some(list_item) = &line_info.list_item {
1752 let item_indent = list_item.marker_column;
1754 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1757 let is_nested = nesting > block.nesting_level;
1761 let same_type =
1762 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1763 let same_context = block.blockquote_prefix == blockquote_prefix;
1764 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1768 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1769
1770 let has_non_list_content = {
1772 let mut found_non_list = false;
1773 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1775
1776 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1778 let last_line = &lines[block_last_item_line - 1];
1779 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1780 log::debug!(
1781 "After problematic line {}: checking lines {} to {} for non-list content",
1782 block_last_item_line,
1783 block_last_item_line + 1,
1784 line_num
1785 );
1786 if line_num == block_last_item_line + 1 {
1788 log::debug!("Lines are consecutive, no content between");
1789 }
1790 }
1791 }
1792
1793 for check_line in (block_last_item_line + 1)..line_num {
1794 let check_idx = check_line - 1;
1795 if check_idx < lines.len() {
1796 let check_info = &lines[check_idx];
1797 let is_list_breaking_content = if check_info.in_code_block {
1799 let last_item_marker_width =
1801 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1802 lines[block_last_item_line - 1]
1803 .list_item
1804 .as_ref()
1805 .map(|li| {
1806 if li.is_ordered {
1807 li.marker.len() + 1 } else {
1809 li.marker.len()
1810 }
1811 })
1812 .unwrap_or(3) } else {
1814 3 };
1816
1817 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1818
1819 let context = CodeBlockUtils::analyze_code_block_context(
1821 lines,
1822 check_line - 1,
1823 min_continuation,
1824 );
1825
1826 matches!(context, CodeBlockContext::Standalone)
1828 } else if !check_info.is_blank && check_info.list_item.is_none() {
1829 let line_content = check_info.content.trim();
1831
1832 if check_info.heading.is_some()
1834 || line_content.starts_with("---")
1835 || line_content.starts_with("***")
1836 || line_content.starts_with("___")
1837 || (line_content.contains('|')
1838 && !line_content.contains("](")
1839 && !line_content.contains("http")
1840 && (line_content.matches('|').count() > 1
1841 || line_content.starts_with('|')
1842 || line_content.ends_with('|')))
1843 || line_content.starts_with(">")
1844 {
1845 true
1846 }
1847 else {
1849 let last_item_marker_width =
1850 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1851 lines[block_last_item_line - 1]
1852 .list_item
1853 .as_ref()
1854 .map(|li| {
1855 if li.is_ordered {
1856 li.marker.len() + 1 } else {
1858 li.marker.len()
1859 }
1860 })
1861 .unwrap_or(3) } else {
1863 3 };
1865
1866 let min_continuation =
1867 if block.is_ordered { last_item_marker_width } else { 2 };
1868 check_info.indent < min_continuation
1869 }
1870 } else {
1871 false
1872 };
1873
1874 if is_list_breaking_content {
1875 found_non_list = true;
1877 break;
1878 }
1879 }
1880 }
1881 found_non_list
1882 };
1883
1884 let mut continues_list = if is_nested {
1888 same_context && reasonable_distance && !has_non_list_content
1890 } else {
1891 let result = same_type
1893 && same_context
1894 && reasonable_distance
1895 && marker_compatible
1896 && !has_non_list_content;
1897
1898 if block.item_lines.last().is_some_and(|&last_line| {
1900 last_line > 0
1901 && last_line <= lines.len()
1902 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1903 && lines[last_line - 1].content.contains(r"\`")
1904 }) {
1905 log::debug!(
1906 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1907 );
1908 if line_num > 0 && line_num <= lines.len() {
1909 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1910 }
1911 }
1912
1913 result
1914 };
1915
1916 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1919 if block.item_lines.contains(&(line_num - 1)) {
1921 continues_list = true;
1923 }
1924 }
1925
1926 if continues_list {
1927 block.end_line = line_num;
1929 block.item_lines.push(line_num);
1930
1931 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1933 list_item.marker.len() + 1
1934 } else {
1935 list_item.marker.len()
1936 });
1937
1938 if !block.is_ordered
1940 && block.marker.is_some()
1941 && block.marker.as_ref() != Some(&list_item.marker)
1942 {
1943 block.marker = None;
1945 }
1946 } else {
1947 list_blocks.push(block.clone());
1950
1951 *block = ListBlock {
1952 start_line: line_num,
1953 end_line: line_num,
1954 is_ordered: list_item.is_ordered,
1955 marker: if list_item.is_ordered {
1956 None
1957 } else {
1958 Some(list_item.marker.clone())
1959 },
1960 blockquote_prefix: blockquote_prefix.clone(),
1961 item_lines: vec![line_num],
1962 nesting_level: nesting,
1963 max_marker_width: if list_item.is_ordered {
1964 list_item.marker.len() + 1
1965 } else {
1966 list_item.marker.len()
1967 },
1968 };
1969 }
1970 } else {
1971 current_block = Some(ListBlock {
1973 start_line: line_num,
1974 end_line: line_num,
1975 is_ordered: list_item.is_ordered,
1976 marker: if list_item.is_ordered {
1977 None
1978 } else {
1979 Some(list_item.marker.clone())
1980 },
1981 blockquote_prefix,
1982 item_lines: vec![line_num],
1983 nesting_level: nesting,
1984 max_marker_width: list_item.marker.len(),
1985 });
1986 }
1987
1988 last_list_item_line = line_num;
1989 current_indent_level = item_indent;
1990 last_marker_width = if list_item.is_ordered {
1991 list_item.marker.len() + 1 } else {
1993 list_item.marker.len()
1994 };
1995 } else if let Some(ref mut block) = current_block {
1996 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2006 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2007 } else {
2008 false
2009 };
2010
2011 let min_continuation_indent = if block.is_ordered {
2015 current_indent_level + last_marker_width
2016 } else {
2017 current_indent_level + 2 };
2019
2020 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2021 block.end_line = line_num;
2023 } else if line_info.is_blank {
2024 let mut check_idx = line_idx + 1;
2027 let mut found_continuation = false;
2028
2029 while check_idx < lines.len() && lines[check_idx].is_blank {
2031 check_idx += 1;
2032 }
2033
2034 if check_idx < lines.len() {
2035 let next_line = &lines[check_idx];
2036 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2038 found_continuation = true;
2039 }
2040 else if !next_line.in_code_block
2042 && next_line.list_item.is_some()
2043 && let Some(item) = &next_line.list_item
2044 {
2045 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2046 .find(&next_line.content)
2047 .map_or(String::new(), |m| m.as_str().to_string());
2048 if item.marker_column == current_indent_level
2049 && item.is_ordered == block.is_ordered
2050 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2051 {
2052 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2055 if let Some(between_line) = lines.get(idx) {
2056 let trimmed = between_line.content.trim();
2057 if trimmed.is_empty() {
2059 return false;
2060 }
2061 let line_indent =
2063 between_line.content.len() - between_line.content.trim_start().len();
2064
2065 if trimmed.starts_with("```")
2067 || trimmed.starts_with("~~~")
2068 || trimmed.starts_with("---")
2069 || trimmed.starts_with("***")
2070 || trimmed.starts_with("___")
2071 || trimmed.starts_with(">")
2072 || trimmed.contains('|') || between_line.heading.is_some()
2074 {
2075 return true; }
2077
2078 line_indent >= min_continuation_indent
2080 } else {
2081 false
2082 }
2083 });
2084
2085 if block.is_ordered {
2086 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2089 if let Some(between_line) = lines.get(idx) {
2090 let trimmed = between_line.content.trim();
2091 if trimmed.is_empty() {
2092 return false;
2093 }
2094 trimmed.starts_with("```")
2096 || trimmed.starts_with("~~~")
2097 || trimmed.starts_with("---")
2098 || trimmed.starts_with("***")
2099 || trimmed.starts_with("___")
2100 || trimmed.starts_with(">")
2101 || trimmed.contains('|') || between_line.heading.is_some()
2103 } else {
2104 false
2105 }
2106 });
2107 found_continuation = !has_structural_separators;
2108 } else {
2109 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2111 if let Some(between_line) = lines.get(idx) {
2112 let trimmed = between_line.content.trim();
2113 if trimmed.is_empty() {
2114 return false;
2115 }
2116 trimmed.starts_with("```")
2118 || trimmed.starts_with("~~~")
2119 || trimmed.starts_with("---")
2120 || trimmed.starts_with("***")
2121 || trimmed.starts_with("___")
2122 || trimmed.starts_with(">")
2123 || trimmed.contains('|') || between_line.heading.is_some()
2125 } else {
2126 false
2127 }
2128 });
2129 found_continuation = !has_structural_separators;
2130 }
2131 }
2132 }
2133 }
2134
2135 if found_continuation {
2136 block.end_line = line_num;
2138 } else {
2139 list_blocks.push(block.clone());
2141 current_block = None;
2142 }
2143 } else {
2144 let min_required_indent = if block.is_ordered {
2147 current_indent_level + last_marker_width
2148 } else {
2149 current_indent_level + 2
2150 };
2151
2152 let line_content = line_info.content.trim();
2157 let is_structural_separator = line_info.heading.is_some()
2158 || line_content.starts_with("```")
2159 || line_content.starts_with("~~~")
2160 || line_content.starts_with("---")
2161 || line_content.starts_with("***")
2162 || line_content.starts_with("___")
2163 || line_content.starts_with(">")
2164 || (line_content.contains('|')
2165 && !line_content.contains("](")
2166 && !line_content.contains("http")
2167 && (line_content.matches('|').count() > 1
2168 || line_content.starts_with('|')
2169 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2174 && !line_info.is_blank
2175 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2176
2177 if is_lazy_continuation {
2178 let content_to_check = if !blockquote_prefix.is_empty() {
2181 line_info
2183 .content
2184 .strip_prefix(&blockquote_prefix)
2185 .unwrap_or(&line_info.content)
2186 .trim()
2187 } else {
2188 line_info.content.trim()
2189 };
2190
2191 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2192
2193 if starts_with_uppercase && last_list_item_line > 0 {
2196 list_blocks.push(block.clone());
2198 current_block = None;
2199 } else {
2200 block.end_line = line_num;
2202 }
2203 } else {
2204 list_blocks.push(block.clone());
2206 current_block = None;
2207 }
2208 }
2209 }
2210 }
2211
2212 if let Some(block) = current_block {
2214 list_blocks.push(block);
2215 }
2216
2217 merge_adjacent_list_blocks(&mut list_blocks, lines);
2219
2220 list_blocks
2221 }
2222
2223 fn compute_char_frequency(content: &str) -> CharFrequency {
2225 let mut frequency = CharFrequency::default();
2226
2227 for ch in content.chars() {
2228 match ch {
2229 '#' => frequency.hash_count += 1,
2230 '*' => frequency.asterisk_count += 1,
2231 '_' => frequency.underscore_count += 1,
2232 '-' => frequency.hyphen_count += 1,
2233 '+' => frequency.plus_count += 1,
2234 '>' => frequency.gt_count += 1,
2235 '|' => frequency.pipe_count += 1,
2236 '[' => frequency.bracket_count += 1,
2237 '`' => frequency.backtick_count += 1,
2238 '<' => frequency.lt_count += 1,
2239 '!' => frequency.exclamation_count += 1,
2240 '\n' => frequency.newline_count += 1,
2241 _ => {}
2242 }
2243 }
2244
2245 frequency
2246 }
2247
2248 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2250 lazy_static! {
2251 static ref HTML_TAG_REGEX: regex::Regex =
2252 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2253 }
2254
2255 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2256
2257 for cap in HTML_TAG_REGEX.captures_iter(content) {
2258 let full_match = cap.get(0).unwrap();
2259 let match_start = full_match.start();
2260 let match_end = full_match.end();
2261
2262 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2264 continue;
2265 }
2266
2267 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2268 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2269 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2270
2271 let mut line_num = 1;
2273 let mut col_start = match_start;
2274 let mut col_end = match_end;
2275 for (idx, line_info) in lines.iter().enumerate() {
2276 if match_start >= line_info.byte_offset {
2277 line_num = idx + 1;
2278 col_start = match_start - line_info.byte_offset;
2279 col_end = match_end - line_info.byte_offset;
2280 } else {
2281 break;
2282 }
2283 }
2284
2285 html_tags.push(HtmlTag {
2286 line: line_num,
2287 start_col: col_start,
2288 end_col: col_end,
2289 byte_offset: match_start,
2290 byte_end: match_end,
2291 tag_name,
2292 is_closing,
2293 is_self_closing,
2294 raw_content: full_match.as_str().to_string(),
2295 });
2296 }
2297
2298 html_tags
2299 }
2300
2301 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2303 lazy_static! {
2304 static ref EMPHASIS_REGEX: regex::Regex =
2305 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2306 }
2307
2308 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2309
2310 for cap in EMPHASIS_REGEX.captures_iter(content) {
2311 let full_match = cap.get(0).unwrap();
2312 let match_start = full_match.start();
2313 let match_end = full_match.end();
2314
2315 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2317 continue;
2318 }
2319
2320 let opening_markers = cap.get(1).unwrap().as_str();
2321 let content_part = cap.get(2).unwrap().as_str();
2322 let closing_markers = cap.get(3).unwrap().as_str();
2323
2324 if opening_markers.chars().next() != closing_markers.chars().next()
2326 || opening_markers.len() != closing_markers.len()
2327 {
2328 continue;
2329 }
2330
2331 let marker = opening_markers.chars().next().unwrap();
2332 let marker_count = opening_markers.len();
2333
2334 let mut line_num = 1;
2336 let mut col_start = match_start;
2337 let mut col_end = match_end;
2338 for (idx, line_info) in lines.iter().enumerate() {
2339 if match_start >= line_info.byte_offset {
2340 line_num = idx + 1;
2341 col_start = match_start - line_info.byte_offset;
2342 col_end = match_end - line_info.byte_offset;
2343 } else {
2344 break;
2345 }
2346 }
2347
2348 emphasis_spans.push(EmphasisSpan {
2349 line: line_num,
2350 start_col: col_start,
2351 end_col: col_end,
2352 byte_offset: match_start,
2353 byte_end: match_end,
2354 marker,
2355 marker_count,
2356 content: content_part.to_string(),
2357 });
2358 }
2359
2360 emphasis_spans
2361 }
2362
2363 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2365 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2366
2367 for (line_idx, line_info) in lines.iter().enumerate() {
2368 if line_info.in_code_block || line_info.is_blank {
2370 continue;
2371 }
2372
2373 let line = &line_info.content;
2374 let line_num = line_idx + 1;
2375
2376 if !line.contains('|') {
2378 continue;
2379 }
2380
2381 let parts: Vec<&str> = line.split('|').collect();
2383 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2384
2385 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2387 let mut column_alignments = Vec::new();
2388
2389 if is_separator {
2390 for part in &parts[1..parts.len() - 1] {
2391 let trimmed = part.trim();
2393 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2394 "center".to_string()
2395 } else if trimmed.ends_with(':') {
2396 "right".to_string()
2397 } else if trimmed.starts_with(':') {
2398 "left".to_string()
2399 } else {
2400 "none".to_string()
2401 };
2402 column_alignments.push(alignment);
2403 }
2404 }
2405
2406 table_rows.push(TableRow {
2407 line: line_num,
2408 is_separator,
2409 column_count,
2410 column_alignments,
2411 });
2412 }
2413
2414 table_rows
2415 }
2416
2417 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2419 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2420
2421 for cap in BARE_URL_PATTERN.captures_iter(content) {
2423 let full_match = cap.get(0).unwrap();
2424 let match_start = full_match.start();
2425 let match_end = full_match.end();
2426
2427 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2429 continue;
2430 }
2431
2432 let preceding_char = if match_start > 0 {
2434 content.chars().nth(match_start - 1)
2435 } else {
2436 None
2437 };
2438 let following_char = content.chars().nth(match_end);
2439
2440 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2441 continue;
2442 }
2443 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2444 continue;
2445 }
2446
2447 let url = full_match.as_str();
2448 let url_type = if url.starts_with("https://") {
2449 "https"
2450 } else if url.starts_with("http://") {
2451 "http"
2452 } else if url.starts_with("ftp://") {
2453 "ftp"
2454 } else {
2455 "other"
2456 };
2457
2458 let mut line_num = 1;
2460 let mut col_start = match_start;
2461 let mut col_end = match_end;
2462 for (idx, line_info) in lines.iter().enumerate() {
2463 if match_start >= line_info.byte_offset {
2464 line_num = idx + 1;
2465 col_start = match_start - line_info.byte_offset;
2466 col_end = match_end - line_info.byte_offset;
2467 } else {
2468 break;
2469 }
2470 }
2471
2472 bare_urls.push(BareUrl {
2473 line: line_num,
2474 start_col: col_start,
2475 end_col: col_end,
2476 byte_offset: match_start,
2477 byte_end: match_end,
2478 url: url.to_string(),
2479 url_type: url_type.to_string(),
2480 });
2481 }
2482
2483 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2485 let full_match = cap.get(0).unwrap();
2486 let match_start = full_match.start();
2487 let match_end = full_match.end();
2488
2489 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2491 continue;
2492 }
2493
2494 let preceding_char = if match_start > 0 {
2496 content.chars().nth(match_start - 1)
2497 } else {
2498 None
2499 };
2500 let following_char = content.chars().nth(match_end);
2501
2502 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2503 continue;
2504 }
2505 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2506 continue;
2507 }
2508
2509 let email = full_match.as_str();
2510
2511 let mut line_num = 1;
2513 let mut col_start = match_start;
2514 let mut col_end = match_end;
2515 for (idx, line_info) in lines.iter().enumerate() {
2516 if match_start >= line_info.byte_offset {
2517 line_num = idx + 1;
2518 col_start = match_start - line_info.byte_offset;
2519 col_end = match_end - line_info.byte_offset;
2520 } else {
2521 break;
2522 }
2523 }
2524
2525 bare_urls.push(BareUrl {
2526 line: line_num,
2527 start_col: col_start,
2528 end_col: col_end,
2529 byte_offset: match_start,
2530 byte_end: match_end,
2531 url: email.to_string(),
2532 url_type: "email".to_string(),
2533 });
2534 }
2535
2536 bare_urls
2537 }
2538}
2539
2540fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2542 if list_blocks.len() < 2 {
2543 return;
2544 }
2545
2546 let mut merger = ListBlockMerger::new(lines);
2547 *list_blocks = merger.merge(list_blocks);
2548}
2549
2550struct ListBlockMerger<'a> {
2552 lines: &'a [LineInfo],
2553}
2554
2555impl<'a> ListBlockMerger<'a> {
2556 fn new(lines: &'a [LineInfo]) -> Self {
2557 Self { lines }
2558 }
2559
2560 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2561 let mut merged = Vec::with_capacity(list_blocks.len());
2562 let mut current = list_blocks[0].clone();
2563
2564 for next in list_blocks.iter().skip(1) {
2565 if self.should_merge_blocks(¤t, next) {
2566 current = self.merge_two_blocks(current, next);
2567 } else {
2568 merged.push(current);
2569 current = next.clone();
2570 }
2571 }
2572
2573 merged.push(current);
2574 merged
2575 }
2576
2577 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2579 if !self.blocks_are_compatible(current, next) {
2581 return false;
2582 }
2583
2584 let spacing = self.analyze_spacing_between(current, next);
2586 match spacing {
2587 BlockSpacing::Consecutive => true,
2588 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2589 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2590 self.can_merge_with_content_between(current, next)
2591 }
2592 }
2593 }
2594
2595 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2597 current.is_ordered == next.is_ordered
2598 && current.blockquote_prefix == next.blockquote_prefix
2599 && current.nesting_level == next.nesting_level
2600 }
2601
2602 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2604 let gap = next.start_line - current.end_line;
2605
2606 match gap {
2607 1 => BlockSpacing::Consecutive,
2608 2 => BlockSpacing::SingleBlank,
2609 _ if gap > 2 => {
2610 if self.has_only_blank_lines_between(current, next) {
2611 BlockSpacing::MultipleBlanks
2612 } else {
2613 BlockSpacing::ContentBetween
2614 }
2615 }
2616 _ => BlockSpacing::Consecutive, }
2618 }
2619
2620 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2622 if has_meaningful_content_between(current, next, self.lines) {
2625 return false; }
2627
2628 !current.is_ordered && current.marker == next.marker
2630 }
2631
2632 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2634 if has_meaningful_content_between(current, next, self.lines) {
2636 return false; }
2638
2639 current.is_ordered && next.is_ordered
2641 }
2642
2643 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2645 for line_num in (current.end_line + 1)..next.start_line {
2646 if let Some(line_info) = self.lines.get(line_num - 1)
2647 && !line_info.content.trim().is_empty()
2648 {
2649 return false;
2650 }
2651 }
2652 true
2653 }
2654
2655 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2657 current.end_line = next.end_line;
2658 current.item_lines.extend_from_slice(&next.item_lines);
2659
2660 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2662
2663 if !current.is_ordered && self.markers_differ(¤t, next) {
2665 current.marker = None; }
2667
2668 current
2669 }
2670
2671 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2673 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2674 }
2675}
2676
2677#[derive(Debug, PartialEq)]
2679enum BlockSpacing {
2680 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2685
2686fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2688 for line_num in (current.end_line + 1)..next.start_line {
2690 if let Some(line_info) = lines.get(line_num - 1) {
2691 let trimmed = line_info.content.trim();
2693
2694 if trimmed.is_empty() {
2696 continue;
2697 }
2698
2699 if line_info.heading.is_some() {
2703 return true; }
2705
2706 if is_horizontal_rule(trimmed) {
2708 return true; }
2710
2711 if trimmed.contains('|') && trimmed.len() > 1 {
2714 if !trimmed.contains("](") && !trimmed.contains("http") {
2716 let pipe_count = trimmed.matches('|').count();
2718 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2719 return true; }
2721 }
2722 }
2723
2724 if trimmed.starts_with('>') {
2726 return true; }
2728
2729 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2731 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2732
2733 let min_continuation_indent = if current.is_ordered {
2735 current.nesting_level + current.max_marker_width + 1 } else {
2737 current.nesting_level + 2
2738 };
2739
2740 if line_indent < min_continuation_indent {
2741 return true; }
2744 }
2745
2746 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2748
2749 let min_indent = if current.is_ordered {
2751 current.nesting_level + current.max_marker_width
2752 } else {
2753 current.nesting_level + 2
2754 };
2755
2756 if line_indent < min_indent {
2758 return true; }
2760
2761 }
2764 }
2765
2766 false
2768}
2769
2770fn is_horizontal_rule(trimmed: &str) -> bool {
2772 if trimmed.len() < 3 {
2773 return false;
2774 }
2775
2776 let chars: Vec<char> = trimmed.chars().collect();
2778 if let Some(&first_char) = chars.first()
2779 && (first_char == '-' || first_char == '*' || first_char == '_')
2780 {
2781 let mut count = 0;
2782 for &ch in &chars {
2783 if ch == first_char {
2784 count += 1;
2785 } else if ch != ' ' && ch != '\t' {
2786 return false; }
2788 }
2789 return count >= 3;
2790 }
2791 false
2792}
2793
2794#[cfg(test)]
2796mod tests {
2797 use super::*;
2798
2799 #[test]
2800 fn test_empty_content() {
2801 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2802 assert_eq!(ctx.content, "");
2803 assert_eq!(ctx.line_offsets, vec![0]);
2804 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2805 assert_eq!(ctx.lines.len(), 0);
2806 }
2807
2808 #[test]
2809 fn test_single_line() {
2810 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2811 assert_eq!(ctx.content, "# Hello");
2812 assert_eq!(ctx.line_offsets, vec![0]);
2813 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2814 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2815 }
2816
2817 #[test]
2818 fn test_multi_line() {
2819 let content = "# Title\n\nSecond line\nThird line";
2820 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2821 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2822 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2829
2830 #[test]
2831 fn test_line_info() {
2832 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2833 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2834
2835 assert_eq!(ctx.lines.len(), 7);
2837
2838 let line1 = &ctx.lines[0];
2840 assert_eq!(line1.content, "# Title");
2841 assert_eq!(line1.byte_offset, 0);
2842 assert_eq!(line1.indent, 0);
2843 assert!(!line1.is_blank);
2844 assert!(!line1.in_code_block);
2845 assert!(line1.list_item.is_none());
2846
2847 let line2 = &ctx.lines[1];
2849 assert_eq!(line2.content, " indented");
2850 assert_eq!(line2.byte_offset, 8);
2851 assert_eq!(line2.indent, 4);
2852 assert!(!line2.is_blank);
2853
2854 let line3 = &ctx.lines[2];
2856 assert_eq!(line3.content, "");
2857 assert!(line3.is_blank);
2858
2859 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2861 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2862 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2863 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2864 }
2865
2866 #[test]
2867 fn test_list_item_detection() {
2868 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2869 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2870
2871 let line1 = &ctx.lines[0];
2873 assert!(line1.list_item.is_some());
2874 let list1 = line1.list_item.as_ref().unwrap();
2875 assert_eq!(list1.marker, "-");
2876 assert!(!list1.is_ordered);
2877 assert_eq!(list1.marker_column, 0);
2878 assert_eq!(list1.content_column, 2);
2879
2880 let line2 = &ctx.lines[1];
2882 assert!(line2.list_item.is_some());
2883 let list2 = line2.list_item.as_ref().unwrap();
2884 assert_eq!(list2.marker, "*");
2885 assert_eq!(list2.marker_column, 2);
2886
2887 let line3 = &ctx.lines[2];
2889 assert!(line3.list_item.is_some());
2890 let list3 = line3.list_item.as_ref().unwrap();
2891 assert_eq!(list3.marker, "1.");
2892 assert!(list3.is_ordered);
2893 assert_eq!(list3.number, Some(1));
2894
2895 let line6 = &ctx.lines[5];
2897 assert!(line6.list_item.is_none());
2898 }
2899
2900 #[test]
2901 fn test_offset_to_line_col_edge_cases() {
2902 let content = "a\nb\nc";
2903 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2904 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2912}