1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::ast_utils::get_cached_ast;
4use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
5use lazy_static::lazy_static;
6use markdown::mdast::Node;
7use regex::Regex;
8
9lazy_static! {
10 static ref LINK_PATTERN: Regex = Regex::new(
13 r"(?sx)
14 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
15 (?:
16 \(([^)]*)\) # Inline URL in group 2 (can be empty)
17 |
18 \[([^\]]*)\] # Reference ID in group 3
19 )"
20 ).unwrap();
21
22 static ref IMAGE_PATTERN: Regex = Regex::new(
25 r"(?sx)
26 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
27 (?:
28 \(([^)]*)\) # Inline URL in group 2 (can be empty)
29 |
30 \[([^\]]*)\] # Reference ID in group 3
31 )"
32 ).unwrap();
33
34 static ref REF_DEF_PATTERN: Regex = Regex::new(
36 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
37 ).unwrap();
38
39 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
42 r"`+"
43 ).unwrap();
44
45 static ref BARE_URL_PATTERN: Regex = Regex::new(
47 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
48 ).unwrap();
49
50 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
52 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
53 ).unwrap();
54
55 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
57 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
58 ).unwrap();
59
60 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
62}
63
64#[derive(Debug, Clone)]
66pub struct LineInfo {
67 pub content: String,
69 pub byte_offset: usize,
71 pub indent: usize,
73 pub is_blank: bool,
75 pub in_code_block: bool,
77 pub in_front_matter: bool,
79 pub in_html_block: bool,
81 pub list_item: Option<ListItemInfo>,
83 pub heading: Option<HeadingInfo>,
85 pub blockquote: Option<BlockquoteInfo>,
87 pub in_mkdocstrings: bool,
89}
90
91#[derive(Debug, Clone)]
93pub struct ListItemInfo {
94 pub marker: String,
96 pub is_ordered: bool,
98 pub number: Option<usize>,
100 pub marker_column: usize,
102 pub content_column: usize,
104}
105
106#[derive(Debug, Clone, PartialEq)]
108pub enum HeadingStyle {
109 ATX,
111 Setext1,
113 Setext2,
115}
116
117#[derive(Debug, Clone)]
119pub struct ParsedLink {
120 pub line: usize,
122 pub start_col: usize,
124 pub end_col: usize,
126 pub byte_offset: usize,
128 pub byte_end: usize,
130 pub text: String,
132 pub url: String,
134 pub is_reference: bool,
136 pub reference_id: Option<String>,
138}
139
140#[derive(Debug, Clone)]
142pub struct ParsedImage {
143 pub line: usize,
145 pub start_col: usize,
147 pub end_col: usize,
149 pub byte_offset: usize,
151 pub byte_end: usize,
153 pub alt_text: String,
155 pub url: String,
157 pub is_reference: bool,
159 pub reference_id: Option<String>,
161}
162
163#[derive(Debug, Clone)]
165pub struct ReferenceDef {
166 pub line: usize,
168 pub id: String,
170 pub url: String,
172 pub title: Option<String>,
174}
175
176#[derive(Debug, Clone)]
178pub struct CodeSpan {
179 pub line: usize,
181 pub start_col: usize,
183 pub end_col: usize,
185 pub byte_offset: usize,
187 pub byte_end: usize,
189 pub backtick_count: usize,
191 pub content: String,
193}
194
195#[derive(Debug, Clone)]
197pub struct HeadingInfo {
198 pub level: u8,
200 pub style: HeadingStyle,
202 pub marker: String,
204 pub marker_column: usize,
206 pub content_column: usize,
208 pub text: String,
210 pub custom_id: Option<String>,
212 pub raw_text: String,
214 pub has_closing_sequence: bool,
216 pub closing_sequence: String,
218}
219
220#[derive(Debug, Clone)]
222pub struct BlockquoteInfo {
223 pub nesting_level: usize,
225 pub indent: String,
227 pub marker_column: usize,
229 pub prefix: String,
231 pub content: String,
233 pub has_no_space_after_marker: bool,
235 pub has_multiple_spaces_after_marker: bool,
237 pub needs_md028_fix: bool,
239}
240
241#[derive(Debug, Clone)]
243pub struct ListBlock {
244 pub start_line: usize,
246 pub end_line: usize,
248 pub is_ordered: bool,
250 pub marker: Option<String>,
252 pub blockquote_prefix: String,
254 pub item_lines: Vec<usize>,
256 pub nesting_level: usize,
258 pub max_marker_width: usize,
260}
261
262use std::sync::{Arc, Mutex};
263
264#[derive(Debug, Clone, Default)]
266pub struct CharFrequency {
267 pub hash_count: usize,
269 pub asterisk_count: usize,
271 pub underscore_count: usize,
273 pub hyphen_count: usize,
275 pub plus_count: usize,
277 pub gt_count: usize,
279 pub pipe_count: usize,
281 pub bracket_count: usize,
283 pub backtick_count: usize,
285 pub lt_count: usize,
287 pub exclamation_count: usize,
289 pub newline_count: usize,
291}
292
293#[derive(Debug, Clone)]
295pub struct HtmlTag {
296 pub line: usize,
298 pub start_col: usize,
300 pub end_col: usize,
302 pub byte_offset: usize,
304 pub byte_end: usize,
306 pub tag_name: String,
308 pub is_closing: bool,
310 pub is_self_closing: bool,
312 pub raw_content: String,
314}
315
316#[derive(Debug, Clone)]
318pub struct EmphasisSpan {
319 pub line: usize,
321 pub start_col: usize,
323 pub end_col: usize,
325 pub byte_offset: usize,
327 pub byte_end: usize,
329 pub marker: char,
331 pub marker_count: usize,
333 pub content: String,
335}
336
337#[derive(Debug, Clone)]
339pub struct TableRow {
340 pub line: usize,
342 pub is_separator: bool,
344 pub column_count: usize,
346 pub column_alignments: Vec<String>, }
349
350#[derive(Debug, Clone)]
352pub struct BareUrl {
353 pub line: usize,
355 pub start_col: usize,
357 pub end_col: usize,
359 pub byte_offset: usize,
361 pub byte_end: usize,
363 pub url: String,
365 pub url_type: String,
367}
368
369pub struct LintContext<'a> {
370 pub content: &'a str,
371 pub line_offsets: Vec<usize>,
372 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, ast_cache: Mutex<Option<Arc<Node>>>, pub flavor: MarkdownFlavor, }
387
388impl<'a> LintContext<'a> {
389 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
390 let mut line_offsets = vec![0];
391 for (i, c) in content.char_indices() {
392 if c == '\n' {
393 line_offsets.push(i + 1);
394 }
395 }
396
397 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
399
400 let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
402
403 let ast = get_cached_ast(content);
405 let code_spans = Self::parse_code_spans(content, &lines, &ast);
406
407 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
409 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
410 let reference_defs = Self::parse_reference_defs(content, &lines);
411 let list_blocks = Self::parse_list_blocks(&lines);
414
415 Self::detect_html_blocks(&mut lines);
417
418 let char_frequency = Self::compute_char_frequency(content);
420
421 Self {
422 content,
423 line_offsets,
424 code_blocks,
425 lines,
426 links,
427 images,
428 reference_defs,
429 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
430 list_blocks,
431 char_frequency,
432 html_tags_cache: Mutex::new(None),
433 emphasis_spans_cache: Mutex::new(None),
434 table_rows_cache: Mutex::new(None),
435 bare_urls_cache: Mutex::new(None),
436 ast_cache: Mutex::new(None),
437 flavor,
438 }
439 }
440
441 pub fn get_ast(&self) -> Arc<Node> {
443 let mut cache = self.ast_cache.lock().unwrap();
444
445 if cache.is_none() {
446 *cache = Some(get_cached_ast(self.content));
449 }
450
451 cache.as_ref().unwrap().clone()
452 }
453
454 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
456 let mut cache = self.code_spans_cache.lock().unwrap();
457
458 if cache.is_none() {
460 let ast = self.get_ast();
461 let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
462 *cache = Some(Arc::new(code_spans));
463 }
464
465 cache.as_ref().unwrap().clone()
467 }
468
469 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
471 let mut cache = self.html_tags_cache.lock().unwrap();
472
473 if cache.is_none() {
474 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
475 *cache = Some(Arc::new(html_tags));
476 }
477
478 cache.as_ref().unwrap().clone()
479 }
480
481 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
483 let mut cache = self.emphasis_spans_cache.lock().unwrap();
484
485 if cache.is_none() {
486 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
487 *cache = Some(Arc::new(emphasis_spans));
488 }
489
490 cache.as_ref().unwrap().clone()
491 }
492
493 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
495 let mut cache = self.table_rows_cache.lock().unwrap();
496
497 if cache.is_none() {
498 let table_rows = Self::parse_table_rows(&self.lines);
499 *cache = Some(Arc::new(table_rows));
500 }
501
502 cache.as_ref().unwrap().clone()
503 }
504
505 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
507 let mut cache = self.bare_urls_cache.lock().unwrap();
508
509 if cache.is_none() {
510 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
511 *cache = Some(Arc::new(bare_urls));
512 }
513
514 cache.as_ref().unwrap().clone()
515 }
516
517 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
519 match self.line_offsets.binary_search(&offset) {
520 Ok(line) => (line + 1, 1),
521 Err(line) => {
522 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
523 (line, offset - line_start + 1)
524 }
525 }
526 }
527
528 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
530 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
532 return true;
533 }
534
535 self.code_spans()
537 .iter()
538 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
539 }
540
541 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
543 if line_num > 0 {
544 self.lines.get(line_num - 1)
545 } else {
546 None
547 }
548 }
549
550 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
552 self.line_info(line_num).map(|info| info.byte_offset)
553 }
554
555 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
557 let normalized_id = ref_id.to_lowercase();
558 self.reference_defs
559 .iter()
560 .find(|def| def.id == normalized_id)
561 .map(|def| def.url.as_str())
562 }
563
564 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
566 self.links.iter().filter(|link| link.line == line_num).collect()
567 }
568
569 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
571 self.images.iter().filter(|img| img.line == line_num).collect()
572 }
573
574 pub fn is_in_list_block(&self, line_num: usize) -> bool {
576 self.list_blocks
577 .iter()
578 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
579 }
580
581 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
583 self.list_blocks
584 .iter()
585 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
586 }
587
588 pub fn is_in_code_block(&self, line_num: usize) -> bool {
592 if line_num == 0 || line_num > self.lines.len() {
593 return false;
594 }
595 self.lines[line_num - 1].in_code_block
596 }
597
598 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
600 if line_num == 0 || line_num > self.lines.len() {
601 return false;
602 }
603 self.lines[line_num - 1].in_front_matter
604 }
605
606 pub fn is_in_html_block(&self, line_num: usize) -> bool {
608 if line_num == 0 || line_num > self.lines.len() {
609 return false;
610 }
611 self.lines[line_num - 1].in_html_block
612 }
613
614 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
616 if line_num == 0 || line_num > self.lines.len() {
617 return false;
618 }
619
620 let col_0indexed = if col > 0 { col - 1 } else { 0 };
624 let code_spans = self.code_spans();
625 code_spans
626 .iter()
627 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
628 }
629
630 pub fn has_char(&self, ch: char) -> bool {
632 match ch {
633 '#' => self.char_frequency.hash_count > 0,
634 '*' => self.char_frequency.asterisk_count > 0,
635 '_' => self.char_frequency.underscore_count > 0,
636 '-' => self.char_frequency.hyphen_count > 0,
637 '+' => self.char_frequency.plus_count > 0,
638 '>' => self.char_frequency.gt_count > 0,
639 '|' => self.char_frequency.pipe_count > 0,
640 '[' => self.char_frequency.bracket_count > 0,
641 '`' => self.char_frequency.backtick_count > 0,
642 '<' => self.char_frequency.lt_count > 0,
643 '!' => self.char_frequency.exclamation_count > 0,
644 '\n' => self.char_frequency.newline_count > 0,
645 _ => self.content.contains(ch), }
647 }
648
649 pub fn char_count(&self, ch: char) -> usize {
651 match ch {
652 '#' => self.char_frequency.hash_count,
653 '*' => self.char_frequency.asterisk_count,
654 '_' => self.char_frequency.underscore_count,
655 '-' => self.char_frequency.hyphen_count,
656 '+' => self.char_frequency.plus_count,
657 '>' => self.char_frequency.gt_count,
658 '|' => self.char_frequency.pipe_count,
659 '[' => self.char_frequency.bracket_count,
660 '`' => self.char_frequency.backtick_count,
661 '<' => self.char_frequency.lt_count,
662 '!' => self.char_frequency.exclamation_count,
663 '\n' => self.char_frequency.newline_count,
664 _ => self.content.matches(ch).count(), }
666 }
667
668 pub fn likely_has_headings(&self) -> bool {
670 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
672
673 pub fn likely_has_lists(&self) -> bool {
675 self.char_frequency.asterisk_count > 0
676 || self.char_frequency.hyphen_count > 0
677 || self.char_frequency.plus_count > 0
678 }
679
680 pub fn likely_has_emphasis(&self) -> bool {
682 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
683 }
684
685 pub fn likely_has_tables(&self) -> bool {
687 self.char_frequency.pipe_count > 2
688 }
689
690 pub fn likely_has_blockquotes(&self) -> bool {
692 self.char_frequency.gt_count > 0
693 }
694
695 pub fn likely_has_code(&self) -> bool {
697 self.char_frequency.backtick_count > 0
698 }
699
700 pub fn likely_has_links_or_images(&self) -> bool {
702 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
703 }
704
705 pub fn likely_has_html(&self) -> bool {
707 self.char_frequency.lt_count > 0
708 }
709
710 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
712 self.html_tags()
713 .iter()
714 .filter(|tag| tag.line == line_num)
715 .cloned()
716 .collect()
717 }
718
719 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
721 self.emphasis_spans()
722 .iter()
723 .filter(|span| span.line == line_num)
724 .cloned()
725 .collect()
726 }
727
728 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
730 self.table_rows()
731 .iter()
732 .filter(|row| row.line == line_num)
733 .cloned()
734 .collect()
735 }
736
737 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
739 self.bare_urls()
740 .iter()
741 .filter(|url| url.line == line_num)
742 .cloned()
743 .collect()
744 }
745
746 fn parse_links(
748 content: &str,
749 lines: &[LineInfo],
750 code_blocks: &[(usize, usize)],
751 code_spans: &[CodeSpan],
752 flavor: MarkdownFlavor,
753 ) -> Vec<ParsedLink> {
754 use crate::utils::skip_context::is_mkdocs_snippet_line;
755
756 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
761 let full_match = cap.get(0).unwrap();
762 let match_start = full_match.start();
763 let match_end = full_match.end();
764
765 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
767 continue;
768 }
769
770 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
772 continue;
773 }
774
775 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
777 continue;
778 }
779
780 if code_spans
782 .iter()
783 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
784 {
785 continue;
786 }
787
788 let line_idx = lines
791 .iter()
792 .position(|line| {
793 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
794 })
795 .unwrap_or(0);
796
797 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
798 continue;
799 }
800
801 let mut line_num = 1;
803 let mut col_start = match_start;
804 for (idx, line_info) in lines.iter().enumerate() {
805 if match_start >= line_info.byte_offset {
806 line_num = idx + 1;
807 col_start = match_start - line_info.byte_offset;
808 } else {
809 break;
810 }
811 }
812
813 let mut end_line_num = 1;
815 let mut col_end = match_end;
816 for (idx, line_info) in lines.iter().enumerate() {
817 if match_end > line_info.byte_offset {
818 end_line_num = idx + 1;
819 col_end = match_end - line_info.byte_offset;
820 } else {
821 break;
822 }
823 }
824
825 if line_num == end_line_num {
827 } else {
829 }
832
833 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
834
835 if let Some(inline_url) = cap.get(2) {
836 links.push(ParsedLink {
838 line: line_num,
839 start_col: col_start,
840 end_col: col_end,
841 byte_offset: match_start,
842 byte_end: match_end,
843 text,
844 url: inline_url.as_str().to_string(),
845 is_reference: false,
846 reference_id: None,
847 });
848 } else if let Some(ref_id) = cap.get(3) {
849 let ref_id_str = ref_id.as_str();
851 let normalized_ref = if ref_id_str.is_empty() {
852 text.to_lowercase() } else {
854 ref_id_str.to_lowercase()
855 };
856
857 links.push(ParsedLink {
858 line: line_num,
859 start_col: col_start,
860 end_col: col_end,
861 byte_offset: match_start,
862 byte_end: match_end,
863 text,
864 url: String::new(), is_reference: true,
866 reference_id: Some(normalized_ref),
867 });
868 }
869 }
870
871 links
872 }
873
874 fn parse_images(
876 content: &str,
877 lines: &[LineInfo],
878 code_blocks: &[(usize, usize)],
879 code_spans: &[CodeSpan],
880 ) -> Vec<ParsedImage> {
881 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
886 let full_match = cap.get(0).unwrap();
887 let match_start = full_match.start();
888 let match_end = full_match.end();
889
890 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
892 continue;
893 }
894
895 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
897 continue;
898 }
899
900 if code_spans
902 .iter()
903 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
904 {
905 continue;
906 }
907
908 let mut line_num = 1;
910 let mut col_start = match_start;
911 for (idx, line_info) in lines.iter().enumerate() {
912 if match_start >= line_info.byte_offset {
913 line_num = idx + 1;
914 col_start = match_start - line_info.byte_offset;
915 } else {
916 break;
917 }
918 }
919
920 let mut end_line_num = 1;
922 let mut col_end = match_end;
923 for (idx, line_info) in lines.iter().enumerate() {
924 if match_end > line_info.byte_offset {
925 end_line_num = idx + 1;
926 col_end = match_end - line_info.byte_offset;
927 } else {
928 break;
929 }
930 }
931
932 if line_num == end_line_num {
934 } else {
936 }
939
940 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
941
942 if let Some(inline_url) = cap.get(2) {
943 images.push(ParsedImage {
945 line: line_num,
946 start_col: col_start,
947 end_col: col_end,
948 byte_offset: match_start,
949 byte_end: match_end,
950 alt_text,
951 url: inline_url.as_str().to_string(),
952 is_reference: false,
953 reference_id: None,
954 });
955 } else if let Some(ref_id) = cap.get(3) {
956 let ref_id_str = ref_id.as_str();
958 let normalized_ref = if ref_id_str.is_empty() {
959 alt_text.to_lowercase() } else {
961 ref_id_str.to_lowercase()
962 };
963
964 images.push(ParsedImage {
965 line: line_num,
966 start_col: col_start,
967 end_col: col_end,
968 byte_offset: match_start,
969 byte_end: match_end,
970 alt_text,
971 url: String::new(), is_reference: true,
973 reference_id: Some(normalized_ref),
974 });
975 }
976 }
977
978 images
979 }
980
981 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
983 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
987 if line_info.in_code_block {
989 continue;
990 }
991
992 let line = &line_info.content;
993 let line_num = line_idx + 1;
994
995 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
996 let id = cap.get(1).unwrap().as_str().to_lowercase();
997 let url = cap.get(2).unwrap().as_str().to_string();
998 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
999
1000 refs.push(ReferenceDef {
1001 line: line_num,
1002 id,
1003 url,
1004 title,
1005 });
1006 }
1007 }
1008
1009 refs
1010 }
1011
1012 fn compute_line_info(
1014 content: &str,
1015 line_offsets: &[usize],
1016 code_blocks: &[(usize, usize)],
1017 flavor: MarkdownFlavor,
1018 ) -> Vec<LineInfo> {
1019 lazy_static! {
1020 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1022 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1023
1024 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1026
1027 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1029 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1030
1031 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1033 }
1034
1035 let content_lines: Vec<&str> = content.lines().collect();
1036 let mut lines = Vec::with_capacity(content_lines.len());
1037
1038 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1041
1042 for (i, line) in content_lines.iter().enumerate() {
1043 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1044 let indent = line.len() - line.trim_start().len();
1045 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1047 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1049 after_prefix.trim().is_empty()
1050 } else {
1051 line.trim().is_empty()
1052 };
1053 let in_code_block = code_blocks.iter().any(|&(start, end)| {
1056 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1061 let mut boundary = start;
1063 while boundary > 0 && !content.is_char_boundary(boundary) {
1064 boundary -= 1;
1065 }
1066 boundary
1067 } else {
1068 start
1069 };
1070
1071 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1072 let mut boundary = end;
1074 while boundary < content.len() && !content.is_char_boundary(boundary) {
1075 boundary += 1;
1076 }
1077 boundary
1078 } else {
1079 end.min(content.len())
1080 };
1081
1082 let block_content = &content[safe_start..safe_end];
1083 let is_multiline = block_content.contains('\n');
1084 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1085 let is_indented = !is_fenced
1086 && block_content
1087 .lines()
1088 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
1089
1090 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1091 });
1092
1093 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1095 && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1096 let list_item =
1097 if !(in_code_block || is_blank || in_mkdocstrings || (front_matter_end > 0 && i < front_matter_end)) {
1098 let (line_for_list_check, blockquote_prefix_len) =
1100 if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1101 let prefix = caps.get(1).unwrap().as_str();
1102 let content = caps.get(2).unwrap().as_str();
1103 (content, prefix.len())
1104 } else {
1105 (&**line, 0)
1106 };
1107
1108 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1109 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1110 let marker = caps.get(2).map_or("", |m| m.as_str());
1111 let spacing = caps.get(3).map_or("", |m| m.as_str());
1112 let _content = caps.get(4).map_or("", |m| m.as_str());
1113 let marker_column = blockquote_prefix_len + leading_spaces.len();
1114 let content_column = marker_column + marker.len() + spacing.len();
1115
1116 if spacing.is_empty() {
1123 None
1124 } else {
1125 Some(ListItemInfo {
1126 marker: marker.to_string(),
1127 is_ordered: false,
1128 number: None,
1129 marker_column,
1130 content_column,
1131 })
1132 }
1133 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1134 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1135 let number_str = caps.get(2).map_or("", |m| m.as_str());
1136 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1137 let spacing = caps.get(4).map_or("", |m| m.as_str());
1138 let _content = caps.get(5).map_or("", |m| m.as_str());
1139 let marker = format!("{number_str}{delimiter}");
1140 let marker_column = blockquote_prefix_len + leading_spaces.len();
1141 let content_column = marker_column + marker.len() + spacing.len();
1142
1143 if spacing.is_empty() {
1146 None
1147 } else {
1148 Some(ListItemInfo {
1149 marker,
1150 is_ordered: true,
1151 number: number_str.parse().ok(),
1152 marker_column,
1153 content_column,
1154 })
1155 }
1156 } else {
1157 None
1158 }
1159 } else {
1160 None
1161 };
1162
1163 lines.push(LineInfo {
1164 content: line.to_string(),
1165 byte_offset,
1166 indent,
1167 is_blank,
1168 in_code_block,
1169 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1170 in_html_block: false, list_item,
1172 heading: None, blockquote: None, in_mkdocstrings,
1175 });
1176 }
1177
1178 for i in 0..content_lines.len() {
1180 if lines[i].in_code_block {
1181 continue;
1182 }
1183
1184 if front_matter_end > 0 && i < front_matter_end {
1186 continue;
1187 }
1188
1189 let line = content_lines[i];
1190
1191 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1193 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1194 let markers = caps.get(2).map_or("", |m| m.as_str());
1195 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1196 let content = caps.get(4).map_or("", |m| m.as_str());
1197
1198 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1199 let marker_column = indent_str.len();
1200
1201 let prefix = format!("{indent_str}{markers}{spaces_after}");
1203
1204 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1206 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1208
1209 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1213
1214 lines[i].blockquote = Some(BlockquoteInfo {
1215 nesting_level,
1216 indent: indent_str.to_string(),
1217 marker_column,
1218 prefix,
1219 content: content.to_string(),
1220 has_no_space_after_marker: has_no_space,
1221 has_multiple_spaces_after_marker: has_multiple_spaces,
1222 needs_md028_fix,
1223 });
1224 }
1225
1226 if lines[i].is_blank {
1228 continue;
1229 }
1230
1231 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1234 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1235 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1236 } else {
1237 false
1238 };
1239
1240 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1241 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1243 continue;
1244 }
1245 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1246 let hashes = caps.get(2).map_or("", |m| m.as_str());
1247 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1248 let rest = caps.get(4).map_or("", |m| m.as_str());
1249
1250 let level = hashes.len() as u8;
1251 let marker_column = leading_spaces.len();
1252
1253 let (text, has_closing, closing_seq) = {
1255 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1257 if rest[id_start..].trim_end().ends_with('}') {
1259 (&rest[..id_start], &rest[id_start..])
1261 } else {
1262 (rest, "")
1263 }
1264 } else {
1265 (rest, "")
1266 };
1267
1268 let trimmed_rest = rest_without_id.trim_end();
1270 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1271 let mut start_of_hashes = last_hash_pos;
1273 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1274 start_of_hashes -= 1;
1275 }
1276
1277 let has_space_before = start_of_hashes == 0
1279 || trimmed_rest
1280 .chars()
1281 .nth(start_of_hashes - 1)
1282 .is_some_and(|c| c.is_whitespace());
1283
1284 let potential_closing = &trimmed_rest[start_of_hashes..];
1286 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1287
1288 if is_all_hashes && has_space_before {
1289 let closing_hashes = potential_closing.to_string();
1291 let text_part = if !custom_id_part.is_empty() {
1294 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1297 } else {
1298 rest_without_id[..start_of_hashes].trim_end().to_string()
1299 };
1300 (text_part, true, closing_hashes)
1301 } else {
1302 (rest.to_string(), false, String::new())
1304 }
1305 } else {
1306 (rest.to_string(), false, String::new())
1308 }
1309 };
1310
1311 let content_column = marker_column + hashes.len() + spaces_after.len();
1312
1313 let raw_text = text.trim().to_string();
1315 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1316
1317 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1319 let next_line = content_lines[i + 1];
1320 if !lines[i + 1].in_code_block
1321 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1322 && let Some(next_line_id) =
1323 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1324 {
1325 custom_id = Some(next_line_id);
1326 }
1327 }
1328
1329 lines[i].heading = Some(HeadingInfo {
1330 level,
1331 style: HeadingStyle::ATX,
1332 marker: hashes.to_string(),
1333 marker_column,
1334 content_column,
1335 text: clean_text,
1336 custom_id,
1337 raw_text,
1338 has_closing_sequence: has_closing,
1339 closing_sequence: closing_seq,
1340 });
1341 }
1342 else if i + 1 < content_lines.len() {
1344 let next_line = content_lines[i + 1];
1345 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1346 if front_matter_end > 0 && i < front_matter_end {
1348 continue;
1349 }
1350
1351 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1353 continue;
1354 }
1355
1356 let underline = next_line.trim();
1357
1358 if underline == "---" {
1361 continue;
1362 }
1363
1364 let current_line_trimmed = line.trim();
1366 if current_line_trimmed.contains(':')
1367 && !current_line_trimmed.starts_with('#')
1368 && !current_line_trimmed.contains('[')
1369 && !current_line_trimmed.contains("](")
1370 {
1371 continue;
1373 }
1374
1375 let level = if underline.starts_with('=') { 1 } else { 2 };
1376 let style = if level == 1 {
1377 HeadingStyle::Setext1
1378 } else {
1379 HeadingStyle::Setext2
1380 };
1381
1382 let raw_text = line.trim().to_string();
1384 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1385
1386 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1388 let attr_line = content_lines[i + 2];
1389 if !lines[i + 2].in_code_block
1390 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1391 && let Some(attr_line_id) =
1392 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1393 {
1394 custom_id = Some(attr_line_id);
1395 }
1396 }
1397
1398 lines[i].heading = Some(HeadingInfo {
1399 level,
1400 style,
1401 marker: underline.to_string(),
1402 marker_column: next_line.len() - next_line.trim_start().len(),
1403 content_column: lines[i].indent,
1404 text: clean_text,
1405 custom_id,
1406 raw_text,
1407 has_closing_sequence: false,
1408 closing_sequence: String::new(),
1409 });
1410 }
1411 }
1412 }
1413
1414 lines
1415 }
1416
1417 fn detect_html_blocks(lines: &mut [LineInfo]) {
1419 const BLOCK_ELEMENTS: &[&str] = &[
1421 "address",
1422 "article",
1423 "aside",
1424 "blockquote",
1425 "details",
1426 "dialog",
1427 "dd",
1428 "div",
1429 "dl",
1430 "dt",
1431 "fieldset",
1432 "figcaption",
1433 "figure",
1434 "footer",
1435 "form",
1436 "h1",
1437 "h2",
1438 "h3",
1439 "h4",
1440 "h5",
1441 "h6",
1442 "header",
1443 "hr",
1444 "li",
1445 "main",
1446 "nav",
1447 "ol",
1448 "p",
1449 "pre",
1450 "section",
1451 "table",
1452 "tbody",
1453 "td",
1454 "tfoot",
1455 "th",
1456 "thead",
1457 "tr",
1458 "ul",
1459 ];
1460
1461 let mut i = 0;
1462 while i < lines.len() {
1463 if lines[i].in_code_block || lines[i].in_front_matter {
1465 i += 1;
1466 continue;
1467 }
1468
1469 let trimmed = lines[i].content.trim_start();
1470
1471 if trimmed.starts_with('<') && trimmed.len() > 1 {
1473 let after_bracket = &trimmed[1..];
1475 let is_closing = after_bracket.starts_with('/');
1476 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1477
1478 let tag_name = tag_start
1480 .chars()
1481 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1482 .collect::<String>()
1483 .to_lowercase();
1484
1485 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1487 lines[i].in_html_block = true;
1489
1490 if !is_closing {
1493 let closing_tag = format!("</{tag_name}>");
1494 let mut j = i + 1;
1495 while j < lines.len() && j < i + 100 {
1496 if lines[j].is_blank {
1499 break;
1500 }
1501
1502 lines[j].in_html_block = true;
1503
1504 if lines[j].content.contains(&closing_tag) {
1506 break;
1507 }
1508 j += 1;
1509 }
1510 }
1511 }
1512 }
1513
1514 i += 1;
1515 }
1516 }
1517
1518 fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1520 let mut code_spans = Vec::new();
1521
1522 if !content.contains('`') {
1524 return code_spans;
1525 }
1526
1527 fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1529 match node {
1530 Node::InlineCode(inline_code) => {
1531 if let Some(pos) = &inline_code.position {
1532 let start_pos = pos.start.offset;
1533 let end_pos = pos.end.offset;
1534
1535 let full_span = &content[start_pos..end_pos];
1537 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1538
1539 let content_start = start_pos + backtick_count;
1541 let content_end = end_pos - backtick_count;
1542 let span_content = if content_start < content_end {
1543 content[content_start..content_end].to_string()
1544 } else {
1545 String::new()
1546 };
1547
1548 let mut line_num = 1;
1550 let mut col_start = start_pos;
1551 for (idx, line_info) in lines.iter().enumerate() {
1552 if start_pos >= line_info.byte_offset {
1553 line_num = idx + 1;
1554 col_start = start_pos - line_info.byte_offset;
1555 } else {
1556 break;
1557 }
1558 }
1559
1560 let mut col_end = end_pos;
1562 for line_info in lines.iter() {
1563 if end_pos > line_info.byte_offset {
1564 col_end = end_pos - line_info.byte_offset;
1565 } else {
1566 break;
1567 }
1568 }
1569
1570 spans.push(CodeSpan {
1571 line: line_num,
1572 start_col: col_start,
1573 end_col: col_end,
1574 byte_offset: start_pos,
1575 byte_end: end_pos,
1576 backtick_count,
1577 content: span_content,
1578 });
1579 }
1580 }
1581 Node::Root(root) => {
1583 for child in &root.children {
1584 extract_code_spans(child, content, lines, spans);
1585 }
1586 }
1587 Node::Paragraph(para) => {
1588 for child in ¶.children {
1589 extract_code_spans(child, content, lines, spans);
1590 }
1591 }
1592 Node::Heading(heading) => {
1593 for child in &heading.children {
1594 extract_code_spans(child, content, lines, spans);
1595 }
1596 }
1597 Node::List(list) => {
1598 for child in &list.children {
1599 extract_code_spans(child, content, lines, spans);
1600 }
1601 }
1602 Node::ListItem(item) => {
1603 for child in &item.children {
1604 extract_code_spans(child, content, lines, spans);
1605 }
1606 }
1607 Node::Blockquote(blockquote) => {
1608 for child in &blockquote.children {
1609 extract_code_spans(child, content, lines, spans);
1610 }
1611 }
1612 Node::Table(table) => {
1613 for child in &table.children {
1614 extract_code_spans(child, content, lines, spans);
1615 }
1616 }
1617 Node::TableRow(row) => {
1618 for child in &row.children {
1619 extract_code_spans(child, content, lines, spans);
1620 }
1621 }
1622 Node::TableCell(cell) => {
1623 for child in &cell.children {
1624 extract_code_spans(child, content, lines, spans);
1625 }
1626 }
1627 Node::Emphasis(emphasis) => {
1628 for child in &emphasis.children {
1629 extract_code_spans(child, content, lines, spans);
1630 }
1631 }
1632 Node::Strong(strong) => {
1633 for child in &strong.children {
1634 extract_code_spans(child, content, lines, spans);
1635 }
1636 }
1637 Node::Link(link) => {
1638 for child in &link.children {
1639 extract_code_spans(child, content, lines, spans);
1640 }
1641 }
1642 Node::LinkReference(link_ref) => {
1643 for child in &link_ref.children {
1644 extract_code_spans(child, content, lines, spans);
1645 }
1646 }
1647 Node::FootnoteDefinition(footnote) => {
1648 for child in &footnote.children {
1649 extract_code_spans(child, content, lines, spans);
1650 }
1651 }
1652 Node::Delete(delete) => {
1653 for child in &delete.children {
1654 extract_code_spans(child, content, lines, spans);
1655 }
1656 }
1657 Node::Code(_)
1659 | Node::Text(_)
1660 | Node::Html(_)
1661 | Node::Image(_)
1662 | Node::ImageReference(_)
1663 | Node::FootnoteReference(_)
1664 | Node::Break(_)
1665 | Node::ThematicBreak(_)
1666 | Node::Definition(_)
1667 | Node::Yaml(_)
1668 | Node::Toml(_)
1669 | Node::Math(_)
1670 | Node::InlineMath(_)
1671 | Node::MdxJsxFlowElement(_)
1672 | Node::MdxFlowExpression(_)
1673 | Node::MdxJsxTextElement(_)
1674 | Node::MdxTextExpression(_)
1675 | Node::MdxjsEsm(_) => {
1676 }
1678 }
1679 }
1680
1681 extract_code_spans(ast, content, lines, &mut code_spans);
1683
1684 code_spans.sort_by_key(|span| span.byte_offset);
1686
1687 code_spans
1688 }
1689
1690 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1692 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1695 let mut last_list_item_line = 0;
1696 let mut current_indent_level = 0;
1697 let mut last_marker_width = 0;
1698
1699 for (line_idx, line_info) in lines.iter().enumerate() {
1700 let line_num = line_idx + 1;
1701
1702 if line_info.in_code_block {
1704 if let Some(ref mut block) = current_block {
1705 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1707
1708 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1710
1711 match context {
1712 CodeBlockContext::Indented => {
1713 block.end_line = line_num;
1715 continue;
1716 }
1717 CodeBlockContext::Standalone => {
1718 let completed_block = current_block.take().unwrap();
1720 list_blocks.push(completed_block);
1721 continue;
1722 }
1723 CodeBlockContext::Adjacent => {
1724 block.end_line = line_num;
1726 continue;
1727 }
1728 }
1729 } else {
1730 continue;
1732 }
1733 }
1734
1735 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1737 caps.get(0).unwrap().as_str().to_string()
1738 } else {
1739 String::new()
1740 };
1741
1742 if let Some(list_item) = &line_info.list_item {
1744 let item_indent = list_item.marker_column;
1746 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1749 let is_nested = nesting > block.nesting_level;
1753 let same_type =
1754 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1755 let same_context = block.blockquote_prefix == blockquote_prefix;
1756 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1760 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1761
1762 let has_non_list_content = {
1764 let mut found_non_list = false;
1765 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1767
1768 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1770 let last_line = &lines[block_last_item_line - 1];
1771 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1772 log::debug!(
1773 "After problematic line {}: checking lines {} to {} for non-list content",
1774 block_last_item_line,
1775 block_last_item_line + 1,
1776 line_num
1777 );
1778 if line_num == block_last_item_line + 1 {
1780 log::debug!("Lines are consecutive, no content between");
1781 }
1782 }
1783 }
1784
1785 for check_line in (block_last_item_line + 1)..line_num {
1786 let check_idx = check_line - 1;
1787 if check_idx < lines.len() {
1788 let check_info = &lines[check_idx];
1789 let is_list_breaking_content = if check_info.in_code_block {
1791 let last_item_marker_width =
1793 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1794 lines[block_last_item_line - 1]
1795 .list_item
1796 .as_ref()
1797 .map(|li| {
1798 if li.is_ordered {
1799 li.marker.len() + 1 } else {
1801 li.marker.len()
1802 }
1803 })
1804 .unwrap_or(3) } else {
1806 3 };
1808
1809 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1810
1811 let context = CodeBlockUtils::analyze_code_block_context(
1813 lines,
1814 check_line - 1,
1815 min_continuation,
1816 );
1817
1818 matches!(context, CodeBlockContext::Standalone)
1820 } else if !check_info.is_blank && check_info.list_item.is_none() {
1821 let line_content = check_info.content.trim();
1823
1824 if check_info.heading.is_some()
1826 || line_content.starts_with("---")
1827 || line_content.starts_with("***")
1828 || line_content.starts_with("___")
1829 || (line_content.contains('|')
1830 && !line_content.contains("](")
1831 && !line_content.contains("http")
1832 && (line_content.matches('|').count() > 1
1833 || line_content.starts_with('|')
1834 || line_content.ends_with('|')))
1835 || line_content.starts_with(">")
1836 {
1837 true
1838 }
1839 else {
1841 let last_item_marker_width =
1842 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1843 lines[block_last_item_line - 1]
1844 .list_item
1845 .as_ref()
1846 .map(|li| {
1847 if li.is_ordered {
1848 li.marker.len() + 1 } else {
1850 li.marker.len()
1851 }
1852 })
1853 .unwrap_or(3) } else {
1855 3 };
1857
1858 let min_continuation =
1859 if block.is_ordered { last_item_marker_width } else { 2 };
1860 check_info.indent < min_continuation
1861 }
1862 } else {
1863 false
1864 };
1865
1866 if is_list_breaking_content {
1867 found_non_list = true;
1869 break;
1870 }
1871 }
1872 }
1873 found_non_list
1874 };
1875
1876 let mut continues_list = if is_nested {
1880 same_context && reasonable_distance && !has_non_list_content
1882 } else {
1883 let result = same_type
1885 && same_context
1886 && reasonable_distance
1887 && marker_compatible
1888 && !has_non_list_content;
1889
1890 if block.item_lines.last().is_some_and(|&last_line| {
1892 last_line > 0
1893 && last_line <= lines.len()
1894 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1895 && lines[last_line - 1].content.contains(r"\`")
1896 }) {
1897 log::debug!(
1898 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1899 );
1900 if line_num > 0 && line_num <= lines.len() {
1901 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1902 }
1903 }
1904
1905 result
1906 };
1907
1908 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1911 if block.item_lines.contains(&(line_num - 1)) {
1913 continues_list = true;
1915 }
1916 }
1917
1918 if continues_list {
1919 block.end_line = line_num;
1921 block.item_lines.push(line_num);
1922
1923 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1925 list_item.marker.len() + 1
1926 } else {
1927 list_item.marker.len()
1928 });
1929
1930 if !block.is_ordered
1932 && block.marker.is_some()
1933 && block.marker.as_ref() != Some(&list_item.marker)
1934 {
1935 block.marker = None;
1937 }
1938 } else {
1939 list_blocks.push(block.clone());
1942
1943 *block = ListBlock {
1944 start_line: line_num,
1945 end_line: line_num,
1946 is_ordered: list_item.is_ordered,
1947 marker: if list_item.is_ordered {
1948 None
1949 } else {
1950 Some(list_item.marker.clone())
1951 },
1952 blockquote_prefix: blockquote_prefix.clone(),
1953 item_lines: vec![line_num],
1954 nesting_level: nesting,
1955 max_marker_width: if list_item.is_ordered {
1956 list_item.marker.len() + 1
1957 } else {
1958 list_item.marker.len()
1959 },
1960 };
1961 }
1962 } else {
1963 current_block = Some(ListBlock {
1965 start_line: line_num,
1966 end_line: line_num,
1967 is_ordered: list_item.is_ordered,
1968 marker: if list_item.is_ordered {
1969 None
1970 } else {
1971 Some(list_item.marker.clone())
1972 },
1973 blockquote_prefix,
1974 item_lines: vec![line_num],
1975 nesting_level: nesting,
1976 max_marker_width: list_item.marker.len(),
1977 });
1978 }
1979
1980 last_list_item_line = line_num;
1981 current_indent_level = item_indent;
1982 last_marker_width = if list_item.is_ordered {
1983 list_item.marker.len() + 1 } else {
1985 list_item.marker.len()
1986 };
1987 } else if let Some(ref mut block) = current_block {
1988 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
1998 lines[block.end_line - 1].content.trim_end().ends_with('\\')
1999 } else {
2000 false
2001 };
2002
2003 let min_continuation_indent = if block.is_ordered {
2007 current_indent_level + last_marker_width
2008 } else {
2009 current_indent_level + 2 };
2011
2012 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2013 block.end_line = line_num;
2015 } else if line_info.is_blank {
2016 let mut check_idx = line_idx + 1;
2019 let mut found_continuation = false;
2020
2021 while check_idx < lines.len() && lines[check_idx].is_blank {
2023 check_idx += 1;
2024 }
2025
2026 if check_idx < lines.len() {
2027 let next_line = &lines[check_idx];
2028 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2030 found_continuation = true;
2031 }
2032 else if !next_line.in_code_block
2034 && next_line.list_item.is_some()
2035 && let Some(item) = &next_line.list_item
2036 {
2037 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2038 .find(&next_line.content)
2039 .map_or(String::new(), |m| m.as_str().to_string());
2040 if item.marker_column == current_indent_level
2041 && item.is_ordered == block.is_ordered
2042 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2043 {
2044 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2047 if let Some(between_line) = lines.get(idx) {
2048 let trimmed = between_line.content.trim();
2049 if trimmed.is_empty() {
2051 return false;
2052 }
2053 let line_indent =
2055 between_line.content.len() - between_line.content.trim_start().len();
2056
2057 if trimmed.starts_with("```")
2059 || trimmed.starts_with("~~~")
2060 || trimmed.starts_with("---")
2061 || trimmed.starts_with("***")
2062 || trimmed.starts_with("___")
2063 || trimmed.starts_with(">")
2064 || trimmed.contains('|') || between_line.heading.is_some()
2066 {
2067 return true; }
2069
2070 line_indent >= min_continuation_indent
2072 } else {
2073 false
2074 }
2075 });
2076
2077 if block.is_ordered {
2078 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2081 if let Some(between_line) = lines.get(idx) {
2082 let trimmed = between_line.content.trim();
2083 if trimmed.is_empty() {
2084 return false;
2085 }
2086 trimmed.starts_with("```")
2088 || trimmed.starts_with("~~~")
2089 || trimmed.starts_with("---")
2090 || trimmed.starts_with("***")
2091 || trimmed.starts_with("___")
2092 || trimmed.starts_with(">")
2093 || trimmed.contains('|') || between_line.heading.is_some()
2095 } else {
2096 false
2097 }
2098 });
2099 found_continuation = !has_structural_separators;
2100 } else {
2101 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2103 if let Some(between_line) = lines.get(idx) {
2104 let trimmed = between_line.content.trim();
2105 if trimmed.is_empty() {
2106 return false;
2107 }
2108 trimmed.starts_with("```")
2110 || trimmed.starts_with("~~~")
2111 || trimmed.starts_with("---")
2112 || trimmed.starts_with("***")
2113 || trimmed.starts_with("___")
2114 || trimmed.starts_with(">")
2115 || trimmed.contains('|') || between_line.heading.is_some()
2117 } else {
2118 false
2119 }
2120 });
2121 found_continuation = !has_structural_separators;
2122 }
2123 }
2124 }
2125 }
2126
2127 if found_continuation {
2128 block.end_line = line_num;
2130 } else {
2131 list_blocks.push(block.clone());
2133 current_block = None;
2134 }
2135 } else {
2136 let min_required_indent = if block.is_ordered {
2139 current_indent_level + last_marker_width
2140 } else {
2141 current_indent_level + 2
2142 };
2143
2144 let line_content = line_info.content.trim();
2149 let is_structural_separator = line_info.heading.is_some()
2150 || line_content.starts_with("```")
2151 || line_content.starts_with("~~~")
2152 || line_content.starts_with("---")
2153 || line_content.starts_with("***")
2154 || line_content.starts_with("___")
2155 || line_content.starts_with(">")
2156 || (line_content.contains('|')
2157 && !line_content.contains("](")
2158 && !line_content.contains("http")
2159 && (line_content.matches('|').count() > 1
2160 || line_content.starts_with('|')
2161 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2166 && !line_info.is_blank
2167 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2168
2169 if is_lazy_continuation {
2170 let content_to_check = if !blockquote_prefix.is_empty() {
2173 line_info
2175 .content
2176 .strip_prefix(&blockquote_prefix)
2177 .unwrap_or(&line_info.content)
2178 .trim()
2179 } else {
2180 line_info.content.trim()
2181 };
2182
2183 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2184
2185 if starts_with_uppercase && last_list_item_line > 0 {
2188 list_blocks.push(block.clone());
2190 current_block = None;
2191 } else {
2192 block.end_line = line_num;
2194 }
2195 } else {
2196 list_blocks.push(block.clone());
2198 current_block = None;
2199 }
2200 }
2201 }
2202 }
2203
2204 if let Some(block) = current_block {
2206 list_blocks.push(block);
2207 }
2208
2209 merge_adjacent_list_blocks(&mut list_blocks, lines);
2211
2212 list_blocks
2213 }
2214
2215 fn compute_char_frequency(content: &str) -> CharFrequency {
2217 let mut frequency = CharFrequency::default();
2218
2219 for ch in content.chars() {
2220 match ch {
2221 '#' => frequency.hash_count += 1,
2222 '*' => frequency.asterisk_count += 1,
2223 '_' => frequency.underscore_count += 1,
2224 '-' => frequency.hyphen_count += 1,
2225 '+' => frequency.plus_count += 1,
2226 '>' => frequency.gt_count += 1,
2227 '|' => frequency.pipe_count += 1,
2228 '[' => frequency.bracket_count += 1,
2229 '`' => frequency.backtick_count += 1,
2230 '<' => frequency.lt_count += 1,
2231 '!' => frequency.exclamation_count += 1,
2232 '\n' => frequency.newline_count += 1,
2233 _ => {}
2234 }
2235 }
2236
2237 frequency
2238 }
2239
2240 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2242 lazy_static! {
2243 static ref HTML_TAG_REGEX: regex::Regex =
2244 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2245 }
2246
2247 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2248
2249 for cap in HTML_TAG_REGEX.captures_iter(content) {
2250 let full_match = cap.get(0).unwrap();
2251 let match_start = full_match.start();
2252 let match_end = full_match.end();
2253
2254 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2256 continue;
2257 }
2258
2259 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2260 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2261 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2262
2263 let mut line_num = 1;
2265 let mut col_start = match_start;
2266 let mut col_end = match_end;
2267 for (idx, line_info) in lines.iter().enumerate() {
2268 if match_start >= line_info.byte_offset {
2269 line_num = idx + 1;
2270 col_start = match_start - line_info.byte_offset;
2271 col_end = match_end - line_info.byte_offset;
2272 } else {
2273 break;
2274 }
2275 }
2276
2277 html_tags.push(HtmlTag {
2278 line: line_num,
2279 start_col: col_start,
2280 end_col: col_end,
2281 byte_offset: match_start,
2282 byte_end: match_end,
2283 tag_name,
2284 is_closing,
2285 is_self_closing,
2286 raw_content: full_match.as_str().to_string(),
2287 });
2288 }
2289
2290 html_tags
2291 }
2292
2293 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2295 lazy_static! {
2296 static ref EMPHASIS_REGEX: regex::Regex =
2297 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2298 }
2299
2300 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2301
2302 for cap in EMPHASIS_REGEX.captures_iter(content) {
2303 let full_match = cap.get(0).unwrap();
2304 let match_start = full_match.start();
2305 let match_end = full_match.end();
2306
2307 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2309 continue;
2310 }
2311
2312 let opening_markers = cap.get(1).unwrap().as_str();
2313 let content_part = cap.get(2).unwrap().as_str();
2314 let closing_markers = cap.get(3).unwrap().as_str();
2315
2316 if opening_markers.chars().next() != closing_markers.chars().next()
2318 || opening_markers.len() != closing_markers.len()
2319 {
2320 continue;
2321 }
2322
2323 let marker = opening_markers.chars().next().unwrap();
2324 let marker_count = opening_markers.len();
2325
2326 let mut line_num = 1;
2328 let mut col_start = match_start;
2329 let mut col_end = match_end;
2330 for (idx, line_info) in lines.iter().enumerate() {
2331 if match_start >= line_info.byte_offset {
2332 line_num = idx + 1;
2333 col_start = match_start - line_info.byte_offset;
2334 col_end = match_end - line_info.byte_offset;
2335 } else {
2336 break;
2337 }
2338 }
2339
2340 emphasis_spans.push(EmphasisSpan {
2341 line: line_num,
2342 start_col: col_start,
2343 end_col: col_end,
2344 byte_offset: match_start,
2345 byte_end: match_end,
2346 marker,
2347 marker_count,
2348 content: content_part.to_string(),
2349 });
2350 }
2351
2352 emphasis_spans
2353 }
2354
2355 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2357 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2358
2359 for (line_idx, line_info) in lines.iter().enumerate() {
2360 if line_info.in_code_block || line_info.is_blank {
2362 continue;
2363 }
2364
2365 let line = &line_info.content;
2366 let line_num = line_idx + 1;
2367
2368 if !line.contains('|') {
2370 continue;
2371 }
2372
2373 let parts: Vec<&str> = line.split('|').collect();
2375 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2376
2377 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2379 let mut column_alignments = Vec::new();
2380
2381 if is_separator {
2382 for part in &parts[1..parts.len() - 1] {
2383 let trimmed = part.trim();
2385 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2386 "center".to_string()
2387 } else if trimmed.ends_with(':') {
2388 "right".to_string()
2389 } else if trimmed.starts_with(':') {
2390 "left".to_string()
2391 } else {
2392 "none".to_string()
2393 };
2394 column_alignments.push(alignment);
2395 }
2396 }
2397
2398 table_rows.push(TableRow {
2399 line: line_num,
2400 is_separator,
2401 column_count,
2402 column_alignments,
2403 });
2404 }
2405
2406 table_rows
2407 }
2408
2409 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2411 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2412
2413 for cap in BARE_URL_PATTERN.captures_iter(content) {
2415 let full_match = cap.get(0).unwrap();
2416 let match_start = full_match.start();
2417 let match_end = full_match.end();
2418
2419 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2421 continue;
2422 }
2423
2424 let preceding_char = if match_start > 0 {
2426 content.chars().nth(match_start - 1)
2427 } else {
2428 None
2429 };
2430 let following_char = content.chars().nth(match_end);
2431
2432 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2433 continue;
2434 }
2435 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2436 continue;
2437 }
2438
2439 let url = full_match.as_str();
2440 let url_type = if url.starts_with("https://") {
2441 "https"
2442 } else if url.starts_with("http://") {
2443 "http"
2444 } else if url.starts_with("ftp://") {
2445 "ftp"
2446 } else {
2447 "other"
2448 };
2449
2450 let mut line_num = 1;
2452 let mut col_start = match_start;
2453 let mut col_end = match_end;
2454 for (idx, line_info) in lines.iter().enumerate() {
2455 if match_start >= line_info.byte_offset {
2456 line_num = idx + 1;
2457 col_start = match_start - line_info.byte_offset;
2458 col_end = match_end - line_info.byte_offset;
2459 } else {
2460 break;
2461 }
2462 }
2463
2464 bare_urls.push(BareUrl {
2465 line: line_num,
2466 start_col: col_start,
2467 end_col: col_end,
2468 byte_offset: match_start,
2469 byte_end: match_end,
2470 url: url.to_string(),
2471 url_type: url_type.to_string(),
2472 });
2473 }
2474
2475 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2477 let full_match = cap.get(0).unwrap();
2478 let match_start = full_match.start();
2479 let match_end = full_match.end();
2480
2481 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2483 continue;
2484 }
2485
2486 let preceding_char = if match_start > 0 {
2488 content.chars().nth(match_start - 1)
2489 } else {
2490 None
2491 };
2492 let following_char = content.chars().nth(match_end);
2493
2494 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2495 continue;
2496 }
2497 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2498 continue;
2499 }
2500
2501 let email = full_match.as_str();
2502
2503 let mut line_num = 1;
2505 let mut col_start = match_start;
2506 let mut col_end = match_end;
2507 for (idx, line_info) in lines.iter().enumerate() {
2508 if match_start >= line_info.byte_offset {
2509 line_num = idx + 1;
2510 col_start = match_start - line_info.byte_offset;
2511 col_end = match_end - line_info.byte_offset;
2512 } else {
2513 break;
2514 }
2515 }
2516
2517 bare_urls.push(BareUrl {
2518 line: line_num,
2519 start_col: col_start,
2520 end_col: col_end,
2521 byte_offset: match_start,
2522 byte_end: match_end,
2523 url: email.to_string(),
2524 url_type: "email".to_string(),
2525 });
2526 }
2527
2528 bare_urls
2529 }
2530}
2531
2532fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2534 if list_blocks.len() < 2 {
2535 return;
2536 }
2537
2538 let mut merger = ListBlockMerger::new(lines);
2539 *list_blocks = merger.merge(list_blocks);
2540}
2541
2542struct ListBlockMerger<'a> {
2544 lines: &'a [LineInfo],
2545}
2546
2547impl<'a> ListBlockMerger<'a> {
2548 fn new(lines: &'a [LineInfo]) -> Self {
2549 Self { lines }
2550 }
2551
2552 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2553 let mut merged = Vec::with_capacity(list_blocks.len());
2554 let mut current = list_blocks[0].clone();
2555
2556 for next in list_blocks.iter().skip(1) {
2557 if self.should_merge_blocks(¤t, next) {
2558 current = self.merge_two_blocks(current, next);
2559 } else {
2560 merged.push(current);
2561 current = next.clone();
2562 }
2563 }
2564
2565 merged.push(current);
2566 merged
2567 }
2568
2569 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2571 if !self.blocks_are_compatible(current, next) {
2573 return false;
2574 }
2575
2576 let spacing = self.analyze_spacing_between(current, next);
2578 match spacing {
2579 BlockSpacing::Consecutive => true,
2580 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2581 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2582 self.can_merge_with_content_between(current, next)
2583 }
2584 }
2585 }
2586
2587 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2589 current.is_ordered == next.is_ordered
2590 && current.blockquote_prefix == next.blockquote_prefix
2591 && current.nesting_level == next.nesting_level
2592 }
2593
2594 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2596 let gap = next.start_line - current.end_line;
2597
2598 match gap {
2599 1 => BlockSpacing::Consecutive,
2600 2 => BlockSpacing::SingleBlank,
2601 _ if gap > 2 => {
2602 if self.has_only_blank_lines_between(current, next) {
2603 BlockSpacing::MultipleBlanks
2604 } else {
2605 BlockSpacing::ContentBetween
2606 }
2607 }
2608 _ => BlockSpacing::Consecutive, }
2610 }
2611
2612 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2614 if has_meaningful_content_between(current, next, self.lines) {
2617 return false; }
2619
2620 !current.is_ordered && current.marker == next.marker
2622 }
2623
2624 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2626 if has_meaningful_content_between(current, next, self.lines) {
2628 return false; }
2630
2631 current.is_ordered && next.is_ordered
2633 }
2634
2635 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2637 for line_num in (current.end_line + 1)..next.start_line {
2638 if let Some(line_info) = self.lines.get(line_num - 1)
2639 && !line_info.content.trim().is_empty()
2640 {
2641 return false;
2642 }
2643 }
2644 true
2645 }
2646
2647 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2649 current.end_line = next.end_line;
2650 current.item_lines.extend_from_slice(&next.item_lines);
2651
2652 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2654
2655 if !current.is_ordered && self.markers_differ(¤t, next) {
2657 current.marker = None; }
2659
2660 current
2661 }
2662
2663 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2665 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2666 }
2667}
2668
2669#[derive(Debug, PartialEq)]
2671enum BlockSpacing {
2672 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2677
2678fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2680 for line_num in (current.end_line + 1)..next.start_line {
2682 if let Some(line_info) = lines.get(line_num - 1) {
2683 let trimmed = line_info.content.trim();
2685
2686 if trimmed.is_empty() {
2688 continue;
2689 }
2690
2691 if line_info.heading.is_some() {
2695 return true; }
2697
2698 if is_horizontal_rule(trimmed) {
2700 return true; }
2702
2703 if trimmed.contains('|') && trimmed.len() > 1 {
2706 if !trimmed.contains("](") && !trimmed.contains("http") {
2708 let pipe_count = trimmed.matches('|').count();
2710 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2711 return true; }
2713 }
2714 }
2715
2716 if trimmed.starts_with('>') {
2718 return true; }
2720
2721 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2723 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2724
2725 let min_continuation_indent = if current.is_ordered {
2727 current.nesting_level + current.max_marker_width + 1 } else {
2729 current.nesting_level + 2
2730 };
2731
2732 if line_indent < min_continuation_indent {
2733 return true; }
2736 }
2737
2738 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2740
2741 let min_indent = if current.is_ordered {
2743 current.nesting_level + current.max_marker_width
2744 } else {
2745 current.nesting_level + 2
2746 };
2747
2748 if line_indent < min_indent {
2750 return true; }
2752
2753 }
2756 }
2757
2758 false
2760}
2761
2762fn is_horizontal_rule(trimmed: &str) -> bool {
2764 if trimmed.len() < 3 {
2765 return false;
2766 }
2767
2768 let chars: Vec<char> = trimmed.chars().collect();
2770 if let Some(&first_char) = chars.first()
2771 && (first_char == '-' || first_char == '*' || first_char == '_')
2772 {
2773 let mut count = 0;
2774 for &ch in &chars {
2775 if ch == first_char {
2776 count += 1;
2777 } else if ch != ' ' && ch != '\t' {
2778 return false; }
2780 }
2781 return count >= 3;
2782 }
2783 false
2784}
2785
2786#[cfg(test)]
2788mod tests {
2789 use super::*;
2790
2791 #[test]
2792 fn test_empty_content() {
2793 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2794 assert_eq!(ctx.content, "");
2795 assert_eq!(ctx.line_offsets, vec![0]);
2796 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2797 assert_eq!(ctx.lines.len(), 0);
2798 }
2799
2800 #[test]
2801 fn test_single_line() {
2802 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2803 assert_eq!(ctx.content, "# Hello");
2804 assert_eq!(ctx.line_offsets, vec![0]);
2805 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2806 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2807 }
2808
2809 #[test]
2810 fn test_multi_line() {
2811 let content = "# Title\n\nSecond line\nThird line";
2812 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2813 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2814 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2821
2822 #[test]
2823 fn test_line_info() {
2824 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2825 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2826
2827 assert_eq!(ctx.lines.len(), 7);
2829
2830 let line1 = &ctx.lines[0];
2832 assert_eq!(line1.content, "# Title");
2833 assert_eq!(line1.byte_offset, 0);
2834 assert_eq!(line1.indent, 0);
2835 assert!(!line1.is_blank);
2836 assert!(!line1.in_code_block);
2837 assert!(line1.list_item.is_none());
2838
2839 let line2 = &ctx.lines[1];
2841 assert_eq!(line2.content, " indented");
2842 assert_eq!(line2.byte_offset, 8);
2843 assert_eq!(line2.indent, 4);
2844 assert!(!line2.is_blank);
2845
2846 let line3 = &ctx.lines[2];
2848 assert_eq!(line3.content, "");
2849 assert!(line3.is_blank);
2850
2851 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2853 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2854 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2855 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2856 }
2857
2858 #[test]
2859 fn test_list_item_detection() {
2860 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2861 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2862
2863 let line1 = &ctx.lines[0];
2865 assert!(line1.list_item.is_some());
2866 let list1 = line1.list_item.as_ref().unwrap();
2867 assert_eq!(list1.marker, "-");
2868 assert!(!list1.is_ordered);
2869 assert_eq!(list1.marker_column, 0);
2870 assert_eq!(list1.content_column, 2);
2871
2872 let line2 = &ctx.lines[1];
2874 assert!(line2.list_item.is_some());
2875 let list2 = line2.list_item.as_ref().unwrap();
2876 assert_eq!(list2.marker, "*");
2877 assert_eq!(list2.marker_column, 2);
2878
2879 let line3 = &ctx.lines[2];
2881 assert!(line3.list_item.is_some());
2882 let list3 = line3.list_item.as_ref().unwrap();
2883 assert_eq!(list3.marker, "1.");
2884 assert!(list3.is_ordered);
2885 assert_eq!(list3.number, Some(1));
2886
2887 let line6 = &ctx.lines[5];
2889 assert!(line6.list_item.is_none());
2890 }
2891
2892 #[test]
2893 fn test_offset_to_line_col_edge_cases() {
2894 let content = "a\nb\nc";
2895 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2896 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2904}