1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::ast_utils::get_cached_ast;
4use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
5use lazy_static::lazy_static;
6use markdown::mdast::Node;
7use regex::Regex;
8
9lazy_static! {
10 static ref LINK_PATTERN: Regex = Regex::new(
13 r#"(?sx)
14 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
15 (?:
16 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
17 |
18 \[([^\]]*)\] # Reference ID in group 6
19 )"#
20 ).unwrap();
21
22 static ref IMAGE_PATTERN: Regex = Regex::new(
25 r#"(?sx)
26 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
27 (?:
28 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
29 |
30 \[([^\]]*)\] # Reference ID in group 6
31 )"#
32 ).unwrap();
33
34 static ref REF_DEF_PATTERN: Regex = Regex::new(
36 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
37 ).unwrap();
38
39 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
42 r"`+"
43 ).unwrap();
44
45 static ref BARE_URL_PATTERN: Regex = Regex::new(
47 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
48 ).unwrap();
49
50 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
52 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
53 ).unwrap();
54
55 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
57 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
58 ).unwrap();
59
60 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
62}
63
64#[derive(Debug, Clone)]
66pub struct LineInfo {
67 pub content: String,
69 pub byte_offset: usize,
71 pub indent: usize,
73 pub is_blank: bool,
75 pub in_code_block: bool,
77 pub in_front_matter: bool,
79 pub in_html_block: bool,
81 pub in_html_comment: bool,
83 pub list_item: Option<ListItemInfo>,
85 pub heading: Option<HeadingInfo>,
87 pub blockquote: Option<BlockquoteInfo>,
89 pub in_mkdocstrings: bool,
91}
92
93#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96 pub marker: String,
98 pub is_ordered: bool,
100 pub number: Option<usize>,
102 pub marker_column: usize,
104 pub content_column: usize,
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111 ATX,
113 Setext1,
115 Setext2,
117}
118
119#[derive(Debug, Clone)]
121pub struct ParsedLink {
122 pub line: usize,
124 pub start_col: usize,
126 pub end_col: usize,
128 pub byte_offset: usize,
130 pub byte_end: usize,
132 pub text: String,
134 pub url: String,
136 pub is_reference: bool,
138 pub reference_id: Option<String>,
140}
141
142#[derive(Debug, Clone)]
144pub struct ParsedImage {
145 pub line: usize,
147 pub start_col: usize,
149 pub end_col: usize,
151 pub byte_offset: usize,
153 pub byte_end: usize,
155 pub alt_text: String,
157 pub url: String,
159 pub is_reference: bool,
161 pub reference_id: Option<String>,
163}
164
165#[derive(Debug, Clone)]
167pub struct ReferenceDef {
168 pub line: usize,
170 pub id: String,
172 pub url: String,
174 pub title: Option<String>,
176}
177
178#[derive(Debug, Clone)]
180pub struct CodeSpan {
181 pub line: usize,
183 pub start_col: usize,
185 pub end_col: usize,
187 pub byte_offset: usize,
189 pub byte_end: usize,
191 pub backtick_count: usize,
193 pub content: String,
195}
196
197#[derive(Debug, Clone)]
199pub struct HeadingInfo {
200 pub level: u8,
202 pub style: HeadingStyle,
204 pub marker: String,
206 pub marker_column: usize,
208 pub content_column: usize,
210 pub text: String,
212 pub custom_id: Option<String>,
214 pub raw_text: String,
216 pub has_closing_sequence: bool,
218 pub closing_sequence: String,
220}
221
222#[derive(Debug, Clone)]
224pub struct BlockquoteInfo {
225 pub nesting_level: usize,
227 pub indent: String,
229 pub marker_column: usize,
231 pub prefix: String,
233 pub content: String,
235 pub has_no_space_after_marker: bool,
237 pub has_multiple_spaces_after_marker: bool,
239 pub needs_md028_fix: bool,
241}
242
243#[derive(Debug, Clone)]
245pub struct ListBlock {
246 pub start_line: usize,
248 pub end_line: usize,
250 pub is_ordered: bool,
252 pub marker: Option<String>,
254 pub blockquote_prefix: String,
256 pub item_lines: Vec<usize>,
258 pub nesting_level: usize,
260 pub max_marker_width: usize,
262}
263
264use std::sync::{Arc, Mutex};
265
266#[derive(Debug, Clone, Default)]
268pub struct CharFrequency {
269 pub hash_count: usize,
271 pub asterisk_count: usize,
273 pub underscore_count: usize,
275 pub hyphen_count: usize,
277 pub plus_count: usize,
279 pub gt_count: usize,
281 pub pipe_count: usize,
283 pub bracket_count: usize,
285 pub backtick_count: usize,
287 pub lt_count: usize,
289 pub exclamation_count: usize,
291 pub newline_count: usize,
293}
294
295#[derive(Debug, Clone)]
297pub struct HtmlTag {
298 pub line: usize,
300 pub start_col: usize,
302 pub end_col: usize,
304 pub byte_offset: usize,
306 pub byte_end: usize,
308 pub tag_name: String,
310 pub is_closing: bool,
312 pub is_self_closing: bool,
314 pub raw_content: String,
316}
317
318#[derive(Debug, Clone)]
320pub struct EmphasisSpan {
321 pub line: usize,
323 pub start_col: usize,
325 pub end_col: usize,
327 pub byte_offset: usize,
329 pub byte_end: usize,
331 pub marker: char,
333 pub marker_count: usize,
335 pub content: String,
337}
338
339#[derive(Debug, Clone)]
341pub struct TableRow {
342 pub line: usize,
344 pub is_separator: bool,
346 pub column_count: usize,
348 pub column_alignments: Vec<String>, }
351
352#[derive(Debug, Clone)]
354pub struct BareUrl {
355 pub line: usize,
357 pub start_col: usize,
359 pub end_col: usize,
361 pub byte_offset: usize,
363 pub byte_end: usize,
365 pub url: String,
367 pub url_type: String,
369}
370
371pub struct LintContext<'a> {
372 pub content: &'a str,
373 pub line_offsets: Vec<usize>,
374 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, ast_cache: Mutex<Option<Arc<Node>>>, pub flavor: MarkdownFlavor, }
389
390impl<'a> LintContext<'a> {
391 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
392 let mut line_offsets = vec![0];
393 for (i, c) in content.char_indices() {
394 if c == '\n' {
395 line_offsets.push(i + 1);
396 }
397 }
398
399 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
401
402 let mut lines = Self::compute_basic_line_info(content, &line_offsets, &code_blocks, flavor);
404
405 Self::detect_html_blocks(&mut lines);
408
409 Self::detect_headings_and_blockquotes(content, &mut lines, flavor);
411
412 let ast = get_cached_ast(content);
414 let code_spans = Self::parse_code_spans(content, &lines, &ast);
415
416 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
418 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
419 let reference_defs = Self::parse_reference_defs(content, &lines);
420 let list_blocks = Self::parse_list_blocks(&lines);
423
424 let char_frequency = Self::compute_char_frequency(content);
426
427 Self {
428 content,
429 line_offsets,
430 code_blocks,
431 lines,
432 links,
433 images,
434 reference_defs,
435 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
436 list_blocks,
437 char_frequency,
438 html_tags_cache: Mutex::new(None),
439 emphasis_spans_cache: Mutex::new(None),
440 table_rows_cache: Mutex::new(None),
441 bare_urls_cache: Mutex::new(None),
442 ast_cache: Mutex::new(None),
443 flavor,
444 }
445 }
446
447 pub fn get_ast(&self) -> Arc<Node> {
449 let mut cache = self.ast_cache.lock().unwrap();
450
451 if cache.is_none() {
452 *cache = Some(get_cached_ast(self.content));
455 }
456
457 cache.as_ref().unwrap().clone()
458 }
459
460 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
462 let mut cache = self.code_spans_cache.lock().unwrap();
463
464 if cache.is_none() {
466 let ast = self.get_ast();
467 let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
468 *cache = Some(Arc::new(code_spans));
469 }
470
471 cache.as_ref().unwrap().clone()
473 }
474
475 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
477 let mut cache = self.html_tags_cache.lock().unwrap();
478
479 if cache.is_none() {
480 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
481 *cache = Some(Arc::new(html_tags));
482 }
483
484 cache.as_ref().unwrap().clone()
485 }
486
487 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
489 let mut cache = self.emphasis_spans_cache.lock().unwrap();
490
491 if cache.is_none() {
492 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
493 *cache = Some(Arc::new(emphasis_spans));
494 }
495
496 cache.as_ref().unwrap().clone()
497 }
498
499 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
501 let mut cache = self.table_rows_cache.lock().unwrap();
502
503 if cache.is_none() {
504 let table_rows = Self::parse_table_rows(&self.lines);
505 *cache = Some(Arc::new(table_rows));
506 }
507
508 cache.as_ref().unwrap().clone()
509 }
510
511 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
513 let mut cache = self.bare_urls_cache.lock().unwrap();
514
515 if cache.is_none() {
516 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
517 *cache = Some(Arc::new(bare_urls));
518 }
519
520 cache.as_ref().unwrap().clone()
521 }
522
523 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
525 match self.line_offsets.binary_search(&offset) {
526 Ok(line) => (line + 1, 1),
527 Err(line) => {
528 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
529 (line, offset - line_start + 1)
530 }
531 }
532 }
533
534 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
536 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
538 return true;
539 }
540
541 self.code_spans()
543 .iter()
544 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
545 }
546
547 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
549 if line_num > 0 {
550 self.lines.get(line_num - 1)
551 } else {
552 None
553 }
554 }
555
556 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
558 self.line_info(line_num).map(|info| info.byte_offset)
559 }
560
561 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
563 let normalized_id = ref_id.to_lowercase();
564 self.reference_defs
565 .iter()
566 .find(|def| def.id == normalized_id)
567 .map(|def| def.url.as_str())
568 }
569
570 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
572 self.links.iter().filter(|link| link.line == line_num).collect()
573 }
574
575 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
577 self.images.iter().filter(|img| img.line == line_num).collect()
578 }
579
580 pub fn is_in_list_block(&self, line_num: usize) -> bool {
582 self.list_blocks
583 .iter()
584 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
585 }
586
587 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
589 self.list_blocks
590 .iter()
591 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
592 }
593
594 pub fn is_in_code_block(&self, line_num: usize) -> bool {
598 if line_num == 0 || line_num > self.lines.len() {
599 return false;
600 }
601 self.lines[line_num - 1].in_code_block
602 }
603
604 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
606 if line_num == 0 || line_num > self.lines.len() {
607 return false;
608 }
609 self.lines[line_num - 1].in_front_matter
610 }
611
612 pub fn is_in_html_block(&self, line_num: usize) -> bool {
614 if line_num == 0 || line_num > self.lines.len() {
615 return false;
616 }
617 self.lines[line_num - 1].in_html_block
618 }
619
620 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
622 if line_num == 0 || line_num > self.lines.len() {
623 return false;
624 }
625
626 let col_0indexed = if col > 0 { col - 1 } else { 0 };
630 let code_spans = self.code_spans();
631 code_spans
632 .iter()
633 .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
634 }
635
636 pub fn has_char(&self, ch: char) -> bool {
638 match ch {
639 '#' => self.char_frequency.hash_count > 0,
640 '*' => self.char_frequency.asterisk_count > 0,
641 '_' => self.char_frequency.underscore_count > 0,
642 '-' => self.char_frequency.hyphen_count > 0,
643 '+' => self.char_frequency.plus_count > 0,
644 '>' => self.char_frequency.gt_count > 0,
645 '|' => self.char_frequency.pipe_count > 0,
646 '[' => self.char_frequency.bracket_count > 0,
647 '`' => self.char_frequency.backtick_count > 0,
648 '<' => self.char_frequency.lt_count > 0,
649 '!' => self.char_frequency.exclamation_count > 0,
650 '\n' => self.char_frequency.newline_count > 0,
651 _ => self.content.contains(ch), }
653 }
654
655 pub fn char_count(&self, ch: char) -> usize {
657 match ch {
658 '#' => self.char_frequency.hash_count,
659 '*' => self.char_frequency.asterisk_count,
660 '_' => self.char_frequency.underscore_count,
661 '-' => self.char_frequency.hyphen_count,
662 '+' => self.char_frequency.plus_count,
663 '>' => self.char_frequency.gt_count,
664 '|' => self.char_frequency.pipe_count,
665 '[' => self.char_frequency.bracket_count,
666 '`' => self.char_frequency.backtick_count,
667 '<' => self.char_frequency.lt_count,
668 '!' => self.char_frequency.exclamation_count,
669 '\n' => self.char_frequency.newline_count,
670 _ => self.content.matches(ch).count(), }
672 }
673
674 pub fn likely_has_headings(&self) -> bool {
676 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
678
679 pub fn likely_has_lists(&self) -> bool {
681 self.char_frequency.asterisk_count > 0
682 || self.char_frequency.hyphen_count > 0
683 || self.char_frequency.plus_count > 0
684 }
685
686 pub fn likely_has_emphasis(&self) -> bool {
688 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
689 }
690
691 pub fn likely_has_tables(&self) -> bool {
693 self.char_frequency.pipe_count > 2
694 }
695
696 pub fn likely_has_blockquotes(&self) -> bool {
698 self.char_frequency.gt_count > 0
699 }
700
701 pub fn likely_has_code(&self) -> bool {
703 self.char_frequency.backtick_count > 0
704 }
705
706 pub fn likely_has_links_or_images(&self) -> bool {
708 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
709 }
710
711 pub fn likely_has_html(&self) -> bool {
713 self.char_frequency.lt_count > 0
714 }
715
716 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
718 self.html_tags()
719 .iter()
720 .filter(|tag| tag.line == line_num)
721 .cloned()
722 .collect()
723 }
724
725 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
727 self.emphasis_spans()
728 .iter()
729 .filter(|span| span.line == line_num)
730 .cloned()
731 .collect()
732 }
733
734 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
736 self.table_rows()
737 .iter()
738 .filter(|row| row.line == line_num)
739 .cloned()
740 .collect()
741 }
742
743 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
745 self.bare_urls()
746 .iter()
747 .filter(|url| url.line == line_num)
748 .cloned()
749 .collect()
750 }
751
752 fn parse_links(
754 content: &str,
755 lines: &[LineInfo],
756 code_blocks: &[(usize, usize)],
757 code_spans: &[CodeSpan],
758 flavor: MarkdownFlavor,
759 ) -> Vec<ParsedLink> {
760 use crate::utils::skip_context::{is_in_html_comment, is_mkdocs_snippet_line};
761
762 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
767 let full_match = cap.get(0).unwrap();
768 let match_start = full_match.start();
769 let match_end = full_match.end();
770
771 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
773 continue;
774 }
775
776 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
778 continue;
779 }
780
781 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
783 continue;
784 }
785
786 if code_spans
788 .iter()
789 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
790 {
791 continue;
792 }
793
794 if is_in_html_comment(content, match_start) {
796 continue;
797 }
798
799 let line_idx = lines
802 .iter()
803 .position(|line| {
804 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
805 })
806 .unwrap_or(0);
807
808 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
809 continue;
810 }
811
812 let mut line_num = 1;
814 let mut col_start = match_start;
815 for (idx, line_info) in lines.iter().enumerate() {
816 if match_start >= line_info.byte_offset {
817 line_num = idx + 1;
818 col_start = match_start - line_info.byte_offset;
819 } else {
820 break;
821 }
822 }
823
824 let mut end_line_num = 1;
826 let mut col_end = match_end;
827 for (idx, line_info) in lines.iter().enumerate() {
828 if match_end > line_info.byte_offset {
829 end_line_num = idx + 1;
830 col_end = match_end - line_info.byte_offset;
831 } else {
832 break;
833 }
834 }
835
836 if line_num == end_line_num {
838 } else {
840 }
843
844 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
845
846 let inline_url = cap.get(2).or_else(|| cap.get(3));
848
849 if let Some(url_match) = inline_url {
850 links.push(ParsedLink {
852 line: line_num,
853 start_col: col_start,
854 end_col: col_end,
855 byte_offset: match_start,
856 byte_end: match_end,
857 text,
858 url: url_match.as_str().to_string(),
859 is_reference: false,
860 reference_id: None,
861 });
862 } else if let Some(ref_id) = cap.get(6) {
863 let ref_id_str = ref_id.as_str();
865 let normalized_ref = if ref_id_str.is_empty() {
866 text.to_lowercase() } else {
868 ref_id_str.to_lowercase()
869 };
870
871 links.push(ParsedLink {
872 line: line_num,
873 start_col: col_start,
874 end_col: col_end,
875 byte_offset: match_start,
876 byte_end: match_end,
877 text,
878 url: String::new(), is_reference: true,
880 reference_id: Some(normalized_ref),
881 });
882 }
883 }
884
885 links
886 }
887
888 fn parse_images(
890 content: &str,
891 lines: &[LineInfo],
892 code_blocks: &[(usize, usize)],
893 code_spans: &[CodeSpan],
894 ) -> Vec<ParsedImage> {
895 use crate::utils::skip_context::is_in_html_comment;
896
897 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
902 let full_match = cap.get(0).unwrap();
903 let match_start = full_match.start();
904 let match_end = full_match.end();
905
906 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
908 continue;
909 }
910
911 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
913 continue;
914 }
915
916 if code_spans
918 .iter()
919 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
920 {
921 continue;
922 }
923
924 if is_in_html_comment(content, match_start) {
926 continue;
927 }
928
929 let mut line_num = 1;
931 let mut col_start = match_start;
932 for (idx, line_info) in lines.iter().enumerate() {
933 if match_start >= line_info.byte_offset {
934 line_num = idx + 1;
935 col_start = match_start - line_info.byte_offset;
936 } else {
937 break;
938 }
939 }
940
941 let mut end_line_num = 1;
943 let mut col_end = match_end;
944 for (idx, line_info) in lines.iter().enumerate() {
945 if match_end > line_info.byte_offset {
946 end_line_num = idx + 1;
947 col_end = match_end - line_info.byte_offset;
948 } else {
949 break;
950 }
951 }
952
953 if line_num == end_line_num {
955 } else {
957 }
960
961 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
962
963 let inline_url = cap.get(2).or_else(|| cap.get(3));
965
966 if let Some(url_match) = inline_url {
967 images.push(ParsedImage {
969 line: line_num,
970 start_col: col_start,
971 end_col: col_end,
972 byte_offset: match_start,
973 byte_end: match_end,
974 alt_text,
975 url: url_match.as_str().to_string(),
976 is_reference: false,
977 reference_id: None,
978 });
979 } else if let Some(ref_id) = cap.get(6) {
980 let ref_id_str = ref_id.as_str();
982 let normalized_ref = if ref_id_str.is_empty() {
983 alt_text.to_lowercase() } else {
985 ref_id_str.to_lowercase()
986 };
987
988 images.push(ParsedImage {
989 line: line_num,
990 start_col: col_start,
991 end_col: col_end,
992 byte_offset: match_start,
993 byte_end: match_end,
994 alt_text,
995 url: String::new(), is_reference: true,
997 reference_id: Some(normalized_ref),
998 });
999 }
1000 }
1001
1002 images
1003 }
1004
1005 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1007 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
1011 if line_info.in_code_block {
1013 continue;
1014 }
1015
1016 let line = &line_info.content;
1017 let line_num = line_idx + 1;
1018
1019 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1020 let id = cap.get(1).unwrap().as_str().to_lowercase();
1021 let url = cap.get(2).unwrap().as_str().to_string();
1022 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1023
1024 refs.push(ReferenceDef {
1025 line: line_num,
1026 id,
1027 url,
1028 title,
1029 });
1030 }
1031 }
1032
1033 refs
1034 }
1035
1036 fn compute_basic_line_info(
1038 content: &str,
1039 line_offsets: &[usize],
1040 code_blocks: &[(usize, usize)],
1041 flavor: MarkdownFlavor,
1042 ) -> Vec<LineInfo> {
1043 lazy_static! {
1044 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1046 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1047
1048 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1050 }
1051
1052 let content_lines: Vec<&str> = content.lines().collect();
1053 let mut lines = Vec::with_capacity(content_lines.len());
1054
1055 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1058
1059 for (i, line) in content_lines.iter().enumerate() {
1060 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1061 let indent = line.len() - line.trim_start().len();
1062 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1064 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1066 after_prefix.trim().is_empty()
1067 } else {
1068 line.trim().is_empty()
1069 };
1070 let in_code_block = code_blocks.iter().any(|&(start, end)| {
1073 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1078 let mut boundary = start;
1080 while boundary > 0 && !content.is_char_boundary(boundary) {
1081 boundary -= 1;
1082 }
1083 boundary
1084 } else {
1085 start
1086 };
1087
1088 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1089 let mut boundary = end;
1091 while boundary < content.len() && !content.is_char_boundary(boundary) {
1092 boundary += 1;
1093 }
1094 boundary
1095 } else {
1096 end.min(content.len())
1097 };
1098
1099 let block_content = &content[safe_start..safe_end];
1100 let is_multiline = block_content.contains('\n');
1101 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1102 let is_indented = !is_fenced
1103 && block_content
1104 .lines()
1105 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
1106
1107 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1108 });
1109
1110 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1112 && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1113 let in_html_comment = crate::utils::skip_context::is_in_html_comment(content, byte_offset);
1114 let list_item = if !(in_code_block
1115 || is_blank
1116 || in_mkdocstrings
1117 || in_html_comment
1118 || (front_matter_end > 0 && i < front_matter_end))
1119 {
1120 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1122 let prefix = caps.get(1).unwrap().as_str();
1123 let content = caps.get(2).unwrap().as_str();
1124 (content, prefix.len())
1125 } else {
1126 (&**line, 0)
1127 };
1128
1129 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1130 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1131 let marker = caps.get(2).map_or("", |m| m.as_str());
1132 let spacing = caps.get(3).map_or("", |m| m.as_str());
1133 let _content = caps.get(4).map_or("", |m| m.as_str());
1134 let marker_column = blockquote_prefix_len + leading_spaces.len();
1135 let content_column = marker_column + marker.len() + spacing.len();
1136
1137 if spacing.is_empty() {
1144 None
1145 } else {
1146 Some(ListItemInfo {
1147 marker: marker.to_string(),
1148 is_ordered: false,
1149 number: None,
1150 marker_column,
1151 content_column,
1152 })
1153 }
1154 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1155 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1156 let number_str = caps.get(2).map_or("", |m| m.as_str());
1157 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1158 let spacing = caps.get(4).map_or("", |m| m.as_str());
1159 let _content = caps.get(5).map_or("", |m| m.as_str());
1160 let marker = format!("{number_str}{delimiter}");
1161 let marker_column = blockquote_prefix_len + leading_spaces.len();
1162 let content_column = marker_column + marker.len() + spacing.len();
1163
1164 if spacing.is_empty() {
1167 None
1168 } else {
1169 Some(ListItemInfo {
1170 marker,
1171 is_ordered: true,
1172 number: number_str.parse().ok(),
1173 marker_column,
1174 content_column,
1175 })
1176 }
1177 } else {
1178 None
1179 }
1180 } else {
1181 None
1182 };
1183
1184 lines.push(LineInfo {
1185 content: line.to_string(),
1186 byte_offset,
1187 indent,
1188 is_blank,
1189 in_code_block,
1190 in_front_matter: front_matter_end > 0 && i < front_matter_end,
1191 in_html_block: false, in_html_comment,
1193 list_item,
1194 heading: None, blockquote: None, in_mkdocstrings,
1197 });
1198 }
1199
1200 lines
1201 }
1202
1203 fn detect_headings_and_blockquotes(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
1205 lazy_static! {
1206 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1208
1209 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1211 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1212 }
1213
1214 let content_lines: Vec<&str> = content.lines().collect();
1215
1216 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1218
1219 for i in 0..lines.len() {
1221 if lines[i].in_code_block {
1222 continue;
1223 }
1224
1225 if front_matter_end > 0 && i < front_matter_end {
1227 continue;
1228 }
1229
1230 if lines[i].in_html_block {
1232 continue;
1233 }
1234
1235 let line = content_lines[i];
1236
1237 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1239 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1240 let markers = caps.get(2).map_or("", |m| m.as_str());
1241 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1242 let content = caps.get(4).map_or("", |m| m.as_str());
1243
1244 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1245 let marker_column = indent_str.len();
1246
1247 let prefix = format!("{indent_str}{markers}{spaces_after}");
1249
1250 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1252 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1254
1255 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1259
1260 lines[i].blockquote = Some(BlockquoteInfo {
1261 nesting_level,
1262 indent: indent_str.to_string(),
1263 marker_column,
1264 prefix,
1265 content: content.to_string(),
1266 has_no_space_after_marker: has_no_space,
1267 has_multiple_spaces_after_marker: has_multiple_spaces,
1268 needs_md028_fix,
1269 });
1270 }
1271
1272 if lines[i].is_blank {
1274 continue;
1275 }
1276
1277 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1280 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1281 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1282 } else {
1283 false
1284 };
1285
1286 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1287 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1289 continue;
1290 }
1291 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1292 let hashes = caps.get(2).map_or("", |m| m.as_str());
1293 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1294 let rest = caps.get(4).map_or("", |m| m.as_str());
1295
1296 let level = hashes.len() as u8;
1297 let marker_column = leading_spaces.len();
1298
1299 let (text, has_closing, closing_seq) = {
1301 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1303 if rest[id_start..].trim_end().ends_with('}') {
1305 (&rest[..id_start], &rest[id_start..])
1307 } else {
1308 (rest, "")
1309 }
1310 } else {
1311 (rest, "")
1312 };
1313
1314 let trimmed_rest = rest_without_id.trim_end();
1316 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1317 let mut start_of_hashes = last_hash_pos;
1319 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1320 start_of_hashes -= 1;
1321 }
1322
1323 let has_space_before = start_of_hashes == 0
1325 || trimmed_rest
1326 .chars()
1327 .nth(start_of_hashes - 1)
1328 .is_some_and(|c| c.is_whitespace());
1329
1330 let potential_closing = &trimmed_rest[start_of_hashes..];
1332 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1333
1334 if is_all_hashes && has_space_before {
1335 let closing_hashes = potential_closing.to_string();
1337 let text_part = if !custom_id_part.is_empty() {
1340 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1343 } else {
1344 rest_without_id[..start_of_hashes].trim_end().to_string()
1345 };
1346 (text_part, true, closing_hashes)
1347 } else {
1348 (rest.to_string(), false, String::new())
1350 }
1351 } else {
1352 (rest.to_string(), false, String::new())
1354 }
1355 };
1356
1357 let content_column = marker_column + hashes.len() + spaces_after.len();
1358
1359 let raw_text = text.trim().to_string();
1361 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1362
1363 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1365 let next_line = content_lines[i + 1];
1366 if !lines[i + 1].in_code_block
1367 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1368 && let Some(next_line_id) =
1369 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1370 {
1371 custom_id = Some(next_line_id);
1372 }
1373 }
1374
1375 lines[i].heading = Some(HeadingInfo {
1376 level,
1377 style: HeadingStyle::ATX,
1378 marker: hashes.to_string(),
1379 marker_column,
1380 content_column,
1381 text: clean_text,
1382 custom_id,
1383 raw_text,
1384 has_closing_sequence: has_closing,
1385 closing_sequence: closing_seq,
1386 });
1387 }
1388 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1390 let next_line = content_lines[i + 1];
1391 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1392 if front_matter_end > 0 && i < front_matter_end {
1394 continue;
1395 }
1396
1397 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1399 continue;
1400 }
1401
1402 let underline = next_line.trim();
1403
1404 if underline == "---" {
1407 continue;
1408 }
1409
1410 let current_line_trimmed = line.trim();
1412 if current_line_trimmed.contains(':')
1413 && !current_line_trimmed.starts_with('#')
1414 && !current_line_trimmed.contains('[')
1415 && !current_line_trimmed.contains("](")
1416 {
1417 continue;
1419 }
1420
1421 let level = if underline.starts_with('=') { 1 } else { 2 };
1422 let style = if level == 1 {
1423 HeadingStyle::Setext1
1424 } else {
1425 HeadingStyle::Setext2
1426 };
1427
1428 let raw_text = line.trim().to_string();
1430 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1431
1432 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1434 let attr_line = content_lines[i + 2];
1435 if !lines[i + 2].in_code_block
1436 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1437 && let Some(attr_line_id) =
1438 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1439 {
1440 custom_id = Some(attr_line_id);
1441 }
1442 }
1443
1444 lines[i].heading = Some(HeadingInfo {
1445 level,
1446 style,
1447 marker: underline.to_string(),
1448 marker_column: next_line.len() - next_line.trim_start().len(),
1449 content_column: lines[i].indent,
1450 text: clean_text,
1451 custom_id,
1452 raw_text,
1453 has_closing_sequence: false,
1454 closing_sequence: String::new(),
1455 });
1456 }
1457 }
1458 }
1459 }
1460
1461 fn detect_html_blocks(lines: &mut [LineInfo]) {
1463 const BLOCK_ELEMENTS: &[&str] = &[
1465 "address",
1466 "article",
1467 "aside",
1468 "blockquote",
1469 "details",
1470 "dialog",
1471 "dd",
1472 "div",
1473 "dl",
1474 "dt",
1475 "fieldset",
1476 "figcaption",
1477 "figure",
1478 "footer",
1479 "form",
1480 "h1",
1481 "h2",
1482 "h3",
1483 "h4",
1484 "h5",
1485 "h6",
1486 "header",
1487 "hr",
1488 "li",
1489 "main",
1490 "nav",
1491 "ol",
1492 "p",
1493 "pre",
1494 "script",
1495 "section",
1496 "style",
1497 "table",
1498 "tbody",
1499 "td",
1500 "tfoot",
1501 "th",
1502 "thead",
1503 "tr",
1504 "ul",
1505 ];
1506
1507 let mut i = 0;
1508 while i < lines.len() {
1509 if lines[i].in_code_block || lines[i].in_front_matter {
1511 i += 1;
1512 continue;
1513 }
1514
1515 let trimmed = lines[i].content.trim_start();
1516
1517 if trimmed.starts_with('<') && trimmed.len() > 1 {
1519 let after_bracket = &trimmed[1..];
1521 let is_closing = after_bracket.starts_with('/');
1522 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1523
1524 let tag_name = tag_start
1526 .chars()
1527 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1528 .collect::<String>()
1529 .to_lowercase();
1530
1531 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1533 lines[i].in_html_block = true;
1535
1536 if !is_closing {
1539 let closing_tag = format!("</{tag_name}>");
1540 let allow_blank_lines = tag_name == "style" || tag_name == "script";
1542 let mut j = i + 1;
1543 while j < lines.len() && j < i + 100 {
1544 if !allow_blank_lines && lines[j].is_blank {
1547 break;
1548 }
1549
1550 lines[j].in_html_block = true;
1551
1552 if lines[j].content.contains(&closing_tag) {
1554 break;
1555 }
1556 j += 1;
1557 }
1558 }
1559 }
1560 }
1561
1562 i += 1;
1563 }
1564 }
1565
1566 fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1568 let mut code_spans = Vec::new();
1569
1570 if !content.contains('`') {
1572 return code_spans;
1573 }
1574
1575 fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1577 match node {
1578 Node::InlineCode(inline_code) => {
1579 if let Some(pos) = &inline_code.position {
1580 let start_pos = pos.start.offset;
1581 let end_pos = pos.end.offset;
1582
1583 let full_span = &content[start_pos..end_pos];
1585 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1586
1587 let content_start = start_pos + backtick_count;
1589 let content_end = end_pos - backtick_count;
1590 let span_content = if content_start < content_end {
1591 content[content_start..content_end].to_string()
1592 } else {
1593 String::new()
1594 };
1595
1596 let mut line_num = 1;
1598 let mut col_start = start_pos;
1599 for (idx, line_info) in lines.iter().enumerate() {
1600 if start_pos >= line_info.byte_offset {
1601 line_num = idx + 1;
1602 col_start = start_pos - line_info.byte_offset;
1603 } else {
1604 break;
1605 }
1606 }
1607
1608 let mut col_end = end_pos;
1610 for line_info in lines.iter() {
1611 if end_pos > line_info.byte_offset {
1612 col_end = end_pos - line_info.byte_offset;
1613 } else {
1614 break;
1615 }
1616 }
1617
1618 spans.push(CodeSpan {
1619 line: line_num,
1620 start_col: col_start,
1621 end_col: col_end,
1622 byte_offset: start_pos,
1623 byte_end: end_pos,
1624 backtick_count,
1625 content: span_content,
1626 });
1627 }
1628 }
1629 Node::Root(root) => {
1631 for child in &root.children {
1632 extract_code_spans(child, content, lines, spans);
1633 }
1634 }
1635 Node::Paragraph(para) => {
1636 for child in ¶.children {
1637 extract_code_spans(child, content, lines, spans);
1638 }
1639 }
1640 Node::Heading(heading) => {
1641 for child in &heading.children {
1642 extract_code_spans(child, content, lines, spans);
1643 }
1644 }
1645 Node::List(list) => {
1646 for child in &list.children {
1647 extract_code_spans(child, content, lines, spans);
1648 }
1649 }
1650 Node::ListItem(item) => {
1651 for child in &item.children {
1652 extract_code_spans(child, content, lines, spans);
1653 }
1654 }
1655 Node::Blockquote(blockquote) => {
1656 for child in &blockquote.children {
1657 extract_code_spans(child, content, lines, spans);
1658 }
1659 }
1660 Node::Table(table) => {
1661 for child in &table.children {
1662 extract_code_spans(child, content, lines, spans);
1663 }
1664 }
1665 Node::TableRow(row) => {
1666 for child in &row.children {
1667 extract_code_spans(child, content, lines, spans);
1668 }
1669 }
1670 Node::TableCell(cell) => {
1671 for child in &cell.children {
1672 extract_code_spans(child, content, lines, spans);
1673 }
1674 }
1675 Node::Emphasis(emphasis) => {
1676 for child in &emphasis.children {
1677 extract_code_spans(child, content, lines, spans);
1678 }
1679 }
1680 Node::Strong(strong) => {
1681 for child in &strong.children {
1682 extract_code_spans(child, content, lines, spans);
1683 }
1684 }
1685 Node::Link(link) => {
1686 for child in &link.children {
1687 extract_code_spans(child, content, lines, spans);
1688 }
1689 }
1690 Node::LinkReference(link_ref) => {
1691 for child in &link_ref.children {
1692 extract_code_spans(child, content, lines, spans);
1693 }
1694 }
1695 Node::FootnoteDefinition(footnote) => {
1696 for child in &footnote.children {
1697 extract_code_spans(child, content, lines, spans);
1698 }
1699 }
1700 Node::Delete(delete) => {
1701 for child in &delete.children {
1702 extract_code_spans(child, content, lines, spans);
1703 }
1704 }
1705 Node::Code(_)
1707 | Node::Text(_)
1708 | Node::Html(_)
1709 | Node::Image(_)
1710 | Node::ImageReference(_)
1711 | Node::FootnoteReference(_)
1712 | Node::Break(_)
1713 | Node::ThematicBreak(_)
1714 | Node::Definition(_)
1715 | Node::Yaml(_)
1716 | Node::Toml(_)
1717 | Node::Math(_)
1718 | Node::InlineMath(_)
1719 | Node::MdxJsxFlowElement(_)
1720 | Node::MdxFlowExpression(_)
1721 | Node::MdxJsxTextElement(_)
1722 | Node::MdxTextExpression(_)
1723 | Node::MdxjsEsm(_) => {
1724 }
1726 }
1727 }
1728
1729 extract_code_spans(ast, content, lines, &mut code_spans);
1731
1732 code_spans.sort_by_key(|span| span.byte_offset);
1734
1735 code_spans
1736 }
1737
1738 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1740 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1743 let mut last_list_item_line = 0;
1744 let mut current_indent_level = 0;
1745 let mut last_marker_width = 0;
1746
1747 for (line_idx, line_info) in lines.iter().enumerate() {
1748 let line_num = line_idx + 1;
1749
1750 if line_info.in_code_block {
1752 if let Some(ref mut block) = current_block {
1753 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1755
1756 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1758
1759 match context {
1760 CodeBlockContext::Indented => {
1761 block.end_line = line_num;
1763 continue;
1764 }
1765 CodeBlockContext::Standalone => {
1766 let completed_block = current_block.take().unwrap();
1768 list_blocks.push(completed_block);
1769 continue;
1770 }
1771 CodeBlockContext::Adjacent => {
1772 block.end_line = line_num;
1774 continue;
1775 }
1776 }
1777 } else {
1778 continue;
1780 }
1781 }
1782
1783 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1785 caps.get(0).unwrap().as_str().to_string()
1786 } else {
1787 String::new()
1788 };
1789
1790 if let Some(list_item) = &line_info.list_item {
1792 let item_indent = list_item.marker_column;
1794 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1797 let is_nested = nesting > block.nesting_level;
1801 let same_type =
1802 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1803 let same_context = block.blockquote_prefix == blockquote_prefix;
1804 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1808 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1809
1810 let has_non_list_content = {
1812 let mut found_non_list = false;
1813 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1815
1816 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1818 let last_line = &lines[block_last_item_line - 1];
1819 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1820 log::debug!(
1821 "After problematic line {}: checking lines {} to {} for non-list content",
1822 block_last_item_line,
1823 block_last_item_line + 1,
1824 line_num
1825 );
1826 if line_num == block_last_item_line + 1 {
1828 log::debug!("Lines are consecutive, no content between");
1829 }
1830 }
1831 }
1832
1833 for check_line in (block_last_item_line + 1)..line_num {
1834 let check_idx = check_line - 1;
1835 if check_idx < lines.len() {
1836 let check_info = &lines[check_idx];
1837 let is_list_breaking_content = if check_info.in_code_block {
1839 let last_item_marker_width =
1841 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1842 lines[block_last_item_line - 1]
1843 .list_item
1844 .as_ref()
1845 .map(|li| {
1846 if li.is_ordered {
1847 li.marker.len() + 1 } else {
1849 li.marker.len()
1850 }
1851 })
1852 .unwrap_or(3) } else {
1854 3 };
1856
1857 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1858
1859 let context = CodeBlockUtils::analyze_code_block_context(
1861 lines,
1862 check_line - 1,
1863 min_continuation,
1864 );
1865
1866 matches!(context, CodeBlockContext::Standalone)
1868 } else if !check_info.is_blank && check_info.list_item.is_none() {
1869 let line_content = check_info.content.trim();
1871
1872 if check_info.heading.is_some()
1874 || line_content.starts_with("---")
1875 || line_content.starts_with("***")
1876 || line_content.starts_with("___")
1877 || (line_content.contains('|')
1878 && !line_content.contains("](")
1879 && !line_content.contains("http")
1880 && (line_content.matches('|').count() > 1
1881 || line_content.starts_with('|')
1882 || line_content.ends_with('|')))
1883 || line_content.starts_with(">")
1884 {
1885 true
1886 }
1887 else {
1889 let last_item_marker_width =
1890 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1891 lines[block_last_item_line - 1]
1892 .list_item
1893 .as_ref()
1894 .map(|li| {
1895 if li.is_ordered {
1896 li.marker.len() + 1 } else {
1898 li.marker.len()
1899 }
1900 })
1901 .unwrap_or(3) } else {
1903 3 };
1905
1906 let min_continuation =
1907 if block.is_ordered { last_item_marker_width } else { 2 };
1908 check_info.indent < min_continuation
1909 }
1910 } else {
1911 false
1912 };
1913
1914 if is_list_breaking_content {
1915 found_non_list = true;
1917 break;
1918 }
1919 }
1920 }
1921 found_non_list
1922 };
1923
1924 let mut continues_list = if is_nested {
1928 same_context && reasonable_distance && !has_non_list_content
1930 } else {
1931 let result = same_type
1933 && same_context
1934 && reasonable_distance
1935 && marker_compatible
1936 && !has_non_list_content;
1937
1938 if block.item_lines.last().is_some_and(|&last_line| {
1940 last_line > 0
1941 && last_line <= lines.len()
1942 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1943 && lines[last_line - 1].content.contains(r"\`")
1944 }) {
1945 log::debug!(
1946 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1947 );
1948 if line_num > 0 && line_num <= lines.len() {
1949 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1950 }
1951 }
1952
1953 result
1954 };
1955
1956 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1959 if block.item_lines.contains(&(line_num - 1)) {
1961 continues_list = true;
1963 }
1964 }
1965
1966 if continues_list {
1967 block.end_line = line_num;
1969 block.item_lines.push(line_num);
1970
1971 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1973 list_item.marker.len() + 1
1974 } else {
1975 list_item.marker.len()
1976 });
1977
1978 if !block.is_ordered
1980 && block.marker.is_some()
1981 && block.marker.as_ref() != Some(&list_item.marker)
1982 {
1983 block.marker = None;
1985 }
1986 } else {
1987 list_blocks.push(block.clone());
1990
1991 *block = ListBlock {
1992 start_line: line_num,
1993 end_line: line_num,
1994 is_ordered: list_item.is_ordered,
1995 marker: if list_item.is_ordered {
1996 None
1997 } else {
1998 Some(list_item.marker.clone())
1999 },
2000 blockquote_prefix: blockquote_prefix.clone(),
2001 item_lines: vec![line_num],
2002 nesting_level: nesting,
2003 max_marker_width: if list_item.is_ordered {
2004 list_item.marker.len() + 1
2005 } else {
2006 list_item.marker.len()
2007 },
2008 };
2009 }
2010 } else {
2011 current_block = Some(ListBlock {
2013 start_line: line_num,
2014 end_line: line_num,
2015 is_ordered: list_item.is_ordered,
2016 marker: if list_item.is_ordered {
2017 None
2018 } else {
2019 Some(list_item.marker.clone())
2020 },
2021 blockquote_prefix,
2022 item_lines: vec![line_num],
2023 nesting_level: nesting,
2024 max_marker_width: list_item.marker.len(),
2025 });
2026 }
2027
2028 last_list_item_line = line_num;
2029 current_indent_level = item_indent;
2030 last_marker_width = if list_item.is_ordered {
2031 list_item.marker.len() + 1 } else {
2033 list_item.marker.len()
2034 };
2035 } else if let Some(ref mut block) = current_block {
2036 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2046 lines[block.end_line - 1].content.trim_end().ends_with('\\')
2047 } else {
2048 false
2049 };
2050
2051 let min_continuation_indent = if block.is_ordered {
2055 current_indent_level + last_marker_width
2056 } else {
2057 current_indent_level + 2 };
2059
2060 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2061 block.end_line = line_num;
2063 } else if line_info.is_blank {
2064 let mut check_idx = line_idx + 1;
2067 let mut found_continuation = false;
2068
2069 while check_idx < lines.len() && lines[check_idx].is_blank {
2071 check_idx += 1;
2072 }
2073
2074 if check_idx < lines.len() {
2075 let next_line = &lines[check_idx];
2076 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2078 found_continuation = true;
2079 }
2080 else if !next_line.in_code_block
2082 && next_line.list_item.is_some()
2083 && let Some(item) = &next_line.list_item
2084 {
2085 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2086 .find(&next_line.content)
2087 .map_or(String::new(), |m| m.as_str().to_string());
2088 if item.marker_column == current_indent_level
2089 && item.is_ordered == block.is_ordered
2090 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2091 {
2092 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2095 if let Some(between_line) = lines.get(idx) {
2096 let trimmed = between_line.content.trim();
2097 if trimmed.is_empty() {
2099 return false;
2100 }
2101 let line_indent =
2103 between_line.content.len() - between_line.content.trim_start().len();
2104
2105 if trimmed.starts_with("```")
2107 || trimmed.starts_with("~~~")
2108 || trimmed.starts_with("---")
2109 || trimmed.starts_with("***")
2110 || trimmed.starts_with("___")
2111 || trimmed.starts_with(">")
2112 || trimmed.contains('|') || between_line.heading.is_some()
2114 {
2115 return true; }
2117
2118 line_indent >= min_continuation_indent
2120 } else {
2121 false
2122 }
2123 });
2124
2125 if block.is_ordered {
2126 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2129 if let Some(between_line) = lines.get(idx) {
2130 let trimmed = between_line.content.trim();
2131 if trimmed.is_empty() {
2132 return false;
2133 }
2134 trimmed.starts_with("```")
2136 || trimmed.starts_with("~~~")
2137 || trimmed.starts_with("---")
2138 || trimmed.starts_with("***")
2139 || trimmed.starts_with("___")
2140 || trimmed.starts_with(">")
2141 || trimmed.contains('|') || between_line.heading.is_some()
2143 } else {
2144 false
2145 }
2146 });
2147 found_continuation = !has_structural_separators;
2148 } else {
2149 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2151 if let Some(between_line) = lines.get(idx) {
2152 let trimmed = between_line.content.trim();
2153 if trimmed.is_empty() {
2154 return false;
2155 }
2156 trimmed.starts_with("```")
2158 || trimmed.starts_with("~~~")
2159 || trimmed.starts_with("---")
2160 || trimmed.starts_with("***")
2161 || trimmed.starts_with("___")
2162 || trimmed.starts_with(">")
2163 || trimmed.contains('|') || between_line.heading.is_some()
2165 } else {
2166 false
2167 }
2168 });
2169 found_continuation = !has_structural_separators;
2170 }
2171 }
2172 }
2173 }
2174
2175 if found_continuation {
2176 block.end_line = line_num;
2178 } else {
2179 list_blocks.push(block.clone());
2181 current_block = None;
2182 }
2183 } else {
2184 let min_required_indent = if block.is_ordered {
2187 current_indent_level + last_marker_width
2188 } else {
2189 current_indent_level + 2
2190 };
2191
2192 let line_content = line_info.content.trim();
2197 let is_structural_separator = line_info.heading.is_some()
2198 || line_content.starts_with("```")
2199 || line_content.starts_with("~~~")
2200 || line_content.starts_with("---")
2201 || line_content.starts_with("***")
2202 || line_content.starts_with("___")
2203 || line_content.starts_with(">")
2204 || (line_content.contains('|')
2205 && !line_content.contains("](")
2206 && !line_content.contains("http")
2207 && (line_content.matches('|').count() > 1
2208 || line_content.starts_with('|')
2209 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2214 && !line_info.is_blank
2215 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2216
2217 if is_lazy_continuation {
2218 let content_to_check = if !blockquote_prefix.is_empty() {
2221 line_info
2223 .content
2224 .strip_prefix(&blockquote_prefix)
2225 .unwrap_or(&line_info.content)
2226 .trim()
2227 } else {
2228 line_info.content.trim()
2229 };
2230
2231 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2232
2233 if starts_with_uppercase && last_list_item_line > 0 {
2236 list_blocks.push(block.clone());
2238 current_block = None;
2239 } else {
2240 block.end_line = line_num;
2242 }
2243 } else {
2244 list_blocks.push(block.clone());
2246 current_block = None;
2247 }
2248 }
2249 }
2250 }
2251
2252 if let Some(block) = current_block {
2254 list_blocks.push(block);
2255 }
2256
2257 merge_adjacent_list_blocks(&mut list_blocks, lines);
2259
2260 list_blocks
2261 }
2262
2263 fn compute_char_frequency(content: &str) -> CharFrequency {
2265 let mut frequency = CharFrequency::default();
2266
2267 for ch in content.chars() {
2268 match ch {
2269 '#' => frequency.hash_count += 1,
2270 '*' => frequency.asterisk_count += 1,
2271 '_' => frequency.underscore_count += 1,
2272 '-' => frequency.hyphen_count += 1,
2273 '+' => frequency.plus_count += 1,
2274 '>' => frequency.gt_count += 1,
2275 '|' => frequency.pipe_count += 1,
2276 '[' => frequency.bracket_count += 1,
2277 '`' => frequency.backtick_count += 1,
2278 '<' => frequency.lt_count += 1,
2279 '!' => frequency.exclamation_count += 1,
2280 '\n' => frequency.newline_count += 1,
2281 _ => {}
2282 }
2283 }
2284
2285 frequency
2286 }
2287
2288 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2290 lazy_static! {
2291 static ref HTML_TAG_REGEX: regex::Regex =
2292 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2293 }
2294
2295 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2296
2297 for cap in HTML_TAG_REGEX.captures_iter(content) {
2298 let full_match = cap.get(0).unwrap();
2299 let match_start = full_match.start();
2300 let match_end = full_match.end();
2301
2302 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2304 continue;
2305 }
2306
2307 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2308 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2309 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2310
2311 let mut line_num = 1;
2313 let mut col_start = match_start;
2314 let mut col_end = match_end;
2315 for (idx, line_info) in lines.iter().enumerate() {
2316 if match_start >= line_info.byte_offset {
2317 line_num = idx + 1;
2318 col_start = match_start - line_info.byte_offset;
2319 col_end = match_end - line_info.byte_offset;
2320 } else {
2321 break;
2322 }
2323 }
2324
2325 html_tags.push(HtmlTag {
2326 line: line_num,
2327 start_col: col_start,
2328 end_col: col_end,
2329 byte_offset: match_start,
2330 byte_end: match_end,
2331 tag_name,
2332 is_closing,
2333 is_self_closing,
2334 raw_content: full_match.as_str().to_string(),
2335 });
2336 }
2337
2338 html_tags
2339 }
2340
2341 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2343 lazy_static! {
2344 static ref EMPHASIS_REGEX: regex::Regex =
2345 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2346 }
2347
2348 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2349
2350 for cap in EMPHASIS_REGEX.captures_iter(content) {
2351 let full_match = cap.get(0).unwrap();
2352 let match_start = full_match.start();
2353 let match_end = full_match.end();
2354
2355 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2357 continue;
2358 }
2359
2360 let opening_markers = cap.get(1).unwrap().as_str();
2361 let content_part = cap.get(2).unwrap().as_str();
2362 let closing_markers = cap.get(3).unwrap().as_str();
2363
2364 if opening_markers.chars().next() != closing_markers.chars().next()
2366 || opening_markers.len() != closing_markers.len()
2367 {
2368 continue;
2369 }
2370
2371 let marker = opening_markers.chars().next().unwrap();
2372 let marker_count = opening_markers.len();
2373
2374 let mut line_num = 1;
2376 let mut col_start = match_start;
2377 let mut col_end = match_end;
2378 for (idx, line_info) in lines.iter().enumerate() {
2379 if match_start >= line_info.byte_offset {
2380 line_num = idx + 1;
2381 col_start = match_start - line_info.byte_offset;
2382 col_end = match_end - line_info.byte_offset;
2383 } else {
2384 break;
2385 }
2386 }
2387
2388 emphasis_spans.push(EmphasisSpan {
2389 line: line_num,
2390 start_col: col_start,
2391 end_col: col_end,
2392 byte_offset: match_start,
2393 byte_end: match_end,
2394 marker,
2395 marker_count,
2396 content: content_part.to_string(),
2397 });
2398 }
2399
2400 emphasis_spans
2401 }
2402
2403 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2405 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2406
2407 for (line_idx, line_info) in lines.iter().enumerate() {
2408 if line_info.in_code_block || line_info.is_blank {
2410 continue;
2411 }
2412
2413 let line = &line_info.content;
2414 let line_num = line_idx + 1;
2415
2416 if !line.contains('|') {
2418 continue;
2419 }
2420
2421 let parts: Vec<&str> = line.split('|').collect();
2423 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2424
2425 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2427 let mut column_alignments = Vec::new();
2428
2429 if is_separator {
2430 for part in &parts[1..parts.len() - 1] {
2431 let trimmed = part.trim();
2433 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2434 "center".to_string()
2435 } else if trimmed.ends_with(':') {
2436 "right".to_string()
2437 } else if trimmed.starts_with(':') {
2438 "left".to_string()
2439 } else {
2440 "none".to_string()
2441 };
2442 column_alignments.push(alignment);
2443 }
2444 }
2445
2446 table_rows.push(TableRow {
2447 line: line_num,
2448 is_separator,
2449 column_count,
2450 column_alignments,
2451 });
2452 }
2453
2454 table_rows
2455 }
2456
2457 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2459 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2460
2461 for cap in BARE_URL_PATTERN.captures_iter(content) {
2463 let full_match = cap.get(0).unwrap();
2464 let match_start = full_match.start();
2465 let match_end = full_match.end();
2466
2467 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2469 continue;
2470 }
2471
2472 let preceding_char = if match_start > 0 {
2474 content.chars().nth(match_start - 1)
2475 } else {
2476 None
2477 };
2478 let following_char = content.chars().nth(match_end);
2479
2480 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2481 continue;
2482 }
2483 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2484 continue;
2485 }
2486
2487 let url = full_match.as_str();
2488 let url_type = if url.starts_with("https://") {
2489 "https"
2490 } else if url.starts_with("http://") {
2491 "http"
2492 } else if url.starts_with("ftp://") {
2493 "ftp"
2494 } else {
2495 "other"
2496 };
2497
2498 let mut line_num = 1;
2500 let mut col_start = match_start;
2501 let mut col_end = match_end;
2502 for (idx, line_info) in lines.iter().enumerate() {
2503 if match_start >= line_info.byte_offset {
2504 line_num = idx + 1;
2505 col_start = match_start - line_info.byte_offset;
2506 col_end = match_end - line_info.byte_offset;
2507 } else {
2508 break;
2509 }
2510 }
2511
2512 bare_urls.push(BareUrl {
2513 line: line_num,
2514 start_col: col_start,
2515 end_col: col_end,
2516 byte_offset: match_start,
2517 byte_end: match_end,
2518 url: url.to_string(),
2519 url_type: url_type.to_string(),
2520 });
2521 }
2522
2523 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2525 let full_match = cap.get(0).unwrap();
2526 let match_start = full_match.start();
2527 let match_end = full_match.end();
2528
2529 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2531 continue;
2532 }
2533
2534 let preceding_char = if match_start > 0 {
2536 content.chars().nth(match_start - 1)
2537 } else {
2538 None
2539 };
2540 let following_char = content.chars().nth(match_end);
2541
2542 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2543 continue;
2544 }
2545 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2546 continue;
2547 }
2548
2549 let email = full_match.as_str();
2550
2551 let mut line_num = 1;
2553 let mut col_start = match_start;
2554 let mut col_end = match_end;
2555 for (idx, line_info) in lines.iter().enumerate() {
2556 if match_start >= line_info.byte_offset {
2557 line_num = idx + 1;
2558 col_start = match_start - line_info.byte_offset;
2559 col_end = match_end - line_info.byte_offset;
2560 } else {
2561 break;
2562 }
2563 }
2564
2565 bare_urls.push(BareUrl {
2566 line: line_num,
2567 start_col: col_start,
2568 end_col: col_end,
2569 byte_offset: match_start,
2570 byte_end: match_end,
2571 url: email.to_string(),
2572 url_type: "email".to_string(),
2573 });
2574 }
2575
2576 bare_urls
2577 }
2578}
2579
2580fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2582 if list_blocks.len() < 2 {
2583 return;
2584 }
2585
2586 let mut merger = ListBlockMerger::new(lines);
2587 *list_blocks = merger.merge(list_blocks);
2588}
2589
2590struct ListBlockMerger<'a> {
2592 lines: &'a [LineInfo],
2593}
2594
2595impl<'a> ListBlockMerger<'a> {
2596 fn new(lines: &'a [LineInfo]) -> Self {
2597 Self { lines }
2598 }
2599
2600 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2601 let mut merged = Vec::with_capacity(list_blocks.len());
2602 let mut current = list_blocks[0].clone();
2603
2604 for next in list_blocks.iter().skip(1) {
2605 if self.should_merge_blocks(¤t, next) {
2606 current = self.merge_two_blocks(current, next);
2607 } else {
2608 merged.push(current);
2609 current = next.clone();
2610 }
2611 }
2612
2613 merged.push(current);
2614 merged
2615 }
2616
2617 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2619 if !self.blocks_are_compatible(current, next) {
2621 return false;
2622 }
2623
2624 let spacing = self.analyze_spacing_between(current, next);
2626 match spacing {
2627 BlockSpacing::Consecutive => true,
2628 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2629 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2630 self.can_merge_with_content_between(current, next)
2631 }
2632 }
2633 }
2634
2635 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2637 current.is_ordered == next.is_ordered
2638 && current.blockquote_prefix == next.blockquote_prefix
2639 && current.nesting_level == next.nesting_level
2640 }
2641
2642 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2644 let gap = next.start_line - current.end_line;
2645
2646 match gap {
2647 1 => BlockSpacing::Consecutive,
2648 2 => BlockSpacing::SingleBlank,
2649 _ if gap > 2 => {
2650 if self.has_only_blank_lines_between(current, next) {
2651 BlockSpacing::MultipleBlanks
2652 } else {
2653 BlockSpacing::ContentBetween
2654 }
2655 }
2656 _ => BlockSpacing::Consecutive, }
2658 }
2659
2660 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2662 if has_meaningful_content_between(current, next, self.lines) {
2665 return false; }
2667
2668 !current.is_ordered && current.marker == next.marker
2670 }
2671
2672 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2674 if has_meaningful_content_between(current, next, self.lines) {
2676 return false; }
2678
2679 current.is_ordered && next.is_ordered
2681 }
2682
2683 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2685 for line_num in (current.end_line + 1)..next.start_line {
2686 if let Some(line_info) = self.lines.get(line_num - 1)
2687 && !line_info.content.trim().is_empty()
2688 {
2689 return false;
2690 }
2691 }
2692 true
2693 }
2694
2695 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2697 current.end_line = next.end_line;
2698 current.item_lines.extend_from_slice(&next.item_lines);
2699
2700 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2702
2703 if !current.is_ordered && self.markers_differ(¤t, next) {
2705 current.marker = None; }
2707
2708 current
2709 }
2710
2711 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2713 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2714 }
2715}
2716
2717#[derive(Debug, PartialEq)]
2719enum BlockSpacing {
2720 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2725
2726fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2728 for line_num in (current.end_line + 1)..next.start_line {
2730 if let Some(line_info) = lines.get(line_num - 1) {
2731 let trimmed = line_info.content.trim();
2733
2734 if trimmed.is_empty() {
2736 continue;
2737 }
2738
2739 if line_info.heading.is_some() {
2743 return true; }
2745
2746 if is_horizontal_rule(trimmed) {
2748 return true; }
2750
2751 if trimmed.contains('|') && trimmed.len() > 1 {
2754 if !trimmed.contains("](") && !trimmed.contains("http") {
2756 let pipe_count = trimmed.matches('|').count();
2758 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2759 return true; }
2761 }
2762 }
2763
2764 if trimmed.starts_with('>') {
2766 return true; }
2768
2769 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2771 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2772
2773 let min_continuation_indent = if current.is_ordered {
2775 current.nesting_level + current.max_marker_width + 1 } else {
2777 current.nesting_level + 2
2778 };
2779
2780 if line_indent < min_continuation_indent {
2781 return true; }
2784 }
2785
2786 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2788
2789 let min_indent = if current.is_ordered {
2791 current.nesting_level + current.max_marker_width
2792 } else {
2793 current.nesting_level + 2
2794 };
2795
2796 if line_indent < min_indent {
2798 return true; }
2800
2801 }
2804 }
2805
2806 false
2808}
2809
2810fn is_horizontal_rule(trimmed: &str) -> bool {
2812 if trimmed.len() < 3 {
2813 return false;
2814 }
2815
2816 let chars: Vec<char> = trimmed.chars().collect();
2818 if let Some(&first_char) = chars.first()
2819 && (first_char == '-' || first_char == '*' || first_char == '_')
2820 {
2821 let mut count = 0;
2822 for &ch in &chars {
2823 if ch == first_char {
2824 count += 1;
2825 } else if ch != ' ' && ch != '\t' {
2826 return false; }
2828 }
2829 return count >= 3;
2830 }
2831 false
2832}
2833
2834#[cfg(test)]
2836mod tests {
2837 use super::*;
2838
2839 #[test]
2840 fn test_empty_content() {
2841 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2842 assert_eq!(ctx.content, "");
2843 assert_eq!(ctx.line_offsets, vec![0]);
2844 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2845 assert_eq!(ctx.lines.len(), 0);
2846 }
2847
2848 #[test]
2849 fn test_single_line() {
2850 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2851 assert_eq!(ctx.content, "# Hello");
2852 assert_eq!(ctx.line_offsets, vec![0]);
2853 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2854 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2855 }
2856
2857 #[test]
2858 fn test_multi_line() {
2859 let content = "# Title\n\nSecond line\nThird line";
2860 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2861 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2862 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2869
2870 #[test]
2871 fn test_line_info() {
2872 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2873 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2874
2875 assert_eq!(ctx.lines.len(), 7);
2877
2878 let line1 = &ctx.lines[0];
2880 assert_eq!(line1.content, "# Title");
2881 assert_eq!(line1.byte_offset, 0);
2882 assert_eq!(line1.indent, 0);
2883 assert!(!line1.is_blank);
2884 assert!(!line1.in_code_block);
2885 assert!(line1.list_item.is_none());
2886
2887 let line2 = &ctx.lines[1];
2889 assert_eq!(line2.content, " indented");
2890 assert_eq!(line2.byte_offset, 8);
2891 assert_eq!(line2.indent, 4);
2892 assert!(!line2.is_blank);
2893
2894 let line3 = &ctx.lines[2];
2896 assert_eq!(line3.content, "");
2897 assert!(line3.is_blank);
2898
2899 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2901 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2902 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2903 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2904 }
2905
2906 #[test]
2907 fn test_list_item_detection() {
2908 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2909 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2910
2911 let line1 = &ctx.lines[0];
2913 assert!(line1.list_item.is_some());
2914 let list1 = line1.list_item.as_ref().unwrap();
2915 assert_eq!(list1.marker, "-");
2916 assert!(!list1.is_ordered);
2917 assert_eq!(list1.marker_column, 0);
2918 assert_eq!(list1.content_column, 2);
2919
2920 let line2 = &ctx.lines[1];
2922 assert!(line2.list_item.is_some());
2923 let list2 = line2.list_item.as_ref().unwrap();
2924 assert_eq!(list2.marker, "*");
2925 assert_eq!(list2.marker_column, 2);
2926
2927 let line3 = &ctx.lines[2];
2929 assert!(line3.list_item.is_some());
2930 let list3 = line3.list_item.as_ref().unwrap();
2931 assert_eq!(list3.marker, "1.");
2932 assert!(list3.is_ordered);
2933 assert_eq!(list3.number, Some(1));
2934
2935 let line6 = &ctx.lines[5];
2937 assert!(line6.list_item.is_none());
2938 }
2939
2940 #[test]
2941 fn test_offset_to_line_col_edge_cases() {
2942 let content = "a\nb\nc";
2943 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2944 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2952}