1use crate::config::MarkdownFlavor;
2use crate::utils::ast_utils::get_cached_ast;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use markdown::mdast::Node;
6use regex::Regex;
7
8lazy_static! {
9 static ref LINK_PATTERN: Regex = Regex::new(
12 r"(?sx)
13 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
14 (?:
15 \(([^)]*)\) # Inline URL in group 2 (can be empty)
16 |
17 \[([^\]]*)\] # Reference ID in group 3
18 )"
19 ).unwrap();
20
21 static ref IMAGE_PATTERN: Regex = Regex::new(
24 r"(?sx)
25 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
26 (?:
27 \(([^)]*)\) # Inline URL in group 2 (can be empty)
28 |
29 \[([^\]]*)\] # Reference ID in group 3
30 )"
31 ).unwrap();
32
33 static ref REF_DEF_PATTERN: Regex = Regex::new(
35 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36 ).unwrap();
37
38 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41 r"`+"
42 ).unwrap();
43
44 static ref BARE_URL_PATTERN: Regex = Regex::new(
46 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47 ).unwrap();
48
49 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52 ).unwrap();
53
54 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57 ).unwrap();
58
59 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63#[derive(Debug, Clone)]
65pub struct LineInfo {
66 pub content: String,
68 pub byte_offset: usize,
70 pub indent: usize,
72 pub is_blank: bool,
74 pub in_code_block: bool,
76 pub in_front_matter: bool,
78 pub in_html_block: bool,
80 pub list_item: Option<ListItemInfo>,
82 pub heading: Option<HeadingInfo>,
84 pub blockquote: Option<BlockquoteInfo>,
86}
87
88#[derive(Debug, Clone)]
90pub struct ListItemInfo {
91 pub marker: String,
93 pub is_ordered: bool,
95 pub number: Option<usize>,
97 pub marker_column: usize,
99 pub content_column: usize,
101}
102
103#[derive(Debug, Clone, PartialEq)]
105pub enum HeadingStyle {
106 ATX,
108 Setext1,
110 Setext2,
112}
113
114#[derive(Debug, Clone)]
116pub struct ParsedLink {
117 pub line: usize,
119 pub start_col: usize,
121 pub end_col: usize,
123 pub byte_offset: usize,
125 pub byte_end: usize,
127 pub text: String,
129 pub url: String,
131 pub is_reference: bool,
133 pub reference_id: Option<String>,
135}
136
137#[derive(Debug, Clone)]
139pub struct ParsedImage {
140 pub line: usize,
142 pub start_col: usize,
144 pub end_col: usize,
146 pub byte_offset: usize,
148 pub byte_end: usize,
150 pub alt_text: String,
152 pub url: String,
154 pub is_reference: bool,
156 pub reference_id: Option<String>,
158}
159
160#[derive(Debug, Clone)]
162pub struct ReferenceDef {
163 pub line: usize,
165 pub id: String,
167 pub url: String,
169 pub title: Option<String>,
171}
172
173#[derive(Debug, Clone)]
175pub struct CodeSpan {
176 pub line: usize,
178 pub start_col: usize,
180 pub end_col: usize,
182 pub byte_offset: usize,
184 pub byte_end: usize,
186 pub backtick_count: usize,
188 pub content: String,
190}
191
192#[derive(Debug, Clone)]
194pub struct HeadingInfo {
195 pub level: u8,
197 pub style: HeadingStyle,
199 pub marker: String,
201 pub marker_column: usize,
203 pub content_column: usize,
205 pub text: String,
207 pub custom_id: Option<String>,
209 pub raw_text: String,
211 pub has_closing_sequence: bool,
213 pub closing_sequence: String,
215}
216
217#[derive(Debug, Clone)]
219pub struct BlockquoteInfo {
220 pub nesting_level: usize,
222 pub indent: String,
224 pub marker_column: usize,
226 pub prefix: String,
228 pub content: String,
230 pub has_no_space_after_marker: bool,
232 pub has_multiple_spaces_after_marker: bool,
234 pub needs_md028_fix: bool,
236}
237
238#[derive(Debug, Clone)]
240pub struct ListBlock {
241 pub start_line: usize,
243 pub end_line: usize,
245 pub is_ordered: bool,
247 pub marker: Option<String>,
249 pub blockquote_prefix: String,
251 pub item_lines: Vec<usize>,
253 pub nesting_level: usize,
255 pub max_marker_width: usize,
257}
258
259use std::sync::{Arc, Mutex};
260
261#[derive(Debug, Clone, Default)]
263pub struct CharFrequency {
264 pub hash_count: usize,
266 pub asterisk_count: usize,
268 pub underscore_count: usize,
270 pub hyphen_count: usize,
272 pub plus_count: usize,
274 pub gt_count: usize,
276 pub pipe_count: usize,
278 pub bracket_count: usize,
280 pub backtick_count: usize,
282 pub lt_count: usize,
284 pub exclamation_count: usize,
286 pub newline_count: usize,
288}
289
290#[derive(Debug, Clone)]
292pub struct HtmlTag {
293 pub line: usize,
295 pub start_col: usize,
297 pub end_col: usize,
299 pub byte_offset: usize,
301 pub byte_end: usize,
303 pub tag_name: String,
305 pub is_closing: bool,
307 pub is_self_closing: bool,
309 pub raw_content: String,
311}
312
313#[derive(Debug, Clone)]
315pub struct EmphasisSpan {
316 pub line: usize,
318 pub start_col: usize,
320 pub end_col: usize,
322 pub byte_offset: usize,
324 pub byte_end: usize,
326 pub marker: char,
328 pub marker_count: usize,
330 pub content: String,
332}
333
334#[derive(Debug, Clone)]
336pub struct TableRow {
337 pub line: usize,
339 pub is_separator: bool,
341 pub column_count: usize,
343 pub column_alignments: Vec<String>, }
346
347#[derive(Debug, Clone)]
349pub struct BareUrl {
350 pub line: usize,
352 pub start_col: usize,
354 pub end_col: usize,
356 pub byte_offset: usize,
358 pub byte_end: usize,
360 pub url: String,
362 pub url_type: String,
364}
365
366pub struct LintContext<'a> {
367 pub content: &'a str,
368 pub line_offsets: Vec<usize>,
369 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, ast_cache: Mutex<Option<Arc<Node>>>, pub flavor: MarkdownFlavor, }
384
385impl<'a> LintContext<'a> {
386 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
387 let mut line_offsets = vec![0];
388 for (i, c) in content.char_indices() {
389 if c == '\n' {
390 line_offsets.push(i + 1);
391 }
392 }
393
394 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
396
397 let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
399
400 let ast = get_cached_ast(content);
402 let code_spans = Self::parse_code_spans(content, &lines, &ast);
403
404 let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
406 let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
407 let reference_defs = Self::parse_reference_defs(content, &lines);
408 let list_blocks = Self::parse_list_blocks(&lines);
409
410 Self::detect_html_blocks(&mut lines);
412
413 let char_frequency = Self::compute_char_frequency(content);
415
416 Self {
417 content,
418 line_offsets,
419 code_blocks,
420 lines,
421 links,
422 images,
423 reference_defs,
424 code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
425 list_blocks,
426 char_frequency,
427 html_tags_cache: Mutex::new(None),
428 emphasis_spans_cache: Mutex::new(None),
429 table_rows_cache: Mutex::new(None),
430 bare_urls_cache: Mutex::new(None),
431 ast_cache: Mutex::new(None),
432 flavor,
433 }
434 }
435
436 pub fn get_ast(&self) -> Arc<Node> {
438 let mut cache = self.ast_cache.lock().unwrap();
439
440 if cache.is_none() {
441 *cache = Some(get_cached_ast(self.content));
444 }
445
446 cache.as_ref().unwrap().clone()
447 }
448
449 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
451 let mut cache = self.code_spans_cache.lock().unwrap();
452
453 if cache.is_none() {
455 let ast = self.get_ast();
456 let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
457 *cache = Some(Arc::new(code_spans));
458 }
459
460 cache.as_ref().unwrap().clone()
462 }
463
464 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
466 let mut cache = self.html_tags_cache.lock().unwrap();
467
468 if cache.is_none() {
469 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
470 *cache = Some(Arc::new(html_tags));
471 }
472
473 cache.as_ref().unwrap().clone()
474 }
475
476 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
478 let mut cache = self.emphasis_spans_cache.lock().unwrap();
479
480 if cache.is_none() {
481 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
482 *cache = Some(Arc::new(emphasis_spans));
483 }
484
485 cache.as_ref().unwrap().clone()
486 }
487
488 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
490 let mut cache = self.table_rows_cache.lock().unwrap();
491
492 if cache.is_none() {
493 let table_rows = Self::parse_table_rows(&self.lines);
494 *cache = Some(Arc::new(table_rows));
495 }
496
497 cache.as_ref().unwrap().clone()
498 }
499
500 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
502 let mut cache = self.bare_urls_cache.lock().unwrap();
503
504 if cache.is_none() {
505 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
506 *cache = Some(Arc::new(bare_urls));
507 }
508
509 cache.as_ref().unwrap().clone()
510 }
511
512 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
514 match self.line_offsets.binary_search(&offset) {
515 Ok(line) => (line + 1, 1),
516 Err(line) => {
517 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
518 (line, offset - line_start + 1)
519 }
520 }
521 }
522
523 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
525 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
527 return true;
528 }
529
530 self.code_spans()
532 .iter()
533 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
534 }
535
536 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
538 if line_num > 0 {
539 self.lines.get(line_num - 1)
540 } else {
541 None
542 }
543 }
544
545 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
547 self.line_info(line_num).map(|info| info.byte_offset)
548 }
549
550 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
552 let normalized_id = ref_id.to_lowercase();
553 self.reference_defs
554 .iter()
555 .find(|def| def.id == normalized_id)
556 .map(|def| def.url.as_str())
557 }
558
559 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
561 self.links.iter().filter(|link| link.line == line_num).collect()
562 }
563
564 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
566 self.images.iter().filter(|img| img.line == line_num).collect()
567 }
568
569 pub fn is_in_list_block(&self, line_num: usize) -> bool {
571 self.list_blocks
572 .iter()
573 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
574 }
575
576 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
578 self.list_blocks
579 .iter()
580 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
581 }
582
583 pub fn is_in_code_block(&self, line_num: usize) -> bool {
587 if line_num == 0 || line_num > self.lines.len() {
588 return false;
589 }
590 self.lines[line_num - 1].in_code_block
591 }
592
593 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
595 if line_num == 0 || line_num > self.lines.len() {
596 return false;
597 }
598 self.lines[line_num - 1].in_front_matter
599 }
600
601 pub fn is_in_html_block(&self, line_num: usize) -> bool {
603 if line_num == 0 || line_num > self.lines.len() {
604 return false;
605 }
606 self.lines[line_num - 1].in_html_block
607 }
608
609 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
611 if line_num == 0 || line_num > self.lines.len() {
612 return false;
613 }
614
615 let code_spans = self.code_spans();
617 code_spans
618 .iter()
619 .any(|span| span.line == line_num && col >= span.start_col && col <= span.end_col)
620 }
621
622 pub fn has_char(&self, ch: char) -> bool {
624 match ch {
625 '#' => self.char_frequency.hash_count > 0,
626 '*' => self.char_frequency.asterisk_count > 0,
627 '_' => self.char_frequency.underscore_count > 0,
628 '-' => self.char_frequency.hyphen_count > 0,
629 '+' => self.char_frequency.plus_count > 0,
630 '>' => self.char_frequency.gt_count > 0,
631 '|' => self.char_frequency.pipe_count > 0,
632 '[' => self.char_frequency.bracket_count > 0,
633 '`' => self.char_frequency.backtick_count > 0,
634 '<' => self.char_frequency.lt_count > 0,
635 '!' => self.char_frequency.exclamation_count > 0,
636 '\n' => self.char_frequency.newline_count > 0,
637 _ => self.content.contains(ch), }
639 }
640
641 pub fn char_count(&self, ch: char) -> usize {
643 match ch {
644 '#' => self.char_frequency.hash_count,
645 '*' => self.char_frequency.asterisk_count,
646 '_' => self.char_frequency.underscore_count,
647 '-' => self.char_frequency.hyphen_count,
648 '+' => self.char_frequency.plus_count,
649 '>' => self.char_frequency.gt_count,
650 '|' => self.char_frequency.pipe_count,
651 '[' => self.char_frequency.bracket_count,
652 '`' => self.char_frequency.backtick_count,
653 '<' => self.char_frequency.lt_count,
654 '!' => self.char_frequency.exclamation_count,
655 '\n' => self.char_frequency.newline_count,
656 _ => self.content.matches(ch).count(), }
658 }
659
660 pub fn likely_has_headings(&self) -> bool {
662 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
664
665 pub fn likely_has_lists(&self) -> bool {
667 self.char_frequency.asterisk_count > 0
668 || self.char_frequency.hyphen_count > 0
669 || self.char_frequency.plus_count > 0
670 }
671
672 pub fn likely_has_emphasis(&self) -> bool {
674 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
675 }
676
677 pub fn likely_has_tables(&self) -> bool {
679 self.char_frequency.pipe_count > 2
680 }
681
682 pub fn likely_has_blockquotes(&self) -> bool {
684 self.char_frequency.gt_count > 0
685 }
686
687 pub fn likely_has_code(&self) -> bool {
689 self.char_frequency.backtick_count > 0
690 }
691
692 pub fn likely_has_links_or_images(&self) -> bool {
694 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
695 }
696
697 pub fn likely_has_html(&self) -> bool {
699 self.char_frequency.lt_count > 0
700 }
701
702 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
704 self.html_tags()
705 .iter()
706 .filter(|tag| tag.line == line_num)
707 .cloned()
708 .collect()
709 }
710
711 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
713 self.emphasis_spans()
714 .iter()
715 .filter(|span| span.line == line_num)
716 .cloned()
717 .collect()
718 }
719
720 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
722 self.table_rows()
723 .iter()
724 .filter(|row| row.line == line_num)
725 .cloned()
726 .collect()
727 }
728
729 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
731 self.bare_urls()
732 .iter()
733 .filter(|url| url.line == line_num)
734 .cloned()
735 .collect()
736 }
737
738 fn parse_links(
740 content: &str,
741 lines: &[LineInfo],
742 code_blocks: &[(usize, usize)],
743 code_spans: &[CodeSpan],
744 flavor: MarkdownFlavor,
745 ) -> Vec<ParsedLink> {
746 use crate::utils::skip_context::is_mkdocs_snippet_line;
747
748 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
753 let full_match = cap.get(0).unwrap();
754 let match_start = full_match.start();
755 let match_end = full_match.end();
756
757 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
759 continue;
760 }
761
762 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
764 continue;
765 }
766
767 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
769 continue;
770 }
771
772 if code_spans
774 .iter()
775 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
776 {
777 continue;
778 }
779
780 let line_idx = lines
783 .iter()
784 .position(|line| {
785 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
786 })
787 .unwrap_or(0);
788
789 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
790 continue;
791 }
792
793 let mut line_num = 1;
795 let mut col_start = match_start;
796 for (idx, line_info) in lines.iter().enumerate() {
797 if match_start >= line_info.byte_offset {
798 line_num = idx + 1;
799 col_start = match_start - line_info.byte_offset;
800 } else {
801 break;
802 }
803 }
804
805 let mut end_line_num = 1;
807 let mut col_end = match_end;
808 for (idx, line_info) in lines.iter().enumerate() {
809 if match_end > line_info.byte_offset {
810 end_line_num = idx + 1;
811 col_end = match_end - line_info.byte_offset;
812 } else {
813 break;
814 }
815 }
816
817 if line_num == end_line_num {
819 } else {
821 }
824
825 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
826
827 if let Some(inline_url) = cap.get(2) {
828 links.push(ParsedLink {
830 line: line_num,
831 start_col: col_start,
832 end_col: col_end,
833 byte_offset: match_start,
834 byte_end: match_end,
835 text,
836 url: inline_url.as_str().to_string(),
837 is_reference: false,
838 reference_id: None,
839 });
840 } else if let Some(ref_id) = cap.get(3) {
841 let ref_id_str = ref_id.as_str();
843 let normalized_ref = if ref_id_str.is_empty() {
844 text.to_lowercase() } else {
846 ref_id_str.to_lowercase()
847 };
848
849 links.push(ParsedLink {
850 line: line_num,
851 start_col: col_start,
852 end_col: col_end,
853 byte_offset: match_start,
854 byte_end: match_end,
855 text,
856 url: String::new(), is_reference: true,
858 reference_id: Some(normalized_ref),
859 });
860 }
861 }
862
863 links
864 }
865
866 fn parse_images(
868 content: &str,
869 lines: &[LineInfo],
870 code_blocks: &[(usize, usize)],
871 code_spans: &[CodeSpan],
872 ) -> Vec<ParsedImage> {
873 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
878 let full_match = cap.get(0).unwrap();
879 let match_start = full_match.start();
880 let match_end = full_match.end();
881
882 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
884 continue;
885 }
886
887 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
889 continue;
890 }
891
892 if code_spans
894 .iter()
895 .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
896 {
897 continue;
898 }
899
900 let mut line_num = 1;
902 let mut col_start = match_start;
903 for (idx, line_info) in lines.iter().enumerate() {
904 if match_start >= line_info.byte_offset {
905 line_num = idx + 1;
906 col_start = match_start - line_info.byte_offset;
907 } else {
908 break;
909 }
910 }
911
912 let mut end_line_num = 1;
914 let mut col_end = match_end;
915 for (idx, line_info) in lines.iter().enumerate() {
916 if match_end > line_info.byte_offset {
917 end_line_num = idx + 1;
918 col_end = match_end - line_info.byte_offset;
919 } else {
920 break;
921 }
922 }
923
924 if line_num == end_line_num {
926 } else {
928 }
931
932 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
933
934 if let Some(inline_url) = cap.get(2) {
935 images.push(ParsedImage {
937 line: line_num,
938 start_col: col_start,
939 end_col: col_end,
940 byte_offset: match_start,
941 byte_end: match_end,
942 alt_text,
943 url: inline_url.as_str().to_string(),
944 is_reference: false,
945 reference_id: None,
946 });
947 } else if let Some(ref_id) = cap.get(3) {
948 let ref_id_str = ref_id.as_str();
950 let normalized_ref = if ref_id_str.is_empty() {
951 alt_text.to_lowercase() } else {
953 ref_id_str.to_lowercase()
954 };
955
956 images.push(ParsedImage {
957 line: line_num,
958 start_col: col_start,
959 end_col: col_end,
960 byte_offset: match_start,
961 byte_end: match_end,
962 alt_text,
963 url: String::new(), is_reference: true,
965 reference_id: Some(normalized_ref),
966 });
967 }
968 }
969
970 images
971 }
972
973 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
975 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
979 if line_info.in_code_block {
981 continue;
982 }
983
984 let line = &line_info.content;
985 let line_num = line_idx + 1;
986
987 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
988 let id = cap.get(1).unwrap().as_str().to_lowercase();
989 let url = cap.get(2).unwrap().as_str().to_string();
990 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
991
992 refs.push(ReferenceDef {
993 line: line_num,
994 id,
995 url,
996 title,
997 });
998 }
999 }
1000
1001 refs
1002 }
1003
1004 fn compute_line_info(
1006 content: &str,
1007 line_offsets: &[usize],
1008 code_blocks: &[(usize, usize)],
1009 flavor: MarkdownFlavor,
1010 ) -> Vec<LineInfo> {
1011 lazy_static! {
1012 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1014 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1015
1016 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1018
1019 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1021 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1022
1023 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1025 }
1026
1027 let content_lines: Vec<&str> = content.lines().collect();
1028 let mut lines = Vec::with_capacity(content_lines.len());
1029
1030 let mut in_front_matter = false;
1032 let mut front_matter_end = 0;
1033 if content_lines.first().map(|l| l.trim()) == Some("---") {
1034 in_front_matter = true;
1035 for (idx, line) in content_lines.iter().enumerate().skip(1) {
1036 if line.trim() == "---" {
1037 front_matter_end = idx;
1038 break;
1039 }
1040 }
1041 }
1042
1043 for (i, line) in content_lines.iter().enumerate() {
1044 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1045 let indent = line.len() - line.trim_start().len();
1046 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1048 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1050 after_prefix.trim().is_empty()
1051 } else {
1052 line.trim().is_empty()
1053 };
1054 let in_code_block = code_blocks.iter().any(|&(start, end)| {
1057 let safe_start = if start > 0 && !content.is_char_boundary(start) {
1062 let mut boundary = start;
1064 while boundary > 0 && !content.is_char_boundary(boundary) {
1065 boundary -= 1;
1066 }
1067 boundary
1068 } else {
1069 start
1070 };
1071
1072 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1073 let mut boundary = end;
1075 while boundary < content.len() && !content.is_char_boundary(boundary) {
1076 boundary += 1;
1077 }
1078 boundary
1079 } else {
1080 end.min(content.len())
1081 };
1082
1083 let block_content = &content[safe_start..safe_end];
1084 let is_multiline = block_content.contains('\n');
1085 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1086 let is_indented = !is_fenced
1087 && block_content
1088 .lines()
1089 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
1090
1091 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1092 });
1093
1094 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
1096 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1098 let prefix = caps.get(1).unwrap().as_str();
1099 let content = caps.get(2).unwrap().as_str();
1100 (content, prefix.len())
1101 } else {
1102 (&**line, 0)
1103 };
1104
1105 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1106 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1107 let marker = caps.get(2).map_or("", |m| m.as_str());
1108 let spacing = caps.get(3).map_or("", |m| m.as_str());
1109 let _content = caps.get(4).map_or("", |m| m.as_str());
1110 let marker_column = blockquote_prefix_len + leading_spaces.len();
1111 let content_column = marker_column + marker.len() + spacing.len();
1112
1113 if spacing.is_empty() {
1120 None
1121 } else {
1122 Some(ListItemInfo {
1123 marker: marker.to_string(),
1124 is_ordered: false,
1125 number: None,
1126 marker_column,
1127 content_column,
1128 })
1129 }
1130 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1131 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1132 let number_str = caps.get(2).map_or("", |m| m.as_str());
1133 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1134 let spacing = caps.get(4).map_or("", |m| m.as_str());
1135 let _content = caps.get(5).map_or("", |m| m.as_str());
1136 let marker = format!("{number_str}{delimiter}");
1137 let marker_column = blockquote_prefix_len + leading_spaces.len();
1138 let content_column = marker_column + marker.len() + spacing.len();
1139
1140 if spacing.is_empty() {
1143 None
1144 } else {
1145 Some(ListItemInfo {
1146 marker,
1147 is_ordered: true,
1148 number: number_str.parse().ok(),
1149 marker_column,
1150 content_column,
1151 })
1152 }
1153 } else {
1154 None
1155 }
1156 } else {
1157 None
1158 };
1159
1160 lines.push(LineInfo {
1161 content: line.to_string(),
1162 byte_offset,
1163 indent,
1164 is_blank,
1165 in_code_block,
1166 in_front_matter: in_front_matter && i <= front_matter_end,
1167 in_html_block: false, list_item,
1169 heading: None, blockquote: None, });
1172 }
1173
1174 for i in 0..content_lines.len() {
1176 if lines[i].in_code_block {
1177 continue;
1178 }
1179
1180 if in_front_matter && i <= front_matter_end {
1182 continue;
1183 }
1184
1185 let line = content_lines[i];
1186
1187 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1189 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1190 let markers = caps.get(2).map_or("", |m| m.as_str());
1191 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1192 let content = caps.get(4).map_or("", |m| m.as_str());
1193
1194 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1195 let marker_column = indent_str.len();
1196
1197 let prefix = format!("{indent_str}{markers}{spaces_after}");
1199
1200 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1202 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1204
1205 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1209
1210 lines[i].blockquote = Some(BlockquoteInfo {
1211 nesting_level,
1212 indent: indent_str.to_string(),
1213 marker_column,
1214 prefix,
1215 content: content.to_string(),
1216 has_no_space_after_marker: has_no_space,
1217 has_multiple_spaces_after_marker: has_multiple_spaces,
1218 needs_md028_fix,
1219 });
1220 }
1221
1222 if lines[i].is_blank {
1224 continue;
1225 }
1226
1227 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1230 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1231 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1232 } else {
1233 false
1234 };
1235
1236 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1237 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1239 continue;
1240 }
1241 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1242 let hashes = caps.get(2).map_or("", |m| m.as_str());
1243 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1244 let rest = caps.get(4).map_or("", |m| m.as_str());
1245
1246 let level = hashes.len() as u8;
1247 let marker_column = leading_spaces.len();
1248
1249 let (text, has_closing, closing_seq) = {
1251 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1253 if rest[id_start..].trim_end().ends_with('}') {
1255 (&rest[..id_start], &rest[id_start..])
1257 } else {
1258 (rest, "")
1259 }
1260 } else {
1261 (rest, "")
1262 };
1263
1264 let trimmed_rest = rest_without_id.trim_end();
1266 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1267 let mut start_of_hashes = last_hash_pos;
1269 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1270 start_of_hashes -= 1;
1271 }
1272
1273 let has_space_before = start_of_hashes == 0
1275 || trimmed_rest
1276 .chars()
1277 .nth(start_of_hashes - 1)
1278 .is_some_and(|c| c.is_whitespace());
1279
1280 let potential_closing = &trimmed_rest[start_of_hashes..];
1282 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1283
1284 if is_all_hashes && has_space_before {
1285 let closing_hashes = potential_closing.to_string();
1287 let text_part = if !custom_id_part.is_empty() {
1290 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1293 } else {
1294 rest_without_id[..start_of_hashes].trim_end().to_string()
1295 };
1296 (text_part, true, closing_hashes)
1297 } else {
1298 (rest.to_string(), false, String::new())
1300 }
1301 } else {
1302 (rest.to_string(), false, String::new())
1304 }
1305 };
1306
1307 let content_column = marker_column + hashes.len() + spaces_after.len();
1308
1309 let raw_text = text.trim().to_string();
1311 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1312
1313 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1315 let next_line = content_lines[i + 1];
1316 if !lines[i + 1].in_code_block
1317 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1318 && let Some(next_line_id) =
1319 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1320 {
1321 custom_id = Some(next_line_id);
1322 }
1323 }
1324
1325 lines[i].heading = Some(HeadingInfo {
1326 level,
1327 style: HeadingStyle::ATX,
1328 marker: hashes.to_string(),
1329 marker_column,
1330 content_column,
1331 text: clean_text,
1332 custom_id,
1333 raw_text,
1334 has_closing_sequence: has_closing,
1335 closing_sequence: closing_seq,
1336 });
1337 }
1338 else if i + 1 < content_lines.len() {
1340 let next_line = content_lines[i + 1];
1341 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1342 if in_front_matter && i < front_matter_end {
1344 continue;
1345 }
1346
1347 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1349 continue;
1350 }
1351
1352 let underline = next_line.trim();
1353
1354 if underline == "---" {
1357 continue;
1358 }
1359
1360 let current_line_trimmed = line.trim();
1362 if current_line_trimmed.contains(':')
1363 && !current_line_trimmed.starts_with('#')
1364 && !current_line_trimmed.contains('[')
1365 && !current_line_trimmed.contains("](")
1366 {
1367 continue;
1369 }
1370
1371 let level = if underline.starts_with('=') { 1 } else { 2 };
1372 let style = if level == 1 {
1373 HeadingStyle::Setext1
1374 } else {
1375 HeadingStyle::Setext2
1376 };
1377
1378 let raw_text = line.trim().to_string();
1380 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1381
1382 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1384 let attr_line = content_lines[i + 2];
1385 if !lines[i + 2].in_code_block
1386 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1387 && let Some(attr_line_id) =
1388 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1389 {
1390 custom_id = Some(attr_line_id);
1391 }
1392 }
1393
1394 lines[i].heading = Some(HeadingInfo {
1395 level,
1396 style,
1397 marker: underline.to_string(),
1398 marker_column: next_line.len() - next_line.trim_start().len(),
1399 content_column: lines[i].indent,
1400 text: clean_text,
1401 custom_id,
1402 raw_text,
1403 has_closing_sequence: false,
1404 closing_sequence: String::new(),
1405 });
1406 }
1407 }
1408 }
1409
1410 lines
1411 }
1412
1413 fn detect_html_blocks(lines: &mut [LineInfo]) {
1415 const BLOCK_ELEMENTS: &[&str] = &[
1417 "address",
1418 "article",
1419 "aside",
1420 "blockquote",
1421 "details",
1422 "dialog",
1423 "dd",
1424 "div",
1425 "dl",
1426 "dt",
1427 "fieldset",
1428 "figcaption",
1429 "figure",
1430 "footer",
1431 "form",
1432 "h1",
1433 "h2",
1434 "h3",
1435 "h4",
1436 "h5",
1437 "h6",
1438 "header",
1439 "hr",
1440 "li",
1441 "main",
1442 "nav",
1443 "ol",
1444 "p",
1445 "pre",
1446 "section",
1447 "table",
1448 "tbody",
1449 "td",
1450 "tfoot",
1451 "th",
1452 "thead",
1453 "tr",
1454 "ul",
1455 ];
1456
1457 let mut i = 0;
1458 while i < lines.len() {
1459 if lines[i].in_code_block || lines[i].in_front_matter {
1461 i += 1;
1462 continue;
1463 }
1464
1465 let trimmed = lines[i].content.trim_start();
1466
1467 if trimmed.starts_with('<') && trimmed.len() > 1 {
1469 let after_bracket = &trimmed[1..];
1471 let is_closing = after_bracket.starts_with('/');
1472 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1473
1474 let tag_name = tag_start
1476 .chars()
1477 .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1478 .collect::<String>()
1479 .to_lowercase();
1480
1481 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1483 lines[i].in_html_block = true;
1485
1486 if !is_closing {
1489 let closing_tag = format!("</{tag_name}>");
1490 let mut j = i + 1;
1491 while j < lines.len() && j < i + 100 {
1492 if lines[j].is_blank {
1495 break;
1496 }
1497
1498 lines[j].in_html_block = true;
1499
1500 if lines[j].content.contains(&closing_tag) {
1502 break;
1503 }
1504 j += 1;
1505 }
1506 }
1507 }
1508 }
1509
1510 i += 1;
1511 }
1512 }
1513
1514 fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1516 let mut code_spans = Vec::new();
1517
1518 if !content.contains('`') {
1520 return code_spans;
1521 }
1522
1523 fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1525 match node {
1526 Node::InlineCode(inline_code) => {
1527 if let Some(pos) = &inline_code.position {
1528 let start_pos = pos.start.offset;
1529 let end_pos = pos.end.offset;
1530
1531 let full_span = &content[start_pos..end_pos];
1533 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1534
1535 let content_start = start_pos + backtick_count;
1537 let content_end = end_pos - backtick_count;
1538 let span_content = if content_start < content_end {
1539 content[content_start..content_end].to_string()
1540 } else {
1541 String::new()
1542 };
1543
1544 let mut line_num = 1;
1546 let mut col_start = start_pos;
1547 for (idx, line_info) in lines.iter().enumerate() {
1548 if start_pos >= line_info.byte_offset {
1549 line_num = idx + 1;
1550 col_start = start_pos - line_info.byte_offset;
1551 } else {
1552 break;
1553 }
1554 }
1555
1556 let mut col_end = end_pos;
1558 for line_info in lines.iter() {
1559 if end_pos > line_info.byte_offset {
1560 col_end = end_pos - line_info.byte_offset;
1561 } else {
1562 break;
1563 }
1564 }
1565
1566 spans.push(CodeSpan {
1567 line: line_num,
1568 start_col: col_start,
1569 end_col: col_end,
1570 byte_offset: start_pos,
1571 byte_end: end_pos,
1572 backtick_count,
1573 content: span_content,
1574 });
1575 }
1576 }
1577 Node::Root(root) => {
1579 for child in &root.children {
1580 extract_code_spans(child, content, lines, spans);
1581 }
1582 }
1583 Node::Paragraph(para) => {
1584 for child in ¶.children {
1585 extract_code_spans(child, content, lines, spans);
1586 }
1587 }
1588 Node::Heading(heading) => {
1589 for child in &heading.children {
1590 extract_code_spans(child, content, lines, spans);
1591 }
1592 }
1593 Node::List(list) => {
1594 for child in &list.children {
1595 extract_code_spans(child, content, lines, spans);
1596 }
1597 }
1598 Node::ListItem(item) => {
1599 for child in &item.children {
1600 extract_code_spans(child, content, lines, spans);
1601 }
1602 }
1603 Node::Blockquote(blockquote) => {
1604 for child in &blockquote.children {
1605 extract_code_spans(child, content, lines, spans);
1606 }
1607 }
1608 Node::Table(table) => {
1609 for child in &table.children {
1610 extract_code_spans(child, content, lines, spans);
1611 }
1612 }
1613 Node::TableRow(row) => {
1614 for child in &row.children {
1615 extract_code_spans(child, content, lines, spans);
1616 }
1617 }
1618 Node::TableCell(cell) => {
1619 for child in &cell.children {
1620 extract_code_spans(child, content, lines, spans);
1621 }
1622 }
1623 Node::Emphasis(emphasis) => {
1624 for child in &emphasis.children {
1625 extract_code_spans(child, content, lines, spans);
1626 }
1627 }
1628 Node::Strong(strong) => {
1629 for child in &strong.children {
1630 extract_code_spans(child, content, lines, spans);
1631 }
1632 }
1633 Node::Link(link) => {
1634 for child in &link.children {
1635 extract_code_spans(child, content, lines, spans);
1636 }
1637 }
1638 Node::LinkReference(link_ref) => {
1639 for child in &link_ref.children {
1640 extract_code_spans(child, content, lines, spans);
1641 }
1642 }
1643 Node::FootnoteDefinition(footnote) => {
1644 for child in &footnote.children {
1645 extract_code_spans(child, content, lines, spans);
1646 }
1647 }
1648 Node::Delete(delete) => {
1649 for child in &delete.children {
1650 extract_code_spans(child, content, lines, spans);
1651 }
1652 }
1653 Node::Code(_)
1655 | Node::Text(_)
1656 | Node::Html(_)
1657 | Node::Image(_)
1658 | Node::ImageReference(_)
1659 | Node::FootnoteReference(_)
1660 | Node::Break(_)
1661 | Node::ThematicBreak(_)
1662 | Node::Definition(_)
1663 | Node::Yaml(_)
1664 | Node::Toml(_)
1665 | Node::Math(_)
1666 | Node::InlineMath(_)
1667 | Node::MdxJsxFlowElement(_)
1668 | Node::MdxFlowExpression(_)
1669 | Node::MdxJsxTextElement(_)
1670 | Node::MdxTextExpression(_)
1671 | Node::MdxjsEsm(_) => {
1672 }
1674 }
1675 }
1676
1677 extract_code_spans(ast, content, lines, &mut code_spans);
1679
1680 code_spans.sort_by_key(|span| span.byte_offset);
1682
1683 code_spans
1684 }
1685
1686 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1688 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1691 let mut last_list_item_line = 0;
1692 let mut current_indent_level = 0;
1693 let mut last_marker_width = 0;
1694
1695 for (line_idx, line_info) in lines.iter().enumerate() {
1696 let line_num = line_idx + 1;
1697
1698 if line_info.in_code_block {
1700 if let Some(ref mut block) = current_block {
1701 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1703
1704 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1706
1707 match context {
1708 CodeBlockContext::Indented => {
1709 block.end_line = line_num;
1711 continue;
1712 }
1713 CodeBlockContext::Standalone => {
1714 let completed_block = current_block.take().unwrap();
1716 list_blocks.push(completed_block);
1717 continue;
1718 }
1719 CodeBlockContext::Adjacent => {
1720 block.end_line = line_num;
1722 continue;
1723 }
1724 }
1725 } else {
1726 continue;
1728 }
1729 }
1730
1731 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1733 caps.get(0).unwrap().as_str().to_string()
1734 } else {
1735 String::new()
1736 };
1737
1738 if let Some(list_item) = &line_info.list_item {
1740 let item_indent = list_item.marker_column;
1742 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1745 let is_nested = nesting > block.nesting_level;
1749 let same_type =
1750 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1751 let same_context = block.blockquote_prefix == blockquote_prefix;
1752 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1756 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1757
1758 let has_non_list_content = {
1760 let mut found_non_list = false;
1761 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1763
1764 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1766 let last_line = &lines[block_last_item_line - 1];
1767 if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1768 log::debug!(
1769 "After problematic line {}: checking lines {} to {} for non-list content",
1770 block_last_item_line,
1771 block_last_item_line + 1,
1772 line_num
1773 );
1774 if line_num == block_last_item_line + 1 {
1776 log::debug!("Lines are consecutive, no content between");
1777 }
1778 }
1779 }
1780
1781 for check_line in (block_last_item_line + 1)..line_num {
1782 let check_idx = check_line - 1;
1783 if check_idx < lines.len() {
1784 let check_info = &lines[check_idx];
1785 let is_list_breaking_content = if check_info.in_code_block {
1787 let last_item_marker_width =
1789 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1790 lines[block_last_item_line - 1]
1791 .list_item
1792 .as_ref()
1793 .map(|li| {
1794 if li.is_ordered {
1795 li.marker.len() + 1 } else {
1797 li.marker.len()
1798 }
1799 })
1800 .unwrap_or(3) } else {
1802 3 };
1804
1805 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1806
1807 let context = CodeBlockUtils::analyze_code_block_context(
1809 lines,
1810 check_line - 1,
1811 min_continuation,
1812 );
1813
1814 matches!(context, CodeBlockContext::Standalone)
1816 } else if !check_info.is_blank && check_info.list_item.is_none() {
1817 let line_content = check_info.content.trim();
1819
1820 if check_info.heading.is_some()
1822 || line_content.starts_with("---")
1823 || line_content.starts_with("***")
1824 || line_content.starts_with("___")
1825 || (line_content.contains('|')
1826 && !line_content.contains("](")
1827 && !line_content.contains("http")
1828 && (line_content.matches('|').count() > 1
1829 || line_content.starts_with('|')
1830 || line_content.ends_with('|')))
1831 || line_content.starts_with(">")
1832 {
1833 true
1834 }
1835 else {
1837 let last_item_marker_width =
1838 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1839 lines[block_last_item_line - 1]
1840 .list_item
1841 .as_ref()
1842 .map(|li| {
1843 if li.is_ordered {
1844 li.marker.len() + 1 } else {
1846 li.marker.len()
1847 }
1848 })
1849 .unwrap_or(3) } else {
1851 3 };
1853
1854 let min_continuation =
1855 if block.is_ordered { last_item_marker_width } else { 2 };
1856 check_info.indent < min_continuation
1857 }
1858 } else {
1859 false
1860 };
1861
1862 if is_list_breaking_content {
1863 found_non_list = true;
1865 break;
1866 }
1867 }
1868 }
1869 found_non_list
1870 };
1871
1872 let mut continues_list = if is_nested {
1876 same_context && reasonable_distance && !has_non_list_content
1878 } else {
1879 let result = same_type
1881 && same_context
1882 && reasonable_distance
1883 && marker_compatible
1884 && !has_non_list_content;
1885
1886 if block.item_lines.last().is_some_and(|&last_line| {
1888 last_line > 0
1889 && last_line <= lines.len()
1890 && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1891 && lines[last_line - 1].content.contains(r"\`")
1892 }) {
1893 log::debug!(
1894 "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1895 );
1896 if line_num > 0 && line_num <= lines.len() {
1897 log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1898 }
1899 }
1900
1901 result
1902 };
1903
1904 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1907 if block.item_lines.contains(&(line_num - 1)) {
1909 continues_list = true;
1911 }
1912 }
1913
1914 if continues_list {
1915 block.end_line = line_num;
1917 block.item_lines.push(line_num);
1918
1919 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1921 list_item.marker.len() + 1
1922 } else {
1923 list_item.marker.len()
1924 });
1925
1926 if !block.is_ordered
1928 && block.marker.is_some()
1929 && block.marker.as_ref() != Some(&list_item.marker)
1930 {
1931 block.marker = None;
1933 }
1934 } else {
1935 list_blocks.push(block.clone());
1938
1939 *block = ListBlock {
1940 start_line: line_num,
1941 end_line: line_num,
1942 is_ordered: list_item.is_ordered,
1943 marker: if list_item.is_ordered {
1944 None
1945 } else {
1946 Some(list_item.marker.clone())
1947 },
1948 blockquote_prefix: blockquote_prefix.clone(),
1949 item_lines: vec![line_num],
1950 nesting_level: nesting,
1951 max_marker_width: if list_item.is_ordered {
1952 list_item.marker.len() + 1
1953 } else {
1954 list_item.marker.len()
1955 },
1956 };
1957 }
1958 } else {
1959 current_block = Some(ListBlock {
1961 start_line: line_num,
1962 end_line: line_num,
1963 is_ordered: list_item.is_ordered,
1964 marker: if list_item.is_ordered {
1965 None
1966 } else {
1967 Some(list_item.marker.clone())
1968 },
1969 blockquote_prefix,
1970 item_lines: vec![line_num],
1971 nesting_level: nesting,
1972 max_marker_width: list_item.marker.len(),
1973 });
1974 }
1975
1976 last_list_item_line = line_num;
1977 current_indent_level = item_indent;
1978 last_marker_width = if list_item.is_ordered {
1979 list_item.marker.len() + 1 } else {
1981 list_item.marker.len()
1982 };
1983 } else if let Some(ref mut block) = current_block {
1984 let min_continuation_indent = if block.is_ordered {
1995 current_indent_level + last_marker_width
1996 } else {
1997 current_indent_level + 2 };
1999
2000 if line_info.indent >= min_continuation_indent {
2001 block.end_line = line_num;
2003 } else if line_info.is_blank {
2004 let mut check_idx = line_idx + 1;
2007 let mut found_continuation = false;
2008
2009 while check_idx < lines.len() && lines[check_idx].is_blank {
2011 check_idx += 1;
2012 }
2013
2014 if check_idx < lines.len() {
2015 let next_line = &lines[check_idx];
2016 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2018 found_continuation = true;
2019 }
2020 else if !next_line.in_code_block
2022 && next_line.list_item.is_some()
2023 && let Some(item) = &next_line.list_item
2024 {
2025 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2026 .find(&next_line.content)
2027 .map_or(String::new(), |m| m.as_str().to_string());
2028 if item.marker_column == current_indent_level
2029 && item.is_ordered == block.is_ordered
2030 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2031 {
2032 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2035 if let Some(between_line) = lines.get(idx) {
2036 let trimmed = between_line.content.trim();
2037 if trimmed.is_empty() {
2039 return false;
2040 }
2041 let line_indent =
2043 between_line.content.len() - between_line.content.trim_start().len();
2044
2045 if trimmed.starts_with("```")
2047 || trimmed.starts_with("~~~")
2048 || trimmed.starts_with("---")
2049 || trimmed.starts_with("***")
2050 || trimmed.starts_with("___")
2051 || trimmed.starts_with(">")
2052 || trimmed.contains('|') || between_line.heading.is_some()
2054 {
2055 return true; }
2057
2058 line_indent >= min_continuation_indent
2060 } else {
2061 false
2062 }
2063 });
2064
2065 if block.is_ordered {
2066 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2069 if let Some(between_line) = lines.get(idx) {
2070 let trimmed = between_line.content.trim();
2071 if trimmed.is_empty() {
2072 return false;
2073 }
2074 trimmed.starts_with("```")
2076 || trimmed.starts_with("~~~")
2077 || trimmed.starts_with("---")
2078 || trimmed.starts_with("***")
2079 || trimmed.starts_with("___")
2080 || trimmed.starts_with(">")
2081 || trimmed.contains('|') || between_line.heading.is_some()
2083 } else {
2084 false
2085 }
2086 });
2087 found_continuation = !has_structural_separators;
2088 } else {
2089 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2091 if let Some(between_line) = lines.get(idx) {
2092 let trimmed = between_line.content.trim();
2093 if trimmed.is_empty() {
2094 return false;
2095 }
2096 trimmed.starts_with("```")
2098 || trimmed.starts_with("~~~")
2099 || trimmed.starts_with("---")
2100 || trimmed.starts_with("***")
2101 || trimmed.starts_with("___")
2102 || trimmed.starts_with(">")
2103 || trimmed.contains('|') || between_line.heading.is_some()
2105 } else {
2106 false
2107 }
2108 });
2109 found_continuation = !has_structural_separators;
2110 }
2111 }
2112 }
2113 }
2114
2115 if found_continuation {
2116 block.end_line = line_num;
2118 } else {
2119 list_blocks.push(block.clone());
2121 current_block = None;
2122 }
2123 } else {
2124 let min_required_indent = if block.is_ordered {
2127 current_indent_level + last_marker_width
2128 } else {
2129 current_indent_level + 2
2130 };
2131
2132 let line_content = line_info.content.trim();
2137 let is_structural_separator = line_info.heading.is_some()
2138 || line_content.starts_with("```")
2139 || line_content.starts_with("~~~")
2140 || line_content.starts_with("---")
2141 || line_content.starts_with("***")
2142 || line_content.starts_with("___")
2143 || line_content.starts_with(">")
2144 || (line_content.contains('|')
2145 && !line_content.contains("](")
2146 && !line_content.contains("http")
2147 && (line_content.matches('|').count() > 1
2148 || line_content.starts_with('|')
2149 || line_content.ends_with('|'))); let is_lazy_continuation = !is_structural_separator
2154 && !line_info.is_blank
2155 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2156
2157 if is_lazy_continuation {
2158 let content_to_check = if !blockquote_prefix.is_empty() {
2161 line_info
2163 .content
2164 .strip_prefix(&blockquote_prefix)
2165 .unwrap_or(&line_info.content)
2166 .trim()
2167 } else {
2168 line_info.content.trim()
2169 };
2170
2171 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2172
2173 if starts_with_uppercase && last_list_item_line > 0 {
2176 list_blocks.push(block.clone());
2178 current_block = None;
2179 } else {
2180 block.end_line = line_num;
2182 }
2183 } else {
2184 list_blocks.push(block.clone());
2186 current_block = None;
2187 }
2188 }
2189 }
2190 }
2191
2192 if let Some(block) = current_block {
2194 list_blocks.push(block);
2195 }
2196
2197 merge_adjacent_list_blocks(&mut list_blocks, lines);
2199
2200 list_blocks
2201 }
2202
2203 fn compute_char_frequency(content: &str) -> CharFrequency {
2205 let mut frequency = CharFrequency::default();
2206
2207 for ch in content.chars() {
2208 match ch {
2209 '#' => frequency.hash_count += 1,
2210 '*' => frequency.asterisk_count += 1,
2211 '_' => frequency.underscore_count += 1,
2212 '-' => frequency.hyphen_count += 1,
2213 '+' => frequency.plus_count += 1,
2214 '>' => frequency.gt_count += 1,
2215 '|' => frequency.pipe_count += 1,
2216 '[' => frequency.bracket_count += 1,
2217 '`' => frequency.backtick_count += 1,
2218 '<' => frequency.lt_count += 1,
2219 '!' => frequency.exclamation_count += 1,
2220 '\n' => frequency.newline_count += 1,
2221 _ => {}
2222 }
2223 }
2224
2225 frequency
2226 }
2227
2228 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2230 lazy_static! {
2231 static ref HTML_TAG_REGEX: regex::Regex =
2232 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2233 }
2234
2235 let mut html_tags = Vec::with_capacity(content.matches('<').count());
2236
2237 for cap in HTML_TAG_REGEX.captures_iter(content) {
2238 let full_match = cap.get(0).unwrap();
2239 let match_start = full_match.start();
2240 let match_end = full_match.end();
2241
2242 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2244 continue;
2245 }
2246
2247 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2248 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2249 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2250
2251 let mut line_num = 1;
2253 let mut col_start = match_start;
2254 let mut col_end = match_end;
2255 for (idx, line_info) in lines.iter().enumerate() {
2256 if match_start >= line_info.byte_offset {
2257 line_num = idx + 1;
2258 col_start = match_start - line_info.byte_offset;
2259 col_end = match_end - line_info.byte_offset;
2260 } else {
2261 break;
2262 }
2263 }
2264
2265 html_tags.push(HtmlTag {
2266 line: line_num,
2267 start_col: col_start,
2268 end_col: col_end,
2269 byte_offset: match_start,
2270 byte_end: match_end,
2271 tag_name,
2272 is_closing,
2273 is_self_closing,
2274 raw_content: full_match.as_str().to_string(),
2275 });
2276 }
2277
2278 html_tags
2279 }
2280
2281 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2283 lazy_static! {
2284 static ref EMPHASIS_REGEX: regex::Regex =
2285 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2286 }
2287
2288 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2289
2290 for cap in EMPHASIS_REGEX.captures_iter(content) {
2291 let full_match = cap.get(0).unwrap();
2292 let match_start = full_match.start();
2293 let match_end = full_match.end();
2294
2295 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2297 continue;
2298 }
2299
2300 let opening_markers = cap.get(1).unwrap().as_str();
2301 let content_part = cap.get(2).unwrap().as_str();
2302 let closing_markers = cap.get(3).unwrap().as_str();
2303
2304 if opening_markers.chars().next() != closing_markers.chars().next()
2306 || opening_markers.len() != closing_markers.len()
2307 {
2308 continue;
2309 }
2310
2311 let marker = opening_markers.chars().next().unwrap();
2312 let marker_count = opening_markers.len();
2313
2314 let mut line_num = 1;
2316 let mut col_start = match_start;
2317 let mut col_end = match_end;
2318 for (idx, line_info) in lines.iter().enumerate() {
2319 if match_start >= line_info.byte_offset {
2320 line_num = idx + 1;
2321 col_start = match_start - line_info.byte_offset;
2322 col_end = match_end - line_info.byte_offset;
2323 } else {
2324 break;
2325 }
2326 }
2327
2328 emphasis_spans.push(EmphasisSpan {
2329 line: line_num,
2330 start_col: col_start,
2331 end_col: col_end,
2332 byte_offset: match_start,
2333 byte_end: match_end,
2334 marker,
2335 marker_count,
2336 content: content_part.to_string(),
2337 });
2338 }
2339
2340 emphasis_spans
2341 }
2342
2343 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2345 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2346
2347 for (line_idx, line_info) in lines.iter().enumerate() {
2348 if line_info.in_code_block || line_info.is_blank {
2350 continue;
2351 }
2352
2353 let line = &line_info.content;
2354 let line_num = line_idx + 1;
2355
2356 if !line.contains('|') {
2358 continue;
2359 }
2360
2361 let parts: Vec<&str> = line.split('|').collect();
2363 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2364
2365 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2367 let mut column_alignments = Vec::new();
2368
2369 if is_separator {
2370 for part in &parts[1..parts.len() - 1] {
2371 let trimmed = part.trim();
2373 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2374 "center".to_string()
2375 } else if trimmed.ends_with(':') {
2376 "right".to_string()
2377 } else if trimmed.starts_with(':') {
2378 "left".to_string()
2379 } else {
2380 "none".to_string()
2381 };
2382 column_alignments.push(alignment);
2383 }
2384 }
2385
2386 table_rows.push(TableRow {
2387 line: line_num,
2388 is_separator,
2389 column_count,
2390 column_alignments,
2391 });
2392 }
2393
2394 table_rows
2395 }
2396
2397 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2399 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2400
2401 for cap in BARE_URL_PATTERN.captures_iter(content) {
2403 let full_match = cap.get(0).unwrap();
2404 let match_start = full_match.start();
2405 let match_end = full_match.end();
2406
2407 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2409 continue;
2410 }
2411
2412 let preceding_char = if match_start > 0 {
2414 content.chars().nth(match_start - 1)
2415 } else {
2416 None
2417 };
2418 let following_char = content.chars().nth(match_end);
2419
2420 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2421 continue;
2422 }
2423 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2424 continue;
2425 }
2426
2427 let url = full_match.as_str();
2428 let url_type = if url.starts_with("https://") {
2429 "https"
2430 } else if url.starts_with("http://") {
2431 "http"
2432 } else if url.starts_with("ftp://") {
2433 "ftp"
2434 } else {
2435 "other"
2436 };
2437
2438 let mut line_num = 1;
2440 let mut col_start = match_start;
2441 let mut col_end = match_end;
2442 for (idx, line_info) in lines.iter().enumerate() {
2443 if match_start >= line_info.byte_offset {
2444 line_num = idx + 1;
2445 col_start = match_start - line_info.byte_offset;
2446 col_end = match_end - line_info.byte_offset;
2447 } else {
2448 break;
2449 }
2450 }
2451
2452 bare_urls.push(BareUrl {
2453 line: line_num,
2454 start_col: col_start,
2455 end_col: col_end,
2456 byte_offset: match_start,
2457 byte_end: match_end,
2458 url: url.to_string(),
2459 url_type: url_type.to_string(),
2460 });
2461 }
2462
2463 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2465 let full_match = cap.get(0).unwrap();
2466 let match_start = full_match.start();
2467 let match_end = full_match.end();
2468
2469 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2471 continue;
2472 }
2473
2474 let preceding_char = if match_start > 0 {
2476 content.chars().nth(match_start - 1)
2477 } else {
2478 None
2479 };
2480 let following_char = content.chars().nth(match_end);
2481
2482 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2483 continue;
2484 }
2485 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2486 continue;
2487 }
2488
2489 let email = full_match.as_str();
2490
2491 let mut line_num = 1;
2493 let mut col_start = match_start;
2494 let mut col_end = match_end;
2495 for (idx, line_info) in lines.iter().enumerate() {
2496 if match_start >= line_info.byte_offset {
2497 line_num = idx + 1;
2498 col_start = match_start - line_info.byte_offset;
2499 col_end = match_end - line_info.byte_offset;
2500 } else {
2501 break;
2502 }
2503 }
2504
2505 bare_urls.push(BareUrl {
2506 line: line_num,
2507 start_col: col_start,
2508 end_col: col_end,
2509 byte_offset: match_start,
2510 byte_end: match_end,
2511 url: email.to_string(),
2512 url_type: "email".to_string(),
2513 });
2514 }
2515
2516 bare_urls
2517 }
2518}
2519
2520fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2522 if list_blocks.len() < 2 {
2523 return;
2524 }
2525
2526 let mut merger = ListBlockMerger::new(lines);
2527 *list_blocks = merger.merge(list_blocks);
2528}
2529
2530struct ListBlockMerger<'a> {
2532 lines: &'a [LineInfo],
2533}
2534
2535impl<'a> ListBlockMerger<'a> {
2536 fn new(lines: &'a [LineInfo]) -> Self {
2537 Self { lines }
2538 }
2539
2540 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2541 let mut merged = Vec::with_capacity(list_blocks.len());
2542 let mut current = list_blocks[0].clone();
2543
2544 for next in list_blocks.iter().skip(1) {
2545 if self.should_merge_blocks(¤t, next) {
2546 current = self.merge_two_blocks(current, next);
2547 } else {
2548 merged.push(current);
2549 current = next.clone();
2550 }
2551 }
2552
2553 merged.push(current);
2554 merged
2555 }
2556
2557 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2559 if !self.blocks_are_compatible(current, next) {
2561 return false;
2562 }
2563
2564 let spacing = self.analyze_spacing_between(current, next);
2566 match spacing {
2567 BlockSpacing::Consecutive => true,
2568 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2569 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2570 self.can_merge_with_content_between(current, next)
2571 }
2572 }
2573 }
2574
2575 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2577 current.is_ordered == next.is_ordered
2578 && current.blockquote_prefix == next.blockquote_prefix
2579 && current.nesting_level == next.nesting_level
2580 }
2581
2582 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2584 let gap = next.start_line - current.end_line;
2585
2586 match gap {
2587 1 => BlockSpacing::Consecutive,
2588 2 => BlockSpacing::SingleBlank,
2589 _ if gap > 2 => {
2590 if self.has_only_blank_lines_between(current, next) {
2591 BlockSpacing::MultipleBlanks
2592 } else {
2593 BlockSpacing::ContentBetween
2594 }
2595 }
2596 _ => BlockSpacing::Consecutive, }
2598 }
2599
2600 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2602 if has_meaningful_content_between(current, next, self.lines) {
2605 return false; }
2607
2608 !current.is_ordered && current.marker == next.marker
2610 }
2611
2612 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2614 if has_meaningful_content_between(current, next, self.lines) {
2616 return false; }
2618
2619 current.is_ordered && next.is_ordered
2621 }
2622
2623 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2625 for line_num in (current.end_line + 1)..next.start_line {
2626 if let Some(line_info) = self.lines.get(line_num - 1)
2627 && !line_info.content.trim().is_empty()
2628 {
2629 return false;
2630 }
2631 }
2632 true
2633 }
2634
2635 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2637 current.end_line = next.end_line;
2638 current.item_lines.extend_from_slice(&next.item_lines);
2639
2640 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2642
2643 if !current.is_ordered && self.markers_differ(¤t, next) {
2645 current.marker = None; }
2647
2648 current
2649 }
2650
2651 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2653 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2654 }
2655}
2656
2657#[derive(Debug, PartialEq)]
2659enum BlockSpacing {
2660 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2665
2666fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2668 for line_num in (current.end_line + 1)..next.start_line {
2670 if let Some(line_info) = lines.get(line_num - 1) {
2671 let trimmed = line_info.content.trim();
2673
2674 if trimmed.is_empty() {
2676 continue;
2677 }
2678
2679 if line_info.heading.is_some() {
2683 return true; }
2685
2686 if is_horizontal_rule(trimmed) {
2688 return true; }
2690
2691 if trimmed.contains('|') && trimmed.len() > 1 {
2694 if !trimmed.contains("](") && !trimmed.contains("http") {
2696 let pipe_count = trimmed.matches('|').count();
2698 if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2699 return true; }
2701 }
2702 }
2703
2704 if trimmed.starts_with('>') {
2706 return true; }
2708
2709 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2711 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2712
2713 let min_continuation_indent = if current.is_ordered {
2715 current.nesting_level + current.max_marker_width + 1 } else {
2717 current.nesting_level + 2
2718 };
2719
2720 if line_indent < min_continuation_indent {
2721 return true; }
2724 }
2725
2726 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2728
2729 let min_indent = if current.is_ordered {
2731 current.nesting_level + current.max_marker_width
2732 } else {
2733 current.nesting_level + 2
2734 };
2735
2736 if line_indent < min_indent {
2738 return true; }
2740
2741 }
2744 }
2745
2746 false
2748}
2749
2750fn is_horizontal_rule(trimmed: &str) -> bool {
2752 if trimmed.len() < 3 {
2753 return false;
2754 }
2755
2756 let chars: Vec<char> = trimmed.chars().collect();
2758 if let Some(&first_char) = chars.first()
2759 && (first_char == '-' || first_char == '*' || first_char == '_')
2760 {
2761 let mut count = 0;
2762 for &ch in &chars {
2763 if ch == first_char {
2764 count += 1;
2765 } else if ch != ' ' && ch != '\t' {
2766 return false; }
2768 }
2769 return count >= 3;
2770 }
2771 false
2772}
2773
2774#[cfg(test)]
2776mod tests {
2777 use super::*;
2778
2779 #[test]
2780 fn test_empty_content() {
2781 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2782 assert_eq!(ctx.content, "");
2783 assert_eq!(ctx.line_offsets, vec![0]);
2784 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2785 assert_eq!(ctx.lines.len(), 0);
2786 }
2787
2788 #[test]
2789 fn test_single_line() {
2790 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2791 assert_eq!(ctx.content, "# Hello");
2792 assert_eq!(ctx.line_offsets, vec![0]);
2793 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2794 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2795 }
2796
2797 #[test]
2798 fn test_multi_line() {
2799 let content = "# Title\n\nSecond line\nThird line";
2800 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2801 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2802 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2809
2810 #[test]
2811 fn test_line_info() {
2812 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2813 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2814
2815 assert_eq!(ctx.lines.len(), 7);
2817
2818 let line1 = &ctx.lines[0];
2820 assert_eq!(line1.content, "# Title");
2821 assert_eq!(line1.byte_offset, 0);
2822 assert_eq!(line1.indent, 0);
2823 assert!(!line1.is_blank);
2824 assert!(!line1.in_code_block);
2825 assert!(line1.list_item.is_none());
2826
2827 let line2 = &ctx.lines[1];
2829 assert_eq!(line2.content, " indented");
2830 assert_eq!(line2.byte_offset, 8);
2831 assert_eq!(line2.indent, 4);
2832 assert!(!line2.is_blank);
2833
2834 let line3 = &ctx.lines[2];
2836 assert_eq!(line3.content, "");
2837 assert!(line3.is_blank);
2838
2839 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2841 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2842 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2843 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2844 }
2845
2846 #[test]
2847 fn test_list_item_detection() {
2848 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2849 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2850
2851 let line1 = &ctx.lines[0];
2853 assert!(line1.list_item.is_some());
2854 let list1 = line1.list_item.as_ref().unwrap();
2855 assert_eq!(list1.marker, "-");
2856 assert!(!list1.is_ordered);
2857 assert_eq!(list1.marker_column, 0);
2858 assert_eq!(list1.content_column, 2);
2859
2860 let line2 = &ctx.lines[1];
2862 assert!(line2.list_item.is_some());
2863 let list2 = line2.list_item.as_ref().unwrap();
2864 assert_eq!(list2.marker, "*");
2865 assert_eq!(list2.marker_column, 2);
2866
2867 let line3 = &ctx.lines[2];
2869 assert!(line3.list_item.is_some());
2870 let list3 = line3.list_item.as_ref().unwrap();
2871 assert_eq!(list3.marker, "1.");
2872 assert!(list3.is_ordered);
2873 assert_eq!(list3.number, Some(1));
2874
2875 let line6 = &ctx.lines[5];
2877 assert!(line6.list_item.is_none());
2878 }
2879
2880 #[test]
2881 fn test_offset_to_line_col_edge_cases() {
2882 let content = "a\nb\nc";
2883 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2884 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2892}