1use crate::config::MarkdownFlavor;
2use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7 static ref LINK_PATTERN: Regex = Regex::new(
10 r"(?sx)
11 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
12 (?:
13 \(([^)]*)\) # Inline URL in group 2 (can be empty)
14 |
15 \[([^\]]*)\] # Reference ID in group 3
16 )"
17 ).unwrap();
18
19 static ref IMAGE_PATTERN: Regex = Regex::new(
22 r"(?sx)
23 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
24 (?:
25 \(([^)]*)\) # Inline URL in group 2 (can be empty)
26 |
27 \[([^\]]*)\] # Reference ID in group 3
28 )"
29 ).unwrap();
30
31 static ref REF_DEF_PATTERN: Regex = Regex::new(
33 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
34 ).unwrap();
35
36 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
39 r"`+"
40 ).unwrap();
41
42 static ref BARE_URL_PATTERN: Regex = Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap();
46
47 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
49 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
50 ).unwrap();
51
52 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
54 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
55 ).unwrap();
56
57 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
59}
60
61#[derive(Debug, Clone)]
63pub struct LineInfo {
64 pub content: String,
66 pub byte_offset: usize,
68 pub indent: usize,
70 pub is_blank: bool,
72 pub in_code_block: bool,
74 pub in_front_matter: bool,
76 pub list_item: Option<ListItemInfo>,
78 pub heading: Option<HeadingInfo>,
80 pub blockquote: Option<BlockquoteInfo>,
82}
83
84#[derive(Debug, Clone)]
86pub struct ListItemInfo {
87 pub marker: String,
89 pub is_ordered: bool,
91 pub number: Option<usize>,
93 pub marker_column: usize,
95 pub content_column: usize,
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub enum HeadingStyle {
102 ATX,
104 Setext1,
106 Setext2,
108}
109
110#[derive(Debug, Clone)]
112pub struct ParsedLink {
113 pub line: usize,
115 pub start_col: usize,
117 pub end_col: usize,
119 pub byte_offset: usize,
121 pub byte_end: usize,
123 pub text: String,
125 pub url: String,
127 pub is_reference: bool,
129 pub reference_id: Option<String>,
131}
132
133#[derive(Debug, Clone)]
135pub struct ParsedImage {
136 pub line: usize,
138 pub start_col: usize,
140 pub end_col: usize,
142 pub byte_offset: usize,
144 pub byte_end: usize,
146 pub alt_text: String,
148 pub url: String,
150 pub is_reference: bool,
152 pub reference_id: Option<String>,
154}
155
156#[derive(Debug, Clone)]
158pub struct ReferenceDef {
159 pub line: usize,
161 pub id: String,
163 pub url: String,
165 pub title: Option<String>,
167}
168
169#[derive(Debug, Clone)]
171pub struct CodeSpan {
172 pub line: usize,
174 pub start_col: usize,
176 pub end_col: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182 pub backtick_count: usize,
184 pub content: String,
186}
187
188#[derive(Debug, Clone)]
190pub struct HeadingInfo {
191 pub level: u8,
193 pub style: HeadingStyle,
195 pub marker: String,
197 pub marker_column: usize,
199 pub content_column: usize,
201 pub text: String,
203 pub custom_id: Option<String>,
205 pub raw_text: String,
207 pub has_closing_sequence: bool,
209 pub closing_sequence: String,
211}
212
213#[derive(Debug, Clone)]
215pub struct BlockquoteInfo {
216 pub nesting_level: usize,
218 pub indent: String,
220 pub marker_column: usize,
222 pub prefix: String,
224 pub content: String,
226 pub has_no_space_after_marker: bool,
228 pub has_multiple_spaces_after_marker: bool,
230 pub needs_md028_fix: bool,
232}
233
234#[derive(Debug, Clone)]
236pub struct ListBlock {
237 pub start_line: usize,
239 pub end_line: usize,
241 pub is_ordered: bool,
243 pub marker: Option<String>,
245 pub blockquote_prefix: String,
247 pub item_lines: Vec<usize>,
249 pub nesting_level: usize,
251 pub max_marker_width: usize,
253}
254
255use std::sync::{Arc, Mutex};
256
257#[derive(Debug, Clone, Default)]
259pub struct CharFrequency {
260 pub hash_count: usize,
262 pub asterisk_count: usize,
264 pub underscore_count: usize,
266 pub hyphen_count: usize,
268 pub plus_count: usize,
270 pub gt_count: usize,
272 pub pipe_count: usize,
274 pub bracket_count: usize,
276 pub backtick_count: usize,
278 pub lt_count: usize,
280 pub exclamation_count: usize,
282 pub newline_count: usize,
284}
285
286#[derive(Debug, Clone)]
288pub struct HtmlTag {
289 pub line: usize,
291 pub start_col: usize,
293 pub end_col: usize,
295 pub byte_offset: usize,
297 pub byte_end: usize,
299 pub tag_name: String,
301 pub is_closing: bool,
303 pub is_self_closing: bool,
305 pub raw_content: String,
307}
308
309#[derive(Debug, Clone)]
311pub struct EmphasisSpan {
312 pub line: usize,
314 pub start_col: usize,
316 pub end_col: usize,
318 pub byte_offset: usize,
320 pub byte_end: usize,
322 pub marker: char,
324 pub marker_count: usize,
326 pub content: String,
328}
329
330#[derive(Debug, Clone)]
332pub struct TableRow {
333 pub line: usize,
335 pub is_separator: bool,
337 pub column_count: usize,
339 pub column_alignments: Vec<String>, }
342
343#[derive(Debug, Clone)]
345pub struct BareUrl {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub url: String,
358 pub url_type: String,
360}
361
362pub struct LintContext<'a> {
363 pub content: &'a str,
364 pub line_offsets: Vec<usize>,
365 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, pub flavor: MarkdownFlavor, }
379
380impl<'a> LintContext<'a> {
381 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
382 let mut line_offsets = vec![0];
383 for (i, c) in content.char_indices() {
384 if c == '\n' {
385 line_offsets.push(i + 1);
386 }
387 }
388
389 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
391
392 let lines = Self::compute_line_info(content, &line_offsets, &code_blocks);
394
395 let links = Self::parse_links(content, &lines, &code_blocks, flavor);
398 let images = Self::parse_images(content, &lines, &code_blocks);
399 let reference_defs = Self::parse_reference_defs(content, &lines);
400 let list_blocks = Self::parse_list_blocks(&lines);
401
402 let char_frequency = Self::compute_char_frequency(content);
404
405 Self {
406 content,
407 line_offsets,
408 code_blocks,
409 lines,
410 links,
411 images,
412 reference_defs,
413 code_spans_cache: Mutex::new(None),
414 list_blocks,
415 char_frequency,
416 html_tags_cache: Mutex::new(None),
417 emphasis_spans_cache: Mutex::new(None),
418 table_rows_cache: Mutex::new(None),
419 bare_urls_cache: Mutex::new(None),
420 flavor,
421 }
422 }
423
424 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
426 let mut cache = self.code_spans_cache.lock().unwrap();
427
428 if cache.is_none() {
430 let code_spans = Self::parse_code_spans(self.content, &self.lines);
431 *cache = Some(Arc::new(code_spans));
432 }
433
434 cache.as_ref().unwrap().clone()
436 }
437
438 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
440 let mut cache = self.html_tags_cache.lock().unwrap();
441
442 if cache.is_none() {
443 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
444 *cache = Some(Arc::new(html_tags));
445 }
446
447 cache.as_ref().unwrap().clone()
448 }
449
450 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
452 let mut cache = self.emphasis_spans_cache.lock().unwrap();
453
454 if cache.is_none() {
455 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
456 *cache = Some(Arc::new(emphasis_spans));
457 }
458
459 cache.as_ref().unwrap().clone()
460 }
461
462 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
464 let mut cache = self.table_rows_cache.lock().unwrap();
465
466 if cache.is_none() {
467 let table_rows = Self::parse_table_rows(&self.lines);
468 *cache = Some(Arc::new(table_rows));
469 }
470
471 cache.as_ref().unwrap().clone()
472 }
473
474 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
476 let mut cache = self.bare_urls_cache.lock().unwrap();
477
478 if cache.is_none() {
479 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
480 *cache = Some(Arc::new(bare_urls));
481 }
482
483 cache.as_ref().unwrap().clone()
484 }
485
486 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
488 match self.line_offsets.binary_search(&offset) {
489 Ok(line) => (line + 1, 1),
490 Err(line) => {
491 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
492 (line, offset - line_start + 1)
493 }
494 }
495 }
496
497 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
499 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
501 return true;
502 }
503
504 self.code_spans()
506 .iter()
507 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
508 }
509
510 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
512 if line_num > 0 {
513 self.lines.get(line_num - 1)
514 } else {
515 None
516 }
517 }
518
519 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
521 self.line_info(line_num).map(|info| info.byte_offset)
522 }
523
524 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
526 let normalized_id = ref_id.to_lowercase();
527 self.reference_defs
528 .iter()
529 .find(|def| def.id == normalized_id)
530 .map(|def| def.url.as_str())
531 }
532
533 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
535 self.links.iter().filter(|link| link.line == line_num).collect()
536 }
537
538 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
540 self.images.iter().filter(|img| img.line == line_num).collect()
541 }
542
543 pub fn is_in_list_block(&self, line_num: usize) -> bool {
545 self.list_blocks
546 .iter()
547 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
548 }
549
550 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
552 self.list_blocks
553 .iter()
554 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
555 }
556
557 pub fn has_char(&self, ch: char) -> bool {
559 match ch {
560 '#' => self.char_frequency.hash_count > 0,
561 '*' => self.char_frequency.asterisk_count > 0,
562 '_' => self.char_frequency.underscore_count > 0,
563 '-' => self.char_frequency.hyphen_count > 0,
564 '+' => self.char_frequency.plus_count > 0,
565 '>' => self.char_frequency.gt_count > 0,
566 '|' => self.char_frequency.pipe_count > 0,
567 '[' => self.char_frequency.bracket_count > 0,
568 '`' => self.char_frequency.backtick_count > 0,
569 '<' => self.char_frequency.lt_count > 0,
570 '!' => self.char_frequency.exclamation_count > 0,
571 '\n' => self.char_frequency.newline_count > 0,
572 _ => self.content.contains(ch), }
574 }
575
576 pub fn char_count(&self, ch: char) -> usize {
578 match ch {
579 '#' => self.char_frequency.hash_count,
580 '*' => self.char_frequency.asterisk_count,
581 '_' => self.char_frequency.underscore_count,
582 '-' => self.char_frequency.hyphen_count,
583 '+' => self.char_frequency.plus_count,
584 '>' => self.char_frequency.gt_count,
585 '|' => self.char_frequency.pipe_count,
586 '[' => self.char_frequency.bracket_count,
587 '`' => self.char_frequency.backtick_count,
588 '<' => self.char_frequency.lt_count,
589 '!' => self.char_frequency.exclamation_count,
590 '\n' => self.char_frequency.newline_count,
591 _ => self.content.matches(ch).count(), }
593 }
594
595 pub fn likely_has_headings(&self) -> bool {
597 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
599
600 pub fn likely_has_lists(&self) -> bool {
602 self.char_frequency.asterisk_count > 0
603 || self.char_frequency.hyphen_count > 0
604 || self.char_frequency.plus_count > 0
605 }
606
607 pub fn likely_has_emphasis(&self) -> bool {
609 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
610 }
611
612 pub fn likely_has_tables(&self) -> bool {
614 self.char_frequency.pipe_count > 2
615 }
616
617 pub fn likely_has_blockquotes(&self) -> bool {
619 self.char_frequency.gt_count > 0
620 }
621
622 pub fn likely_has_code(&self) -> bool {
624 self.char_frequency.backtick_count > 0
625 }
626
627 pub fn likely_has_links_or_images(&self) -> bool {
629 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
630 }
631
632 pub fn likely_has_html(&self) -> bool {
634 self.char_frequency.lt_count > 0
635 }
636
637 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
639 self.html_tags()
640 .iter()
641 .filter(|tag| tag.line == line_num)
642 .cloned()
643 .collect()
644 }
645
646 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
648 self.emphasis_spans()
649 .iter()
650 .filter(|span| span.line == line_num)
651 .cloned()
652 .collect()
653 }
654
655 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
657 self.table_rows()
658 .iter()
659 .filter(|row| row.line == line_num)
660 .cloned()
661 .collect()
662 }
663
664 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
666 self.bare_urls()
667 .iter()
668 .filter(|url| url.line == line_num)
669 .cloned()
670 .collect()
671 }
672
673 fn parse_links(
675 content: &str,
676 lines: &[LineInfo],
677 code_blocks: &[(usize, usize)],
678 flavor: MarkdownFlavor,
679 ) -> Vec<ParsedLink> {
680 use crate::utils::skip_context::is_mkdocs_snippet_line;
681
682 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
687 let full_match = cap.get(0).unwrap();
688 let match_start = full_match.start();
689 let match_end = full_match.end();
690
691 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
693 continue;
694 }
695
696 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
698 continue;
699 }
700
701 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
703 continue;
704 }
705
706 let line_idx = lines
709 .iter()
710 .position(|line| {
711 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
712 })
713 .unwrap_or(0);
714
715 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
716 continue;
717 }
718
719 let mut line_num = 1;
721 let mut col_start = match_start;
722 for (idx, line_info) in lines.iter().enumerate() {
723 if match_start >= line_info.byte_offset {
724 line_num = idx + 1;
725 col_start = match_start - line_info.byte_offset;
726 } else {
727 break;
728 }
729 }
730
731 let mut end_line_num = 1;
733 let mut col_end = match_end;
734 for (idx, line_info) in lines.iter().enumerate() {
735 if match_end > line_info.byte_offset {
736 end_line_num = idx + 1;
737 col_end = match_end - line_info.byte_offset;
738 } else {
739 break;
740 }
741 }
742
743 if line_num == end_line_num {
745 } else {
747 }
750
751 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
752
753 if let Some(inline_url) = cap.get(2) {
754 links.push(ParsedLink {
756 line: line_num,
757 start_col: col_start,
758 end_col: col_end,
759 byte_offset: match_start,
760 byte_end: match_end,
761 text,
762 url: inline_url.as_str().to_string(),
763 is_reference: false,
764 reference_id: None,
765 });
766 } else if let Some(ref_id) = cap.get(3) {
767 let ref_id_str = ref_id.as_str();
769 let normalized_ref = if ref_id_str.is_empty() {
770 text.to_lowercase() } else {
772 ref_id_str.to_lowercase()
773 };
774
775 links.push(ParsedLink {
776 line: line_num,
777 start_col: col_start,
778 end_col: col_end,
779 byte_offset: match_start,
780 byte_end: match_end,
781 text,
782 url: String::new(), is_reference: true,
784 reference_id: Some(normalized_ref),
785 });
786 }
787 }
788
789 links
790 }
791
792 fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
794 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
799 let full_match = cap.get(0).unwrap();
800 let match_start = full_match.start();
801 let match_end = full_match.end();
802
803 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
805 continue;
806 }
807
808 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
810 continue;
811 }
812
813 let mut line_num = 1;
815 let mut col_start = match_start;
816 for (idx, line_info) in lines.iter().enumerate() {
817 if match_start >= line_info.byte_offset {
818 line_num = idx + 1;
819 col_start = match_start - line_info.byte_offset;
820 } else {
821 break;
822 }
823 }
824
825 let mut end_line_num = 1;
827 let mut col_end = match_end;
828 for (idx, line_info) in lines.iter().enumerate() {
829 if match_end > line_info.byte_offset {
830 end_line_num = idx + 1;
831 col_end = match_end - line_info.byte_offset;
832 } else {
833 break;
834 }
835 }
836
837 if line_num == end_line_num {
839 } else {
841 }
844
845 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
846
847 if let Some(inline_url) = cap.get(2) {
848 images.push(ParsedImage {
850 line: line_num,
851 start_col: col_start,
852 end_col: col_end,
853 byte_offset: match_start,
854 byte_end: match_end,
855 alt_text,
856 url: inline_url.as_str().to_string(),
857 is_reference: false,
858 reference_id: None,
859 });
860 } else if let Some(ref_id) = cap.get(3) {
861 let ref_id_str = ref_id.as_str();
863 let normalized_ref = if ref_id_str.is_empty() {
864 alt_text.to_lowercase() } else {
866 ref_id_str.to_lowercase()
867 };
868
869 images.push(ParsedImage {
870 line: line_num,
871 start_col: col_start,
872 end_col: col_end,
873 byte_offset: match_start,
874 byte_end: match_end,
875 alt_text,
876 url: String::new(), is_reference: true,
878 reference_id: Some(normalized_ref),
879 });
880 }
881 }
882
883 images
884 }
885
886 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
888 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
892 if line_info.in_code_block {
894 continue;
895 }
896
897 let line = &line_info.content;
898 let line_num = line_idx + 1;
899
900 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
901 let id = cap.get(1).unwrap().as_str().to_lowercase();
902 let url = cap.get(2).unwrap().as_str().to_string();
903 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
904
905 refs.push(ReferenceDef {
906 line: line_num,
907 id,
908 url,
909 title,
910 });
911 }
912 }
913
914 refs
915 }
916
917 fn compute_line_info(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<LineInfo> {
919 lazy_static! {
920 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
922 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
923
924 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
926
927 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
929 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
930
931 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
933 }
934
935 let content_lines: Vec<&str> = content.lines().collect();
936 let mut lines = Vec::with_capacity(content_lines.len());
937
938 let mut in_front_matter = false;
940 let mut front_matter_end = 0;
941 if content_lines.first().map(|l| l.trim()) == Some("---") {
942 in_front_matter = true;
943 for (idx, line) in content_lines.iter().enumerate().skip(1) {
944 if line.trim() == "---" {
945 front_matter_end = idx;
946 break;
947 }
948 }
949 }
950
951 for (i, line) in content_lines.iter().enumerate() {
952 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
953 let indent = line.len() - line.trim_start().len();
954 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
956 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
958 after_prefix.trim().is_empty()
959 } else {
960 line.trim().is_empty()
961 };
962 let in_code_block = code_blocks.iter().any(|&(start, end)| {
965 let block_content = &content[start..end];
968 let is_multiline = block_content.contains('\n');
969 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
970 let is_indented = !is_fenced
971 && block_content
972 .lines()
973 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
974
975 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
976 });
977
978 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
980 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
982 let prefix = caps.get(1).unwrap().as_str();
983 let content = caps.get(2).unwrap().as_str();
984 (content, prefix.len())
985 } else {
986 (&**line, 0)
987 };
988
989 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
990 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
991 let marker = caps.get(2).map_or("", |m| m.as_str());
992 let spacing = caps.get(3).map_or("", |m| m.as_str());
993 let _content = caps.get(4).map_or("", |m| m.as_str());
994 let marker_column = blockquote_prefix_len + leading_spaces.len();
995 let content_column = marker_column + marker.len() + spacing.len();
996
997 if spacing.is_empty() {
1004 None
1005 } else {
1006 Some(ListItemInfo {
1007 marker: marker.to_string(),
1008 is_ordered: false,
1009 number: None,
1010 marker_column,
1011 content_column,
1012 })
1013 }
1014 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1015 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1016 let number_str = caps.get(2).map_or("", |m| m.as_str());
1017 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1018 let spacing = caps.get(4).map_or("", |m| m.as_str());
1019 let _content = caps.get(5).map_or("", |m| m.as_str());
1020 let marker = format!("{number_str}{delimiter}");
1021 let marker_column = blockquote_prefix_len + leading_spaces.len();
1022 let content_column = marker_column + marker.len() + spacing.len();
1023
1024 if spacing.is_empty() {
1027 None
1028 } else {
1029 Some(ListItemInfo {
1030 marker,
1031 is_ordered: true,
1032 number: number_str.parse().ok(),
1033 marker_column,
1034 content_column,
1035 })
1036 }
1037 } else {
1038 None
1039 }
1040 } else {
1041 None
1042 };
1043
1044 lines.push(LineInfo {
1045 content: line.to_string(),
1046 byte_offset,
1047 indent,
1048 is_blank,
1049 in_code_block,
1050 in_front_matter: in_front_matter && i <= front_matter_end,
1051 list_item,
1052 heading: None, blockquote: None, });
1055 }
1056
1057 for i in 0..content_lines.len() {
1059 if lines[i].in_code_block {
1060 continue;
1061 }
1062
1063 if in_front_matter && i <= front_matter_end {
1065 continue;
1066 }
1067
1068 let line = content_lines[i];
1069
1070 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1072 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1073 let markers = caps.get(2).map_or("", |m| m.as_str());
1074 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1075 let content = caps.get(4).map_or("", |m| m.as_str());
1076
1077 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1078 let marker_column = indent_str.len();
1079
1080 let prefix = format!("{indent_str}{markers}{spaces_after}");
1082
1083 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1085 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1087
1088 let needs_md028_fix = content.trim().is_empty() && spaces_after.is_empty();
1090
1091 lines[i].blockquote = Some(BlockquoteInfo {
1092 nesting_level,
1093 indent: indent_str.to_string(),
1094 marker_column,
1095 prefix,
1096 content: content.to_string(),
1097 has_no_space_after_marker: has_no_space,
1098 has_multiple_spaces_after_marker: has_multiple_spaces,
1099 needs_md028_fix,
1100 });
1101 }
1102
1103 if lines[i].is_blank {
1105 continue;
1106 }
1107
1108 if let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1110 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1111 let hashes = caps.get(2).map_or("", |m| m.as_str());
1112 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1113 let rest = caps.get(4).map_or("", |m| m.as_str());
1114
1115 let level = hashes.len() as u8;
1116 let marker_column = leading_spaces.len();
1117
1118 let (text, has_closing, closing_seq) = {
1120 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1122 if rest[id_start..].trim_end().ends_with('}') {
1124 (&rest[..id_start], &rest[id_start..])
1126 } else {
1127 (rest, "")
1128 }
1129 } else {
1130 (rest, "")
1131 };
1132
1133 let trimmed_rest = rest_without_id.trim_end();
1135 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1136 let mut start_of_hashes = last_hash_pos;
1138 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1139 start_of_hashes -= 1;
1140 }
1141
1142 let has_space_before = start_of_hashes == 0
1144 || trimmed_rest
1145 .chars()
1146 .nth(start_of_hashes - 1)
1147 .is_some_and(|c| c.is_whitespace());
1148
1149 let potential_closing = &trimmed_rest[start_of_hashes..];
1151 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1152
1153 if is_all_hashes && has_space_before {
1154 let closing_hashes = potential_closing.to_string();
1156 let text_part = if !custom_id_part.is_empty() {
1159 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1162 } else {
1163 rest_without_id[..start_of_hashes].trim_end().to_string()
1164 };
1165 (text_part, true, closing_hashes)
1166 } else {
1167 (rest.to_string(), false, String::new())
1169 }
1170 } else {
1171 (rest.to_string(), false, String::new())
1173 }
1174 };
1175
1176 let content_column = marker_column + hashes.len() + spaces_after.len();
1177
1178 let raw_text = text.trim().to_string();
1180 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1181
1182 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1184 let next_line = content_lines[i + 1];
1185 if !lines[i + 1].in_code_block
1186 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1187 && let Some(next_line_id) =
1188 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1189 {
1190 custom_id = Some(next_line_id);
1191 }
1192 }
1193
1194 lines[i].heading = Some(HeadingInfo {
1195 level,
1196 style: HeadingStyle::ATX,
1197 marker: hashes.to_string(),
1198 marker_column,
1199 content_column,
1200 text: clean_text,
1201 custom_id,
1202 raw_text,
1203 has_closing_sequence: has_closing,
1204 closing_sequence: closing_seq,
1205 });
1206 }
1207 else if i + 1 < content_lines.len() {
1209 let next_line = content_lines[i + 1];
1210 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1211 if in_front_matter && i < front_matter_end {
1213 continue;
1214 }
1215
1216 let underline = next_line.trim();
1217 let level = if underline.starts_with('=') { 1 } else { 2 };
1218 let style = if level == 1 {
1219 HeadingStyle::Setext1
1220 } else {
1221 HeadingStyle::Setext2
1222 };
1223
1224 let raw_text = line.trim().to_string();
1226 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1227
1228 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1230 let attr_line = content_lines[i + 2];
1231 if !lines[i + 2].in_code_block
1232 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1233 && let Some(attr_line_id) =
1234 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1235 {
1236 custom_id = Some(attr_line_id);
1237 }
1238 }
1239
1240 lines[i].heading = Some(HeadingInfo {
1241 level,
1242 style,
1243 marker: underline.to_string(),
1244 marker_column: next_line.len() - next_line.trim_start().len(),
1245 content_column: lines[i].indent,
1246 text: clean_text,
1247 custom_id,
1248 raw_text,
1249 has_closing_sequence: false,
1250 closing_sequence: String::new(),
1251 });
1252 }
1253 }
1254 }
1255
1256 lines
1257 }
1258
1259 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1261 let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); if !content.contains('`') {
1266 return code_spans;
1267 }
1268
1269 let mut pos = 0;
1270 let bytes = content.as_bytes();
1271
1272 while pos < bytes.len() {
1273 if let Some(backtick_start) = content[pos..].find('`') {
1275 let start_pos = pos + backtick_start;
1276
1277 let mut in_code_block = false;
1279 for (line_idx, line_info) in lines.iter().enumerate() {
1280 if start_pos >= line_info.byte_offset
1281 && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1282 {
1283 in_code_block = line_info.in_code_block;
1284 break;
1285 }
1286 }
1287
1288 if in_code_block {
1289 pos = start_pos + 1;
1290 continue;
1291 }
1292
1293 let mut backtick_count = 0;
1295 let mut i = start_pos;
1296 while i < bytes.len() && bytes[i] == b'`' {
1297 backtick_count += 1;
1298 i += 1;
1299 }
1300
1301 let search_start = start_pos + backtick_count;
1303 let closing_pattern = &content[start_pos..start_pos + backtick_count];
1304
1305 if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1306 let end_pos = search_start + rel_end;
1308 let check_pos = end_pos + backtick_count;
1309
1310 if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1312 let content_start = start_pos + backtick_count;
1314 let content_end = end_pos;
1315 let span_content = content[content_start..content_end].to_string();
1316
1317 let mut line_num = 1;
1319 let mut col_start = start_pos;
1320 for (idx, line_info) in lines.iter().enumerate() {
1321 if start_pos >= line_info.byte_offset {
1322 line_num = idx + 1;
1323 col_start = start_pos - line_info.byte_offset;
1324 } else {
1325 break;
1326 }
1327 }
1328
1329 let mut col_end = end_pos + backtick_count;
1331 for line_info in lines.iter() {
1332 if end_pos + backtick_count > line_info.byte_offset {
1333 col_end = end_pos + backtick_count - line_info.byte_offset;
1334 } else {
1335 break;
1336 }
1337 }
1338
1339 code_spans.push(CodeSpan {
1340 line: line_num,
1341 start_col: col_start,
1342 end_col: col_end,
1343 byte_offset: start_pos,
1344 byte_end: end_pos + backtick_count,
1345 backtick_count,
1346 content: span_content,
1347 });
1348
1349 pos = end_pos + backtick_count;
1351 continue;
1352 }
1353 }
1354
1355 pos = start_pos + backtick_count;
1357 } else {
1358 break;
1360 }
1361 }
1362
1363 code_spans
1364 }
1365
1366 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1368 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1371 let mut last_list_item_line = 0;
1372 let mut current_indent_level = 0;
1373 let mut last_marker_width = 0;
1374
1375 for (line_idx, line_info) in lines.iter().enumerate() {
1376 let line_num = line_idx + 1;
1377
1378 if line_info.in_code_block {
1380 if let Some(ref mut block) = current_block {
1381 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1383
1384 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1386
1387 match context {
1388 CodeBlockContext::Indented => {
1389 block.end_line = line_num;
1391 continue;
1392 }
1393 CodeBlockContext::Standalone => {
1394 let completed_block = current_block.take().unwrap();
1396 list_blocks.push(completed_block);
1397 continue;
1398 }
1399 CodeBlockContext::Adjacent => {
1400 block.end_line = line_num;
1402 continue;
1403 }
1404 }
1405 } else {
1406 continue;
1408 }
1409 }
1410
1411 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1413 caps.get(0).unwrap().as_str().to_string()
1414 } else {
1415 String::new()
1416 };
1417
1418 if let Some(list_item) = &line_info.list_item {
1420 let item_indent = list_item.marker_column;
1422 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1425 let is_nested = nesting > block.nesting_level;
1429 let same_type =
1430 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1431 let same_context = block.blockquote_prefix == blockquote_prefix;
1432 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1436 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1437
1438 let has_non_list_content = {
1440 let mut found_non_list = false;
1441 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1443 for check_line in (block_last_item_line + 1)..line_num {
1444 let check_idx = check_line - 1;
1445 if check_idx < lines.len() {
1446 let check_info = &lines[check_idx];
1447 let is_list_breaking_content = if check_info.in_code_block {
1449 let last_item_marker_width =
1451 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1452 lines[block_last_item_line - 1]
1453 .list_item
1454 .as_ref()
1455 .map(|li| {
1456 if li.is_ordered {
1457 li.marker.len() + 1 } else {
1459 li.marker.len()
1460 }
1461 })
1462 .unwrap_or(3) } else {
1464 3 };
1466
1467 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1468
1469 let context = CodeBlockUtils::analyze_code_block_context(
1471 lines,
1472 check_line - 1,
1473 min_continuation,
1474 );
1475
1476 matches!(context, CodeBlockContext::Standalone)
1478 } else if !check_info.is_blank && check_info.list_item.is_none() {
1479 let line_content = check_info.content.trim();
1481
1482 if check_info.heading.is_some()
1484 || line_content.starts_with("---")
1485 || line_content.starts_with("***")
1486 || line_content.starts_with("___")
1487 || line_content.contains('|')
1488 || line_content.starts_with(">")
1489 {
1490 true
1491 }
1492 else {
1494 let last_item_marker_width =
1495 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1496 lines[block_last_item_line - 1]
1497 .list_item
1498 .as_ref()
1499 .map(|li| {
1500 if li.is_ordered {
1501 li.marker.len() + 1 } else {
1503 li.marker.len()
1504 }
1505 })
1506 .unwrap_or(3) } else {
1508 3 };
1510
1511 let min_continuation =
1512 if block.is_ordered { last_item_marker_width } else { 2 };
1513 check_info.indent < min_continuation
1514 }
1515 } else {
1516 false
1517 };
1518
1519 if is_list_breaking_content {
1520 found_non_list = true;
1522 break;
1523 }
1524 }
1525 }
1526 found_non_list
1527 };
1528
1529 let continues_list = if is_nested {
1533 same_context && reasonable_distance && !has_non_list_content
1535 } else {
1536 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1538 };
1539
1540 if continues_list {
1541 block.end_line = line_num;
1543 block.item_lines.push(line_num);
1544
1545 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1547 list_item.marker.len() + 1
1548 } else {
1549 list_item.marker.len()
1550 });
1551
1552 if !block.is_ordered
1554 && block.marker.is_some()
1555 && block.marker.as_ref() != Some(&list_item.marker)
1556 {
1557 block.marker = None;
1559 }
1560 } else {
1561 list_blocks.push(block.clone());
1563
1564 *block = ListBlock {
1565 start_line: line_num,
1566 end_line: line_num,
1567 is_ordered: list_item.is_ordered,
1568 marker: if list_item.is_ordered {
1569 None
1570 } else {
1571 Some(list_item.marker.clone())
1572 },
1573 blockquote_prefix: blockquote_prefix.clone(),
1574 item_lines: vec![line_num],
1575 nesting_level: nesting,
1576 max_marker_width: if list_item.is_ordered {
1577 list_item.marker.len() + 1
1578 } else {
1579 list_item.marker.len()
1580 },
1581 };
1582 }
1583 } else {
1584 current_block = Some(ListBlock {
1586 start_line: line_num,
1587 end_line: line_num,
1588 is_ordered: list_item.is_ordered,
1589 marker: if list_item.is_ordered {
1590 None
1591 } else {
1592 Some(list_item.marker.clone())
1593 },
1594 blockquote_prefix,
1595 item_lines: vec![line_num],
1596 nesting_level: nesting,
1597 max_marker_width: list_item.marker.len(),
1598 });
1599 }
1600
1601 last_list_item_line = line_num;
1602 current_indent_level = item_indent;
1603 last_marker_width = if list_item.is_ordered {
1604 list_item.marker.len() + 1 } else {
1606 list_item.marker.len()
1607 };
1608 } else if let Some(ref mut block) = current_block {
1609 let min_continuation_indent = if block.is_ordered {
1620 current_indent_level + last_marker_width
1621 } else {
1622 current_indent_level + 2 };
1624
1625 if line_info.indent >= min_continuation_indent {
1626 block.end_line = line_num;
1628 } else if line_info.is_blank {
1629 let mut check_idx = line_idx + 1;
1632 let mut found_continuation = false;
1633
1634 while check_idx < lines.len() && lines[check_idx].is_blank {
1636 check_idx += 1;
1637 }
1638
1639 if check_idx < lines.len() {
1640 let next_line = &lines[check_idx];
1641 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1643 found_continuation = true;
1644 }
1645 else if !next_line.in_code_block
1647 && next_line.list_item.is_some()
1648 && let Some(item) = &next_line.list_item
1649 {
1650 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1651 .find(&next_line.content)
1652 .map_or(String::new(), |m| m.as_str().to_string());
1653 if item.marker_column == current_indent_level
1654 && item.is_ordered == block.is_ordered
1655 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1656 {
1657 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1660 if let Some(between_line) = lines.get(idx) {
1661 let trimmed = between_line.content.trim();
1662 if trimmed.is_empty() {
1664 return false;
1665 }
1666 let line_indent =
1668 between_line.content.len() - between_line.content.trim_start().len();
1669
1670 if trimmed.starts_with("```")
1672 || trimmed.starts_with("~~~")
1673 || trimmed.starts_with("---")
1674 || trimmed.starts_with("***")
1675 || trimmed.starts_with("___")
1676 || trimmed.starts_with(">")
1677 || trimmed.contains('|') || between_line.heading.is_some()
1679 {
1680 return true; }
1682
1683 line_indent >= min_continuation_indent
1685 } else {
1686 false
1687 }
1688 });
1689
1690 if block.is_ordered {
1691 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1694 if let Some(between_line) = lines.get(idx) {
1695 let trimmed = between_line.content.trim();
1696 if trimmed.is_empty() {
1697 return false;
1698 }
1699 trimmed.starts_with("```")
1701 || trimmed.starts_with("~~~")
1702 || trimmed.starts_with("---")
1703 || trimmed.starts_with("***")
1704 || trimmed.starts_with("___")
1705 || trimmed.starts_with(">")
1706 || trimmed.contains('|') || between_line.heading.is_some()
1708 } else {
1709 false
1710 }
1711 });
1712 found_continuation = !has_structural_separators;
1713 } else {
1714 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1716 if let Some(between_line) = lines.get(idx) {
1717 let trimmed = between_line.content.trim();
1718 if trimmed.is_empty() {
1719 return false;
1720 }
1721 trimmed.starts_with("```")
1723 || trimmed.starts_with("~~~")
1724 || trimmed.starts_with("---")
1725 || trimmed.starts_with("***")
1726 || trimmed.starts_with("___")
1727 || trimmed.starts_with(">")
1728 || trimmed.contains('|') || between_line.heading.is_some()
1730 } else {
1731 false
1732 }
1733 });
1734 found_continuation = !has_structural_separators;
1735 }
1736 }
1737 }
1738 }
1739
1740 if found_continuation {
1741 block.end_line = line_num;
1743 } else {
1744 list_blocks.push(block.clone());
1746 current_block = None;
1747 }
1748 } else {
1749 let min_required_indent = if block.is_ordered {
1752 current_indent_level + last_marker_width
1753 } else {
1754 current_indent_level + 2
1755 };
1756
1757 let line_content = line_info.content.trim();
1762 let is_structural_separator = line_info.heading.is_some()
1763 || line_content.starts_with("```")
1764 || line_content.starts_with("~~~")
1765 || line_content.starts_with("---")
1766 || line_content.starts_with("***")
1767 || line_content.starts_with("___")
1768 || line_content.starts_with(">")
1769 || line_content.contains('|'); let is_lazy_continuation = last_list_item_line == line_num - 1
1772 && !is_structural_separator
1773 && !line_info.is_blank
1774 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1775
1776 if is_lazy_continuation {
1777 let content_to_check = if !blockquote_prefix.is_empty() {
1780 line_info
1782 .content
1783 .strip_prefix(&blockquote_prefix)
1784 .unwrap_or(&line_info.content)
1785 .trim()
1786 } else {
1787 line_info.content.trim()
1788 };
1789
1790 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1791
1792 if starts_with_uppercase && last_list_item_line > 0 {
1795 list_blocks.push(block.clone());
1797 current_block = None;
1798 } else {
1799 block.end_line = line_num;
1801 }
1802 } else {
1803 list_blocks.push(block.clone());
1805 current_block = None;
1806 }
1807 }
1808 }
1809 }
1810
1811 if let Some(block) = current_block {
1813 list_blocks.push(block);
1814 }
1815
1816 merge_adjacent_list_blocks(&mut list_blocks, lines);
1818
1819 list_blocks
1820 }
1821
1822 fn compute_char_frequency(content: &str) -> CharFrequency {
1824 let mut frequency = CharFrequency::default();
1825
1826 for ch in content.chars() {
1827 match ch {
1828 '#' => frequency.hash_count += 1,
1829 '*' => frequency.asterisk_count += 1,
1830 '_' => frequency.underscore_count += 1,
1831 '-' => frequency.hyphen_count += 1,
1832 '+' => frequency.plus_count += 1,
1833 '>' => frequency.gt_count += 1,
1834 '|' => frequency.pipe_count += 1,
1835 '[' => frequency.bracket_count += 1,
1836 '`' => frequency.backtick_count += 1,
1837 '<' => frequency.lt_count += 1,
1838 '!' => frequency.exclamation_count += 1,
1839 '\n' => frequency.newline_count += 1,
1840 _ => {}
1841 }
1842 }
1843
1844 frequency
1845 }
1846
1847 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1849 lazy_static! {
1850 static ref HTML_TAG_REGEX: regex::Regex =
1851 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*(/?)>").unwrap();
1852 }
1853
1854 let mut html_tags = Vec::with_capacity(content.matches('<').count());
1855
1856 for cap in HTML_TAG_REGEX.captures_iter(content) {
1857 let full_match = cap.get(0).unwrap();
1858 let match_start = full_match.start();
1859 let match_end = full_match.end();
1860
1861 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1863 continue;
1864 }
1865
1866 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1867 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1868 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1869
1870 let mut line_num = 1;
1872 let mut col_start = match_start;
1873 let mut col_end = match_end;
1874 for (idx, line_info) in lines.iter().enumerate() {
1875 if match_start >= line_info.byte_offset {
1876 line_num = idx + 1;
1877 col_start = match_start - line_info.byte_offset;
1878 col_end = match_end - line_info.byte_offset;
1879 } else {
1880 break;
1881 }
1882 }
1883
1884 html_tags.push(HtmlTag {
1885 line: line_num,
1886 start_col: col_start,
1887 end_col: col_end,
1888 byte_offset: match_start,
1889 byte_end: match_end,
1890 tag_name,
1891 is_closing,
1892 is_self_closing,
1893 raw_content: full_match.as_str().to_string(),
1894 });
1895 }
1896
1897 html_tags
1898 }
1899
1900 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1902 lazy_static! {
1903 static ref EMPHASIS_REGEX: regex::Regex =
1904 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1905 }
1906
1907 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1908
1909 for cap in EMPHASIS_REGEX.captures_iter(content) {
1910 let full_match = cap.get(0).unwrap();
1911 let match_start = full_match.start();
1912 let match_end = full_match.end();
1913
1914 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1916 continue;
1917 }
1918
1919 let opening_markers = cap.get(1).unwrap().as_str();
1920 let content_part = cap.get(2).unwrap().as_str();
1921 let closing_markers = cap.get(3).unwrap().as_str();
1922
1923 if opening_markers.chars().next() != closing_markers.chars().next()
1925 || opening_markers.len() != closing_markers.len()
1926 {
1927 continue;
1928 }
1929
1930 let marker = opening_markers.chars().next().unwrap();
1931 let marker_count = opening_markers.len();
1932
1933 let mut line_num = 1;
1935 let mut col_start = match_start;
1936 let mut col_end = match_end;
1937 for (idx, line_info) in lines.iter().enumerate() {
1938 if match_start >= line_info.byte_offset {
1939 line_num = idx + 1;
1940 col_start = match_start - line_info.byte_offset;
1941 col_end = match_end - line_info.byte_offset;
1942 } else {
1943 break;
1944 }
1945 }
1946
1947 emphasis_spans.push(EmphasisSpan {
1948 line: line_num,
1949 start_col: col_start,
1950 end_col: col_end,
1951 byte_offset: match_start,
1952 byte_end: match_end,
1953 marker,
1954 marker_count,
1955 content: content_part.to_string(),
1956 });
1957 }
1958
1959 emphasis_spans
1960 }
1961
1962 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
1964 let mut table_rows = Vec::with_capacity(lines.len() / 20);
1965
1966 for (line_idx, line_info) in lines.iter().enumerate() {
1967 if line_info.in_code_block || line_info.is_blank {
1969 continue;
1970 }
1971
1972 let line = &line_info.content;
1973 let line_num = line_idx + 1;
1974
1975 if !line.contains('|') {
1977 continue;
1978 }
1979
1980 let parts: Vec<&str> = line.split('|').collect();
1982 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
1983
1984 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
1986 let mut column_alignments = Vec::new();
1987
1988 if is_separator {
1989 for part in &parts[1..parts.len() - 1] {
1990 let trimmed = part.trim();
1992 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
1993 "center".to_string()
1994 } else if trimmed.ends_with(':') {
1995 "right".to_string()
1996 } else if trimmed.starts_with(':') {
1997 "left".to_string()
1998 } else {
1999 "none".to_string()
2000 };
2001 column_alignments.push(alignment);
2002 }
2003 }
2004
2005 table_rows.push(TableRow {
2006 line: line_num,
2007 is_separator,
2008 column_count,
2009 column_alignments,
2010 });
2011 }
2012
2013 table_rows
2014 }
2015
2016 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2018 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2019
2020 for cap in BARE_URL_PATTERN.captures_iter(content) {
2022 let full_match = cap.get(0).unwrap();
2023 let match_start = full_match.start();
2024 let match_end = full_match.end();
2025
2026 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2028 continue;
2029 }
2030
2031 let preceding_char = if match_start > 0 {
2033 content.chars().nth(match_start - 1)
2034 } else {
2035 None
2036 };
2037 let following_char = content.chars().nth(match_end);
2038
2039 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2040 continue;
2041 }
2042 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2043 continue;
2044 }
2045
2046 let url = full_match.as_str();
2047 let url_type = if url.starts_with("https://") {
2048 "https"
2049 } else if url.starts_with("http://") {
2050 "http"
2051 } else if url.starts_with("ftp://") {
2052 "ftp"
2053 } else {
2054 "other"
2055 };
2056
2057 let mut line_num = 1;
2059 let mut col_start = match_start;
2060 let mut col_end = match_end;
2061 for (idx, line_info) in lines.iter().enumerate() {
2062 if match_start >= line_info.byte_offset {
2063 line_num = idx + 1;
2064 col_start = match_start - line_info.byte_offset;
2065 col_end = match_end - line_info.byte_offset;
2066 } else {
2067 break;
2068 }
2069 }
2070
2071 bare_urls.push(BareUrl {
2072 line: line_num,
2073 start_col: col_start,
2074 end_col: col_end,
2075 byte_offset: match_start,
2076 byte_end: match_end,
2077 url: url.to_string(),
2078 url_type: url_type.to_string(),
2079 });
2080 }
2081
2082 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2084 let full_match = cap.get(0).unwrap();
2085 let match_start = full_match.start();
2086 let match_end = full_match.end();
2087
2088 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2090 continue;
2091 }
2092
2093 let preceding_char = if match_start > 0 {
2095 content.chars().nth(match_start - 1)
2096 } else {
2097 None
2098 };
2099 let following_char = content.chars().nth(match_end);
2100
2101 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2102 continue;
2103 }
2104 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2105 continue;
2106 }
2107
2108 let email = full_match.as_str();
2109
2110 let mut line_num = 1;
2112 let mut col_start = match_start;
2113 let mut col_end = match_end;
2114 for (idx, line_info) in lines.iter().enumerate() {
2115 if match_start >= line_info.byte_offset {
2116 line_num = idx + 1;
2117 col_start = match_start - line_info.byte_offset;
2118 col_end = match_end - line_info.byte_offset;
2119 } else {
2120 break;
2121 }
2122 }
2123
2124 bare_urls.push(BareUrl {
2125 line: line_num,
2126 start_col: col_start,
2127 end_col: col_end,
2128 byte_offset: match_start,
2129 byte_end: match_end,
2130 url: email.to_string(),
2131 url_type: "email".to_string(),
2132 });
2133 }
2134
2135 bare_urls
2136 }
2137}
2138
2139fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2141 if list_blocks.len() < 2 {
2142 return;
2143 }
2144
2145 let mut merger = ListBlockMerger::new(lines);
2146 *list_blocks = merger.merge(list_blocks);
2147}
2148
2149struct ListBlockMerger<'a> {
2151 lines: &'a [LineInfo],
2152}
2153
2154impl<'a> ListBlockMerger<'a> {
2155 fn new(lines: &'a [LineInfo]) -> Self {
2156 Self { lines }
2157 }
2158
2159 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2160 let mut merged = Vec::with_capacity(list_blocks.len());
2161 let mut current = list_blocks[0].clone();
2162
2163 for next in list_blocks.iter().skip(1) {
2164 if self.should_merge_blocks(¤t, next) {
2165 current = self.merge_two_blocks(current, next);
2166 } else {
2167 merged.push(current);
2168 current = next.clone();
2169 }
2170 }
2171
2172 merged.push(current);
2173 merged
2174 }
2175
2176 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2178 if !self.blocks_are_compatible(current, next) {
2180 return false;
2181 }
2182
2183 let spacing = self.analyze_spacing_between(current, next);
2185 match spacing {
2186 BlockSpacing::Consecutive => true,
2187 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2188 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2189 self.can_merge_with_content_between(current, next)
2190 }
2191 }
2192 }
2193
2194 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2196 current.is_ordered == next.is_ordered
2197 && current.blockquote_prefix == next.blockquote_prefix
2198 && current.nesting_level == next.nesting_level
2199 }
2200
2201 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2203 let gap = next.start_line - current.end_line;
2204
2205 match gap {
2206 1 => BlockSpacing::Consecutive,
2207 2 => BlockSpacing::SingleBlank,
2208 _ if gap > 2 => {
2209 if self.has_only_blank_lines_between(current, next) {
2210 BlockSpacing::MultipleBlanks
2211 } else {
2212 BlockSpacing::ContentBetween
2213 }
2214 }
2215 _ => BlockSpacing::Consecutive, }
2217 }
2218
2219 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2221 if has_meaningful_content_between(current, next, self.lines) {
2224 return false; }
2226
2227 !current.is_ordered && current.marker == next.marker
2229 }
2230
2231 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2233 if has_meaningful_content_between(current, next, self.lines) {
2235 return false; }
2237
2238 current.is_ordered && next.is_ordered
2240 }
2241
2242 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2244 for line_num in (current.end_line + 1)..next.start_line {
2245 if let Some(line_info) = self.lines.get(line_num - 1)
2246 && !line_info.content.trim().is_empty()
2247 {
2248 return false;
2249 }
2250 }
2251 true
2252 }
2253
2254 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2256 current.end_line = next.end_line;
2257 current.item_lines.extend_from_slice(&next.item_lines);
2258
2259 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2261
2262 if !current.is_ordered && self.markers_differ(¤t, next) {
2264 current.marker = None; }
2266
2267 current
2268 }
2269
2270 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2272 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2273 }
2274}
2275
2276#[derive(Debug, PartialEq)]
2278enum BlockSpacing {
2279 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2284
2285fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2287 for line_num in (current.end_line + 1)..next.start_line {
2289 if let Some(line_info) = lines.get(line_num - 1) {
2290 let trimmed = line_info.content.trim();
2292
2293 if trimmed.is_empty() {
2295 continue;
2296 }
2297
2298 if line_info.heading.is_some() {
2302 return true; }
2304
2305 if is_horizontal_rule(trimmed) {
2307 return true; }
2309
2310 if trimmed.contains('|') && trimmed.len() > 1 {
2312 return true; }
2314
2315 if trimmed.starts_with('>') {
2317 return true; }
2319
2320 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2322 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2323
2324 let min_continuation_indent = if current.is_ordered {
2326 current.nesting_level + current.max_marker_width + 1 } else {
2328 current.nesting_level + 2
2329 };
2330
2331 if line_indent < min_continuation_indent {
2332 return true; }
2335 }
2336
2337 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2339
2340 let min_indent = if current.is_ordered {
2342 current.nesting_level + current.max_marker_width
2343 } else {
2344 current.nesting_level + 2
2345 };
2346
2347 if line_indent < min_indent {
2349 return true; }
2351
2352 }
2355 }
2356
2357 false
2359}
2360
2361fn is_horizontal_rule(trimmed: &str) -> bool {
2363 if trimmed.len() < 3 {
2364 return false;
2365 }
2366
2367 let chars: Vec<char> = trimmed.chars().collect();
2369 if let Some(&first_char) = chars.first()
2370 && (first_char == '-' || first_char == '*' || first_char == '_')
2371 {
2372 let mut count = 0;
2373 for &ch in &chars {
2374 if ch == first_char {
2375 count += 1;
2376 } else if ch != ' ' && ch != '\t' {
2377 return false; }
2379 }
2380 return count >= 3;
2381 }
2382 false
2383}
2384
2385#[cfg(test)]
2387mod tests {
2388 use super::*;
2389
2390 #[test]
2391 fn test_empty_content() {
2392 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2393 assert_eq!(ctx.content, "");
2394 assert_eq!(ctx.line_offsets, vec![0]);
2395 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2396 assert_eq!(ctx.lines.len(), 0);
2397 }
2398
2399 #[test]
2400 fn test_single_line() {
2401 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2402 assert_eq!(ctx.content, "# Hello");
2403 assert_eq!(ctx.line_offsets, vec![0]);
2404 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2405 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2406 }
2407
2408 #[test]
2409 fn test_multi_line() {
2410 let content = "# Title\n\nSecond line\nThird line";
2411 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2412 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2413 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2420
2421 #[test]
2422 fn test_line_info() {
2423 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2424 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2425
2426 assert_eq!(ctx.lines.len(), 7);
2428
2429 let line1 = &ctx.lines[0];
2431 assert_eq!(line1.content, "# Title");
2432 assert_eq!(line1.byte_offset, 0);
2433 assert_eq!(line1.indent, 0);
2434 assert!(!line1.is_blank);
2435 assert!(!line1.in_code_block);
2436 assert!(line1.list_item.is_none());
2437
2438 let line2 = &ctx.lines[1];
2440 assert_eq!(line2.content, " indented");
2441 assert_eq!(line2.byte_offset, 8);
2442 assert_eq!(line2.indent, 4);
2443 assert!(!line2.is_blank);
2444
2445 let line3 = &ctx.lines[2];
2447 assert_eq!(line3.content, "");
2448 assert!(line3.is_blank);
2449
2450 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2452 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2453 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2454 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2455 }
2456
2457 #[test]
2458 fn test_list_item_detection() {
2459 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2460 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2461
2462 let line1 = &ctx.lines[0];
2464 assert!(line1.list_item.is_some());
2465 let list1 = line1.list_item.as_ref().unwrap();
2466 assert_eq!(list1.marker, "-");
2467 assert!(!list1.is_ordered);
2468 assert_eq!(list1.marker_column, 0);
2469 assert_eq!(list1.content_column, 2);
2470
2471 let line2 = &ctx.lines[1];
2473 assert!(line2.list_item.is_some());
2474 let list2 = line2.list_item.as_ref().unwrap();
2475 assert_eq!(list2.marker, "*");
2476 assert_eq!(list2.marker_column, 2);
2477
2478 let line3 = &ctx.lines[2];
2480 assert!(line3.list_item.is_some());
2481 let list3 = line3.list_item.as_ref().unwrap();
2482 assert_eq!(list3.marker, "1.");
2483 assert!(list3.is_ordered);
2484 assert_eq!(list3.number, Some(1));
2485
2486 let line6 = &ctx.lines[5];
2488 assert!(line6.list_item.is_none());
2489 }
2490
2491 #[test]
2492 fn test_offset_to_line_col_edge_cases() {
2493 let content = "a\nb\nc";
2494 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2495 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2503}