1use crate::config::MarkdownFlavor;
2use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7 static ref LINK_PATTERN: Regex = Regex::new(
10 r"(?sx)
11 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
12 (?:
13 \(([^)]*)\) # Inline URL in group 2 (can be empty)
14 |
15 \[([^\]]*)\] # Reference ID in group 3
16 )"
17 ).unwrap();
18
19 static ref IMAGE_PATTERN: Regex = Regex::new(
22 r"(?sx)
23 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
24 (?:
25 \(([^)]*)\) # Inline URL in group 2 (can be empty)
26 |
27 \[([^\]]*)\] # Reference ID in group 3
28 )"
29 ).unwrap();
30
31 static ref REF_DEF_PATTERN: Regex = Regex::new(
33 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
34 ).unwrap();
35
36 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
39 r"`+"
40 ).unwrap();
41
42 static ref BARE_URL_PATTERN: Regex = Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap();
46
47 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
49 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
50 ).unwrap();
51
52 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
54 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
55 ).unwrap();
56
57 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
59}
60
61#[derive(Debug, Clone)]
63pub struct LineInfo {
64 pub content: String,
66 pub byte_offset: usize,
68 pub indent: usize,
70 pub is_blank: bool,
72 pub in_code_block: bool,
74 pub in_front_matter: bool,
76 pub list_item: Option<ListItemInfo>,
78 pub heading: Option<HeadingInfo>,
80 pub blockquote: Option<BlockquoteInfo>,
82}
83
84#[derive(Debug, Clone)]
86pub struct ListItemInfo {
87 pub marker: String,
89 pub is_ordered: bool,
91 pub number: Option<usize>,
93 pub marker_column: usize,
95 pub content_column: usize,
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub enum HeadingStyle {
102 ATX,
104 Setext1,
106 Setext2,
108}
109
110#[derive(Debug, Clone)]
112pub struct ParsedLink {
113 pub line: usize,
115 pub start_col: usize,
117 pub end_col: usize,
119 pub byte_offset: usize,
121 pub byte_end: usize,
123 pub text: String,
125 pub url: String,
127 pub is_reference: bool,
129 pub reference_id: Option<String>,
131}
132
133#[derive(Debug, Clone)]
135pub struct ParsedImage {
136 pub line: usize,
138 pub start_col: usize,
140 pub end_col: usize,
142 pub byte_offset: usize,
144 pub byte_end: usize,
146 pub alt_text: String,
148 pub url: String,
150 pub is_reference: bool,
152 pub reference_id: Option<String>,
154}
155
156#[derive(Debug, Clone)]
158pub struct ReferenceDef {
159 pub line: usize,
161 pub id: String,
163 pub url: String,
165 pub title: Option<String>,
167}
168
169#[derive(Debug, Clone)]
171pub struct CodeSpan {
172 pub line: usize,
174 pub start_col: usize,
176 pub end_col: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182 pub backtick_count: usize,
184 pub content: String,
186}
187
188#[derive(Debug, Clone)]
190pub struct HeadingInfo {
191 pub level: u8,
193 pub style: HeadingStyle,
195 pub marker: String,
197 pub marker_column: usize,
199 pub content_column: usize,
201 pub text: String,
203 pub custom_id: Option<String>,
205 pub raw_text: String,
207 pub has_closing_sequence: bool,
209 pub closing_sequence: String,
211}
212
213#[derive(Debug, Clone)]
215pub struct BlockquoteInfo {
216 pub nesting_level: usize,
218 pub indent: String,
220 pub marker_column: usize,
222 pub prefix: String,
224 pub content: String,
226 pub has_no_space_after_marker: bool,
228 pub has_multiple_spaces_after_marker: bool,
230 pub needs_md028_fix: bool,
232}
233
234#[derive(Debug, Clone)]
236pub struct ListBlock {
237 pub start_line: usize,
239 pub end_line: usize,
241 pub is_ordered: bool,
243 pub marker: Option<String>,
245 pub blockquote_prefix: String,
247 pub item_lines: Vec<usize>,
249 pub nesting_level: usize,
251 pub max_marker_width: usize,
253}
254
255use std::sync::{Arc, Mutex};
256
257#[derive(Debug, Clone, Default)]
259pub struct CharFrequency {
260 pub hash_count: usize,
262 pub asterisk_count: usize,
264 pub underscore_count: usize,
266 pub hyphen_count: usize,
268 pub plus_count: usize,
270 pub gt_count: usize,
272 pub pipe_count: usize,
274 pub bracket_count: usize,
276 pub backtick_count: usize,
278 pub lt_count: usize,
280 pub exclamation_count: usize,
282 pub newline_count: usize,
284}
285
286#[derive(Debug, Clone)]
288pub struct HtmlTag {
289 pub line: usize,
291 pub start_col: usize,
293 pub end_col: usize,
295 pub byte_offset: usize,
297 pub byte_end: usize,
299 pub tag_name: String,
301 pub is_closing: bool,
303 pub is_self_closing: bool,
305 pub raw_content: String,
307}
308
309#[derive(Debug, Clone)]
311pub struct EmphasisSpan {
312 pub line: usize,
314 pub start_col: usize,
316 pub end_col: usize,
318 pub byte_offset: usize,
320 pub byte_end: usize,
322 pub marker: char,
324 pub marker_count: usize,
326 pub content: String,
328}
329
330#[derive(Debug, Clone)]
332pub struct TableRow {
333 pub line: usize,
335 pub is_separator: bool,
337 pub column_count: usize,
339 pub column_alignments: Vec<String>, }
342
343#[derive(Debug, Clone)]
345pub struct BareUrl {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub url: String,
358 pub url_type: String,
360}
361
362pub struct LintContext<'a> {
363 pub content: &'a str,
364 pub line_offsets: Vec<usize>,
365 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, pub flavor: MarkdownFlavor, }
379
380impl<'a> LintContext<'a> {
381 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
382 let mut line_offsets = vec![0];
383 for (i, c) in content.char_indices() {
384 if c == '\n' {
385 line_offsets.push(i + 1);
386 }
387 }
388
389 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
391
392 let lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
394
395 let links = Self::parse_links(content, &lines, &code_blocks, flavor);
398 let images = Self::parse_images(content, &lines, &code_blocks);
399 let reference_defs = Self::parse_reference_defs(content, &lines);
400 let list_blocks = Self::parse_list_blocks(&lines);
401
402 let char_frequency = Self::compute_char_frequency(content);
404
405 Self {
406 content,
407 line_offsets,
408 code_blocks,
409 lines,
410 links,
411 images,
412 reference_defs,
413 code_spans_cache: Mutex::new(None),
414 list_blocks,
415 char_frequency,
416 html_tags_cache: Mutex::new(None),
417 emphasis_spans_cache: Mutex::new(None),
418 table_rows_cache: Mutex::new(None),
419 bare_urls_cache: Mutex::new(None),
420 flavor,
421 }
422 }
423
424 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
426 let mut cache = self.code_spans_cache.lock().unwrap();
427
428 if cache.is_none() {
430 let code_spans = Self::parse_code_spans(self.content, &self.lines);
431 *cache = Some(Arc::new(code_spans));
432 }
433
434 cache.as_ref().unwrap().clone()
436 }
437
438 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
440 let mut cache = self.html_tags_cache.lock().unwrap();
441
442 if cache.is_none() {
443 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
444 *cache = Some(Arc::new(html_tags));
445 }
446
447 cache.as_ref().unwrap().clone()
448 }
449
450 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
452 let mut cache = self.emphasis_spans_cache.lock().unwrap();
453
454 if cache.is_none() {
455 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
456 *cache = Some(Arc::new(emphasis_spans));
457 }
458
459 cache.as_ref().unwrap().clone()
460 }
461
462 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
464 let mut cache = self.table_rows_cache.lock().unwrap();
465
466 if cache.is_none() {
467 let table_rows = Self::parse_table_rows(&self.lines);
468 *cache = Some(Arc::new(table_rows));
469 }
470
471 cache.as_ref().unwrap().clone()
472 }
473
474 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
476 let mut cache = self.bare_urls_cache.lock().unwrap();
477
478 if cache.is_none() {
479 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
480 *cache = Some(Arc::new(bare_urls));
481 }
482
483 cache.as_ref().unwrap().clone()
484 }
485
486 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
488 match self.line_offsets.binary_search(&offset) {
489 Ok(line) => (line + 1, 1),
490 Err(line) => {
491 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
492 (line, offset - line_start + 1)
493 }
494 }
495 }
496
497 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
499 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
501 return true;
502 }
503
504 self.code_spans()
506 .iter()
507 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
508 }
509
510 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
512 if line_num > 0 {
513 self.lines.get(line_num - 1)
514 } else {
515 None
516 }
517 }
518
519 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
521 self.line_info(line_num).map(|info| info.byte_offset)
522 }
523
524 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
526 let normalized_id = ref_id.to_lowercase();
527 self.reference_defs
528 .iter()
529 .find(|def| def.id == normalized_id)
530 .map(|def| def.url.as_str())
531 }
532
533 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
535 self.links.iter().filter(|link| link.line == line_num).collect()
536 }
537
538 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
540 self.images.iter().filter(|img| img.line == line_num).collect()
541 }
542
543 pub fn is_in_list_block(&self, line_num: usize) -> bool {
545 self.list_blocks
546 .iter()
547 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
548 }
549
550 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
552 self.list_blocks
553 .iter()
554 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
555 }
556
557 pub fn has_char(&self, ch: char) -> bool {
559 match ch {
560 '#' => self.char_frequency.hash_count > 0,
561 '*' => self.char_frequency.asterisk_count > 0,
562 '_' => self.char_frequency.underscore_count > 0,
563 '-' => self.char_frequency.hyphen_count > 0,
564 '+' => self.char_frequency.plus_count > 0,
565 '>' => self.char_frequency.gt_count > 0,
566 '|' => self.char_frequency.pipe_count > 0,
567 '[' => self.char_frequency.bracket_count > 0,
568 '`' => self.char_frequency.backtick_count > 0,
569 '<' => self.char_frequency.lt_count > 0,
570 '!' => self.char_frequency.exclamation_count > 0,
571 '\n' => self.char_frequency.newline_count > 0,
572 _ => self.content.contains(ch), }
574 }
575
576 pub fn char_count(&self, ch: char) -> usize {
578 match ch {
579 '#' => self.char_frequency.hash_count,
580 '*' => self.char_frequency.asterisk_count,
581 '_' => self.char_frequency.underscore_count,
582 '-' => self.char_frequency.hyphen_count,
583 '+' => self.char_frequency.plus_count,
584 '>' => self.char_frequency.gt_count,
585 '|' => self.char_frequency.pipe_count,
586 '[' => self.char_frequency.bracket_count,
587 '`' => self.char_frequency.backtick_count,
588 '<' => self.char_frequency.lt_count,
589 '!' => self.char_frequency.exclamation_count,
590 '\n' => self.char_frequency.newline_count,
591 _ => self.content.matches(ch).count(), }
593 }
594
595 pub fn likely_has_headings(&self) -> bool {
597 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
599
600 pub fn likely_has_lists(&self) -> bool {
602 self.char_frequency.asterisk_count > 0
603 || self.char_frequency.hyphen_count > 0
604 || self.char_frequency.plus_count > 0
605 }
606
607 pub fn likely_has_emphasis(&self) -> bool {
609 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
610 }
611
612 pub fn likely_has_tables(&self) -> bool {
614 self.char_frequency.pipe_count > 2
615 }
616
617 pub fn likely_has_blockquotes(&self) -> bool {
619 self.char_frequency.gt_count > 0
620 }
621
622 pub fn likely_has_code(&self) -> bool {
624 self.char_frequency.backtick_count > 0
625 }
626
627 pub fn likely_has_links_or_images(&self) -> bool {
629 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
630 }
631
632 pub fn likely_has_html(&self) -> bool {
634 self.char_frequency.lt_count > 0
635 }
636
637 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
639 self.html_tags()
640 .iter()
641 .filter(|tag| tag.line == line_num)
642 .cloned()
643 .collect()
644 }
645
646 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
648 self.emphasis_spans()
649 .iter()
650 .filter(|span| span.line == line_num)
651 .cloned()
652 .collect()
653 }
654
655 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
657 self.table_rows()
658 .iter()
659 .filter(|row| row.line == line_num)
660 .cloned()
661 .collect()
662 }
663
664 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
666 self.bare_urls()
667 .iter()
668 .filter(|url| url.line == line_num)
669 .cloned()
670 .collect()
671 }
672
673 fn parse_links(
675 content: &str,
676 lines: &[LineInfo],
677 code_blocks: &[(usize, usize)],
678 flavor: MarkdownFlavor,
679 ) -> Vec<ParsedLink> {
680 use crate::utils::skip_context::is_mkdocs_snippet_line;
681
682 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
687 let full_match = cap.get(0).unwrap();
688 let match_start = full_match.start();
689 let match_end = full_match.end();
690
691 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
693 continue;
694 }
695
696 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
698 continue;
699 }
700
701 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
703 continue;
704 }
705
706 let line_idx = lines
709 .iter()
710 .position(|line| {
711 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
712 })
713 .unwrap_or(0);
714
715 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
716 continue;
717 }
718
719 let mut line_num = 1;
721 let mut col_start = match_start;
722 for (idx, line_info) in lines.iter().enumerate() {
723 if match_start >= line_info.byte_offset {
724 line_num = idx + 1;
725 col_start = match_start - line_info.byte_offset;
726 } else {
727 break;
728 }
729 }
730
731 let mut end_line_num = 1;
733 let mut col_end = match_end;
734 for (idx, line_info) in lines.iter().enumerate() {
735 if match_end > line_info.byte_offset {
736 end_line_num = idx + 1;
737 col_end = match_end - line_info.byte_offset;
738 } else {
739 break;
740 }
741 }
742
743 if line_num == end_line_num {
745 } else {
747 }
750
751 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
752
753 if let Some(inline_url) = cap.get(2) {
754 links.push(ParsedLink {
756 line: line_num,
757 start_col: col_start,
758 end_col: col_end,
759 byte_offset: match_start,
760 byte_end: match_end,
761 text,
762 url: inline_url.as_str().to_string(),
763 is_reference: false,
764 reference_id: None,
765 });
766 } else if let Some(ref_id) = cap.get(3) {
767 let ref_id_str = ref_id.as_str();
769 let normalized_ref = if ref_id_str.is_empty() {
770 text.to_lowercase() } else {
772 ref_id_str.to_lowercase()
773 };
774
775 links.push(ParsedLink {
776 line: line_num,
777 start_col: col_start,
778 end_col: col_end,
779 byte_offset: match_start,
780 byte_end: match_end,
781 text,
782 url: String::new(), is_reference: true,
784 reference_id: Some(normalized_ref),
785 });
786 }
787 }
788
789 links
790 }
791
792 fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
794 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
799 let full_match = cap.get(0).unwrap();
800 let match_start = full_match.start();
801 let match_end = full_match.end();
802
803 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
805 continue;
806 }
807
808 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
810 continue;
811 }
812
813 let mut line_num = 1;
815 let mut col_start = match_start;
816 for (idx, line_info) in lines.iter().enumerate() {
817 if match_start >= line_info.byte_offset {
818 line_num = idx + 1;
819 col_start = match_start - line_info.byte_offset;
820 } else {
821 break;
822 }
823 }
824
825 let mut end_line_num = 1;
827 let mut col_end = match_end;
828 for (idx, line_info) in lines.iter().enumerate() {
829 if match_end > line_info.byte_offset {
830 end_line_num = idx + 1;
831 col_end = match_end - line_info.byte_offset;
832 } else {
833 break;
834 }
835 }
836
837 if line_num == end_line_num {
839 } else {
841 }
844
845 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
846
847 if let Some(inline_url) = cap.get(2) {
848 images.push(ParsedImage {
850 line: line_num,
851 start_col: col_start,
852 end_col: col_end,
853 byte_offset: match_start,
854 byte_end: match_end,
855 alt_text,
856 url: inline_url.as_str().to_string(),
857 is_reference: false,
858 reference_id: None,
859 });
860 } else if let Some(ref_id) = cap.get(3) {
861 let ref_id_str = ref_id.as_str();
863 let normalized_ref = if ref_id_str.is_empty() {
864 alt_text.to_lowercase() } else {
866 ref_id_str.to_lowercase()
867 };
868
869 images.push(ParsedImage {
870 line: line_num,
871 start_col: col_start,
872 end_col: col_end,
873 byte_offset: match_start,
874 byte_end: match_end,
875 alt_text,
876 url: String::new(), is_reference: true,
878 reference_id: Some(normalized_ref),
879 });
880 }
881 }
882
883 images
884 }
885
886 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
888 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
892 if line_info.in_code_block {
894 continue;
895 }
896
897 let line = &line_info.content;
898 let line_num = line_idx + 1;
899
900 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
901 let id = cap.get(1).unwrap().as_str().to_lowercase();
902 let url = cap.get(2).unwrap().as_str().to_string();
903 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
904
905 refs.push(ReferenceDef {
906 line: line_num,
907 id,
908 url,
909 title,
910 });
911 }
912 }
913
914 refs
915 }
916
917 fn compute_line_info(
919 content: &str,
920 line_offsets: &[usize],
921 code_blocks: &[(usize, usize)],
922 flavor: MarkdownFlavor,
923 ) -> Vec<LineInfo> {
924 lazy_static! {
925 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
927 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
928
929 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
931
932 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
934 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
935
936 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
938 }
939
940 let content_lines: Vec<&str> = content.lines().collect();
941 let mut lines = Vec::with_capacity(content_lines.len());
942
943 let mut in_front_matter = false;
945 let mut front_matter_end = 0;
946 if content_lines.first().map(|l| l.trim()) == Some("---") {
947 in_front_matter = true;
948 for (idx, line) in content_lines.iter().enumerate().skip(1) {
949 if line.trim() == "---" {
950 front_matter_end = idx;
951 break;
952 }
953 }
954 }
955
956 for (i, line) in content_lines.iter().enumerate() {
957 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
958 let indent = line.len() - line.trim_start().len();
959 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
961 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
963 after_prefix.trim().is_empty()
964 } else {
965 line.trim().is_empty()
966 };
967 let in_code_block = code_blocks.iter().any(|&(start, end)| {
970 let block_content = &content[start..end];
973 let is_multiline = block_content.contains('\n');
974 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
975 let is_indented = !is_fenced
976 && block_content
977 .lines()
978 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
979
980 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
981 });
982
983 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
985 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
987 let prefix = caps.get(1).unwrap().as_str();
988 let content = caps.get(2).unwrap().as_str();
989 (content, prefix.len())
990 } else {
991 (&**line, 0)
992 };
993
994 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
995 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
996 let marker = caps.get(2).map_or("", |m| m.as_str());
997 let spacing = caps.get(3).map_or("", |m| m.as_str());
998 let _content = caps.get(4).map_or("", |m| m.as_str());
999 let marker_column = blockquote_prefix_len + leading_spaces.len();
1000 let content_column = marker_column + marker.len() + spacing.len();
1001
1002 if spacing.is_empty() {
1009 None
1010 } else {
1011 Some(ListItemInfo {
1012 marker: marker.to_string(),
1013 is_ordered: false,
1014 number: None,
1015 marker_column,
1016 content_column,
1017 })
1018 }
1019 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1020 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1021 let number_str = caps.get(2).map_or("", |m| m.as_str());
1022 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1023 let spacing = caps.get(4).map_or("", |m| m.as_str());
1024 let _content = caps.get(5).map_or("", |m| m.as_str());
1025 let marker = format!("{number_str}{delimiter}");
1026 let marker_column = blockquote_prefix_len + leading_spaces.len();
1027 let content_column = marker_column + marker.len() + spacing.len();
1028
1029 if spacing.is_empty() {
1032 None
1033 } else {
1034 Some(ListItemInfo {
1035 marker,
1036 is_ordered: true,
1037 number: number_str.parse().ok(),
1038 marker_column,
1039 content_column,
1040 })
1041 }
1042 } else {
1043 None
1044 }
1045 } else {
1046 None
1047 };
1048
1049 lines.push(LineInfo {
1050 content: line.to_string(),
1051 byte_offset,
1052 indent,
1053 is_blank,
1054 in_code_block,
1055 in_front_matter: in_front_matter && i <= front_matter_end,
1056 list_item,
1057 heading: None, blockquote: None, });
1060 }
1061
1062 for i in 0..content_lines.len() {
1064 if lines[i].in_code_block {
1065 continue;
1066 }
1067
1068 if in_front_matter && i <= front_matter_end {
1070 continue;
1071 }
1072
1073 let line = content_lines[i];
1074
1075 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1077 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1078 let markers = caps.get(2).map_or("", |m| m.as_str());
1079 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1080 let content = caps.get(4).map_or("", |m| m.as_str());
1081
1082 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1083 let marker_column = indent_str.len();
1084
1085 let prefix = format!("{indent_str}{markers}{spaces_after}");
1087
1088 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1090 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1092
1093 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1097
1098 lines[i].blockquote = Some(BlockquoteInfo {
1099 nesting_level,
1100 indent: indent_str.to_string(),
1101 marker_column,
1102 prefix,
1103 content: content.to_string(),
1104 has_no_space_after_marker: has_no_space,
1105 has_multiple_spaces_after_marker: has_multiple_spaces,
1106 needs_md028_fix,
1107 });
1108 }
1109
1110 if lines[i].is_blank {
1112 continue;
1113 }
1114
1115 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1118 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1119 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1120 } else {
1121 false
1122 };
1123
1124 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1125 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1127 continue;
1128 }
1129 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1130 let hashes = caps.get(2).map_or("", |m| m.as_str());
1131 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1132 let rest = caps.get(4).map_or("", |m| m.as_str());
1133
1134 let level = hashes.len() as u8;
1135 let marker_column = leading_spaces.len();
1136
1137 let (text, has_closing, closing_seq) = {
1139 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1141 if rest[id_start..].trim_end().ends_with('}') {
1143 (&rest[..id_start], &rest[id_start..])
1145 } else {
1146 (rest, "")
1147 }
1148 } else {
1149 (rest, "")
1150 };
1151
1152 let trimmed_rest = rest_without_id.trim_end();
1154 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1155 let mut start_of_hashes = last_hash_pos;
1157 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1158 start_of_hashes -= 1;
1159 }
1160
1161 let has_space_before = start_of_hashes == 0
1163 || trimmed_rest
1164 .chars()
1165 .nth(start_of_hashes - 1)
1166 .is_some_and(|c| c.is_whitespace());
1167
1168 let potential_closing = &trimmed_rest[start_of_hashes..];
1170 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1171
1172 if is_all_hashes && has_space_before {
1173 let closing_hashes = potential_closing.to_string();
1175 let text_part = if !custom_id_part.is_empty() {
1178 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1181 } else {
1182 rest_without_id[..start_of_hashes].trim_end().to_string()
1183 };
1184 (text_part, true, closing_hashes)
1185 } else {
1186 (rest.to_string(), false, String::new())
1188 }
1189 } else {
1190 (rest.to_string(), false, String::new())
1192 }
1193 };
1194
1195 let content_column = marker_column + hashes.len() + spaces_after.len();
1196
1197 let raw_text = text.trim().to_string();
1199 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1200
1201 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1203 let next_line = content_lines[i + 1];
1204 if !lines[i + 1].in_code_block
1205 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1206 && let Some(next_line_id) =
1207 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1208 {
1209 custom_id = Some(next_line_id);
1210 }
1211 }
1212
1213 lines[i].heading = Some(HeadingInfo {
1214 level,
1215 style: HeadingStyle::ATX,
1216 marker: hashes.to_string(),
1217 marker_column,
1218 content_column,
1219 text: clean_text,
1220 custom_id,
1221 raw_text,
1222 has_closing_sequence: has_closing,
1223 closing_sequence: closing_seq,
1224 });
1225 }
1226 else if i + 1 < content_lines.len() {
1228 let next_line = content_lines[i + 1];
1229 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1230 if in_front_matter && i < front_matter_end {
1232 continue;
1233 }
1234
1235 if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1237 continue;
1238 }
1239
1240 let underline = next_line.trim();
1241
1242 if underline == "---" {
1245 continue;
1246 }
1247
1248 let current_line_trimmed = line.trim();
1250 if current_line_trimmed.contains(':')
1251 && !current_line_trimmed.starts_with('#')
1252 && !current_line_trimmed.contains('[')
1253 && !current_line_trimmed.contains("](")
1254 {
1255 continue;
1257 }
1258
1259 let level = if underline.starts_with('=') { 1 } else { 2 };
1260 let style = if level == 1 {
1261 HeadingStyle::Setext1
1262 } else {
1263 HeadingStyle::Setext2
1264 };
1265
1266 let raw_text = line.trim().to_string();
1268 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1269
1270 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1272 let attr_line = content_lines[i + 2];
1273 if !lines[i + 2].in_code_block
1274 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1275 && let Some(attr_line_id) =
1276 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1277 {
1278 custom_id = Some(attr_line_id);
1279 }
1280 }
1281
1282 lines[i].heading = Some(HeadingInfo {
1283 level,
1284 style,
1285 marker: underline.to_string(),
1286 marker_column: next_line.len() - next_line.trim_start().len(),
1287 content_column: lines[i].indent,
1288 text: clean_text,
1289 custom_id,
1290 raw_text,
1291 has_closing_sequence: false,
1292 closing_sequence: String::new(),
1293 });
1294 }
1295 }
1296 }
1297
1298 lines
1299 }
1300
1301 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1303 let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); if !content.contains('`') {
1308 return code_spans;
1309 }
1310
1311 let mut pos = 0;
1312 let bytes = content.as_bytes();
1313
1314 while pos < bytes.len() {
1315 if let Some(backtick_start) = content[pos..].find('`') {
1317 let start_pos = pos + backtick_start;
1318
1319 let mut in_code_block = false;
1321 for (line_idx, line_info) in lines.iter().enumerate() {
1322 if start_pos >= line_info.byte_offset
1323 && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1324 {
1325 in_code_block = line_info.in_code_block;
1326 break;
1327 }
1328 }
1329
1330 if in_code_block {
1331 pos = start_pos + 1;
1332 continue;
1333 }
1334
1335 let mut backtick_count = 0;
1337 let mut i = start_pos;
1338 while i < bytes.len() && bytes[i] == b'`' {
1339 backtick_count += 1;
1340 i += 1;
1341 }
1342
1343 let search_start = start_pos + backtick_count;
1345 let closing_pattern = &content[start_pos..start_pos + backtick_count];
1346
1347 if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1348 let end_pos = search_start + rel_end;
1350 let check_pos = end_pos + backtick_count;
1351
1352 if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1354 let content_start = start_pos + backtick_count;
1356 let content_end = end_pos;
1357 let span_content = content[content_start..content_end].to_string();
1358
1359 let mut line_num = 1;
1361 let mut col_start = start_pos;
1362 for (idx, line_info) in lines.iter().enumerate() {
1363 if start_pos >= line_info.byte_offset {
1364 line_num = idx + 1;
1365 col_start = start_pos - line_info.byte_offset;
1366 } else {
1367 break;
1368 }
1369 }
1370
1371 let mut col_end = end_pos + backtick_count;
1373 for line_info in lines.iter() {
1374 if end_pos + backtick_count > line_info.byte_offset {
1375 col_end = end_pos + backtick_count - line_info.byte_offset;
1376 } else {
1377 break;
1378 }
1379 }
1380
1381 code_spans.push(CodeSpan {
1382 line: line_num,
1383 start_col: col_start,
1384 end_col: col_end,
1385 byte_offset: start_pos,
1386 byte_end: end_pos + backtick_count,
1387 backtick_count,
1388 content: span_content,
1389 });
1390
1391 pos = end_pos + backtick_count;
1393 continue;
1394 }
1395 }
1396
1397 pos = start_pos + backtick_count;
1399 } else {
1400 break;
1402 }
1403 }
1404
1405 code_spans
1406 }
1407
1408 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1410 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1413 let mut last_list_item_line = 0;
1414 let mut current_indent_level = 0;
1415 let mut last_marker_width = 0;
1416
1417 for (line_idx, line_info) in lines.iter().enumerate() {
1418 let line_num = line_idx + 1;
1419
1420 if line_info.in_code_block {
1422 if let Some(ref mut block) = current_block {
1423 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1425
1426 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1428
1429 match context {
1430 CodeBlockContext::Indented => {
1431 block.end_line = line_num;
1433 continue;
1434 }
1435 CodeBlockContext::Standalone => {
1436 let completed_block = current_block.take().unwrap();
1438 list_blocks.push(completed_block);
1439 continue;
1440 }
1441 CodeBlockContext::Adjacent => {
1442 block.end_line = line_num;
1444 continue;
1445 }
1446 }
1447 } else {
1448 continue;
1450 }
1451 }
1452
1453 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1455 caps.get(0).unwrap().as_str().to_string()
1456 } else {
1457 String::new()
1458 };
1459
1460 if let Some(list_item) = &line_info.list_item {
1462 let item_indent = list_item.marker_column;
1464 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1467 let is_nested = nesting > block.nesting_level;
1471 let same_type =
1472 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1473 let same_context = block.blockquote_prefix == blockquote_prefix;
1474 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1478 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1479
1480 let has_non_list_content = {
1482 let mut found_non_list = false;
1483 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1485 for check_line in (block_last_item_line + 1)..line_num {
1486 let check_idx = check_line - 1;
1487 if check_idx < lines.len() {
1488 let check_info = &lines[check_idx];
1489 let is_list_breaking_content = if check_info.in_code_block {
1491 let last_item_marker_width =
1493 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1494 lines[block_last_item_line - 1]
1495 .list_item
1496 .as_ref()
1497 .map(|li| {
1498 if li.is_ordered {
1499 li.marker.len() + 1 } else {
1501 li.marker.len()
1502 }
1503 })
1504 .unwrap_or(3) } else {
1506 3 };
1508
1509 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1510
1511 let context = CodeBlockUtils::analyze_code_block_context(
1513 lines,
1514 check_line - 1,
1515 min_continuation,
1516 );
1517
1518 matches!(context, CodeBlockContext::Standalone)
1520 } else if !check_info.is_blank && check_info.list_item.is_none() {
1521 let line_content = check_info.content.trim();
1523
1524 if check_info.heading.is_some()
1526 || line_content.starts_with("---")
1527 || line_content.starts_with("***")
1528 || line_content.starts_with("___")
1529 || line_content.contains('|')
1530 || line_content.starts_with(">")
1531 {
1532 true
1533 }
1534 else {
1536 let last_item_marker_width =
1537 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1538 lines[block_last_item_line - 1]
1539 .list_item
1540 .as_ref()
1541 .map(|li| {
1542 if li.is_ordered {
1543 li.marker.len() + 1 } else {
1545 li.marker.len()
1546 }
1547 })
1548 .unwrap_or(3) } else {
1550 3 };
1552
1553 let min_continuation =
1554 if block.is_ordered { last_item_marker_width } else { 2 };
1555 check_info.indent < min_continuation
1556 }
1557 } else {
1558 false
1559 };
1560
1561 if is_list_breaking_content {
1562 found_non_list = true;
1564 break;
1565 }
1566 }
1567 }
1568 found_non_list
1569 };
1570
1571 let continues_list = if is_nested {
1575 same_context && reasonable_distance && !has_non_list_content
1577 } else {
1578 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1580 };
1581
1582 if continues_list {
1583 block.end_line = line_num;
1585 block.item_lines.push(line_num);
1586
1587 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1589 list_item.marker.len() + 1
1590 } else {
1591 list_item.marker.len()
1592 });
1593
1594 if !block.is_ordered
1596 && block.marker.is_some()
1597 && block.marker.as_ref() != Some(&list_item.marker)
1598 {
1599 block.marker = None;
1601 }
1602 } else {
1603 list_blocks.push(block.clone());
1605
1606 *block = ListBlock {
1607 start_line: line_num,
1608 end_line: line_num,
1609 is_ordered: list_item.is_ordered,
1610 marker: if list_item.is_ordered {
1611 None
1612 } else {
1613 Some(list_item.marker.clone())
1614 },
1615 blockquote_prefix: blockquote_prefix.clone(),
1616 item_lines: vec![line_num],
1617 nesting_level: nesting,
1618 max_marker_width: if list_item.is_ordered {
1619 list_item.marker.len() + 1
1620 } else {
1621 list_item.marker.len()
1622 },
1623 };
1624 }
1625 } else {
1626 current_block = Some(ListBlock {
1628 start_line: line_num,
1629 end_line: line_num,
1630 is_ordered: list_item.is_ordered,
1631 marker: if list_item.is_ordered {
1632 None
1633 } else {
1634 Some(list_item.marker.clone())
1635 },
1636 blockquote_prefix,
1637 item_lines: vec![line_num],
1638 nesting_level: nesting,
1639 max_marker_width: list_item.marker.len(),
1640 });
1641 }
1642
1643 last_list_item_line = line_num;
1644 current_indent_level = item_indent;
1645 last_marker_width = if list_item.is_ordered {
1646 list_item.marker.len() + 1 } else {
1648 list_item.marker.len()
1649 };
1650 } else if let Some(ref mut block) = current_block {
1651 let min_continuation_indent = if block.is_ordered {
1662 current_indent_level + last_marker_width
1663 } else {
1664 current_indent_level + 2 };
1666
1667 if line_info.indent >= min_continuation_indent {
1668 block.end_line = line_num;
1670 } else if line_info.is_blank {
1671 let mut check_idx = line_idx + 1;
1674 let mut found_continuation = false;
1675
1676 while check_idx < lines.len() && lines[check_idx].is_blank {
1678 check_idx += 1;
1679 }
1680
1681 if check_idx < lines.len() {
1682 let next_line = &lines[check_idx];
1683 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1685 found_continuation = true;
1686 }
1687 else if !next_line.in_code_block
1689 && next_line.list_item.is_some()
1690 && let Some(item) = &next_line.list_item
1691 {
1692 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1693 .find(&next_line.content)
1694 .map_or(String::new(), |m| m.as_str().to_string());
1695 if item.marker_column == current_indent_level
1696 && item.is_ordered == block.is_ordered
1697 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1698 {
1699 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1702 if let Some(between_line) = lines.get(idx) {
1703 let trimmed = between_line.content.trim();
1704 if trimmed.is_empty() {
1706 return false;
1707 }
1708 let line_indent =
1710 between_line.content.len() - between_line.content.trim_start().len();
1711
1712 if trimmed.starts_with("```")
1714 || trimmed.starts_with("~~~")
1715 || trimmed.starts_with("---")
1716 || trimmed.starts_with("***")
1717 || trimmed.starts_with("___")
1718 || trimmed.starts_with(">")
1719 || trimmed.contains('|') || between_line.heading.is_some()
1721 {
1722 return true; }
1724
1725 line_indent >= min_continuation_indent
1727 } else {
1728 false
1729 }
1730 });
1731
1732 if block.is_ordered {
1733 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1736 if let Some(between_line) = lines.get(idx) {
1737 let trimmed = between_line.content.trim();
1738 if trimmed.is_empty() {
1739 return false;
1740 }
1741 trimmed.starts_with("```")
1743 || trimmed.starts_with("~~~")
1744 || trimmed.starts_with("---")
1745 || trimmed.starts_with("***")
1746 || trimmed.starts_with("___")
1747 || trimmed.starts_with(">")
1748 || trimmed.contains('|') || between_line.heading.is_some()
1750 } else {
1751 false
1752 }
1753 });
1754 found_continuation = !has_structural_separators;
1755 } else {
1756 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1758 if let Some(between_line) = lines.get(idx) {
1759 let trimmed = between_line.content.trim();
1760 if trimmed.is_empty() {
1761 return false;
1762 }
1763 trimmed.starts_with("```")
1765 || trimmed.starts_with("~~~")
1766 || trimmed.starts_with("---")
1767 || trimmed.starts_with("***")
1768 || trimmed.starts_with("___")
1769 || trimmed.starts_with(">")
1770 || trimmed.contains('|') || between_line.heading.is_some()
1772 } else {
1773 false
1774 }
1775 });
1776 found_continuation = !has_structural_separators;
1777 }
1778 }
1779 }
1780 }
1781
1782 if found_continuation {
1783 block.end_line = line_num;
1785 } else {
1786 list_blocks.push(block.clone());
1788 current_block = None;
1789 }
1790 } else {
1791 let min_required_indent = if block.is_ordered {
1794 current_indent_level + last_marker_width
1795 } else {
1796 current_indent_level + 2
1797 };
1798
1799 let line_content = line_info.content.trim();
1804 let is_structural_separator = line_info.heading.is_some()
1805 || line_content.starts_with("```")
1806 || line_content.starts_with("~~~")
1807 || line_content.starts_with("---")
1808 || line_content.starts_with("***")
1809 || line_content.starts_with("___")
1810 || line_content.starts_with(">")
1811 || line_content.contains('|'); let is_lazy_continuation = last_list_item_line == line_num - 1
1814 && !is_structural_separator
1815 && !line_info.is_blank
1816 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1817
1818 if is_lazy_continuation {
1819 let content_to_check = if !blockquote_prefix.is_empty() {
1822 line_info
1824 .content
1825 .strip_prefix(&blockquote_prefix)
1826 .unwrap_or(&line_info.content)
1827 .trim()
1828 } else {
1829 line_info.content.trim()
1830 };
1831
1832 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1833
1834 if starts_with_uppercase && last_list_item_line > 0 {
1837 list_blocks.push(block.clone());
1839 current_block = None;
1840 } else {
1841 block.end_line = line_num;
1843 }
1844 } else {
1845 list_blocks.push(block.clone());
1847 current_block = None;
1848 }
1849 }
1850 }
1851 }
1852
1853 if let Some(block) = current_block {
1855 list_blocks.push(block);
1856 }
1857
1858 merge_adjacent_list_blocks(&mut list_blocks, lines);
1860
1861 list_blocks
1862 }
1863
1864 fn compute_char_frequency(content: &str) -> CharFrequency {
1866 let mut frequency = CharFrequency::default();
1867
1868 for ch in content.chars() {
1869 match ch {
1870 '#' => frequency.hash_count += 1,
1871 '*' => frequency.asterisk_count += 1,
1872 '_' => frequency.underscore_count += 1,
1873 '-' => frequency.hyphen_count += 1,
1874 '+' => frequency.plus_count += 1,
1875 '>' => frequency.gt_count += 1,
1876 '|' => frequency.pipe_count += 1,
1877 '[' => frequency.bracket_count += 1,
1878 '`' => frequency.backtick_count += 1,
1879 '<' => frequency.lt_count += 1,
1880 '!' => frequency.exclamation_count += 1,
1881 '\n' => frequency.newline_count += 1,
1882 _ => {}
1883 }
1884 }
1885
1886 frequency
1887 }
1888
1889 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1891 lazy_static! {
1892 static ref HTML_TAG_REGEX: regex::Regex =
1893 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
1894 }
1895
1896 let mut html_tags = Vec::with_capacity(content.matches('<').count());
1897
1898 for cap in HTML_TAG_REGEX.captures_iter(content) {
1899 let full_match = cap.get(0).unwrap();
1900 let match_start = full_match.start();
1901 let match_end = full_match.end();
1902
1903 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1905 continue;
1906 }
1907
1908 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1909 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1910 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1911
1912 let mut line_num = 1;
1914 let mut col_start = match_start;
1915 let mut col_end = match_end;
1916 for (idx, line_info) in lines.iter().enumerate() {
1917 if match_start >= line_info.byte_offset {
1918 line_num = idx + 1;
1919 col_start = match_start - line_info.byte_offset;
1920 col_end = match_end - line_info.byte_offset;
1921 } else {
1922 break;
1923 }
1924 }
1925
1926 html_tags.push(HtmlTag {
1927 line: line_num,
1928 start_col: col_start,
1929 end_col: col_end,
1930 byte_offset: match_start,
1931 byte_end: match_end,
1932 tag_name,
1933 is_closing,
1934 is_self_closing,
1935 raw_content: full_match.as_str().to_string(),
1936 });
1937 }
1938
1939 html_tags
1940 }
1941
1942 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1944 lazy_static! {
1945 static ref EMPHASIS_REGEX: regex::Regex =
1946 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1947 }
1948
1949 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1950
1951 for cap in EMPHASIS_REGEX.captures_iter(content) {
1952 let full_match = cap.get(0).unwrap();
1953 let match_start = full_match.start();
1954 let match_end = full_match.end();
1955
1956 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1958 continue;
1959 }
1960
1961 let opening_markers = cap.get(1).unwrap().as_str();
1962 let content_part = cap.get(2).unwrap().as_str();
1963 let closing_markers = cap.get(3).unwrap().as_str();
1964
1965 if opening_markers.chars().next() != closing_markers.chars().next()
1967 || opening_markers.len() != closing_markers.len()
1968 {
1969 continue;
1970 }
1971
1972 let marker = opening_markers.chars().next().unwrap();
1973 let marker_count = opening_markers.len();
1974
1975 let mut line_num = 1;
1977 let mut col_start = match_start;
1978 let mut col_end = match_end;
1979 for (idx, line_info) in lines.iter().enumerate() {
1980 if match_start >= line_info.byte_offset {
1981 line_num = idx + 1;
1982 col_start = match_start - line_info.byte_offset;
1983 col_end = match_end - line_info.byte_offset;
1984 } else {
1985 break;
1986 }
1987 }
1988
1989 emphasis_spans.push(EmphasisSpan {
1990 line: line_num,
1991 start_col: col_start,
1992 end_col: col_end,
1993 byte_offset: match_start,
1994 byte_end: match_end,
1995 marker,
1996 marker_count,
1997 content: content_part.to_string(),
1998 });
1999 }
2000
2001 emphasis_spans
2002 }
2003
2004 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2006 let mut table_rows = Vec::with_capacity(lines.len() / 20);
2007
2008 for (line_idx, line_info) in lines.iter().enumerate() {
2009 if line_info.in_code_block || line_info.is_blank {
2011 continue;
2012 }
2013
2014 let line = &line_info.content;
2015 let line_num = line_idx + 1;
2016
2017 if !line.contains('|') {
2019 continue;
2020 }
2021
2022 let parts: Vec<&str> = line.split('|').collect();
2024 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2025
2026 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2028 let mut column_alignments = Vec::new();
2029
2030 if is_separator {
2031 for part in &parts[1..parts.len() - 1] {
2032 let trimmed = part.trim();
2034 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2035 "center".to_string()
2036 } else if trimmed.ends_with(':') {
2037 "right".to_string()
2038 } else if trimmed.starts_with(':') {
2039 "left".to_string()
2040 } else {
2041 "none".to_string()
2042 };
2043 column_alignments.push(alignment);
2044 }
2045 }
2046
2047 table_rows.push(TableRow {
2048 line: line_num,
2049 is_separator,
2050 column_count,
2051 column_alignments,
2052 });
2053 }
2054
2055 table_rows
2056 }
2057
2058 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2060 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2061
2062 for cap in BARE_URL_PATTERN.captures_iter(content) {
2064 let full_match = cap.get(0).unwrap();
2065 let match_start = full_match.start();
2066 let match_end = full_match.end();
2067
2068 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2070 continue;
2071 }
2072
2073 let preceding_char = if match_start > 0 {
2075 content.chars().nth(match_start - 1)
2076 } else {
2077 None
2078 };
2079 let following_char = content.chars().nth(match_end);
2080
2081 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2082 continue;
2083 }
2084 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2085 continue;
2086 }
2087
2088 let url = full_match.as_str();
2089 let url_type = if url.starts_with("https://") {
2090 "https"
2091 } else if url.starts_with("http://") {
2092 "http"
2093 } else if url.starts_with("ftp://") {
2094 "ftp"
2095 } else {
2096 "other"
2097 };
2098
2099 let mut line_num = 1;
2101 let mut col_start = match_start;
2102 let mut col_end = match_end;
2103 for (idx, line_info) in lines.iter().enumerate() {
2104 if match_start >= line_info.byte_offset {
2105 line_num = idx + 1;
2106 col_start = match_start - line_info.byte_offset;
2107 col_end = match_end - line_info.byte_offset;
2108 } else {
2109 break;
2110 }
2111 }
2112
2113 bare_urls.push(BareUrl {
2114 line: line_num,
2115 start_col: col_start,
2116 end_col: col_end,
2117 byte_offset: match_start,
2118 byte_end: match_end,
2119 url: url.to_string(),
2120 url_type: url_type.to_string(),
2121 });
2122 }
2123
2124 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2126 let full_match = cap.get(0).unwrap();
2127 let match_start = full_match.start();
2128 let match_end = full_match.end();
2129
2130 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2132 continue;
2133 }
2134
2135 let preceding_char = if match_start > 0 {
2137 content.chars().nth(match_start - 1)
2138 } else {
2139 None
2140 };
2141 let following_char = content.chars().nth(match_end);
2142
2143 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2144 continue;
2145 }
2146 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2147 continue;
2148 }
2149
2150 let email = full_match.as_str();
2151
2152 let mut line_num = 1;
2154 let mut col_start = match_start;
2155 let mut col_end = match_end;
2156 for (idx, line_info) in lines.iter().enumerate() {
2157 if match_start >= line_info.byte_offset {
2158 line_num = idx + 1;
2159 col_start = match_start - line_info.byte_offset;
2160 col_end = match_end - line_info.byte_offset;
2161 } else {
2162 break;
2163 }
2164 }
2165
2166 bare_urls.push(BareUrl {
2167 line: line_num,
2168 start_col: col_start,
2169 end_col: col_end,
2170 byte_offset: match_start,
2171 byte_end: match_end,
2172 url: email.to_string(),
2173 url_type: "email".to_string(),
2174 });
2175 }
2176
2177 bare_urls
2178 }
2179}
2180
2181fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2183 if list_blocks.len() < 2 {
2184 return;
2185 }
2186
2187 let mut merger = ListBlockMerger::new(lines);
2188 *list_blocks = merger.merge(list_blocks);
2189}
2190
2191struct ListBlockMerger<'a> {
2193 lines: &'a [LineInfo],
2194}
2195
2196impl<'a> ListBlockMerger<'a> {
2197 fn new(lines: &'a [LineInfo]) -> Self {
2198 Self { lines }
2199 }
2200
2201 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2202 let mut merged = Vec::with_capacity(list_blocks.len());
2203 let mut current = list_blocks[0].clone();
2204
2205 for next in list_blocks.iter().skip(1) {
2206 if self.should_merge_blocks(¤t, next) {
2207 current = self.merge_two_blocks(current, next);
2208 } else {
2209 merged.push(current);
2210 current = next.clone();
2211 }
2212 }
2213
2214 merged.push(current);
2215 merged
2216 }
2217
2218 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2220 if !self.blocks_are_compatible(current, next) {
2222 return false;
2223 }
2224
2225 let spacing = self.analyze_spacing_between(current, next);
2227 match spacing {
2228 BlockSpacing::Consecutive => true,
2229 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2230 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2231 self.can_merge_with_content_between(current, next)
2232 }
2233 }
2234 }
2235
2236 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2238 current.is_ordered == next.is_ordered
2239 && current.blockquote_prefix == next.blockquote_prefix
2240 && current.nesting_level == next.nesting_level
2241 }
2242
2243 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2245 let gap = next.start_line - current.end_line;
2246
2247 match gap {
2248 1 => BlockSpacing::Consecutive,
2249 2 => BlockSpacing::SingleBlank,
2250 _ if gap > 2 => {
2251 if self.has_only_blank_lines_between(current, next) {
2252 BlockSpacing::MultipleBlanks
2253 } else {
2254 BlockSpacing::ContentBetween
2255 }
2256 }
2257 _ => BlockSpacing::Consecutive, }
2259 }
2260
2261 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2263 if has_meaningful_content_between(current, next, self.lines) {
2266 return false; }
2268
2269 !current.is_ordered && current.marker == next.marker
2271 }
2272
2273 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2275 if has_meaningful_content_between(current, next, self.lines) {
2277 return false; }
2279
2280 current.is_ordered && next.is_ordered
2282 }
2283
2284 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2286 for line_num in (current.end_line + 1)..next.start_line {
2287 if let Some(line_info) = self.lines.get(line_num - 1)
2288 && !line_info.content.trim().is_empty()
2289 {
2290 return false;
2291 }
2292 }
2293 true
2294 }
2295
2296 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2298 current.end_line = next.end_line;
2299 current.item_lines.extend_from_slice(&next.item_lines);
2300
2301 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2303
2304 if !current.is_ordered && self.markers_differ(¤t, next) {
2306 current.marker = None; }
2308
2309 current
2310 }
2311
2312 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2314 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2315 }
2316}
2317
2318#[derive(Debug, PartialEq)]
2320enum BlockSpacing {
2321 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2326
2327fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2329 for line_num in (current.end_line + 1)..next.start_line {
2331 if let Some(line_info) = lines.get(line_num - 1) {
2332 let trimmed = line_info.content.trim();
2334
2335 if trimmed.is_empty() {
2337 continue;
2338 }
2339
2340 if line_info.heading.is_some() {
2344 return true; }
2346
2347 if is_horizontal_rule(trimmed) {
2349 return true; }
2351
2352 if trimmed.contains('|') && trimmed.len() > 1 {
2354 return true; }
2356
2357 if trimmed.starts_with('>') {
2359 return true; }
2361
2362 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2364 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2365
2366 let min_continuation_indent = if current.is_ordered {
2368 current.nesting_level + current.max_marker_width + 1 } else {
2370 current.nesting_level + 2
2371 };
2372
2373 if line_indent < min_continuation_indent {
2374 return true; }
2377 }
2378
2379 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2381
2382 let min_indent = if current.is_ordered {
2384 current.nesting_level + current.max_marker_width
2385 } else {
2386 current.nesting_level + 2
2387 };
2388
2389 if line_indent < min_indent {
2391 return true; }
2393
2394 }
2397 }
2398
2399 false
2401}
2402
2403fn is_horizontal_rule(trimmed: &str) -> bool {
2405 if trimmed.len() < 3 {
2406 return false;
2407 }
2408
2409 let chars: Vec<char> = trimmed.chars().collect();
2411 if let Some(&first_char) = chars.first()
2412 && (first_char == '-' || first_char == '*' || first_char == '_')
2413 {
2414 let mut count = 0;
2415 for &ch in &chars {
2416 if ch == first_char {
2417 count += 1;
2418 } else if ch != ' ' && ch != '\t' {
2419 return false; }
2421 }
2422 return count >= 3;
2423 }
2424 false
2425}
2426
2427#[cfg(test)]
2429mod tests {
2430 use super::*;
2431
2432 #[test]
2433 fn test_empty_content() {
2434 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2435 assert_eq!(ctx.content, "");
2436 assert_eq!(ctx.line_offsets, vec![0]);
2437 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2438 assert_eq!(ctx.lines.len(), 0);
2439 }
2440
2441 #[test]
2442 fn test_single_line() {
2443 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2444 assert_eq!(ctx.content, "# Hello");
2445 assert_eq!(ctx.line_offsets, vec![0]);
2446 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2447 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2448 }
2449
2450 #[test]
2451 fn test_multi_line() {
2452 let content = "# Title\n\nSecond line\nThird line";
2453 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2454 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2455 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2462
2463 #[test]
2464 fn test_line_info() {
2465 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2466 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2467
2468 assert_eq!(ctx.lines.len(), 7);
2470
2471 let line1 = &ctx.lines[0];
2473 assert_eq!(line1.content, "# Title");
2474 assert_eq!(line1.byte_offset, 0);
2475 assert_eq!(line1.indent, 0);
2476 assert!(!line1.is_blank);
2477 assert!(!line1.in_code_block);
2478 assert!(line1.list_item.is_none());
2479
2480 let line2 = &ctx.lines[1];
2482 assert_eq!(line2.content, " indented");
2483 assert_eq!(line2.byte_offset, 8);
2484 assert_eq!(line2.indent, 4);
2485 assert!(!line2.is_blank);
2486
2487 let line3 = &ctx.lines[2];
2489 assert_eq!(line3.content, "");
2490 assert!(line3.is_blank);
2491
2492 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2494 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2495 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2496 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2497 }
2498
2499 #[test]
2500 fn test_list_item_detection() {
2501 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2502 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2503
2504 let line1 = &ctx.lines[0];
2506 assert!(line1.list_item.is_some());
2507 let list1 = line1.list_item.as_ref().unwrap();
2508 assert_eq!(list1.marker, "-");
2509 assert!(!list1.is_ordered);
2510 assert_eq!(list1.marker_column, 0);
2511 assert_eq!(list1.content_column, 2);
2512
2513 let line2 = &ctx.lines[1];
2515 assert!(line2.list_item.is_some());
2516 let list2 = line2.list_item.as_ref().unwrap();
2517 assert_eq!(list2.marker, "*");
2518 assert_eq!(list2.marker_column, 2);
2519
2520 let line3 = &ctx.lines[2];
2522 assert!(line3.list_item.is_some());
2523 let list3 = line3.list_item.as_ref().unwrap();
2524 assert_eq!(list3.marker, "1.");
2525 assert!(list3.is_ordered);
2526 assert_eq!(list3.number, Some(1));
2527
2528 let line6 = &ctx.lines[5];
2530 assert!(line6.list_item.is_none());
2531 }
2532
2533 #[test]
2534 fn test_offset_to_line_col_edge_cases() {
2535 let content = "a\nb\nc";
2536 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2537 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2545}