1use crate::config::MarkdownFlavor;
2use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7 static ref LINK_PATTERN: Regex = Regex::new(
10 r"(?sx)
11 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
12 (?:
13 \(([^)]*)\) # Inline URL in group 2 (can be empty)
14 |
15 \[([^\]]*)\] # Reference ID in group 3
16 )"
17 ).unwrap();
18
19 static ref IMAGE_PATTERN: Regex = Regex::new(
22 r"(?sx)
23 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
24 (?:
25 \(([^)]*)\) # Inline URL in group 2 (can be empty)
26 |
27 \[([^\]]*)\] # Reference ID in group 3
28 )"
29 ).unwrap();
30
31 static ref REF_DEF_PATTERN: Regex = Regex::new(
33 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
34 ).unwrap();
35
36 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
39 r"`+"
40 ).unwrap();
41
42 static ref BARE_URL_PATTERN: Regex = Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap();
46
47 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
49 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
50 ).unwrap();
51
52 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
54 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
55 ).unwrap();
56
57 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
59}
60
61#[derive(Debug, Clone)]
63pub struct LineInfo {
64 pub content: String,
66 pub byte_offset: usize,
68 pub indent: usize,
70 pub is_blank: bool,
72 pub in_code_block: bool,
74 pub in_front_matter: bool,
76 pub list_item: Option<ListItemInfo>,
78 pub heading: Option<HeadingInfo>,
80 pub blockquote: Option<BlockquoteInfo>,
82}
83
84#[derive(Debug, Clone)]
86pub struct ListItemInfo {
87 pub marker: String,
89 pub is_ordered: bool,
91 pub number: Option<usize>,
93 pub marker_column: usize,
95 pub content_column: usize,
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub enum HeadingStyle {
102 ATX,
104 Setext1,
106 Setext2,
108}
109
110#[derive(Debug, Clone)]
112pub struct ParsedLink {
113 pub line: usize,
115 pub start_col: usize,
117 pub end_col: usize,
119 pub byte_offset: usize,
121 pub byte_end: usize,
123 pub text: String,
125 pub url: String,
127 pub is_reference: bool,
129 pub reference_id: Option<String>,
131}
132
133#[derive(Debug, Clone)]
135pub struct ParsedImage {
136 pub line: usize,
138 pub start_col: usize,
140 pub end_col: usize,
142 pub byte_offset: usize,
144 pub byte_end: usize,
146 pub alt_text: String,
148 pub url: String,
150 pub is_reference: bool,
152 pub reference_id: Option<String>,
154}
155
156#[derive(Debug, Clone)]
158pub struct ReferenceDef {
159 pub line: usize,
161 pub id: String,
163 pub url: String,
165 pub title: Option<String>,
167}
168
169#[derive(Debug, Clone)]
171pub struct CodeSpan {
172 pub line: usize,
174 pub start_col: usize,
176 pub end_col: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182 pub backtick_count: usize,
184 pub content: String,
186}
187
188#[derive(Debug, Clone)]
190pub struct HeadingInfo {
191 pub level: u8,
193 pub style: HeadingStyle,
195 pub marker: String,
197 pub marker_column: usize,
199 pub content_column: usize,
201 pub text: String,
203 pub custom_id: Option<String>,
205 pub raw_text: String,
207 pub has_closing_sequence: bool,
209 pub closing_sequence: String,
211}
212
213#[derive(Debug, Clone)]
215pub struct BlockquoteInfo {
216 pub nesting_level: usize,
218 pub indent: String,
220 pub marker_column: usize,
222 pub prefix: String,
224 pub content: String,
226 pub has_no_space_after_marker: bool,
228 pub has_multiple_spaces_after_marker: bool,
230 pub needs_md028_fix: bool,
232}
233
234#[derive(Debug, Clone)]
236pub struct ListBlock {
237 pub start_line: usize,
239 pub end_line: usize,
241 pub is_ordered: bool,
243 pub marker: Option<String>,
245 pub blockquote_prefix: String,
247 pub item_lines: Vec<usize>,
249 pub nesting_level: usize,
251 pub max_marker_width: usize,
253}
254
255use std::sync::{Arc, Mutex};
256
257#[derive(Debug, Clone, Default)]
259pub struct CharFrequency {
260 pub hash_count: usize,
262 pub asterisk_count: usize,
264 pub underscore_count: usize,
266 pub hyphen_count: usize,
268 pub plus_count: usize,
270 pub gt_count: usize,
272 pub pipe_count: usize,
274 pub bracket_count: usize,
276 pub backtick_count: usize,
278 pub lt_count: usize,
280 pub exclamation_count: usize,
282 pub newline_count: usize,
284}
285
286#[derive(Debug, Clone)]
288pub struct HtmlTag {
289 pub line: usize,
291 pub start_col: usize,
293 pub end_col: usize,
295 pub byte_offset: usize,
297 pub byte_end: usize,
299 pub tag_name: String,
301 pub is_closing: bool,
303 pub is_self_closing: bool,
305 pub raw_content: String,
307}
308
309#[derive(Debug, Clone)]
311pub struct EmphasisSpan {
312 pub line: usize,
314 pub start_col: usize,
316 pub end_col: usize,
318 pub byte_offset: usize,
320 pub byte_end: usize,
322 pub marker: char,
324 pub marker_count: usize,
326 pub content: String,
328}
329
330#[derive(Debug, Clone)]
332pub struct TableRow {
333 pub line: usize,
335 pub is_separator: bool,
337 pub column_count: usize,
339 pub column_alignments: Vec<String>, }
342
343#[derive(Debug, Clone)]
345pub struct BareUrl {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub url: String,
358 pub url_type: String,
360}
361
362pub struct LintContext<'a> {
363 pub content: &'a str,
364 pub line_offsets: Vec<usize>,
365 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, pub flavor: MarkdownFlavor, }
379
380impl<'a> LintContext<'a> {
381 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
382 let mut line_offsets = vec![0];
383 for (i, c) in content.char_indices() {
384 if c == '\n' {
385 line_offsets.push(i + 1);
386 }
387 }
388
389 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
391
392 let lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
394
395 let links = Self::parse_links(content, &lines, &code_blocks, flavor);
398 let images = Self::parse_images(content, &lines, &code_blocks);
399 let reference_defs = Self::parse_reference_defs(content, &lines);
400 let list_blocks = Self::parse_list_blocks(&lines);
401
402 let char_frequency = Self::compute_char_frequency(content);
404
405 Self {
406 content,
407 line_offsets,
408 code_blocks,
409 lines,
410 links,
411 images,
412 reference_defs,
413 code_spans_cache: Mutex::new(None),
414 list_blocks,
415 char_frequency,
416 html_tags_cache: Mutex::new(None),
417 emphasis_spans_cache: Mutex::new(None),
418 table_rows_cache: Mutex::new(None),
419 bare_urls_cache: Mutex::new(None),
420 flavor,
421 }
422 }
423
424 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
426 let mut cache = self.code_spans_cache.lock().unwrap();
427
428 if cache.is_none() {
430 let code_spans = Self::parse_code_spans(self.content, &self.lines);
431 *cache = Some(Arc::new(code_spans));
432 }
433
434 cache.as_ref().unwrap().clone()
436 }
437
438 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
440 let mut cache = self.html_tags_cache.lock().unwrap();
441
442 if cache.is_none() {
443 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
444 *cache = Some(Arc::new(html_tags));
445 }
446
447 cache.as_ref().unwrap().clone()
448 }
449
450 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
452 let mut cache = self.emphasis_spans_cache.lock().unwrap();
453
454 if cache.is_none() {
455 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
456 *cache = Some(Arc::new(emphasis_spans));
457 }
458
459 cache.as_ref().unwrap().clone()
460 }
461
462 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
464 let mut cache = self.table_rows_cache.lock().unwrap();
465
466 if cache.is_none() {
467 let table_rows = Self::parse_table_rows(&self.lines);
468 *cache = Some(Arc::new(table_rows));
469 }
470
471 cache.as_ref().unwrap().clone()
472 }
473
474 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
476 let mut cache = self.bare_urls_cache.lock().unwrap();
477
478 if cache.is_none() {
479 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
480 *cache = Some(Arc::new(bare_urls));
481 }
482
483 cache.as_ref().unwrap().clone()
484 }
485
486 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
488 match self.line_offsets.binary_search(&offset) {
489 Ok(line) => (line + 1, 1),
490 Err(line) => {
491 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
492 (line, offset - line_start + 1)
493 }
494 }
495 }
496
497 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
499 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
501 return true;
502 }
503
504 self.code_spans()
506 .iter()
507 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
508 }
509
510 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
512 if line_num > 0 {
513 self.lines.get(line_num - 1)
514 } else {
515 None
516 }
517 }
518
519 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
521 self.line_info(line_num).map(|info| info.byte_offset)
522 }
523
524 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
526 let normalized_id = ref_id.to_lowercase();
527 self.reference_defs
528 .iter()
529 .find(|def| def.id == normalized_id)
530 .map(|def| def.url.as_str())
531 }
532
533 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
535 self.links.iter().filter(|link| link.line == line_num).collect()
536 }
537
538 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
540 self.images.iter().filter(|img| img.line == line_num).collect()
541 }
542
543 pub fn is_in_list_block(&self, line_num: usize) -> bool {
545 self.list_blocks
546 .iter()
547 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
548 }
549
550 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
552 self.list_blocks
553 .iter()
554 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
555 }
556
557 pub fn has_char(&self, ch: char) -> bool {
559 match ch {
560 '#' => self.char_frequency.hash_count > 0,
561 '*' => self.char_frequency.asterisk_count > 0,
562 '_' => self.char_frequency.underscore_count > 0,
563 '-' => self.char_frequency.hyphen_count > 0,
564 '+' => self.char_frequency.plus_count > 0,
565 '>' => self.char_frequency.gt_count > 0,
566 '|' => self.char_frequency.pipe_count > 0,
567 '[' => self.char_frequency.bracket_count > 0,
568 '`' => self.char_frequency.backtick_count > 0,
569 '<' => self.char_frequency.lt_count > 0,
570 '!' => self.char_frequency.exclamation_count > 0,
571 '\n' => self.char_frequency.newline_count > 0,
572 _ => self.content.contains(ch), }
574 }
575
576 pub fn char_count(&self, ch: char) -> usize {
578 match ch {
579 '#' => self.char_frequency.hash_count,
580 '*' => self.char_frequency.asterisk_count,
581 '_' => self.char_frequency.underscore_count,
582 '-' => self.char_frequency.hyphen_count,
583 '+' => self.char_frequency.plus_count,
584 '>' => self.char_frequency.gt_count,
585 '|' => self.char_frequency.pipe_count,
586 '[' => self.char_frequency.bracket_count,
587 '`' => self.char_frequency.backtick_count,
588 '<' => self.char_frequency.lt_count,
589 '!' => self.char_frequency.exclamation_count,
590 '\n' => self.char_frequency.newline_count,
591 _ => self.content.matches(ch).count(), }
593 }
594
595 pub fn likely_has_headings(&self) -> bool {
597 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
599
600 pub fn likely_has_lists(&self) -> bool {
602 self.char_frequency.asterisk_count > 0
603 || self.char_frequency.hyphen_count > 0
604 || self.char_frequency.plus_count > 0
605 }
606
607 pub fn likely_has_emphasis(&self) -> bool {
609 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
610 }
611
612 pub fn likely_has_tables(&self) -> bool {
614 self.char_frequency.pipe_count > 2
615 }
616
617 pub fn likely_has_blockquotes(&self) -> bool {
619 self.char_frequency.gt_count > 0
620 }
621
622 pub fn likely_has_code(&self) -> bool {
624 self.char_frequency.backtick_count > 0
625 }
626
627 pub fn likely_has_links_or_images(&self) -> bool {
629 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
630 }
631
632 pub fn likely_has_html(&self) -> bool {
634 self.char_frequency.lt_count > 0
635 }
636
637 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
639 self.html_tags()
640 .iter()
641 .filter(|tag| tag.line == line_num)
642 .cloned()
643 .collect()
644 }
645
646 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
648 self.emphasis_spans()
649 .iter()
650 .filter(|span| span.line == line_num)
651 .cloned()
652 .collect()
653 }
654
655 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
657 self.table_rows()
658 .iter()
659 .filter(|row| row.line == line_num)
660 .cloned()
661 .collect()
662 }
663
664 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
666 self.bare_urls()
667 .iter()
668 .filter(|url| url.line == line_num)
669 .cloned()
670 .collect()
671 }
672
673 fn parse_links(
675 content: &str,
676 lines: &[LineInfo],
677 code_blocks: &[(usize, usize)],
678 flavor: MarkdownFlavor,
679 ) -> Vec<ParsedLink> {
680 use crate::utils::skip_context::is_mkdocs_snippet_line;
681
682 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
687 let full_match = cap.get(0).unwrap();
688 let match_start = full_match.start();
689 let match_end = full_match.end();
690
691 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
693 continue;
694 }
695
696 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
698 continue;
699 }
700
701 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
703 continue;
704 }
705
706 let line_idx = lines
709 .iter()
710 .position(|line| {
711 match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
712 })
713 .unwrap_or(0);
714
715 if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
716 continue;
717 }
718
719 let mut line_num = 1;
721 let mut col_start = match_start;
722 for (idx, line_info) in lines.iter().enumerate() {
723 if match_start >= line_info.byte_offset {
724 line_num = idx + 1;
725 col_start = match_start - line_info.byte_offset;
726 } else {
727 break;
728 }
729 }
730
731 let mut end_line_num = 1;
733 let mut col_end = match_end;
734 for (idx, line_info) in lines.iter().enumerate() {
735 if match_end > line_info.byte_offset {
736 end_line_num = idx + 1;
737 col_end = match_end - line_info.byte_offset;
738 } else {
739 break;
740 }
741 }
742
743 if line_num == end_line_num {
745 } else {
747 }
750
751 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
752
753 if let Some(inline_url) = cap.get(2) {
754 links.push(ParsedLink {
756 line: line_num,
757 start_col: col_start,
758 end_col: col_end,
759 byte_offset: match_start,
760 byte_end: match_end,
761 text,
762 url: inline_url.as_str().to_string(),
763 is_reference: false,
764 reference_id: None,
765 });
766 } else if let Some(ref_id) = cap.get(3) {
767 let ref_id_str = ref_id.as_str();
769 let normalized_ref = if ref_id_str.is_empty() {
770 text.to_lowercase() } else {
772 ref_id_str.to_lowercase()
773 };
774
775 links.push(ParsedLink {
776 line: line_num,
777 start_col: col_start,
778 end_col: col_end,
779 byte_offset: match_start,
780 byte_end: match_end,
781 text,
782 url: String::new(), is_reference: true,
784 reference_id: Some(normalized_ref),
785 });
786 }
787 }
788
789 links
790 }
791
792 fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
794 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
799 let full_match = cap.get(0).unwrap();
800 let match_start = full_match.start();
801 let match_end = full_match.end();
802
803 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
805 continue;
806 }
807
808 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
810 continue;
811 }
812
813 let mut line_num = 1;
815 let mut col_start = match_start;
816 for (idx, line_info) in lines.iter().enumerate() {
817 if match_start >= line_info.byte_offset {
818 line_num = idx + 1;
819 col_start = match_start - line_info.byte_offset;
820 } else {
821 break;
822 }
823 }
824
825 let mut end_line_num = 1;
827 let mut col_end = match_end;
828 for (idx, line_info) in lines.iter().enumerate() {
829 if match_end > line_info.byte_offset {
830 end_line_num = idx + 1;
831 col_end = match_end - line_info.byte_offset;
832 } else {
833 break;
834 }
835 }
836
837 if line_num == end_line_num {
839 } else {
841 }
844
845 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
846
847 if let Some(inline_url) = cap.get(2) {
848 images.push(ParsedImage {
850 line: line_num,
851 start_col: col_start,
852 end_col: col_end,
853 byte_offset: match_start,
854 byte_end: match_end,
855 alt_text,
856 url: inline_url.as_str().to_string(),
857 is_reference: false,
858 reference_id: None,
859 });
860 } else if let Some(ref_id) = cap.get(3) {
861 let ref_id_str = ref_id.as_str();
863 let normalized_ref = if ref_id_str.is_empty() {
864 alt_text.to_lowercase() } else {
866 ref_id_str.to_lowercase()
867 };
868
869 images.push(ParsedImage {
870 line: line_num,
871 start_col: col_start,
872 end_col: col_end,
873 byte_offset: match_start,
874 byte_end: match_end,
875 alt_text,
876 url: String::new(), is_reference: true,
878 reference_id: Some(normalized_ref),
879 });
880 }
881 }
882
883 images
884 }
885
886 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
888 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
892 if line_info.in_code_block {
894 continue;
895 }
896
897 let line = &line_info.content;
898 let line_num = line_idx + 1;
899
900 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
901 let id = cap.get(1).unwrap().as_str().to_lowercase();
902 let url = cap.get(2).unwrap().as_str().to_string();
903 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
904
905 refs.push(ReferenceDef {
906 line: line_num,
907 id,
908 url,
909 title,
910 });
911 }
912 }
913
914 refs
915 }
916
917 fn compute_line_info(
919 content: &str,
920 line_offsets: &[usize],
921 code_blocks: &[(usize, usize)],
922 flavor: MarkdownFlavor,
923 ) -> Vec<LineInfo> {
924 lazy_static! {
925 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
927 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
928
929 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
931
932 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
934 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
935
936 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
938 }
939
940 let content_lines: Vec<&str> = content.lines().collect();
941 let mut lines = Vec::with_capacity(content_lines.len());
942
943 let mut in_front_matter = false;
945 let mut front_matter_end = 0;
946 if content_lines.first().map(|l| l.trim()) == Some("---") {
947 in_front_matter = true;
948 for (idx, line) in content_lines.iter().enumerate().skip(1) {
949 if line.trim() == "---" {
950 front_matter_end = idx;
951 break;
952 }
953 }
954 }
955
956 for (i, line) in content_lines.iter().enumerate() {
957 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
958 let indent = line.len() - line.trim_start().len();
959 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
961 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
963 after_prefix.trim().is_empty()
964 } else {
965 line.trim().is_empty()
966 };
967 let in_code_block = code_blocks.iter().any(|&(start, end)| {
970 let block_content = &content[start..end];
973 let is_multiline = block_content.contains('\n');
974 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
975 let is_indented = !is_fenced
976 && block_content
977 .lines()
978 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
979
980 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
981 });
982
983 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
985 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
987 let prefix = caps.get(1).unwrap().as_str();
988 let content = caps.get(2).unwrap().as_str();
989 (content, prefix.len())
990 } else {
991 (&**line, 0)
992 };
993
994 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
995 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
996 let marker = caps.get(2).map_or("", |m| m.as_str());
997 let spacing = caps.get(3).map_or("", |m| m.as_str());
998 let _content = caps.get(4).map_or("", |m| m.as_str());
999 let marker_column = blockquote_prefix_len + leading_spaces.len();
1000 let content_column = marker_column + marker.len() + spacing.len();
1001
1002 if spacing.is_empty() {
1009 None
1010 } else {
1011 Some(ListItemInfo {
1012 marker: marker.to_string(),
1013 is_ordered: false,
1014 number: None,
1015 marker_column,
1016 content_column,
1017 })
1018 }
1019 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1020 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1021 let number_str = caps.get(2).map_or("", |m| m.as_str());
1022 let delimiter = caps.get(3).map_or("", |m| m.as_str());
1023 let spacing = caps.get(4).map_or("", |m| m.as_str());
1024 let _content = caps.get(5).map_or("", |m| m.as_str());
1025 let marker = format!("{number_str}{delimiter}");
1026 let marker_column = blockquote_prefix_len + leading_spaces.len();
1027 let content_column = marker_column + marker.len() + spacing.len();
1028
1029 if spacing.is_empty() {
1032 None
1033 } else {
1034 Some(ListItemInfo {
1035 marker,
1036 is_ordered: true,
1037 number: number_str.parse().ok(),
1038 marker_column,
1039 content_column,
1040 })
1041 }
1042 } else {
1043 None
1044 }
1045 } else {
1046 None
1047 };
1048
1049 lines.push(LineInfo {
1050 content: line.to_string(),
1051 byte_offset,
1052 indent,
1053 is_blank,
1054 in_code_block,
1055 in_front_matter: in_front_matter && i <= front_matter_end,
1056 list_item,
1057 heading: None, blockquote: None, });
1060 }
1061
1062 for i in 0..content_lines.len() {
1064 if lines[i].in_code_block {
1065 continue;
1066 }
1067
1068 if in_front_matter && i <= front_matter_end {
1070 continue;
1071 }
1072
1073 let line = content_lines[i];
1074
1075 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1077 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1078 let markers = caps.get(2).map_or("", |m| m.as_str());
1079 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1080 let content = caps.get(4).map_or("", |m| m.as_str());
1081
1082 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1083 let marker_column = indent_str.len();
1084
1085 let prefix = format!("{indent_str}{markers}{spaces_after}");
1087
1088 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1090 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1092
1093 let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1097
1098 lines[i].blockquote = Some(BlockquoteInfo {
1099 nesting_level,
1100 indent: indent_str.to_string(),
1101 marker_column,
1102 prefix,
1103 content: content.to_string(),
1104 has_no_space_after_marker: has_no_space,
1105 has_multiple_spaces_after_marker: has_multiple_spaces,
1106 needs_md028_fix,
1107 });
1108 }
1109
1110 if lines[i].is_blank {
1112 continue;
1113 }
1114
1115 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1118 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1119 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1120 } else {
1121 false
1122 };
1123
1124 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1125 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1126 let hashes = caps.get(2).map_or("", |m| m.as_str());
1127 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1128 let rest = caps.get(4).map_or("", |m| m.as_str());
1129
1130 let level = hashes.len() as u8;
1131 let marker_column = leading_spaces.len();
1132
1133 let (text, has_closing, closing_seq) = {
1135 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1137 if rest[id_start..].trim_end().ends_with('}') {
1139 (&rest[..id_start], &rest[id_start..])
1141 } else {
1142 (rest, "")
1143 }
1144 } else {
1145 (rest, "")
1146 };
1147
1148 let trimmed_rest = rest_without_id.trim_end();
1150 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1151 let mut start_of_hashes = last_hash_pos;
1153 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1154 start_of_hashes -= 1;
1155 }
1156
1157 let has_space_before = start_of_hashes == 0
1159 || trimmed_rest
1160 .chars()
1161 .nth(start_of_hashes - 1)
1162 .is_some_and(|c| c.is_whitespace());
1163
1164 let potential_closing = &trimmed_rest[start_of_hashes..];
1166 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1167
1168 if is_all_hashes && has_space_before {
1169 let closing_hashes = potential_closing.to_string();
1171 let text_part = if !custom_id_part.is_empty() {
1174 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1177 } else {
1178 rest_without_id[..start_of_hashes].trim_end().to_string()
1179 };
1180 (text_part, true, closing_hashes)
1181 } else {
1182 (rest.to_string(), false, String::new())
1184 }
1185 } else {
1186 (rest.to_string(), false, String::new())
1188 }
1189 };
1190
1191 let content_column = marker_column + hashes.len() + spaces_after.len();
1192
1193 let raw_text = text.trim().to_string();
1195 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1196
1197 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1199 let next_line = content_lines[i + 1];
1200 if !lines[i + 1].in_code_block
1201 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1202 && let Some(next_line_id) =
1203 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1204 {
1205 custom_id = Some(next_line_id);
1206 }
1207 }
1208
1209 lines[i].heading = Some(HeadingInfo {
1210 level,
1211 style: HeadingStyle::ATX,
1212 marker: hashes.to_string(),
1213 marker_column,
1214 content_column,
1215 text: clean_text,
1216 custom_id,
1217 raw_text,
1218 has_closing_sequence: has_closing,
1219 closing_sequence: closing_seq,
1220 });
1221 }
1222 else if i + 1 < content_lines.len() {
1224 let next_line = content_lines[i + 1];
1225 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1226 if in_front_matter && i < front_matter_end {
1228 continue;
1229 }
1230
1231 let underline = next_line.trim();
1232 let level = if underline.starts_with('=') { 1 } else { 2 };
1233 let style = if level == 1 {
1234 HeadingStyle::Setext1
1235 } else {
1236 HeadingStyle::Setext2
1237 };
1238
1239 let raw_text = line.trim().to_string();
1241 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1242
1243 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1245 let attr_line = content_lines[i + 2];
1246 if !lines[i + 2].in_code_block
1247 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1248 && let Some(attr_line_id) =
1249 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1250 {
1251 custom_id = Some(attr_line_id);
1252 }
1253 }
1254
1255 lines[i].heading = Some(HeadingInfo {
1256 level,
1257 style,
1258 marker: underline.to_string(),
1259 marker_column: next_line.len() - next_line.trim_start().len(),
1260 content_column: lines[i].indent,
1261 text: clean_text,
1262 custom_id,
1263 raw_text,
1264 has_closing_sequence: false,
1265 closing_sequence: String::new(),
1266 });
1267 }
1268 }
1269 }
1270
1271 lines
1272 }
1273
1274 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1276 let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); if !content.contains('`') {
1281 return code_spans;
1282 }
1283
1284 let mut pos = 0;
1285 let bytes = content.as_bytes();
1286
1287 while pos < bytes.len() {
1288 if let Some(backtick_start) = content[pos..].find('`') {
1290 let start_pos = pos + backtick_start;
1291
1292 let mut in_code_block = false;
1294 for (line_idx, line_info) in lines.iter().enumerate() {
1295 if start_pos >= line_info.byte_offset
1296 && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1297 {
1298 in_code_block = line_info.in_code_block;
1299 break;
1300 }
1301 }
1302
1303 if in_code_block {
1304 pos = start_pos + 1;
1305 continue;
1306 }
1307
1308 let mut backtick_count = 0;
1310 let mut i = start_pos;
1311 while i < bytes.len() && bytes[i] == b'`' {
1312 backtick_count += 1;
1313 i += 1;
1314 }
1315
1316 let search_start = start_pos + backtick_count;
1318 let closing_pattern = &content[start_pos..start_pos + backtick_count];
1319
1320 if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1321 let end_pos = search_start + rel_end;
1323 let check_pos = end_pos + backtick_count;
1324
1325 if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1327 let content_start = start_pos + backtick_count;
1329 let content_end = end_pos;
1330 let span_content = content[content_start..content_end].to_string();
1331
1332 let mut line_num = 1;
1334 let mut col_start = start_pos;
1335 for (idx, line_info) in lines.iter().enumerate() {
1336 if start_pos >= line_info.byte_offset {
1337 line_num = idx + 1;
1338 col_start = start_pos - line_info.byte_offset;
1339 } else {
1340 break;
1341 }
1342 }
1343
1344 let mut col_end = end_pos + backtick_count;
1346 for line_info in lines.iter() {
1347 if end_pos + backtick_count > line_info.byte_offset {
1348 col_end = end_pos + backtick_count - line_info.byte_offset;
1349 } else {
1350 break;
1351 }
1352 }
1353
1354 code_spans.push(CodeSpan {
1355 line: line_num,
1356 start_col: col_start,
1357 end_col: col_end,
1358 byte_offset: start_pos,
1359 byte_end: end_pos + backtick_count,
1360 backtick_count,
1361 content: span_content,
1362 });
1363
1364 pos = end_pos + backtick_count;
1366 continue;
1367 }
1368 }
1369
1370 pos = start_pos + backtick_count;
1372 } else {
1373 break;
1375 }
1376 }
1377
1378 code_spans
1379 }
1380
1381 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1383 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1386 let mut last_list_item_line = 0;
1387 let mut current_indent_level = 0;
1388 let mut last_marker_width = 0;
1389
1390 for (line_idx, line_info) in lines.iter().enumerate() {
1391 let line_num = line_idx + 1;
1392
1393 if line_info.in_code_block {
1395 if let Some(ref mut block) = current_block {
1396 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1398
1399 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1401
1402 match context {
1403 CodeBlockContext::Indented => {
1404 block.end_line = line_num;
1406 continue;
1407 }
1408 CodeBlockContext::Standalone => {
1409 let completed_block = current_block.take().unwrap();
1411 list_blocks.push(completed_block);
1412 continue;
1413 }
1414 CodeBlockContext::Adjacent => {
1415 block.end_line = line_num;
1417 continue;
1418 }
1419 }
1420 } else {
1421 continue;
1423 }
1424 }
1425
1426 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1428 caps.get(0).unwrap().as_str().to_string()
1429 } else {
1430 String::new()
1431 };
1432
1433 if let Some(list_item) = &line_info.list_item {
1435 let item_indent = list_item.marker_column;
1437 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1440 let is_nested = nesting > block.nesting_level;
1444 let same_type =
1445 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1446 let same_context = block.blockquote_prefix == blockquote_prefix;
1447 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1451 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1452
1453 let has_non_list_content = {
1455 let mut found_non_list = false;
1456 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1458 for check_line in (block_last_item_line + 1)..line_num {
1459 let check_idx = check_line - 1;
1460 if check_idx < lines.len() {
1461 let check_info = &lines[check_idx];
1462 let is_list_breaking_content = if check_info.in_code_block {
1464 let last_item_marker_width =
1466 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1467 lines[block_last_item_line - 1]
1468 .list_item
1469 .as_ref()
1470 .map(|li| {
1471 if li.is_ordered {
1472 li.marker.len() + 1 } else {
1474 li.marker.len()
1475 }
1476 })
1477 .unwrap_or(3) } else {
1479 3 };
1481
1482 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1483
1484 let context = CodeBlockUtils::analyze_code_block_context(
1486 lines,
1487 check_line - 1,
1488 min_continuation,
1489 );
1490
1491 matches!(context, CodeBlockContext::Standalone)
1493 } else if !check_info.is_blank && check_info.list_item.is_none() {
1494 let line_content = check_info.content.trim();
1496
1497 if check_info.heading.is_some()
1499 || line_content.starts_with("---")
1500 || line_content.starts_with("***")
1501 || line_content.starts_with("___")
1502 || line_content.contains('|')
1503 || line_content.starts_with(">")
1504 {
1505 true
1506 }
1507 else {
1509 let last_item_marker_width =
1510 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1511 lines[block_last_item_line - 1]
1512 .list_item
1513 .as_ref()
1514 .map(|li| {
1515 if li.is_ordered {
1516 li.marker.len() + 1 } else {
1518 li.marker.len()
1519 }
1520 })
1521 .unwrap_or(3) } else {
1523 3 };
1525
1526 let min_continuation =
1527 if block.is_ordered { last_item_marker_width } else { 2 };
1528 check_info.indent < min_continuation
1529 }
1530 } else {
1531 false
1532 };
1533
1534 if is_list_breaking_content {
1535 found_non_list = true;
1537 break;
1538 }
1539 }
1540 }
1541 found_non_list
1542 };
1543
1544 let continues_list = if is_nested {
1548 same_context && reasonable_distance && !has_non_list_content
1550 } else {
1551 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1553 };
1554
1555 if continues_list {
1556 block.end_line = line_num;
1558 block.item_lines.push(line_num);
1559
1560 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1562 list_item.marker.len() + 1
1563 } else {
1564 list_item.marker.len()
1565 });
1566
1567 if !block.is_ordered
1569 && block.marker.is_some()
1570 && block.marker.as_ref() != Some(&list_item.marker)
1571 {
1572 block.marker = None;
1574 }
1575 } else {
1576 list_blocks.push(block.clone());
1578
1579 *block = ListBlock {
1580 start_line: line_num,
1581 end_line: line_num,
1582 is_ordered: list_item.is_ordered,
1583 marker: if list_item.is_ordered {
1584 None
1585 } else {
1586 Some(list_item.marker.clone())
1587 },
1588 blockquote_prefix: blockquote_prefix.clone(),
1589 item_lines: vec![line_num],
1590 nesting_level: nesting,
1591 max_marker_width: if list_item.is_ordered {
1592 list_item.marker.len() + 1
1593 } else {
1594 list_item.marker.len()
1595 },
1596 };
1597 }
1598 } else {
1599 current_block = Some(ListBlock {
1601 start_line: line_num,
1602 end_line: line_num,
1603 is_ordered: list_item.is_ordered,
1604 marker: if list_item.is_ordered {
1605 None
1606 } else {
1607 Some(list_item.marker.clone())
1608 },
1609 blockquote_prefix,
1610 item_lines: vec![line_num],
1611 nesting_level: nesting,
1612 max_marker_width: list_item.marker.len(),
1613 });
1614 }
1615
1616 last_list_item_line = line_num;
1617 current_indent_level = item_indent;
1618 last_marker_width = if list_item.is_ordered {
1619 list_item.marker.len() + 1 } else {
1621 list_item.marker.len()
1622 };
1623 } else if let Some(ref mut block) = current_block {
1624 let min_continuation_indent = if block.is_ordered {
1635 current_indent_level + last_marker_width
1636 } else {
1637 current_indent_level + 2 };
1639
1640 if line_info.indent >= min_continuation_indent {
1641 block.end_line = line_num;
1643 } else if line_info.is_blank {
1644 let mut check_idx = line_idx + 1;
1647 let mut found_continuation = false;
1648
1649 while check_idx < lines.len() && lines[check_idx].is_blank {
1651 check_idx += 1;
1652 }
1653
1654 if check_idx < lines.len() {
1655 let next_line = &lines[check_idx];
1656 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1658 found_continuation = true;
1659 }
1660 else if !next_line.in_code_block
1662 && next_line.list_item.is_some()
1663 && let Some(item) = &next_line.list_item
1664 {
1665 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1666 .find(&next_line.content)
1667 .map_or(String::new(), |m| m.as_str().to_string());
1668 if item.marker_column == current_indent_level
1669 && item.is_ordered == block.is_ordered
1670 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1671 {
1672 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1675 if let Some(between_line) = lines.get(idx) {
1676 let trimmed = between_line.content.trim();
1677 if trimmed.is_empty() {
1679 return false;
1680 }
1681 let line_indent =
1683 between_line.content.len() - between_line.content.trim_start().len();
1684
1685 if trimmed.starts_with("```")
1687 || trimmed.starts_with("~~~")
1688 || trimmed.starts_with("---")
1689 || trimmed.starts_with("***")
1690 || trimmed.starts_with("___")
1691 || trimmed.starts_with(">")
1692 || trimmed.contains('|') || between_line.heading.is_some()
1694 {
1695 return true; }
1697
1698 line_indent >= min_continuation_indent
1700 } else {
1701 false
1702 }
1703 });
1704
1705 if block.is_ordered {
1706 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1709 if let Some(between_line) = lines.get(idx) {
1710 let trimmed = between_line.content.trim();
1711 if trimmed.is_empty() {
1712 return false;
1713 }
1714 trimmed.starts_with("```")
1716 || trimmed.starts_with("~~~")
1717 || trimmed.starts_with("---")
1718 || trimmed.starts_with("***")
1719 || trimmed.starts_with("___")
1720 || trimmed.starts_with(">")
1721 || trimmed.contains('|') || between_line.heading.is_some()
1723 } else {
1724 false
1725 }
1726 });
1727 found_continuation = !has_structural_separators;
1728 } else {
1729 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1731 if let Some(between_line) = lines.get(idx) {
1732 let trimmed = between_line.content.trim();
1733 if trimmed.is_empty() {
1734 return false;
1735 }
1736 trimmed.starts_with("```")
1738 || trimmed.starts_with("~~~")
1739 || trimmed.starts_with("---")
1740 || trimmed.starts_with("***")
1741 || trimmed.starts_with("___")
1742 || trimmed.starts_with(">")
1743 || trimmed.contains('|') || between_line.heading.is_some()
1745 } else {
1746 false
1747 }
1748 });
1749 found_continuation = !has_structural_separators;
1750 }
1751 }
1752 }
1753 }
1754
1755 if found_continuation {
1756 block.end_line = line_num;
1758 } else {
1759 list_blocks.push(block.clone());
1761 current_block = None;
1762 }
1763 } else {
1764 let min_required_indent = if block.is_ordered {
1767 current_indent_level + last_marker_width
1768 } else {
1769 current_indent_level + 2
1770 };
1771
1772 let line_content = line_info.content.trim();
1777 let is_structural_separator = line_info.heading.is_some()
1778 || line_content.starts_with("```")
1779 || line_content.starts_with("~~~")
1780 || line_content.starts_with("---")
1781 || line_content.starts_with("***")
1782 || line_content.starts_with("___")
1783 || line_content.starts_with(">")
1784 || line_content.contains('|'); let is_lazy_continuation = last_list_item_line == line_num - 1
1787 && !is_structural_separator
1788 && !line_info.is_blank
1789 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1790
1791 if is_lazy_continuation {
1792 let content_to_check = if !blockquote_prefix.is_empty() {
1795 line_info
1797 .content
1798 .strip_prefix(&blockquote_prefix)
1799 .unwrap_or(&line_info.content)
1800 .trim()
1801 } else {
1802 line_info.content.trim()
1803 };
1804
1805 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1806
1807 if starts_with_uppercase && last_list_item_line > 0 {
1810 list_blocks.push(block.clone());
1812 current_block = None;
1813 } else {
1814 block.end_line = line_num;
1816 }
1817 } else {
1818 list_blocks.push(block.clone());
1820 current_block = None;
1821 }
1822 }
1823 }
1824 }
1825
1826 if let Some(block) = current_block {
1828 list_blocks.push(block);
1829 }
1830
1831 merge_adjacent_list_blocks(&mut list_blocks, lines);
1833
1834 list_blocks
1835 }
1836
1837 fn compute_char_frequency(content: &str) -> CharFrequency {
1839 let mut frequency = CharFrequency::default();
1840
1841 for ch in content.chars() {
1842 match ch {
1843 '#' => frequency.hash_count += 1,
1844 '*' => frequency.asterisk_count += 1,
1845 '_' => frequency.underscore_count += 1,
1846 '-' => frequency.hyphen_count += 1,
1847 '+' => frequency.plus_count += 1,
1848 '>' => frequency.gt_count += 1,
1849 '|' => frequency.pipe_count += 1,
1850 '[' => frequency.bracket_count += 1,
1851 '`' => frequency.backtick_count += 1,
1852 '<' => frequency.lt_count += 1,
1853 '!' => frequency.exclamation_count += 1,
1854 '\n' => frequency.newline_count += 1,
1855 _ => {}
1856 }
1857 }
1858
1859 frequency
1860 }
1861
1862 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1864 lazy_static! {
1865 static ref HTML_TAG_REGEX: regex::Regex =
1866 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*(/?)>").unwrap();
1867 }
1868
1869 let mut html_tags = Vec::with_capacity(content.matches('<').count());
1870
1871 for cap in HTML_TAG_REGEX.captures_iter(content) {
1872 let full_match = cap.get(0).unwrap();
1873 let match_start = full_match.start();
1874 let match_end = full_match.end();
1875
1876 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1878 continue;
1879 }
1880
1881 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1882 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1883 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1884
1885 let mut line_num = 1;
1887 let mut col_start = match_start;
1888 let mut col_end = match_end;
1889 for (idx, line_info) in lines.iter().enumerate() {
1890 if match_start >= line_info.byte_offset {
1891 line_num = idx + 1;
1892 col_start = match_start - line_info.byte_offset;
1893 col_end = match_end - line_info.byte_offset;
1894 } else {
1895 break;
1896 }
1897 }
1898
1899 html_tags.push(HtmlTag {
1900 line: line_num,
1901 start_col: col_start,
1902 end_col: col_end,
1903 byte_offset: match_start,
1904 byte_end: match_end,
1905 tag_name,
1906 is_closing,
1907 is_self_closing,
1908 raw_content: full_match.as_str().to_string(),
1909 });
1910 }
1911
1912 html_tags
1913 }
1914
1915 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1917 lazy_static! {
1918 static ref EMPHASIS_REGEX: regex::Regex =
1919 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1920 }
1921
1922 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1923
1924 for cap in EMPHASIS_REGEX.captures_iter(content) {
1925 let full_match = cap.get(0).unwrap();
1926 let match_start = full_match.start();
1927 let match_end = full_match.end();
1928
1929 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1931 continue;
1932 }
1933
1934 let opening_markers = cap.get(1).unwrap().as_str();
1935 let content_part = cap.get(2).unwrap().as_str();
1936 let closing_markers = cap.get(3).unwrap().as_str();
1937
1938 if opening_markers.chars().next() != closing_markers.chars().next()
1940 || opening_markers.len() != closing_markers.len()
1941 {
1942 continue;
1943 }
1944
1945 let marker = opening_markers.chars().next().unwrap();
1946 let marker_count = opening_markers.len();
1947
1948 let mut line_num = 1;
1950 let mut col_start = match_start;
1951 let mut col_end = match_end;
1952 for (idx, line_info) in lines.iter().enumerate() {
1953 if match_start >= line_info.byte_offset {
1954 line_num = idx + 1;
1955 col_start = match_start - line_info.byte_offset;
1956 col_end = match_end - line_info.byte_offset;
1957 } else {
1958 break;
1959 }
1960 }
1961
1962 emphasis_spans.push(EmphasisSpan {
1963 line: line_num,
1964 start_col: col_start,
1965 end_col: col_end,
1966 byte_offset: match_start,
1967 byte_end: match_end,
1968 marker,
1969 marker_count,
1970 content: content_part.to_string(),
1971 });
1972 }
1973
1974 emphasis_spans
1975 }
1976
1977 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
1979 let mut table_rows = Vec::with_capacity(lines.len() / 20);
1980
1981 for (line_idx, line_info) in lines.iter().enumerate() {
1982 if line_info.in_code_block || line_info.is_blank {
1984 continue;
1985 }
1986
1987 let line = &line_info.content;
1988 let line_num = line_idx + 1;
1989
1990 if !line.contains('|') {
1992 continue;
1993 }
1994
1995 let parts: Vec<&str> = line.split('|').collect();
1997 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
1998
1999 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2001 let mut column_alignments = Vec::new();
2002
2003 if is_separator {
2004 for part in &parts[1..parts.len() - 1] {
2005 let trimmed = part.trim();
2007 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2008 "center".to_string()
2009 } else if trimmed.ends_with(':') {
2010 "right".to_string()
2011 } else if trimmed.starts_with(':') {
2012 "left".to_string()
2013 } else {
2014 "none".to_string()
2015 };
2016 column_alignments.push(alignment);
2017 }
2018 }
2019
2020 table_rows.push(TableRow {
2021 line: line_num,
2022 is_separator,
2023 column_count,
2024 column_alignments,
2025 });
2026 }
2027
2028 table_rows
2029 }
2030
2031 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2033 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2034
2035 for cap in BARE_URL_PATTERN.captures_iter(content) {
2037 let full_match = cap.get(0).unwrap();
2038 let match_start = full_match.start();
2039 let match_end = full_match.end();
2040
2041 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2043 continue;
2044 }
2045
2046 let preceding_char = if match_start > 0 {
2048 content.chars().nth(match_start - 1)
2049 } else {
2050 None
2051 };
2052 let following_char = content.chars().nth(match_end);
2053
2054 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2055 continue;
2056 }
2057 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2058 continue;
2059 }
2060
2061 let url = full_match.as_str();
2062 let url_type = if url.starts_with("https://") {
2063 "https"
2064 } else if url.starts_with("http://") {
2065 "http"
2066 } else if url.starts_with("ftp://") {
2067 "ftp"
2068 } else {
2069 "other"
2070 };
2071
2072 let mut line_num = 1;
2074 let mut col_start = match_start;
2075 let mut col_end = match_end;
2076 for (idx, line_info) in lines.iter().enumerate() {
2077 if match_start >= line_info.byte_offset {
2078 line_num = idx + 1;
2079 col_start = match_start - line_info.byte_offset;
2080 col_end = match_end - line_info.byte_offset;
2081 } else {
2082 break;
2083 }
2084 }
2085
2086 bare_urls.push(BareUrl {
2087 line: line_num,
2088 start_col: col_start,
2089 end_col: col_end,
2090 byte_offset: match_start,
2091 byte_end: match_end,
2092 url: url.to_string(),
2093 url_type: url_type.to_string(),
2094 });
2095 }
2096
2097 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2099 let full_match = cap.get(0).unwrap();
2100 let match_start = full_match.start();
2101 let match_end = full_match.end();
2102
2103 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2105 continue;
2106 }
2107
2108 let preceding_char = if match_start > 0 {
2110 content.chars().nth(match_start - 1)
2111 } else {
2112 None
2113 };
2114 let following_char = content.chars().nth(match_end);
2115
2116 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2117 continue;
2118 }
2119 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2120 continue;
2121 }
2122
2123 let email = full_match.as_str();
2124
2125 let mut line_num = 1;
2127 let mut col_start = match_start;
2128 let mut col_end = match_end;
2129 for (idx, line_info) in lines.iter().enumerate() {
2130 if match_start >= line_info.byte_offset {
2131 line_num = idx + 1;
2132 col_start = match_start - line_info.byte_offset;
2133 col_end = match_end - line_info.byte_offset;
2134 } else {
2135 break;
2136 }
2137 }
2138
2139 bare_urls.push(BareUrl {
2140 line: line_num,
2141 start_col: col_start,
2142 end_col: col_end,
2143 byte_offset: match_start,
2144 byte_end: match_end,
2145 url: email.to_string(),
2146 url_type: "email".to_string(),
2147 });
2148 }
2149
2150 bare_urls
2151 }
2152}
2153
2154fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2156 if list_blocks.len() < 2 {
2157 return;
2158 }
2159
2160 let mut merger = ListBlockMerger::new(lines);
2161 *list_blocks = merger.merge(list_blocks);
2162}
2163
2164struct ListBlockMerger<'a> {
2166 lines: &'a [LineInfo],
2167}
2168
2169impl<'a> ListBlockMerger<'a> {
2170 fn new(lines: &'a [LineInfo]) -> Self {
2171 Self { lines }
2172 }
2173
2174 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2175 let mut merged = Vec::with_capacity(list_blocks.len());
2176 let mut current = list_blocks[0].clone();
2177
2178 for next in list_blocks.iter().skip(1) {
2179 if self.should_merge_blocks(¤t, next) {
2180 current = self.merge_two_blocks(current, next);
2181 } else {
2182 merged.push(current);
2183 current = next.clone();
2184 }
2185 }
2186
2187 merged.push(current);
2188 merged
2189 }
2190
2191 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2193 if !self.blocks_are_compatible(current, next) {
2195 return false;
2196 }
2197
2198 let spacing = self.analyze_spacing_between(current, next);
2200 match spacing {
2201 BlockSpacing::Consecutive => true,
2202 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2203 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2204 self.can_merge_with_content_between(current, next)
2205 }
2206 }
2207 }
2208
2209 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2211 current.is_ordered == next.is_ordered
2212 && current.blockquote_prefix == next.blockquote_prefix
2213 && current.nesting_level == next.nesting_level
2214 }
2215
2216 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2218 let gap = next.start_line - current.end_line;
2219
2220 match gap {
2221 1 => BlockSpacing::Consecutive,
2222 2 => BlockSpacing::SingleBlank,
2223 _ if gap > 2 => {
2224 if self.has_only_blank_lines_between(current, next) {
2225 BlockSpacing::MultipleBlanks
2226 } else {
2227 BlockSpacing::ContentBetween
2228 }
2229 }
2230 _ => BlockSpacing::Consecutive, }
2232 }
2233
2234 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2236 if has_meaningful_content_between(current, next, self.lines) {
2239 return false; }
2241
2242 !current.is_ordered && current.marker == next.marker
2244 }
2245
2246 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2248 if has_meaningful_content_between(current, next, self.lines) {
2250 return false; }
2252
2253 current.is_ordered && next.is_ordered
2255 }
2256
2257 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2259 for line_num in (current.end_line + 1)..next.start_line {
2260 if let Some(line_info) = self.lines.get(line_num - 1)
2261 && !line_info.content.trim().is_empty()
2262 {
2263 return false;
2264 }
2265 }
2266 true
2267 }
2268
2269 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2271 current.end_line = next.end_line;
2272 current.item_lines.extend_from_slice(&next.item_lines);
2273
2274 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2276
2277 if !current.is_ordered && self.markers_differ(¤t, next) {
2279 current.marker = None; }
2281
2282 current
2283 }
2284
2285 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2287 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2288 }
2289}
2290
2291#[derive(Debug, PartialEq)]
2293enum BlockSpacing {
2294 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2299
2300fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2302 for line_num in (current.end_line + 1)..next.start_line {
2304 if let Some(line_info) = lines.get(line_num - 1) {
2305 let trimmed = line_info.content.trim();
2307
2308 if trimmed.is_empty() {
2310 continue;
2311 }
2312
2313 if line_info.heading.is_some() {
2317 return true; }
2319
2320 if is_horizontal_rule(trimmed) {
2322 return true; }
2324
2325 if trimmed.contains('|') && trimmed.len() > 1 {
2327 return true; }
2329
2330 if trimmed.starts_with('>') {
2332 return true; }
2334
2335 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2337 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2338
2339 let min_continuation_indent = if current.is_ordered {
2341 current.nesting_level + current.max_marker_width + 1 } else {
2343 current.nesting_level + 2
2344 };
2345
2346 if line_indent < min_continuation_indent {
2347 return true; }
2350 }
2351
2352 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2354
2355 let min_indent = if current.is_ordered {
2357 current.nesting_level + current.max_marker_width
2358 } else {
2359 current.nesting_level + 2
2360 };
2361
2362 if line_indent < min_indent {
2364 return true; }
2366
2367 }
2370 }
2371
2372 false
2374}
2375
2376fn is_horizontal_rule(trimmed: &str) -> bool {
2378 if trimmed.len() < 3 {
2379 return false;
2380 }
2381
2382 let chars: Vec<char> = trimmed.chars().collect();
2384 if let Some(&first_char) = chars.first()
2385 && (first_char == '-' || first_char == '*' || first_char == '_')
2386 {
2387 let mut count = 0;
2388 for &ch in &chars {
2389 if ch == first_char {
2390 count += 1;
2391 } else if ch != ' ' && ch != '\t' {
2392 return false; }
2394 }
2395 return count >= 3;
2396 }
2397 false
2398}
2399
2400#[cfg(test)]
2402mod tests {
2403 use super::*;
2404
2405 #[test]
2406 fn test_empty_content() {
2407 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2408 assert_eq!(ctx.content, "");
2409 assert_eq!(ctx.line_offsets, vec![0]);
2410 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2411 assert_eq!(ctx.lines.len(), 0);
2412 }
2413
2414 #[test]
2415 fn test_single_line() {
2416 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2417 assert_eq!(ctx.content, "# Hello");
2418 assert_eq!(ctx.line_offsets, vec![0]);
2419 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2420 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2421 }
2422
2423 #[test]
2424 fn test_multi_line() {
2425 let content = "# Title\n\nSecond line\nThird line";
2426 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2427 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2428 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2435
2436 #[test]
2437 fn test_line_info() {
2438 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2439 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2440
2441 assert_eq!(ctx.lines.len(), 7);
2443
2444 let line1 = &ctx.lines[0];
2446 assert_eq!(line1.content, "# Title");
2447 assert_eq!(line1.byte_offset, 0);
2448 assert_eq!(line1.indent, 0);
2449 assert!(!line1.is_blank);
2450 assert!(!line1.in_code_block);
2451 assert!(line1.list_item.is_none());
2452
2453 let line2 = &ctx.lines[1];
2455 assert_eq!(line2.content, " indented");
2456 assert_eq!(line2.byte_offset, 8);
2457 assert_eq!(line2.indent, 4);
2458 assert!(!line2.is_blank);
2459
2460 let line3 = &ctx.lines[2];
2462 assert_eq!(line3.content, "");
2463 assert!(line3.is_blank);
2464
2465 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2467 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2468 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2469 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2470 }
2471
2472 #[test]
2473 fn test_list_item_detection() {
2474 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2475 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2476
2477 let line1 = &ctx.lines[0];
2479 assert!(line1.list_item.is_some());
2480 let list1 = line1.list_item.as_ref().unwrap();
2481 assert_eq!(list1.marker, "-");
2482 assert!(!list1.is_ordered);
2483 assert_eq!(list1.marker_column, 0);
2484 assert_eq!(list1.content_column, 2);
2485
2486 let line2 = &ctx.lines[1];
2488 assert!(line2.list_item.is_some());
2489 let list2 = line2.list_item.as_ref().unwrap();
2490 assert_eq!(list2.marker, "*");
2491 assert_eq!(list2.marker_column, 2);
2492
2493 let line3 = &ctx.lines[2];
2495 assert!(line3.list_item.is_some());
2496 let list3 = line3.list_item.as_ref().unwrap();
2497 assert_eq!(list3.marker, "1.");
2498 assert!(list3.is_ordered);
2499 assert_eq!(list3.number, Some(1));
2500
2501 let line6 = &ctx.lines[5];
2503 assert!(line6.list_item.is_none());
2504 }
2505
2506 #[test]
2507 fn test_offset_to_line_col_edge_cases() {
2508 let content = "a\nb\nc";
2509 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2510 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2518}