1use crate::config::MarkdownFlavor;
2use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7 static ref LINK_PATTERN: Regex = Regex::new(
10 r"(?sx)
11 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
12 (?:
13 \(([^)]*)\) # Inline URL in group 2 (can be empty)
14 |
15 \[([^\]]*)\] # Reference ID in group 3
16 )"
17 ).unwrap();
18
19 static ref IMAGE_PATTERN: Regex = Regex::new(
22 r"(?sx)
23 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
24 (?:
25 \(([^)]*)\) # Inline URL in group 2 (can be empty)
26 |
27 \[([^\]]*)\] # Reference ID in group 3
28 )"
29 ).unwrap();
30
31 static ref REF_DEF_PATTERN: Regex = Regex::new(
33 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
34 ).unwrap();
35
36 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
39 r"`+"
40 ).unwrap();
41
42 static ref BARE_URL_PATTERN: Regex = Regex::new(
44 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45 ).unwrap();
46
47 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
49 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
50 ).unwrap();
51
52 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
54 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
55 ).unwrap();
56
57 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
59}
60
61#[derive(Debug, Clone)]
63pub struct LineInfo {
64 pub content: String,
66 pub byte_offset: usize,
68 pub indent: usize,
70 pub is_blank: bool,
72 pub in_code_block: bool,
74 pub in_front_matter: bool,
76 pub list_item: Option<ListItemInfo>,
78 pub heading: Option<HeadingInfo>,
80 pub blockquote: Option<BlockquoteInfo>,
82}
83
84#[derive(Debug, Clone)]
86pub struct ListItemInfo {
87 pub marker: String,
89 pub is_ordered: bool,
91 pub number: Option<usize>,
93 pub marker_column: usize,
95 pub content_column: usize,
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub enum HeadingStyle {
102 ATX,
104 Setext1,
106 Setext2,
108}
109
110#[derive(Debug, Clone)]
112pub struct ParsedLink {
113 pub line: usize,
115 pub start_col: usize,
117 pub end_col: usize,
119 pub byte_offset: usize,
121 pub byte_end: usize,
123 pub text: String,
125 pub url: String,
127 pub is_reference: bool,
129 pub reference_id: Option<String>,
131}
132
133#[derive(Debug, Clone)]
135pub struct ParsedImage {
136 pub line: usize,
138 pub start_col: usize,
140 pub end_col: usize,
142 pub byte_offset: usize,
144 pub byte_end: usize,
146 pub alt_text: String,
148 pub url: String,
150 pub is_reference: bool,
152 pub reference_id: Option<String>,
154}
155
156#[derive(Debug, Clone)]
158pub struct ReferenceDef {
159 pub line: usize,
161 pub id: String,
163 pub url: String,
165 pub title: Option<String>,
167}
168
169#[derive(Debug, Clone)]
171pub struct CodeSpan {
172 pub line: usize,
174 pub start_col: usize,
176 pub end_col: usize,
178 pub byte_offset: usize,
180 pub byte_end: usize,
182 pub backtick_count: usize,
184 pub content: String,
186}
187
188#[derive(Debug, Clone)]
190pub struct HeadingInfo {
191 pub level: u8,
193 pub style: HeadingStyle,
195 pub marker: String,
197 pub marker_column: usize,
199 pub content_column: usize,
201 pub text: String,
203 pub custom_id: Option<String>,
205 pub raw_text: String,
207 pub has_closing_sequence: bool,
209 pub closing_sequence: String,
211}
212
213#[derive(Debug, Clone)]
215pub struct BlockquoteInfo {
216 pub nesting_level: usize,
218 pub indent: String,
220 pub marker_column: usize,
222 pub prefix: String,
224 pub content: String,
226 pub has_no_space_after_marker: bool,
228 pub has_multiple_spaces_after_marker: bool,
230 pub needs_md028_fix: bool,
232}
233
234#[derive(Debug, Clone)]
236pub struct ListBlock {
237 pub start_line: usize,
239 pub end_line: usize,
241 pub is_ordered: bool,
243 pub marker: Option<String>,
245 pub blockquote_prefix: String,
247 pub item_lines: Vec<usize>,
249 pub nesting_level: usize,
251 pub max_marker_width: usize,
253}
254
255use std::sync::{Arc, Mutex};
256
257#[derive(Debug, Clone, Default)]
259pub struct CharFrequency {
260 pub hash_count: usize,
262 pub asterisk_count: usize,
264 pub underscore_count: usize,
266 pub hyphen_count: usize,
268 pub plus_count: usize,
270 pub gt_count: usize,
272 pub pipe_count: usize,
274 pub bracket_count: usize,
276 pub backtick_count: usize,
278 pub lt_count: usize,
280 pub exclamation_count: usize,
282 pub newline_count: usize,
284}
285
286#[derive(Debug, Clone)]
288pub struct HtmlTag {
289 pub line: usize,
291 pub start_col: usize,
293 pub end_col: usize,
295 pub byte_offset: usize,
297 pub byte_end: usize,
299 pub tag_name: String,
301 pub is_closing: bool,
303 pub is_self_closing: bool,
305 pub raw_content: String,
307}
308
309#[derive(Debug, Clone)]
311pub struct EmphasisSpan {
312 pub line: usize,
314 pub start_col: usize,
316 pub end_col: usize,
318 pub byte_offset: usize,
320 pub byte_end: usize,
322 pub marker: char,
324 pub marker_count: usize,
326 pub content: String,
328}
329
330#[derive(Debug, Clone)]
332pub struct TableRow {
333 pub line: usize,
335 pub is_separator: bool,
337 pub column_count: usize,
339 pub column_alignments: Vec<String>, }
342
343#[derive(Debug, Clone)]
345pub struct BareUrl {
346 pub line: usize,
348 pub start_col: usize,
350 pub end_col: usize,
352 pub byte_offset: usize,
354 pub byte_end: usize,
356 pub url: String,
358 pub url_type: String,
360}
361
362pub struct LintContext<'a> {
363 pub content: &'a str,
364 pub line_offsets: Vec<usize>,
365 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, pub flavor: MarkdownFlavor, }
379
380impl<'a> LintContext<'a> {
381 pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
382 let mut line_offsets = vec![0];
383 for (i, c) in content.char_indices() {
384 if c == '\n' {
385 line_offsets.push(i + 1);
386 }
387 }
388
389 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
391
392 let lines = Self::compute_line_info(content, &line_offsets, &code_blocks);
394
395 let links = Self::parse_links(content, &lines, &code_blocks);
398 let images = Self::parse_images(content, &lines, &code_blocks);
399 let reference_defs = Self::parse_reference_defs(content, &lines);
400 let list_blocks = Self::parse_list_blocks(&lines);
401
402 let char_frequency = Self::compute_char_frequency(content);
404
405 Self {
406 content,
407 line_offsets,
408 code_blocks,
409 lines,
410 links,
411 images,
412 reference_defs,
413 code_spans_cache: Mutex::new(None),
414 list_blocks,
415 char_frequency,
416 html_tags_cache: Mutex::new(None),
417 emphasis_spans_cache: Mutex::new(None),
418 table_rows_cache: Mutex::new(None),
419 bare_urls_cache: Mutex::new(None),
420 flavor,
421 }
422 }
423
424 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
426 let mut cache = self.code_spans_cache.lock().unwrap();
427
428 if cache.is_none() {
430 let code_spans = Self::parse_code_spans(self.content, &self.lines);
431 *cache = Some(Arc::new(code_spans));
432 }
433
434 cache.as_ref().unwrap().clone()
436 }
437
438 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
440 let mut cache = self.html_tags_cache.lock().unwrap();
441
442 if cache.is_none() {
443 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
444 *cache = Some(Arc::new(html_tags));
445 }
446
447 cache.as_ref().unwrap().clone()
448 }
449
450 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
452 let mut cache = self.emphasis_spans_cache.lock().unwrap();
453
454 if cache.is_none() {
455 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
456 *cache = Some(Arc::new(emphasis_spans));
457 }
458
459 cache.as_ref().unwrap().clone()
460 }
461
462 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
464 let mut cache = self.table_rows_cache.lock().unwrap();
465
466 if cache.is_none() {
467 let table_rows = Self::parse_table_rows(&self.lines);
468 *cache = Some(Arc::new(table_rows));
469 }
470
471 cache.as_ref().unwrap().clone()
472 }
473
474 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
476 let mut cache = self.bare_urls_cache.lock().unwrap();
477
478 if cache.is_none() {
479 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
480 *cache = Some(Arc::new(bare_urls));
481 }
482
483 cache.as_ref().unwrap().clone()
484 }
485
486 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
488 match self.line_offsets.binary_search(&offset) {
489 Ok(line) => (line + 1, 1),
490 Err(line) => {
491 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
492 (line, offset - line_start + 1)
493 }
494 }
495 }
496
497 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
499 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
501 return true;
502 }
503
504 self.code_spans()
506 .iter()
507 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
508 }
509
510 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
512 if line_num > 0 {
513 self.lines.get(line_num - 1)
514 } else {
515 None
516 }
517 }
518
519 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
521 self.line_info(line_num).map(|info| info.byte_offset)
522 }
523
524 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
526 let normalized_id = ref_id.to_lowercase();
527 self.reference_defs
528 .iter()
529 .find(|def| def.id == normalized_id)
530 .map(|def| def.url.as_str())
531 }
532
533 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
535 self.links.iter().filter(|link| link.line == line_num).collect()
536 }
537
538 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
540 self.images.iter().filter(|img| img.line == line_num).collect()
541 }
542
543 pub fn is_in_list_block(&self, line_num: usize) -> bool {
545 self.list_blocks
546 .iter()
547 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
548 }
549
550 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
552 self.list_blocks
553 .iter()
554 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
555 }
556
557 pub fn has_char(&self, ch: char) -> bool {
559 match ch {
560 '#' => self.char_frequency.hash_count > 0,
561 '*' => self.char_frequency.asterisk_count > 0,
562 '_' => self.char_frequency.underscore_count > 0,
563 '-' => self.char_frequency.hyphen_count > 0,
564 '+' => self.char_frequency.plus_count > 0,
565 '>' => self.char_frequency.gt_count > 0,
566 '|' => self.char_frequency.pipe_count > 0,
567 '[' => self.char_frequency.bracket_count > 0,
568 '`' => self.char_frequency.backtick_count > 0,
569 '<' => self.char_frequency.lt_count > 0,
570 '!' => self.char_frequency.exclamation_count > 0,
571 '\n' => self.char_frequency.newline_count > 0,
572 _ => self.content.contains(ch), }
574 }
575
576 pub fn char_count(&self, ch: char) -> usize {
578 match ch {
579 '#' => self.char_frequency.hash_count,
580 '*' => self.char_frequency.asterisk_count,
581 '_' => self.char_frequency.underscore_count,
582 '-' => self.char_frequency.hyphen_count,
583 '+' => self.char_frequency.plus_count,
584 '>' => self.char_frequency.gt_count,
585 '|' => self.char_frequency.pipe_count,
586 '[' => self.char_frequency.bracket_count,
587 '`' => self.char_frequency.backtick_count,
588 '<' => self.char_frequency.lt_count,
589 '!' => self.char_frequency.exclamation_count,
590 '\n' => self.char_frequency.newline_count,
591 _ => self.content.matches(ch).count(), }
593 }
594
595 pub fn likely_has_headings(&self) -> bool {
597 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
599
600 pub fn likely_has_lists(&self) -> bool {
602 self.char_frequency.asterisk_count > 0
603 || self.char_frequency.hyphen_count > 0
604 || self.char_frequency.plus_count > 0
605 }
606
607 pub fn likely_has_emphasis(&self) -> bool {
609 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
610 }
611
612 pub fn likely_has_tables(&self) -> bool {
614 self.char_frequency.pipe_count > 2
615 }
616
617 pub fn likely_has_blockquotes(&self) -> bool {
619 self.char_frequency.gt_count > 0
620 }
621
622 pub fn likely_has_code(&self) -> bool {
624 self.char_frequency.backtick_count > 0
625 }
626
627 pub fn likely_has_links_or_images(&self) -> bool {
629 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
630 }
631
632 pub fn likely_has_html(&self) -> bool {
634 self.char_frequency.lt_count > 0
635 }
636
637 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
639 self.html_tags()
640 .iter()
641 .filter(|tag| tag.line == line_num)
642 .cloned()
643 .collect()
644 }
645
646 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
648 self.emphasis_spans()
649 .iter()
650 .filter(|span| span.line == line_num)
651 .cloned()
652 .collect()
653 }
654
655 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
657 self.table_rows()
658 .iter()
659 .filter(|row| row.line == line_num)
660 .cloned()
661 .collect()
662 }
663
664 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
666 self.bare_urls()
667 .iter()
668 .filter(|url| url.line == line_num)
669 .cloned()
670 .collect()
671 }
672
673 fn parse_links(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedLink> {
675 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
680 let full_match = cap.get(0).unwrap();
681 let match_start = full_match.start();
682 let match_end = full_match.end();
683
684 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
686 continue;
687 }
688
689 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
691 continue;
692 }
693
694 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
696 continue;
697 }
698
699 let mut line_num = 1;
701 let mut col_start = match_start;
702 for (idx, line_info) in lines.iter().enumerate() {
703 if match_start >= line_info.byte_offset {
704 line_num = idx + 1;
705 col_start = match_start - line_info.byte_offset;
706 } else {
707 break;
708 }
709 }
710
711 let mut end_line_num = 1;
713 let mut col_end = match_end;
714 for (idx, line_info) in lines.iter().enumerate() {
715 if match_end > line_info.byte_offset {
716 end_line_num = idx + 1;
717 col_end = match_end - line_info.byte_offset;
718 } else {
719 break;
720 }
721 }
722
723 if line_num == end_line_num {
725 } else {
727 }
730
731 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
732
733 if let Some(inline_url) = cap.get(2) {
734 links.push(ParsedLink {
736 line: line_num,
737 start_col: col_start,
738 end_col: col_end,
739 byte_offset: match_start,
740 byte_end: match_end,
741 text,
742 url: inline_url.as_str().to_string(),
743 is_reference: false,
744 reference_id: None,
745 });
746 } else if let Some(ref_id) = cap.get(3) {
747 let ref_id_str = ref_id.as_str();
749 let normalized_ref = if ref_id_str.is_empty() {
750 text.to_lowercase() } else {
752 ref_id_str.to_lowercase()
753 };
754
755 links.push(ParsedLink {
756 line: line_num,
757 start_col: col_start,
758 end_col: col_end,
759 byte_offset: match_start,
760 byte_end: match_end,
761 text,
762 url: String::new(), is_reference: true,
764 reference_id: Some(normalized_ref),
765 });
766 }
767 }
768
769 links
770 }
771
772 fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
774 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
779 let full_match = cap.get(0).unwrap();
780 let match_start = full_match.start();
781 let match_end = full_match.end();
782
783 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
785 continue;
786 }
787
788 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
790 continue;
791 }
792
793 let mut line_num = 1;
795 let mut col_start = match_start;
796 for (idx, line_info) in lines.iter().enumerate() {
797 if match_start >= line_info.byte_offset {
798 line_num = idx + 1;
799 col_start = match_start - line_info.byte_offset;
800 } else {
801 break;
802 }
803 }
804
805 let mut end_line_num = 1;
807 let mut col_end = match_end;
808 for (idx, line_info) in lines.iter().enumerate() {
809 if match_end > line_info.byte_offset {
810 end_line_num = idx + 1;
811 col_end = match_end - line_info.byte_offset;
812 } else {
813 break;
814 }
815 }
816
817 if line_num == end_line_num {
819 } else {
821 }
824
825 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
826
827 if let Some(inline_url) = cap.get(2) {
828 images.push(ParsedImage {
830 line: line_num,
831 start_col: col_start,
832 end_col: col_end,
833 byte_offset: match_start,
834 byte_end: match_end,
835 alt_text,
836 url: inline_url.as_str().to_string(),
837 is_reference: false,
838 reference_id: None,
839 });
840 } else if let Some(ref_id) = cap.get(3) {
841 let ref_id_str = ref_id.as_str();
843 let normalized_ref = if ref_id_str.is_empty() {
844 alt_text.to_lowercase() } else {
846 ref_id_str.to_lowercase()
847 };
848
849 images.push(ParsedImage {
850 line: line_num,
851 start_col: col_start,
852 end_col: col_end,
853 byte_offset: match_start,
854 byte_end: match_end,
855 alt_text,
856 url: String::new(), is_reference: true,
858 reference_id: Some(normalized_ref),
859 });
860 }
861 }
862
863 images
864 }
865
866 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
868 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
872 if line_info.in_code_block {
874 continue;
875 }
876
877 let line = &line_info.content;
878 let line_num = line_idx + 1;
879
880 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
881 let id = cap.get(1).unwrap().as_str().to_lowercase();
882 let url = cap.get(2).unwrap().as_str().to_string();
883 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
884
885 refs.push(ReferenceDef {
886 line: line_num,
887 id,
888 url,
889 title,
890 });
891 }
892 }
893
894 refs
895 }
896
897 fn compute_line_info(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<LineInfo> {
899 lazy_static! {
900 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
902 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
903
904 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
906
907 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
909 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
910
911 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
913 }
914
915 let content_lines: Vec<&str> = content.lines().collect();
916 let mut lines = Vec::with_capacity(content_lines.len());
917
918 let mut in_front_matter = false;
920 let mut front_matter_end = 0;
921 if content_lines.first().map(|l| l.trim()) == Some("---") {
922 in_front_matter = true;
923 for (idx, line) in content_lines.iter().enumerate().skip(1) {
924 if line.trim() == "---" {
925 front_matter_end = idx;
926 break;
927 }
928 }
929 }
930
931 for (i, line) in content_lines.iter().enumerate() {
932 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
933 let indent = line.len() - line.trim_start().len();
934 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
936 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
938 after_prefix.trim().is_empty()
939 } else {
940 line.trim().is_empty()
941 };
942 let in_code_block = code_blocks.iter().any(|&(start, end)| {
945 let block_content = &content[start..end];
948 let is_multiline = block_content.contains('\n');
949 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
950 let is_indented = !is_fenced
951 && block_content
952 .lines()
953 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
954
955 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
956 });
957
958 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
960 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
962 let prefix = caps.get(1).unwrap().as_str();
963 let content = caps.get(2).unwrap().as_str();
964 (content, prefix.len())
965 } else {
966 (&**line, 0)
967 };
968
969 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
970 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
971 let marker = caps.get(2).map_or("", |m| m.as_str());
972 let spacing = caps.get(3).map_or("", |m| m.as_str());
973 let _content = caps.get(4).map_or("", |m| m.as_str());
974 let marker_column = blockquote_prefix_len + leading_spaces.len();
975 let content_column = marker_column + marker.len() + spacing.len();
976
977 if spacing.is_empty() {
984 None
985 } else {
986 Some(ListItemInfo {
987 marker: marker.to_string(),
988 is_ordered: false,
989 number: None,
990 marker_column,
991 content_column,
992 })
993 }
994 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
995 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
996 let number_str = caps.get(2).map_or("", |m| m.as_str());
997 let delimiter = caps.get(3).map_or("", |m| m.as_str());
998 let spacing = caps.get(4).map_or("", |m| m.as_str());
999 let _content = caps.get(5).map_or("", |m| m.as_str());
1000 let marker = format!("{number_str}{delimiter}");
1001 let marker_column = blockquote_prefix_len + leading_spaces.len();
1002 let content_column = marker_column + marker.len() + spacing.len();
1003
1004 if spacing.is_empty() {
1007 None
1008 } else {
1009 Some(ListItemInfo {
1010 marker,
1011 is_ordered: true,
1012 number: number_str.parse().ok(),
1013 marker_column,
1014 content_column,
1015 })
1016 }
1017 } else {
1018 None
1019 }
1020 } else {
1021 None
1022 };
1023
1024 lines.push(LineInfo {
1025 content: line.to_string(),
1026 byte_offset,
1027 indent,
1028 is_blank,
1029 in_code_block,
1030 in_front_matter: in_front_matter && i <= front_matter_end,
1031 list_item,
1032 heading: None, blockquote: None, });
1035 }
1036
1037 for i in 0..content_lines.len() {
1039 if lines[i].in_code_block {
1040 continue;
1041 }
1042
1043 if in_front_matter && i <= front_matter_end {
1045 continue;
1046 }
1047
1048 let line = content_lines[i];
1049
1050 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1052 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1053 let markers = caps.get(2).map_or("", |m| m.as_str());
1054 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1055 let content = caps.get(4).map_or("", |m| m.as_str());
1056
1057 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1058 let marker_column = indent_str.len();
1059
1060 let prefix = format!("{indent_str}{markers}{spaces_after}");
1062
1063 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1065 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1067
1068 let needs_md028_fix = content.trim().is_empty() && spaces_after.is_empty();
1070
1071 lines[i].blockquote = Some(BlockquoteInfo {
1072 nesting_level,
1073 indent: indent_str.to_string(),
1074 marker_column,
1075 prefix,
1076 content: content.to_string(),
1077 has_no_space_after_marker: has_no_space,
1078 has_multiple_spaces_after_marker: has_multiple_spaces,
1079 needs_md028_fix,
1080 });
1081 }
1082
1083 if lines[i].is_blank {
1085 continue;
1086 }
1087
1088 if let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1090 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1091 let hashes = caps.get(2).map_or("", |m| m.as_str());
1092 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1093 let rest = caps.get(4).map_or("", |m| m.as_str());
1094
1095 let level = hashes.len() as u8;
1096 let marker_column = leading_spaces.len();
1097
1098 let (text, has_closing, closing_seq) = {
1100 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1102 if rest[id_start..].trim_end().ends_with('}') {
1104 (&rest[..id_start], &rest[id_start..])
1106 } else {
1107 (rest, "")
1108 }
1109 } else {
1110 (rest, "")
1111 };
1112
1113 let trimmed_rest = rest_without_id.trim_end();
1115 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1116 let mut start_of_hashes = last_hash_pos;
1118 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1119 start_of_hashes -= 1;
1120 }
1121
1122 let has_space_before = start_of_hashes == 0
1124 || trimmed_rest
1125 .chars()
1126 .nth(start_of_hashes - 1)
1127 .is_some_and(|c| c.is_whitespace());
1128
1129 let potential_closing = &trimmed_rest[start_of_hashes..];
1131 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1132
1133 if is_all_hashes && has_space_before {
1134 let closing_hashes = potential_closing.to_string();
1136 let text_part = if !custom_id_part.is_empty() {
1139 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1142 } else {
1143 rest_without_id[..start_of_hashes].trim_end().to_string()
1144 };
1145 (text_part, true, closing_hashes)
1146 } else {
1147 (rest.to_string(), false, String::new())
1149 }
1150 } else {
1151 (rest.to_string(), false, String::new())
1153 }
1154 };
1155
1156 let content_column = marker_column + hashes.len() + spaces_after.len();
1157
1158 let raw_text = text.trim().to_string();
1160 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1161
1162 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1164 let next_line = content_lines[i + 1];
1165 if !lines[i + 1].in_code_block
1166 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1167 && let Some(next_line_id) =
1168 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1169 {
1170 custom_id = Some(next_line_id);
1171 }
1172 }
1173
1174 lines[i].heading = Some(HeadingInfo {
1175 level,
1176 style: HeadingStyle::ATX,
1177 marker: hashes.to_string(),
1178 marker_column,
1179 content_column,
1180 text: clean_text,
1181 custom_id,
1182 raw_text,
1183 has_closing_sequence: has_closing,
1184 closing_sequence: closing_seq,
1185 });
1186 }
1187 else if i + 1 < content_lines.len() {
1189 let next_line = content_lines[i + 1];
1190 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1191 if in_front_matter && i < front_matter_end {
1193 continue;
1194 }
1195
1196 let underline = next_line.trim();
1197 let level = if underline.starts_with('=') { 1 } else { 2 };
1198 let style = if level == 1 {
1199 HeadingStyle::Setext1
1200 } else {
1201 HeadingStyle::Setext2
1202 };
1203
1204 let raw_text = line.trim().to_string();
1206 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1207
1208 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1210 let attr_line = content_lines[i + 2];
1211 if !lines[i + 2].in_code_block
1212 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1213 && let Some(attr_line_id) =
1214 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1215 {
1216 custom_id = Some(attr_line_id);
1217 }
1218 }
1219
1220 lines[i].heading = Some(HeadingInfo {
1221 level,
1222 style,
1223 marker: underline.to_string(),
1224 marker_column: next_line.len() - next_line.trim_start().len(),
1225 content_column: lines[i].indent,
1226 text: clean_text,
1227 custom_id,
1228 raw_text,
1229 has_closing_sequence: false,
1230 closing_sequence: String::new(),
1231 });
1232 }
1233 }
1234 }
1235
1236 lines
1237 }
1238
1239 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1241 let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); if !content.contains('`') {
1246 return code_spans;
1247 }
1248
1249 let mut pos = 0;
1250 let bytes = content.as_bytes();
1251
1252 while pos < bytes.len() {
1253 if let Some(backtick_start) = content[pos..].find('`') {
1255 let start_pos = pos + backtick_start;
1256
1257 let mut in_code_block = false;
1259 for (line_idx, line_info) in lines.iter().enumerate() {
1260 if start_pos >= line_info.byte_offset
1261 && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1262 {
1263 in_code_block = line_info.in_code_block;
1264 break;
1265 }
1266 }
1267
1268 if in_code_block {
1269 pos = start_pos + 1;
1270 continue;
1271 }
1272
1273 let mut backtick_count = 0;
1275 let mut i = start_pos;
1276 while i < bytes.len() && bytes[i] == b'`' {
1277 backtick_count += 1;
1278 i += 1;
1279 }
1280
1281 let search_start = start_pos + backtick_count;
1283 let closing_pattern = &content[start_pos..start_pos + backtick_count];
1284
1285 if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1286 let end_pos = search_start + rel_end;
1288 let check_pos = end_pos + backtick_count;
1289
1290 if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1292 let content_start = start_pos + backtick_count;
1294 let content_end = end_pos;
1295 let span_content = content[content_start..content_end].to_string();
1296
1297 let mut line_num = 1;
1299 let mut col_start = start_pos;
1300 for (idx, line_info) in lines.iter().enumerate() {
1301 if start_pos >= line_info.byte_offset {
1302 line_num = idx + 1;
1303 col_start = start_pos - line_info.byte_offset;
1304 } else {
1305 break;
1306 }
1307 }
1308
1309 let mut col_end = end_pos + backtick_count;
1311 for line_info in lines.iter() {
1312 if end_pos + backtick_count > line_info.byte_offset {
1313 col_end = end_pos + backtick_count - line_info.byte_offset;
1314 } else {
1315 break;
1316 }
1317 }
1318
1319 code_spans.push(CodeSpan {
1320 line: line_num,
1321 start_col: col_start,
1322 end_col: col_end,
1323 byte_offset: start_pos,
1324 byte_end: end_pos + backtick_count,
1325 backtick_count,
1326 content: span_content,
1327 });
1328
1329 pos = end_pos + backtick_count;
1331 continue;
1332 }
1333 }
1334
1335 pos = start_pos + backtick_count;
1337 } else {
1338 break;
1340 }
1341 }
1342
1343 code_spans
1344 }
1345
1346 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1348 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1351 let mut last_list_item_line = 0;
1352 let mut current_indent_level = 0;
1353 let mut last_marker_width = 0;
1354
1355 for (line_idx, line_info) in lines.iter().enumerate() {
1356 let line_num = line_idx + 1;
1357
1358 if line_info.in_code_block {
1360 if let Some(ref mut block) = current_block {
1361 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1363
1364 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1366
1367 match context {
1368 CodeBlockContext::Indented => {
1369 block.end_line = line_num;
1371 continue;
1372 }
1373 CodeBlockContext::Standalone => {
1374 let completed_block = current_block.take().unwrap();
1376 list_blocks.push(completed_block);
1377 continue;
1378 }
1379 CodeBlockContext::Adjacent => {
1380 block.end_line = line_num;
1382 continue;
1383 }
1384 }
1385 } else {
1386 continue;
1388 }
1389 }
1390
1391 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1393 caps.get(0).unwrap().as_str().to_string()
1394 } else {
1395 String::new()
1396 };
1397
1398 if let Some(list_item) = &line_info.list_item {
1400 let item_indent = list_item.marker_column;
1402 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1405 let is_nested = nesting > block.nesting_level;
1409 let same_type =
1410 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1411 let same_context = block.blockquote_prefix == blockquote_prefix;
1412 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1416 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1417
1418 let has_non_list_content = {
1420 let mut found_non_list = false;
1421 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1423 for check_line in (block_last_item_line + 1)..line_num {
1424 let check_idx = check_line - 1;
1425 if check_idx < lines.len() {
1426 let check_info = &lines[check_idx];
1427 let is_list_breaking_content = if check_info.in_code_block {
1429 let last_item_marker_width =
1431 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1432 lines[block_last_item_line - 1]
1433 .list_item
1434 .as_ref()
1435 .map(|li| {
1436 if li.is_ordered {
1437 li.marker.len() + 1 } else {
1439 li.marker.len()
1440 }
1441 })
1442 .unwrap_or(3) } else {
1444 3 };
1446
1447 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1448
1449 let context = CodeBlockUtils::analyze_code_block_context(
1451 lines,
1452 check_line - 1,
1453 min_continuation,
1454 );
1455
1456 matches!(context, CodeBlockContext::Standalone)
1458 } else if !check_info.is_blank && check_info.list_item.is_none() {
1459 let line_content = check_info.content.trim();
1461
1462 if check_info.heading.is_some()
1464 || line_content.starts_with("---")
1465 || line_content.starts_with("***")
1466 || line_content.starts_with("___")
1467 || line_content.contains('|')
1468 || line_content.starts_with(">")
1469 {
1470 true
1471 }
1472 else {
1474 let last_item_marker_width =
1475 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1476 lines[block_last_item_line - 1]
1477 .list_item
1478 .as_ref()
1479 .map(|li| {
1480 if li.is_ordered {
1481 li.marker.len() + 1 } else {
1483 li.marker.len()
1484 }
1485 })
1486 .unwrap_or(3) } else {
1488 3 };
1490
1491 let min_continuation =
1492 if block.is_ordered { last_item_marker_width } else { 2 };
1493 check_info.indent < min_continuation
1494 }
1495 } else {
1496 false
1497 };
1498
1499 if is_list_breaking_content {
1500 found_non_list = true;
1502 break;
1503 }
1504 }
1505 }
1506 found_non_list
1507 };
1508
1509 let continues_list = if is_nested {
1513 same_context && reasonable_distance && !has_non_list_content
1515 } else {
1516 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1518 };
1519
1520 if continues_list {
1521 block.end_line = line_num;
1523 block.item_lines.push(line_num);
1524
1525 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1527 list_item.marker.len() + 1
1528 } else {
1529 list_item.marker.len()
1530 });
1531
1532 if !block.is_ordered
1534 && block.marker.is_some()
1535 && block.marker.as_ref() != Some(&list_item.marker)
1536 {
1537 block.marker = None;
1539 }
1540 } else {
1541 list_blocks.push(block.clone());
1543
1544 *block = ListBlock {
1545 start_line: line_num,
1546 end_line: line_num,
1547 is_ordered: list_item.is_ordered,
1548 marker: if list_item.is_ordered {
1549 None
1550 } else {
1551 Some(list_item.marker.clone())
1552 },
1553 blockquote_prefix: blockquote_prefix.clone(),
1554 item_lines: vec![line_num],
1555 nesting_level: nesting,
1556 max_marker_width: if list_item.is_ordered {
1557 list_item.marker.len() + 1
1558 } else {
1559 list_item.marker.len()
1560 },
1561 };
1562 }
1563 } else {
1564 current_block = Some(ListBlock {
1566 start_line: line_num,
1567 end_line: line_num,
1568 is_ordered: list_item.is_ordered,
1569 marker: if list_item.is_ordered {
1570 None
1571 } else {
1572 Some(list_item.marker.clone())
1573 },
1574 blockquote_prefix,
1575 item_lines: vec![line_num],
1576 nesting_level: nesting,
1577 max_marker_width: list_item.marker.len(),
1578 });
1579 }
1580
1581 last_list_item_line = line_num;
1582 current_indent_level = item_indent;
1583 last_marker_width = if list_item.is_ordered {
1584 list_item.marker.len() + 1 } else {
1586 list_item.marker.len()
1587 };
1588 } else if let Some(ref mut block) = current_block {
1589 let min_continuation_indent = if block.is_ordered {
1600 current_indent_level + last_marker_width
1601 } else {
1602 current_indent_level + 2 };
1604
1605 if line_info.indent >= min_continuation_indent {
1606 block.end_line = line_num;
1608 } else if line_info.is_blank {
1609 let mut check_idx = line_idx + 1;
1612 let mut found_continuation = false;
1613
1614 while check_idx < lines.len() && lines[check_idx].is_blank {
1616 check_idx += 1;
1617 }
1618
1619 if check_idx < lines.len() {
1620 let next_line = &lines[check_idx];
1621 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1623 found_continuation = true;
1624 }
1625 else if !next_line.in_code_block
1627 && next_line.list_item.is_some()
1628 && let Some(item) = &next_line.list_item
1629 {
1630 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1631 .find(&next_line.content)
1632 .map_or(String::new(), |m| m.as_str().to_string());
1633 if item.marker_column == current_indent_level
1634 && item.is_ordered == block.is_ordered
1635 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1636 {
1637 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1640 if let Some(between_line) = lines.get(idx) {
1641 let trimmed = between_line.content.trim();
1642 if trimmed.is_empty() {
1644 return false;
1645 }
1646 let line_indent =
1648 between_line.content.len() - between_line.content.trim_start().len();
1649
1650 if trimmed.starts_with("```")
1652 || trimmed.starts_with("~~~")
1653 || trimmed.starts_with("---")
1654 || trimmed.starts_with("***")
1655 || trimmed.starts_with("___")
1656 || trimmed.starts_with(">")
1657 || trimmed.contains('|') || between_line.heading.is_some()
1659 {
1660 return true; }
1662
1663 line_indent >= min_continuation_indent
1665 } else {
1666 false
1667 }
1668 });
1669
1670 if block.is_ordered {
1671 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1674 if let Some(between_line) = lines.get(idx) {
1675 let trimmed = between_line.content.trim();
1676 if trimmed.is_empty() {
1677 return false;
1678 }
1679 trimmed.starts_with("```")
1681 || trimmed.starts_with("~~~")
1682 || trimmed.starts_with("---")
1683 || trimmed.starts_with("***")
1684 || trimmed.starts_with("___")
1685 || trimmed.starts_with(">")
1686 || trimmed.contains('|') || between_line.heading.is_some()
1688 } else {
1689 false
1690 }
1691 });
1692 found_continuation = !has_structural_separators;
1693 } else {
1694 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1696 if let Some(between_line) = lines.get(idx) {
1697 let trimmed = between_line.content.trim();
1698 if trimmed.is_empty() {
1699 return false;
1700 }
1701 trimmed.starts_with("```")
1703 || trimmed.starts_with("~~~")
1704 || trimmed.starts_with("---")
1705 || trimmed.starts_with("***")
1706 || trimmed.starts_with("___")
1707 || trimmed.starts_with(">")
1708 || trimmed.contains('|') || between_line.heading.is_some()
1710 } else {
1711 false
1712 }
1713 });
1714 found_continuation = !has_structural_separators;
1715 }
1716 }
1717 }
1718 }
1719
1720 if found_continuation {
1721 block.end_line = line_num;
1723 } else {
1724 list_blocks.push(block.clone());
1726 current_block = None;
1727 }
1728 } else {
1729 let min_required_indent = if block.is_ordered {
1732 current_indent_level + last_marker_width
1733 } else {
1734 current_indent_level + 2
1735 };
1736
1737 let line_content = line_info.content.trim();
1742 let is_structural_separator = line_info.heading.is_some()
1743 || line_content.starts_with("```")
1744 || line_content.starts_with("~~~")
1745 || line_content.starts_with("---")
1746 || line_content.starts_with("***")
1747 || line_content.starts_with("___")
1748 || line_content.starts_with(">")
1749 || line_content.contains('|'); let is_lazy_continuation = last_list_item_line == line_num - 1
1752 && !is_structural_separator
1753 && !line_info.is_blank
1754 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1755
1756 if is_lazy_continuation {
1757 let content_to_check = if !blockquote_prefix.is_empty() {
1760 line_info
1762 .content
1763 .strip_prefix(&blockquote_prefix)
1764 .unwrap_or(&line_info.content)
1765 .trim()
1766 } else {
1767 line_info.content.trim()
1768 };
1769
1770 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1771
1772 if starts_with_uppercase && last_list_item_line > 0 {
1775 list_blocks.push(block.clone());
1777 current_block = None;
1778 } else {
1779 block.end_line = line_num;
1781 }
1782 } else {
1783 list_blocks.push(block.clone());
1785 current_block = None;
1786 }
1787 }
1788 }
1789 }
1790
1791 if let Some(block) = current_block {
1793 list_blocks.push(block);
1794 }
1795
1796 merge_adjacent_list_blocks(&mut list_blocks, lines);
1798
1799 list_blocks
1800 }
1801
1802 fn compute_char_frequency(content: &str) -> CharFrequency {
1804 let mut frequency = CharFrequency::default();
1805
1806 for ch in content.chars() {
1807 match ch {
1808 '#' => frequency.hash_count += 1,
1809 '*' => frequency.asterisk_count += 1,
1810 '_' => frequency.underscore_count += 1,
1811 '-' => frequency.hyphen_count += 1,
1812 '+' => frequency.plus_count += 1,
1813 '>' => frequency.gt_count += 1,
1814 '|' => frequency.pipe_count += 1,
1815 '[' => frequency.bracket_count += 1,
1816 '`' => frequency.backtick_count += 1,
1817 '<' => frequency.lt_count += 1,
1818 '!' => frequency.exclamation_count += 1,
1819 '\n' => frequency.newline_count += 1,
1820 _ => {}
1821 }
1822 }
1823
1824 frequency
1825 }
1826
1827 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1829 lazy_static! {
1830 static ref HTML_TAG_REGEX: regex::Regex =
1831 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*(/?)>").unwrap();
1832 }
1833
1834 let mut html_tags = Vec::with_capacity(content.matches('<').count());
1835
1836 for cap in HTML_TAG_REGEX.captures_iter(content) {
1837 let full_match = cap.get(0).unwrap();
1838 let match_start = full_match.start();
1839 let match_end = full_match.end();
1840
1841 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1843 continue;
1844 }
1845
1846 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1847 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1848 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1849
1850 let mut line_num = 1;
1852 let mut col_start = match_start;
1853 let mut col_end = match_end;
1854 for (idx, line_info) in lines.iter().enumerate() {
1855 if match_start >= line_info.byte_offset {
1856 line_num = idx + 1;
1857 col_start = match_start - line_info.byte_offset;
1858 col_end = match_end - line_info.byte_offset;
1859 } else {
1860 break;
1861 }
1862 }
1863
1864 html_tags.push(HtmlTag {
1865 line: line_num,
1866 start_col: col_start,
1867 end_col: col_end,
1868 byte_offset: match_start,
1869 byte_end: match_end,
1870 tag_name,
1871 is_closing,
1872 is_self_closing,
1873 raw_content: full_match.as_str().to_string(),
1874 });
1875 }
1876
1877 html_tags
1878 }
1879
1880 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1882 lazy_static! {
1883 static ref EMPHASIS_REGEX: regex::Regex =
1884 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1885 }
1886
1887 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1888
1889 for cap in EMPHASIS_REGEX.captures_iter(content) {
1890 let full_match = cap.get(0).unwrap();
1891 let match_start = full_match.start();
1892 let match_end = full_match.end();
1893
1894 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1896 continue;
1897 }
1898
1899 let opening_markers = cap.get(1).unwrap().as_str();
1900 let content_part = cap.get(2).unwrap().as_str();
1901 let closing_markers = cap.get(3).unwrap().as_str();
1902
1903 if opening_markers.chars().next() != closing_markers.chars().next()
1905 || opening_markers.len() != closing_markers.len()
1906 {
1907 continue;
1908 }
1909
1910 let marker = opening_markers.chars().next().unwrap();
1911 let marker_count = opening_markers.len();
1912
1913 let mut line_num = 1;
1915 let mut col_start = match_start;
1916 let mut col_end = match_end;
1917 for (idx, line_info) in lines.iter().enumerate() {
1918 if match_start >= line_info.byte_offset {
1919 line_num = idx + 1;
1920 col_start = match_start - line_info.byte_offset;
1921 col_end = match_end - line_info.byte_offset;
1922 } else {
1923 break;
1924 }
1925 }
1926
1927 emphasis_spans.push(EmphasisSpan {
1928 line: line_num,
1929 start_col: col_start,
1930 end_col: col_end,
1931 byte_offset: match_start,
1932 byte_end: match_end,
1933 marker,
1934 marker_count,
1935 content: content_part.to_string(),
1936 });
1937 }
1938
1939 emphasis_spans
1940 }
1941
1942 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
1944 let mut table_rows = Vec::with_capacity(lines.len() / 20);
1945
1946 for (line_idx, line_info) in lines.iter().enumerate() {
1947 if line_info.in_code_block || line_info.is_blank {
1949 continue;
1950 }
1951
1952 let line = &line_info.content;
1953 let line_num = line_idx + 1;
1954
1955 if !line.contains('|') {
1957 continue;
1958 }
1959
1960 let parts: Vec<&str> = line.split('|').collect();
1962 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
1963
1964 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
1966 let mut column_alignments = Vec::new();
1967
1968 if is_separator {
1969 for part in &parts[1..parts.len() - 1] {
1970 let trimmed = part.trim();
1972 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
1973 "center".to_string()
1974 } else if trimmed.ends_with(':') {
1975 "right".to_string()
1976 } else if trimmed.starts_with(':') {
1977 "left".to_string()
1978 } else {
1979 "none".to_string()
1980 };
1981 column_alignments.push(alignment);
1982 }
1983 }
1984
1985 table_rows.push(TableRow {
1986 line: line_num,
1987 is_separator,
1988 column_count,
1989 column_alignments,
1990 });
1991 }
1992
1993 table_rows
1994 }
1995
1996 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
1998 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
1999
2000 for cap in BARE_URL_PATTERN.captures_iter(content) {
2002 let full_match = cap.get(0).unwrap();
2003 let match_start = full_match.start();
2004 let match_end = full_match.end();
2005
2006 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2008 continue;
2009 }
2010
2011 let preceding_char = if match_start > 0 {
2013 content.chars().nth(match_start - 1)
2014 } else {
2015 None
2016 };
2017 let following_char = content.chars().nth(match_end);
2018
2019 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2020 continue;
2021 }
2022 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2023 continue;
2024 }
2025
2026 let url = full_match.as_str();
2027 let url_type = if url.starts_with("https://") {
2028 "https"
2029 } else if url.starts_with("http://") {
2030 "http"
2031 } else if url.starts_with("ftp://") {
2032 "ftp"
2033 } else {
2034 "other"
2035 };
2036
2037 let mut line_num = 1;
2039 let mut col_start = match_start;
2040 let mut col_end = match_end;
2041 for (idx, line_info) in lines.iter().enumerate() {
2042 if match_start >= line_info.byte_offset {
2043 line_num = idx + 1;
2044 col_start = match_start - line_info.byte_offset;
2045 col_end = match_end - line_info.byte_offset;
2046 } else {
2047 break;
2048 }
2049 }
2050
2051 bare_urls.push(BareUrl {
2052 line: line_num,
2053 start_col: col_start,
2054 end_col: col_end,
2055 byte_offset: match_start,
2056 byte_end: match_end,
2057 url: url.to_string(),
2058 url_type: url_type.to_string(),
2059 });
2060 }
2061
2062 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2064 let full_match = cap.get(0).unwrap();
2065 let match_start = full_match.start();
2066 let match_end = full_match.end();
2067
2068 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2070 continue;
2071 }
2072
2073 let preceding_char = if match_start > 0 {
2075 content.chars().nth(match_start - 1)
2076 } else {
2077 None
2078 };
2079 let following_char = content.chars().nth(match_end);
2080
2081 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2082 continue;
2083 }
2084 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2085 continue;
2086 }
2087
2088 let email = full_match.as_str();
2089
2090 let mut line_num = 1;
2092 let mut col_start = match_start;
2093 let mut col_end = match_end;
2094 for (idx, line_info) in lines.iter().enumerate() {
2095 if match_start >= line_info.byte_offset {
2096 line_num = idx + 1;
2097 col_start = match_start - line_info.byte_offset;
2098 col_end = match_end - line_info.byte_offset;
2099 } else {
2100 break;
2101 }
2102 }
2103
2104 bare_urls.push(BareUrl {
2105 line: line_num,
2106 start_col: col_start,
2107 end_col: col_end,
2108 byte_offset: match_start,
2109 byte_end: match_end,
2110 url: email.to_string(),
2111 url_type: "email".to_string(),
2112 });
2113 }
2114
2115 bare_urls
2116 }
2117}
2118
2119fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2121 if list_blocks.len() < 2 {
2122 return;
2123 }
2124
2125 let mut merger = ListBlockMerger::new(lines);
2126 *list_blocks = merger.merge(list_blocks);
2127}
2128
2129struct ListBlockMerger<'a> {
2131 lines: &'a [LineInfo],
2132}
2133
2134impl<'a> ListBlockMerger<'a> {
2135 fn new(lines: &'a [LineInfo]) -> Self {
2136 Self { lines }
2137 }
2138
2139 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2140 let mut merged = Vec::with_capacity(list_blocks.len());
2141 let mut current = list_blocks[0].clone();
2142
2143 for next in list_blocks.iter().skip(1) {
2144 if self.should_merge_blocks(¤t, next) {
2145 current = self.merge_two_blocks(current, next);
2146 } else {
2147 merged.push(current);
2148 current = next.clone();
2149 }
2150 }
2151
2152 merged.push(current);
2153 merged
2154 }
2155
2156 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2158 if !self.blocks_are_compatible(current, next) {
2160 return false;
2161 }
2162
2163 let spacing = self.analyze_spacing_between(current, next);
2165 match spacing {
2166 BlockSpacing::Consecutive => true,
2167 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2168 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2169 self.can_merge_with_content_between(current, next)
2170 }
2171 }
2172 }
2173
2174 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2176 current.is_ordered == next.is_ordered
2177 && current.blockquote_prefix == next.blockquote_prefix
2178 && current.nesting_level == next.nesting_level
2179 }
2180
2181 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2183 let gap = next.start_line - current.end_line;
2184
2185 match gap {
2186 1 => BlockSpacing::Consecutive,
2187 2 => BlockSpacing::SingleBlank,
2188 _ if gap > 2 => {
2189 if self.has_only_blank_lines_between(current, next) {
2190 BlockSpacing::MultipleBlanks
2191 } else {
2192 BlockSpacing::ContentBetween
2193 }
2194 }
2195 _ => BlockSpacing::Consecutive, }
2197 }
2198
2199 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2201 if has_meaningful_content_between(current, next, self.lines) {
2204 return false; }
2206
2207 !current.is_ordered && current.marker == next.marker
2209 }
2210
2211 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2213 if has_meaningful_content_between(current, next, self.lines) {
2215 return false; }
2217
2218 current.is_ordered && next.is_ordered
2220 }
2221
2222 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2224 for line_num in (current.end_line + 1)..next.start_line {
2225 if let Some(line_info) = self.lines.get(line_num - 1)
2226 && !line_info.content.trim().is_empty()
2227 {
2228 return false;
2229 }
2230 }
2231 true
2232 }
2233
2234 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2236 current.end_line = next.end_line;
2237 current.item_lines.extend_from_slice(&next.item_lines);
2238
2239 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2241
2242 if !current.is_ordered && self.markers_differ(¤t, next) {
2244 current.marker = None; }
2246
2247 current
2248 }
2249
2250 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2252 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2253 }
2254}
2255
2256#[derive(Debug, PartialEq)]
2258enum BlockSpacing {
2259 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2264
2265fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2267 for line_num in (current.end_line + 1)..next.start_line {
2269 if let Some(line_info) = lines.get(line_num - 1) {
2270 let trimmed = line_info.content.trim();
2272
2273 if trimmed.is_empty() {
2275 continue;
2276 }
2277
2278 if line_info.heading.is_some() {
2282 return true; }
2284
2285 if is_horizontal_rule(trimmed) {
2287 return true; }
2289
2290 if trimmed.contains('|') && trimmed.len() > 1 {
2292 return true; }
2294
2295 if trimmed.starts_with('>') {
2297 return true; }
2299
2300 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2302 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2303
2304 let min_continuation_indent = if current.is_ordered {
2306 current.nesting_level + current.max_marker_width + 1 } else {
2308 current.nesting_level + 2
2309 };
2310
2311 if line_indent < min_continuation_indent {
2312 return true; }
2315 }
2316
2317 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2319
2320 let min_indent = if current.is_ordered {
2322 current.nesting_level + current.max_marker_width
2323 } else {
2324 current.nesting_level + 2
2325 };
2326
2327 if line_indent < min_indent {
2329 return true; }
2331
2332 }
2335 }
2336
2337 false
2339}
2340
2341fn is_horizontal_rule(trimmed: &str) -> bool {
2343 if trimmed.len() < 3 {
2344 return false;
2345 }
2346
2347 let chars: Vec<char> = trimmed.chars().collect();
2349 if let Some(&first_char) = chars.first()
2350 && (first_char == '-' || first_char == '*' || first_char == '_')
2351 {
2352 let mut count = 0;
2353 for &ch in &chars {
2354 if ch == first_char {
2355 count += 1;
2356 } else if ch != ' ' && ch != '\t' {
2357 return false; }
2359 }
2360 return count >= 3;
2361 }
2362 false
2363}
2364
2365#[cfg(test)]
2367mod tests {
2368 use super::*;
2369
2370 #[test]
2371 fn test_empty_content() {
2372 let ctx = LintContext::new("", MarkdownFlavor::Standard);
2373 assert_eq!(ctx.content, "");
2374 assert_eq!(ctx.line_offsets, vec![0]);
2375 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2376 assert_eq!(ctx.lines.len(), 0);
2377 }
2378
2379 #[test]
2380 fn test_single_line() {
2381 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2382 assert_eq!(ctx.content, "# Hello");
2383 assert_eq!(ctx.line_offsets, vec![0]);
2384 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2385 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2386 }
2387
2388 #[test]
2389 fn test_multi_line() {
2390 let content = "# Title\n\nSecond line\nThird line";
2391 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2392 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2393 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2400
2401 #[test]
2402 fn test_line_info() {
2403 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2404 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2405
2406 assert_eq!(ctx.lines.len(), 7);
2408
2409 let line1 = &ctx.lines[0];
2411 assert_eq!(line1.content, "# Title");
2412 assert_eq!(line1.byte_offset, 0);
2413 assert_eq!(line1.indent, 0);
2414 assert!(!line1.is_blank);
2415 assert!(!line1.in_code_block);
2416 assert!(line1.list_item.is_none());
2417
2418 let line2 = &ctx.lines[1];
2420 assert_eq!(line2.content, " indented");
2421 assert_eq!(line2.byte_offset, 8);
2422 assert_eq!(line2.indent, 4);
2423 assert!(!line2.is_blank);
2424
2425 let line3 = &ctx.lines[2];
2427 assert_eq!(line3.content, "");
2428 assert!(line3.is_blank);
2429
2430 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2432 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2433 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2434 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2435 }
2436
2437 #[test]
2438 fn test_list_item_detection() {
2439 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2440 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2441
2442 let line1 = &ctx.lines[0];
2444 assert!(line1.list_item.is_some());
2445 let list1 = line1.list_item.as_ref().unwrap();
2446 assert_eq!(list1.marker, "-");
2447 assert!(!list1.is_ordered);
2448 assert_eq!(list1.marker_column, 0);
2449 assert_eq!(list1.content_column, 2);
2450
2451 let line2 = &ctx.lines[1];
2453 assert!(line2.list_item.is_some());
2454 let list2 = line2.list_item.as_ref().unwrap();
2455 assert_eq!(list2.marker, "*");
2456 assert_eq!(list2.marker_column, 2);
2457
2458 let line3 = &ctx.lines[2];
2460 assert!(line3.list_item.is_some());
2461 let list3 = line3.list_item.as_ref().unwrap();
2462 assert_eq!(list3.marker, "1.");
2463 assert!(list3.is_ordered);
2464 assert_eq!(list3.number, Some(1));
2465
2466 let line6 = &ctx.lines[5];
2468 assert!(line6.list_item.is_none());
2469 }
2470
2471 #[test]
2472 fn test_offset_to_line_col_edge_cases() {
2473 let content = "a\nb\nc";
2474 let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2475 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2483}