1use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
2use lazy_static::lazy_static;
3use regex::Regex;
4
5lazy_static! {
6 static ref LINK_PATTERN: Regex = Regex::new(
9 r"(?sx)
10 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text in group 1 (handles nested brackets)
11 (?:
12 \(([^)]*)\) # Inline URL in group 2 (can be empty)
13 |
14 \[([^\]]*)\] # Reference ID in group 3
15 )"
16 ).unwrap();
17
18 static ref IMAGE_PATTERN: Regex = Regex::new(
21 r"(?sx)
22 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text in group 1 (handles nested brackets)
23 (?:
24 \(([^)]*)\) # Inline URL in group 2 (can be empty)
25 |
26 \[([^\]]*)\] # Reference ID in group 3
27 )"
28 ).unwrap();
29
30 static ref REF_DEF_PATTERN: Regex = Regex::new(
32 r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
33 ).unwrap();
34
35 static ref CODE_SPAN_PATTERN: Regex = Regex::new(
38 r"`+"
39 ).unwrap();
40
41 static ref BARE_URL_PATTERN: Regex = Regex::new(
43 r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44 ).unwrap();
45
46 static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
48 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
49 ).unwrap();
50
51 static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
53 r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
54 ).unwrap();
55
56 static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
58}
59
60#[derive(Debug, Clone)]
62pub struct LineInfo {
63 pub content: String,
65 pub byte_offset: usize,
67 pub indent: usize,
69 pub is_blank: bool,
71 pub in_code_block: bool,
73 pub in_front_matter: bool,
75 pub list_item: Option<ListItemInfo>,
77 pub heading: Option<HeadingInfo>,
79 pub blockquote: Option<BlockquoteInfo>,
81}
82
83#[derive(Debug, Clone)]
85pub struct ListItemInfo {
86 pub marker: String,
88 pub is_ordered: bool,
90 pub number: Option<usize>,
92 pub marker_column: usize,
94 pub content_column: usize,
96}
97
98#[derive(Debug, Clone, PartialEq)]
100pub enum HeadingStyle {
101 ATX,
103 Setext1,
105 Setext2,
107}
108
109#[derive(Debug, Clone)]
111pub struct ParsedLink {
112 pub line: usize,
114 pub start_col: usize,
116 pub end_col: usize,
118 pub byte_offset: usize,
120 pub byte_end: usize,
122 pub text: String,
124 pub url: String,
126 pub is_reference: bool,
128 pub reference_id: Option<String>,
130}
131
132#[derive(Debug, Clone)]
134pub struct ParsedImage {
135 pub line: usize,
137 pub start_col: usize,
139 pub end_col: usize,
141 pub byte_offset: usize,
143 pub byte_end: usize,
145 pub alt_text: String,
147 pub url: String,
149 pub is_reference: bool,
151 pub reference_id: Option<String>,
153}
154
155#[derive(Debug, Clone)]
157pub struct ReferenceDef {
158 pub line: usize,
160 pub id: String,
162 pub url: String,
164 pub title: Option<String>,
166}
167
168#[derive(Debug, Clone)]
170pub struct CodeSpan {
171 pub line: usize,
173 pub start_col: usize,
175 pub end_col: usize,
177 pub byte_offset: usize,
179 pub byte_end: usize,
181 pub backtick_count: usize,
183 pub content: String,
185}
186
187#[derive(Debug, Clone)]
189pub struct HeadingInfo {
190 pub level: u8,
192 pub style: HeadingStyle,
194 pub marker: String,
196 pub marker_column: usize,
198 pub content_column: usize,
200 pub text: String,
202 pub custom_id: Option<String>,
204 pub raw_text: String,
206 pub has_closing_sequence: bool,
208 pub closing_sequence: String,
210}
211
212#[derive(Debug, Clone)]
214pub struct BlockquoteInfo {
215 pub nesting_level: usize,
217 pub indent: String,
219 pub marker_column: usize,
221 pub prefix: String,
223 pub content: String,
225 pub has_no_space_after_marker: bool,
227 pub has_multiple_spaces_after_marker: bool,
229 pub needs_md028_fix: bool,
231}
232
233#[derive(Debug, Clone)]
235pub struct ListBlock {
236 pub start_line: usize,
238 pub end_line: usize,
240 pub is_ordered: bool,
242 pub marker: Option<String>,
244 pub blockquote_prefix: String,
246 pub item_lines: Vec<usize>,
248 pub nesting_level: usize,
250 pub max_marker_width: usize,
252}
253
254use std::sync::{Arc, Mutex};
255
256#[derive(Debug, Clone, Default)]
258pub struct CharFrequency {
259 pub hash_count: usize,
261 pub asterisk_count: usize,
263 pub underscore_count: usize,
265 pub hyphen_count: usize,
267 pub plus_count: usize,
269 pub gt_count: usize,
271 pub pipe_count: usize,
273 pub bracket_count: usize,
275 pub backtick_count: usize,
277 pub lt_count: usize,
279 pub exclamation_count: usize,
281 pub newline_count: usize,
283}
284
285#[derive(Debug, Clone)]
287pub struct HtmlTag {
288 pub line: usize,
290 pub start_col: usize,
292 pub end_col: usize,
294 pub byte_offset: usize,
296 pub byte_end: usize,
298 pub tag_name: String,
300 pub is_closing: bool,
302 pub is_self_closing: bool,
304 pub raw_content: String,
306}
307
308#[derive(Debug, Clone)]
310pub struct EmphasisSpan {
311 pub line: usize,
313 pub start_col: usize,
315 pub end_col: usize,
317 pub byte_offset: usize,
319 pub byte_end: usize,
321 pub marker: char,
323 pub marker_count: usize,
325 pub content: String,
327}
328
329#[derive(Debug, Clone)]
331pub struct TableRow {
332 pub line: usize,
334 pub is_separator: bool,
336 pub column_count: usize,
338 pub column_alignments: Vec<String>, }
341
342#[derive(Debug, Clone)]
344pub struct BareUrl {
345 pub line: usize,
347 pub start_col: usize,
349 pub end_col: usize,
351 pub byte_offset: usize,
353 pub byte_end: usize,
355 pub url: String,
357 pub url_type: String,
359}
360
361pub struct LintContext<'a> {
362 pub content: &'a str,
363 pub line_offsets: Vec<usize>,
364 pub code_blocks: Vec<(usize, usize)>, pub lines: Vec<LineInfo>, pub links: Vec<ParsedLink>, pub images: Vec<ParsedImage>, pub reference_defs: Vec<ReferenceDef>, code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, pub list_blocks: Vec<ListBlock>, pub char_frequency: CharFrequency, html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, }
377
378impl<'a> LintContext<'a> {
379 pub fn new(content: &'a str) -> Self {
380 let mut line_offsets = vec![0];
381 for (i, c) in content.char_indices() {
382 if c == '\n' {
383 line_offsets.push(i + 1);
384 }
385 }
386
387 let code_blocks = CodeBlockUtils::detect_code_blocks(content);
389
390 let lines = Self::compute_line_info(content, &line_offsets, &code_blocks);
392
393 let links = Self::parse_links(content, &lines, &code_blocks);
396 let images = Self::parse_images(content, &lines, &code_blocks);
397 let reference_defs = Self::parse_reference_defs(content, &lines);
398 let list_blocks = Self::parse_list_blocks(&lines);
399
400 let char_frequency = Self::compute_char_frequency(content);
402
403 Self {
404 content,
405 line_offsets,
406 code_blocks,
407 lines,
408 links,
409 images,
410 reference_defs,
411 code_spans_cache: Mutex::new(None),
412 list_blocks,
413 char_frequency,
414 html_tags_cache: Mutex::new(None),
415 emphasis_spans_cache: Mutex::new(None),
416 table_rows_cache: Mutex::new(None),
417 bare_urls_cache: Mutex::new(None),
418 }
419 }
420
421 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
423 let mut cache = self.code_spans_cache.lock().unwrap();
424
425 if cache.is_none() {
427 let code_spans = Self::parse_code_spans(self.content, &self.lines);
428 *cache = Some(Arc::new(code_spans));
429 }
430
431 cache.as_ref().unwrap().clone()
433 }
434
435 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
437 let mut cache = self.html_tags_cache.lock().unwrap();
438
439 if cache.is_none() {
440 let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
441 *cache = Some(Arc::new(html_tags));
442 }
443
444 cache.as_ref().unwrap().clone()
445 }
446
447 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
449 let mut cache = self.emphasis_spans_cache.lock().unwrap();
450
451 if cache.is_none() {
452 let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
453 *cache = Some(Arc::new(emphasis_spans));
454 }
455
456 cache.as_ref().unwrap().clone()
457 }
458
459 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
461 let mut cache = self.table_rows_cache.lock().unwrap();
462
463 if cache.is_none() {
464 let table_rows = Self::parse_table_rows(&self.lines);
465 *cache = Some(Arc::new(table_rows));
466 }
467
468 cache.as_ref().unwrap().clone()
469 }
470
471 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
473 let mut cache = self.bare_urls_cache.lock().unwrap();
474
475 if cache.is_none() {
476 let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
477 *cache = Some(Arc::new(bare_urls));
478 }
479
480 cache.as_ref().unwrap().clone()
481 }
482
483 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
485 match self.line_offsets.binary_search(&offset) {
486 Ok(line) => (line + 1, 1),
487 Err(line) => {
488 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
489 (line, offset - line_start + 1)
490 }
491 }
492 }
493
494 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
496 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
498 return true;
499 }
500
501 self.code_spans()
503 .iter()
504 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
505 }
506
507 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
509 if line_num > 0 {
510 self.lines.get(line_num - 1)
511 } else {
512 None
513 }
514 }
515
516 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
518 self.line_info(line_num).map(|info| info.byte_offset)
519 }
520
521 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
523 let normalized_id = ref_id.to_lowercase();
524 self.reference_defs
525 .iter()
526 .find(|def| def.id == normalized_id)
527 .map(|def| def.url.as_str())
528 }
529
530 pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
532 self.links.iter().filter(|link| link.line == line_num).collect()
533 }
534
535 pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
537 self.images.iter().filter(|img| img.line == line_num).collect()
538 }
539
540 pub fn is_in_list_block(&self, line_num: usize) -> bool {
542 self.list_blocks
543 .iter()
544 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
545 }
546
547 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
549 self.list_blocks
550 .iter()
551 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
552 }
553
554 pub fn has_char(&self, ch: char) -> bool {
556 match ch {
557 '#' => self.char_frequency.hash_count > 0,
558 '*' => self.char_frequency.asterisk_count > 0,
559 '_' => self.char_frequency.underscore_count > 0,
560 '-' => self.char_frequency.hyphen_count > 0,
561 '+' => self.char_frequency.plus_count > 0,
562 '>' => self.char_frequency.gt_count > 0,
563 '|' => self.char_frequency.pipe_count > 0,
564 '[' => self.char_frequency.bracket_count > 0,
565 '`' => self.char_frequency.backtick_count > 0,
566 '<' => self.char_frequency.lt_count > 0,
567 '!' => self.char_frequency.exclamation_count > 0,
568 '\n' => self.char_frequency.newline_count > 0,
569 _ => self.content.contains(ch), }
571 }
572
573 pub fn char_count(&self, ch: char) -> usize {
575 match ch {
576 '#' => self.char_frequency.hash_count,
577 '*' => self.char_frequency.asterisk_count,
578 '_' => self.char_frequency.underscore_count,
579 '-' => self.char_frequency.hyphen_count,
580 '+' => self.char_frequency.plus_count,
581 '>' => self.char_frequency.gt_count,
582 '|' => self.char_frequency.pipe_count,
583 '[' => self.char_frequency.bracket_count,
584 '`' => self.char_frequency.backtick_count,
585 '<' => self.char_frequency.lt_count,
586 '!' => self.char_frequency.exclamation_count,
587 '\n' => self.char_frequency.newline_count,
588 _ => self.content.matches(ch).count(), }
590 }
591
592 pub fn likely_has_headings(&self) -> bool {
594 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 }
596
597 pub fn likely_has_lists(&self) -> bool {
599 self.char_frequency.asterisk_count > 0
600 || self.char_frequency.hyphen_count > 0
601 || self.char_frequency.plus_count > 0
602 }
603
604 pub fn likely_has_emphasis(&self) -> bool {
606 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
607 }
608
609 pub fn likely_has_tables(&self) -> bool {
611 self.char_frequency.pipe_count > 2
612 }
613
614 pub fn likely_has_blockquotes(&self) -> bool {
616 self.char_frequency.gt_count > 0
617 }
618
619 pub fn likely_has_code(&self) -> bool {
621 self.char_frequency.backtick_count > 0
622 }
623
624 pub fn likely_has_links_or_images(&self) -> bool {
626 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
627 }
628
629 pub fn likely_has_html(&self) -> bool {
631 self.char_frequency.lt_count > 0
632 }
633
634 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
636 self.html_tags()
637 .iter()
638 .filter(|tag| tag.line == line_num)
639 .cloned()
640 .collect()
641 }
642
643 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
645 self.emphasis_spans()
646 .iter()
647 .filter(|span| span.line == line_num)
648 .cloned()
649 .collect()
650 }
651
652 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
654 self.table_rows()
655 .iter()
656 .filter(|row| row.line == line_num)
657 .cloned()
658 .collect()
659 }
660
661 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
663 self.bare_urls()
664 .iter()
665 .filter(|url| url.line == line_num)
666 .cloned()
667 .collect()
668 }
669
670 fn parse_links(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedLink> {
672 let mut links = Vec::with_capacity(content.len() / 500); for cap in LINK_PATTERN.captures_iter(content) {
677 let full_match = cap.get(0).unwrap();
678 let match_start = full_match.start();
679 let match_end = full_match.end();
680
681 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
683 continue;
684 }
685
686 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
688 continue;
689 }
690
691 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
693 continue;
694 }
695
696 let mut line_num = 1;
698 let mut col_start = match_start;
699 for (idx, line_info) in lines.iter().enumerate() {
700 if match_start >= line_info.byte_offset {
701 line_num = idx + 1;
702 col_start = match_start - line_info.byte_offset;
703 } else {
704 break;
705 }
706 }
707
708 let mut end_line_num = 1;
710 let mut col_end = match_end;
711 for (idx, line_info) in lines.iter().enumerate() {
712 if match_end > line_info.byte_offset {
713 end_line_num = idx + 1;
714 col_end = match_end - line_info.byte_offset;
715 } else {
716 break;
717 }
718 }
719
720 if line_num == end_line_num {
722 } else {
724 }
727
728 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
729
730 if let Some(inline_url) = cap.get(2) {
731 links.push(ParsedLink {
733 line: line_num,
734 start_col: col_start,
735 end_col: col_end,
736 byte_offset: match_start,
737 byte_end: match_end,
738 text,
739 url: inline_url.as_str().to_string(),
740 is_reference: false,
741 reference_id: None,
742 });
743 } else if let Some(ref_id) = cap.get(3) {
744 let ref_id_str = ref_id.as_str();
746 let normalized_ref = if ref_id_str.is_empty() {
747 text.to_lowercase() } else {
749 ref_id_str.to_lowercase()
750 };
751
752 links.push(ParsedLink {
753 line: line_num,
754 start_col: col_start,
755 end_col: col_end,
756 byte_offset: match_start,
757 byte_end: match_end,
758 text,
759 url: String::new(), is_reference: true,
761 reference_id: Some(normalized_ref),
762 });
763 }
764 }
765
766 links
767 }
768
769 fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
771 let mut images = Vec::with_capacity(content.len() / 1000); for cap in IMAGE_PATTERN.captures_iter(content) {
776 let full_match = cap.get(0).unwrap();
777 let match_start = full_match.start();
778 let match_end = full_match.end();
779
780 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
782 continue;
783 }
784
785 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
787 continue;
788 }
789
790 let mut line_num = 1;
792 let mut col_start = match_start;
793 for (idx, line_info) in lines.iter().enumerate() {
794 if match_start >= line_info.byte_offset {
795 line_num = idx + 1;
796 col_start = match_start - line_info.byte_offset;
797 } else {
798 break;
799 }
800 }
801
802 let mut end_line_num = 1;
804 let mut col_end = match_end;
805 for (idx, line_info) in lines.iter().enumerate() {
806 if match_end > line_info.byte_offset {
807 end_line_num = idx + 1;
808 col_end = match_end - line_info.byte_offset;
809 } else {
810 break;
811 }
812 }
813
814 if line_num == end_line_num {
816 } else {
818 }
821
822 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
823
824 if let Some(inline_url) = cap.get(2) {
825 images.push(ParsedImage {
827 line: line_num,
828 start_col: col_start,
829 end_col: col_end,
830 byte_offset: match_start,
831 byte_end: match_end,
832 alt_text,
833 url: inline_url.as_str().to_string(),
834 is_reference: false,
835 reference_id: None,
836 });
837 } else if let Some(ref_id) = cap.get(3) {
838 let ref_id_str = ref_id.as_str();
840 let normalized_ref = if ref_id_str.is_empty() {
841 alt_text.to_lowercase() } else {
843 ref_id_str.to_lowercase()
844 };
845
846 images.push(ParsedImage {
847 line: line_num,
848 start_col: col_start,
849 end_col: col_end,
850 byte_offset: match_start,
851 byte_end: match_end,
852 alt_text,
853 url: String::new(), is_reference: true,
855 reference_id: Some(normalized_ref),
856 });
857 }
858 }
859
860 images
861 }
862
863 fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
865 let mut refs = Vec::with_capacity(lines.len() / 20); for (line_idx, line_info) in lines.iter().enumerate() {
869 if line_info.in_code_block {
871 continue;
872 }
873
874 let line = &line_info.content;
875 let line_num = line_idx + 1;
876
877 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
878 let id = cap.get(1).unwrap().as_str().to_lowercase();
879 let url = cap.get(2).unwrap().as_str().to_string();
880 let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
881
882 refs.push(ReferenceDef {
883 line: line_num,
884 id,
885 url,
886 title,
887 });
888 }
889 }
890
891 refs
892 }
893
894 fn compute_line_info(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<LineInfo> {
896 lazy_static! {
897 static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
899 static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
900
901 static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
903
904 static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
906 static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
907
908 static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
910 }
911
912 let content_lines: Vec<&str> = content.lines().collect();
913 let mut lines = Vec::with_capacity(content_lines.len());
914
915 let mut in_front_matter = false;
917 let mut front_matter_end = 0;
918 if content_lines.first().map(|l| l.trim()) == Some("---") {
919 in_front_matter = true;
920 for (idx, line) in content_lines.iter().enumerate().skip(1) {
921 if line.trim() == "---" {
922 front_matter_end = idx;
923 break;
924 }
925 }
926 }
927
928 for (i, line) in content_lines.iter().enumerate() {
929 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
930 let indent = line.len() - line.trim_start().len();
931 let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
933 let after_prefix = caps.get(2).map_or("", |m| m.as_str());
935 after_prefix.trim().is_empty()
936 } else {
937 line.trim().is_empty()
938 };
939 let in_code_block = code_blocks.iter().any(|&(start, end)| {
942 let block_content = &content[start..end];
945 let is_multiline = block_content.contains('\n');
946 let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
947 let is_indented = !is_fenced
948 && block_content
949 .lines()
950 .all(|l| l.starts_with(" ") || l.starts_with("\t") || l.trim().is_empty());
951
952 byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
953 });
954
955 let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
957 let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
959 let prefix = caps.get(1).unwrap().as_str();
960 let content = caps.get(2).unwrap().as_str();
961 (content, prefix.len())
962 } else {
963 (&**line, 0)
964 };
965
966 if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
967 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
968 let marker = caps.get(2).map_or("", |m| m.as_str());
969 let spacing = caps.get(3).map_or("", |m| m.as_str());
970 let _content = caps.get(4).map_or("", |m| m.as_str());
971 let marker_column = blockquote_prefix_len + leading_spaces.len();
972 let content_column = marker_column + marker.len() + spacing.len();
973
974 if spacing.is_empty() {
981 None
982 } else {
983 Some(ListItemInfo {
984 marker: marker.to_string(),
985 is_ordered: false,
986 number: None,
987 marker_column,
988 content_column,
989 })
990 }
991 } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
992 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
993 let number_str = caps.get(2).map_or("", |m| m.as_str());
994 let delimiter = caps.get(3).map_or("", |m| m.as_str());
995 let spacing = caps.get(4).map_or("", |m| m.as_str());
996 let _content = caps.get(5).map_or("", |m| m.as_str());
997 let marker = format!("{number_str}{delimiter}");
998 let marker_column = blockquote_prefix_len + leading_spaces.len();
999 let content_column = marker_column + marker.len() + spacing.len();
1000
1001 if spacing.is_empty() {
1004 None
1005 } else {
1006 Some(ListItemInfo {
1007 marker,
1008 is_ordered: true,
1009 number: number_str.parse().ok(),
1010 marker_column,
1011 content_column,
1012 })
1013 }
1014 } else {
1015 None
1016 }
1017 } else {
1018 None
1019 };
1020
1021 lines.push(LineInfo {
1022 content: line.to_string(),
1023 byte_offset,
1024 indent,
1025 is_blank,
1026 in_code_block,
1027 in_front_matter: in_front_matter && i <= front_matter_end,
1028 list_item,
1029 heading: None, blockquote: None, });
1032 }
1033
1034 for i in 0..content_lines.len() {
1036 if lines[i].in_code_block {
1037 continue;
1038 }
1039
1040 if in_front_matter && i <= front_matter_end {
1042 continue;
1043 }
1044
1045 let line = content_lines[i];
1046
1047 if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1049 let indent_str = caps.get(1).map_or("", |m| m.as_str());
1050 let markers = caps.get(2).map_or("", |m| m.as_str());
1051 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1052 let content = caps.get(4).map_or("", |m| m.as_str());
1053
1054 let nesting_level = markers.chars().filter(|&c| c == '>').count();
1055 let marker_column = indent_str.len();
1056
1057 let prefix = format!("{indent_str}{markers}{spaces_after}");
1059
1060 let has_no_space = spaces_after.is_empty() && !content.is_empty();
1062 let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1064
1065 let needs_md028_fix = content.trim().is_empty() && spaces_after.is_empty();
1067
1068 lines[i].blockquote = Some(BlockquoteInfo {
1069 nesting_level,
1070 indent: indent_str.to_string(),
1071 marker_column,
1072 prefix,
1073 content: content.to_string(),
1074 has_no_space_after_marker: has_no_space,
1075 has_multiple_spaces_after_marker: has_multiple_spaces,
1076 needs_md028_fix,
1077 });
1078 }
1079
1080 if lines[i].is_blank {
1082 continue;
1083 }
1084
1085 if let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1087 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1088 let hashes = caps.get(2).map_or("", |m| m.as_str());
1089 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1090 let rest = caps.get(4).map_or("", |m| m.as_str());
1091
1092 let level = hashes.len() as u8;
1093 let marker_column = leading_spaces.len();
1094
1095 let (text, has_closing, closing_seq) = {
1097 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1099 if rest[id_start..].trim_end().ends_with('}') {
1101 (&rest[..id_start], &rest[id_start..])
1103 } else {
1104 (rest, "")
1105 }
1106 } else {
1107 (rest, "")
1108 };
1109
1110 let trimmed_rest = rest_without_id.trim_end();
1112 if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1113 let mut start_of_hashes = last_hash_pos;
1115 while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1116 start_of_hashes -= 1;
1117 }
1118
1119 let has_space_before = start_of_hashes == 0
1121 || trimmed_rest
1122 .chars()
1123 .nth(start_of_hashes - 1)
1124 .is_some_and(|c| c.is_whitespace());
1125
1126 let potential_closing = &trimmed_rest[start_of_hashes..];
1128 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1129
1130 if is_all_hashes && has_space_before {
1131 let closing_hashes = potential_closing.to_string();
1133 let text_part = if !custom_id_part.is_empty() {
1136 format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1139 } else {
1140 rest_without_id[..start_of_hashes].trim_end().to_string()
1141 };
1142 (text_part, true, closing_hashes)
1143 } else {
1144 (rest.to_string(), false, String::new())
1146 }
1147 } else {
1148 (rest.to_string(), false, String::new())
1150 }
1151 };
1152
1153 let content_column = marker_column + hashes.len() + spaces_after.len();
1154
1155 let raw_text = text.trim().to_string();
1157 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1158
1159 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1161 let next_line = content_lines[i + 1];
1162 if !lines[i + 1].in_code_block
1163 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1164 && let Some(next_line_id) =
1165 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1166 {
1167 custom_id = Some(next_line_id);
1168 }
1169 }
1170
1171 lines[i].heading = Some(HeadingInfo {
1172 level,
1173 style: HeadingStyle::ATX,
1174 marker: hashes.to_string(),
1175 marker_column,
1176 content_column,
1177 text: clean_text,
1178 custom_id,
1179 raw_text,
1180 has_closing_sequence: has_closing,
1181 closing_sequence: closing_seq,
1182 });
1183 }
1184 else if i + 1 < content_lines.len() {
1186 let next_line = content_lines[i + 1];
1187 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1188 if in_front_matter && i < front_matter_end {
1190 continue;
1191 }
1192
1193 let underline = next_line.trim();
1194 let level = if underline.starts_with('=') { 1 } else { 2 };
1195 let style = if level == 1 {
1196 HeadingStyle::Setext1
1197 } else {
1198 HeadingStyle::Setext2
1199 };
1200
1201 let raw_text = line.trim().to_string();
1203 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1204
1205 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1207 let attr_line = content_lines[i + 2];
1208 if !lines[i + 2].in_code_block
1209 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1210 && let Some(attr_line_id) =
1211 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1212 {
1213 custom_id = Some(attr_line_id);
1214 }
1215 }
1216
1217 lines[i].heading = Some(HeadingInfo {
1218 level,
1219 style,
1220 marker: underline.to_string(),
1221 marker_column: next_line.len() - next_line.trim_start().len(),
1222 content_column: lines[i].indent,
1223 text: clean_text,
1224 custom_id,
1225 raw_text,
1226 has_closing_sequence: false,
1227 closing_sequence: String::new(),
1228 });
1229 }
1230 }
1231 }
1232
1233 lines
1234 }
1235
1236 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1238 let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); if !content.contains('`') {
1243 return code_spans;
1244 }
1245
1246 let mut pos = 0;
1247 let bytes = content.as_bytes();
1248
1249 while pos < bytes.len() {
1250 if let Some(backtick_start) = content[pos..].find('`') {
1252 let start_pos = pos + backtick_start;
1253
1254 let mut in_code_block = false;
1256 for (line_idx, line_info) in lines.iter().enumerate() {
1257 if start_pos >= line_info.byte_offset
1258 && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1259 {
1260 in_code_block = line_info.in_code_block;
1261 break;
1262 }
1263 }
1264
1265 if in_code_block {
1266 pos = start_pos + 1;
1267 continue;
1268 }
1269
1270 let mut backtick_count = 0;
1272 let mut i = start_pos;
1273 while i < bytes.len() && bytes[i] == b'`' {
1274 backtick_count += 1;
1275 i += 1;
1276 }
1277
1278 let search_start = start_pos + backtick_count;
1280 let closing_pattern = &content[start_pos..start_pos + backtick_count];
1281
1282 if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1283 let end_pos = search_start + rel_end;
1285 let check_pos = end_pos + backtick_count;
1286
1287 if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1289 let content_start = start_pos + backtick_count;
1291 let content_end = end_pos;
1292 let span_content = content[content_start..content_end].to_string();
1293
1294 let mut line_num = 1;
1296 let mut col_start = start_pos;
1297 for (idx, line_info) in lines.iter().enumerate() {
1298 if start_pos >= line_info.byte_offset {
1299 line_num = idx + 1;
1300 col_start = start_pos - line_info.byte_offset;
1301 } else {
1302 break;
1303 }
1304 }
1305
1306 let mut col_end = end_pos + backtick_count;
1308 for line_info in lines.iter() {
1309 if end_pos + backtick_count > line_info.byte_offset {
1310 col_end = end_pos + backtick_count - line_info.byte_offset;
1311 } else {
1312 break;
1313 }
1314 }
1315
1316 code_spans.push(CodeSpan {
1317 line: line_num,
1318 start_col: col_start,
1319 end_col: col_end,
1320 byte_offset: start_pos,
1321 byte_end: end_pos + backtick_count,
1322 backtick_count,
1323 content: span_content,
1324 });
1325
1326 pos = end_pos + backtick_count;
1328 continue;
1329 }
1330 }
1331
1332 pos = start_pos + backtick_count;
1334 } else {
1335 break;
1337 }
1338 }
1339
1340 code_spans
1341 }
1342
1343 fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1345 let mut list_blocks = Vec::with_capacity(lines.len() / 10); let mut current_block: Option<ListBlock> = None;
1348 let mut last_list_item_line = 0;
1349 let mut current_indent_level = 0;
1350 let mut last_marker_width = 0;
1351
1352 for (line_idx, line_info) in lines.iter().enumerate() {
1353 let line_num = line_idx + 1;
1354
1355 if line_info.in_code_block {
1357 if let Some(ref mut block) = current_block {
1358 let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1360
1361 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1363
1364 match context {
1365 CodeBlockContext::Indented => {
1366 block.end_line = line_num;
1368 continue;
1369 }
1370 CodeBlockContext::Standalone => {
1371 let completed_block = current_block.take().unwrap();
1373 list_blocks.push(completed_block);
1374 continue;
1375 }
1376 CodeBlockContext::Adjacent => {
1377 block.end_line = line_num;
1379 continue;
1380 }
1381 }
1382 } else {
1383 continue;
1385 }
1386 }
1387
1388 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1390 caps.get(0).unwrap().as_str().to_string()
1391 } else {
1392 String::new()
1393 };
1394
1395 if let Some(list_item) = &line_info.list_item {
1397 let item_indent = list_item.marker_column;
1399 let nesting = item_indent / 2; if let Some(ref mut block) = current_block {
1402 let is_nested = nesting > block.nesting_level;
1406 let same_type =
1407 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1408 let same_context = block.blockquote_prefix == blockquote_prefix;
1409 let reasonable_distance = line_num <= last_list_item_line + 2; let marker_compatible =
1413 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1414
1415 let has_non_list_content = {
1417 let mut found_non_list = false;
1418 let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1420 for check_line in (block_last_item_line + 1)..line_num {
1421 let check_idx = check_line - 1;
1422 if check_idx < lines.len() {
1423 let check_info = &lines[check_idx];
1424 let is_list_breaking_content = if check_info.in_code_block {
1426 let last_item_marker_width =
1428 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1429 lines[block_last_item_line - 1]
1430 .list_item
1431 .as_ref()
1432 .map(|li| {
1433 if li.is_ordered {
1434 li.marker.len() + 1 } else {
1436 li.marker.len()
1437 }
1438 })
1439 .unwrap_or(3) } else {
1441 3 };
1443
1444 let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1445
1446 let context = CodeBlockUtils::analyze_code_block_context(
1448 lines,
1449 check_line - 1,
1450 min_continuation,
1451 );
1452
1453 matches!(context, CodeBlockContext::Standalone)
1455 } else if !check_info.is_blank && check_info.list_item.is_none() {
1456 let line_content = check_info.content.trim();
1458
1459 if check_info.heading.is_some()
1461 || line_content.starts_with("---")
1462 || line_content.starts_with("***")
1463 || line_content.starts_with("___")
1464 || line_content.contains('|')
1465 || line_content.starts_with(">")
1466 {
1467 true
1468 }
1469 else {
1471 let last_item_marker_width =
1472 if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1473 lines[block_last_item_line - 1]
1474 .list_item
1475 .as_ref()
1476 .map(|li| {
1477 if li.is_ordered {
1478 li.marker.len() + 1 } else {
1480 li.marker.len()
1481 }
1482 })
1483 .unwrap_or(3) } else {
1485 3 };
1487
1488 let min_continuation =
1489 if block.is_ordered { last_item_marker_width } else { 2 };
1490 check_info.indent < min_continuation
1491 }
1492 } else {
1493 false
1494 };
1495
1496 if is_list_breaking_content {
1497 found_non_list = true;
1499 break;
1500 }
1501 }
1502 }
1503 found_non_list
1504 };
1505
1506 let continues_list = if is_nested {
1510 same_context && reasonable_distance && !has_non_list_content
1512 } else {
1513 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1515 };
1516
1517 if continues_list {
1518 block.end_line = line_num;
1520 block.item_lines.push(line_num);
1521
1522 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1524 list_item.marker.len() + 1
1525 } else {
1526 list_item.marker.len()
1527 });
1528
1529 if !block.is_ordered
1531 && block.marker.is_some()
1532 && block.marker.as_ref() != Some(&list_item.marker)
1533 {
1534 block.marker = None;
1536 }
1537 } else {
1538 list_blocks.push(block.clone());
1540
1541 *block = ListBlock {
1542 start_line: line_num,
1543 end_line: line_num,
1544 is_ordered: list_item.is_ordered,
1545 marker: if list_item.is_ordered {
1546 None
1547 } else {
1548 Some(list_item.marker.clone())
1549 },
1550 blockquote_prefix: blockquote_prefix.clone(),
1551 item_lines: vec![line_num],
1552 nesting_level: nesting,
1553 max_marker_width: if list_item.is_ordered {
1554 list_item.marker.len() + 1
1555 } else {
1556 list_item.marker.len()
1557 },
1558 };
1559 }
1560 } else {
1561 current_block = Some(ListBlock {
1563 start_line: line_num,
1564 end_line: line_num,
1565 is_ordered: list_item.is_ordered,
1566 marker: if list_item.is_ordered {
1567 None
1568 } else {
1569 Some(list_item.marker.clone())
1570 },
1571 blockquote_prefix,
1572 item_lines: vec![line_num],
1573 nesting_level: nesting,
1574 max_marker_width: list_item.marker.len(),
1575 });
1576 }
1577
1578 last_list_item_line = line_num;
1579 current_indent_level = item_indent;
1580 last_marker_width = if list_item.is_ordered {
1581 list_item.marker.len() + 1 } else {
1583 list_item.marker.len()
1584 };
1585 } else if let Some(ref mut block) = current_block {
1586 let min_continuation_indent = if block.is_ordered {
1597 current_indent_level + last_marker_width
1598 } else {
1599 current_indent_level + 2 };
1601
1602 if line_info.indent >= min_continuation_indent {
1603 block.end_line = line_num;
1605 } else if line_info.is_blank {
1606 let mut check_idx = line_idx + 1;
1609 let mut found_continuation = false;
1610
1611 while check_idx < lines.len() && lines[check_idx].is_blank {
1613 check_idx += 1;
1614 }
1615
1616 if check_idx < lines.len() {
1617 let next_line = &lines[check_idx];
1618 if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1620 found_continuation = true;
1621 }
1622 else if !next_line.in_code_block
1624 && next_line.list_item.is_some()
1625 && let Some(item) = &next_line.list_item
1626 {
1627 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1628 .find(&next_line.content)
1629 .map_or(String::new(), |m| m.as_str().to_string());
1630 if item.marker_column == current_indent_level
1631 && item.is_ordered == block.is_ordered
1632 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1633 {
1634 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1637 if let Some(between_line) = lines.get(idx) {
1638 let trimmed = between_line.content.trim();
1639 if trimmed.is_empty() {
1641 return false;
1642 }
1643 let line_indent =
1645 between_line.content.len() - between_line.content.trim_start().len();
1646
1647 if trimmed.starts_with("```")
1649 || trimmed.starts_with("~~~")
1650 || trimmed.starts_with("---")
1651 || trimmed.starts_with("***")
1652 || trimmed.starts_with("___")
1653 || trimmed.starts_with(">")
1654 || trimmed.contains('|') || between_line.heading.is_some()
1656 {
1657 return true; }
1659
1660 line_indent >= min_continuation_indent
1662 } else {
1663 false
1664 }
1665 });
1666
1667 if block.is_ordered {
1668 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1671 if let Some(between_line) = lines.get(idx) {
1672 let trimmed = between_line.content.trim();
1673 if trimmed.is_empty() {
1674 return false;
1675 }
1676 trimmed.starts_with("```")
1678 || trimmed.starts_with("~~~")
1679 || trimmed.starts_with("---")
1680 || trimmed.starts_with("***")
1681 || trimmed.starts_with("___")
1682 || trimmed.starts_with(">")
1683 || trimmed.contains('|') || between_line.heading.is_some()
1685 } else {
1686 false
1687 }
1688 });
1689 found_continuation = !has_structural_separators;
1690 } else {
1691 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1693 if let Some(between_line) = lines.get(idx) {
1694 let trimmed = between_line.content.trim();
1695 if trimmed.is_empty() {
1696 return false;
1697 }
1698 trimmed.starts_with("```")
1700 || trimmed.starts_with("~~~")
1701 || trimmed.starts_with("---")
1702 || trimmed.starts_with("***")
1703 || trimmed.starts_with("___")
1704 || trimmed.starts_with(">")
1705 || trimmed.contains('|') || between_line.heading.is_some()
1707 } else {
1708 false
1709 }
1710 });
1711 found_continuation = !has_structural_separators;
1712 }
1713 }
1714 }
1715 }
1716
1717 if found_continuation {
1718 block.end_line = line_num;
1720 } else {
1721 list_blocks.push(block.clone());
1723 current_block = None;
1724 }
1725 } else {
1726 let min_required_indent = if block.is_ordered {
1729 current_indent_level + last_marker_width
1730 } else {
1731 current_indent_level + 2
1732 };
1733
1734 let line_content = line_info.content.trim();
1739 let is_structural_separator = line_info.heading.is_some()
1740 || line_content.starts_with("```")
1741 || line_content.starts_with("~~~")
1742 || line_content.starts_with("---")
1743 || line_content.starts_with("***")
1744 || line_content.starts_with("___")
1745 || line_content.starts_with(">")
1746 || line_content.contains('|'); let is_lazy_continuation = last_list_item_line == line_num - 1
1749 && !is_structural_separator
1750 && !line_info.is_blank
1751 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1752
1753 if is_lazy_continuation {
1754 let content_to_check = if !blockquote_prefix.is_empty() {
1757 line_info
1759 .content
1760 .strip_prefix(&blockquote_prefix)
1761 .unwrap_or(&line_info.content)
1762 .trim()
1763 } else {
1764 line_info.content.trim()
1765 };
1766
1767 let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1768
1769 if starts_with_uppercase && last_list_item_line > 0 {
1772 list_blocks.push(block.clone());
1774 current_block = None;
1775 } else {
1776 block.end_line = line_num;
1778 }
1779 } else {
1780 list_blocks.push(block.clone());
1782 current_block = None;
1783 }
1784 }
1785 }
1786 }
1787
1788 if let Some(block) = current_block {
1790 list_blocks.push(block);
1791 }
1792
1793 merge_adjacent_list_blocks(&mut list_blocks, lines);
1795
1796 list_blocks
1797 }
1798
1799 fn compute_char_frequency(content: &str) -> CharFrequency {
1801 let mut frequency = CharFrequency::default();
1802
1803 for ch in content.chars() {
1804 match ch {
1805 '#' => frequency.hash_count += 1,
1806 '*' => frequency.asterisk_count += 1,
1807 '_' => frequency.underscore_count += 1,
1808 '-' => frequency.hyphen_count += 1,
1809 '+' => frequency.plus_count += 1,
1810 '>' => frequency.gt_count += 1,
1811 '|' => frequency.pipe_count += 1,
1812 '[' => frequency.bracket_count += 1,
1813 '`' => frequency.backtick_count += 1,
1814 '<' => frequency.lt_count += 1,
1815 '!' => frequency.exclamation_count += 1,
1816 '\n' => frequency.newline_count += 1,
1817 _ => {}
1818 }
1819 }
1820
1821 frequency
1822 }
1823
1824 fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1826 lazy_static! {
1827 static ref HTML_TAG_REGEX: regex::Regex =
1828 regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*(/?)>").unwrap();
1829 }
1830
1831 let mut html_tags = Vec::with_capacity(content.matches('<').count());
1832
1833 for cap in HTML_TAG_REGEX.captures_iter(content) {
1834 let full_match = cap.get(0).unwrap();
1835 let match_start = full_match.start();
1836 let match_end = full_match.end();
1837
1838 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1840 continue;
1841 }
1842
1843 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1844 let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1845 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1846
1847 let mut line_num = 1;
1849 let mut col_start = match_start;
1850 let mut col_end = match_end;
1851 for (idx, line_info) in lines.iter().enumerate() {
1852 if match_start >= line_info.byte_offset {
1853 line_num = idx + 1;
1854 col_start = match_start - line_info.byte_offset;
1855 col_end = match_end - line_info.byte_offset;
1856 } else {
1857 break;
1858 }
1859 }
1860
1861 html_tags.push(HtmlTag {
1862 line: line_num,
1863 start_col: col_start,
1864 end_col: col_end,
1865 byte_offset: match_start,
1866 byte_end: match_end,
1867 tag_name,
1868 is_closing,
1869 is_self_closing,
1870 raw_content: full_match.as_str().to_string(),
1871 });
1872 }
1873
1874 html_tags
1875 }
1876
1877 fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1879 lazy_static! {
1880 static ref EMPHASIS_REGEX: regex::Regex =
1881 regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1882 }
1883
1884 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1885
1886 for cap in EMPHASIS_REGEX.captures_iter(content) {
1887 let full_match = cap.get(0).unwrap();
1888 let match_start = full_match.start();
1889 let match_end = full_match.end();
1890
1891 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1893 continue;
1894 }
1895
1896 let opening_markers = cap.get(1).unwrap().as_str();
1897 let content_part = cap.get(2).unwrap().as_str();
1898 let closing_markers = cap.get(3).unwrap().as_str();
1899
1900 if opening_markers.chars().next() != closing_markers.chars().next()
1902 || opening_markers.len() != closing_markers.len()
1903 {
1904 continue;
1905 }
1906
1907 let marker = opening_markers.chars().next().unwrap();
1908 let marker_count = opening_markers.len();
1909
1910 let mut line_num = 1;
1912 let mut col_start = match_start;
1913 let mut col_end = match_end;
1914 for (idx, line_info) in lines.iter().enumerate() {
1915 if match_start >= line_info.byte_offset {
1916 line_num = idx + 1;
1917 col_start = match_start - line_info.byte_offset;
1918 col_end = match_end - line_info.byte_offset;
1919 } else {
1920 break;
1921 }
1922 }
1923
1924 emphasis_spans.push(EmphasisSpan {
1925 line: line_num,
1926 start_col: col_start,
1927 end_col: col_end,
1928 byte_offset: match_start,
1929 byte_end: match_end,
1930 marker,
1931 marker_count,
1932 content: content_part.to_string(),
1933 });
1934 }
1935
1936 emphasis_spans
1937 }
1938
1939 fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
1941 let mut table_rows = Vec::with_capacity(lines.len() / 20);
1942
1943 for (line_idx, line_info) in lines.iter().enumerate() {
1944 if line_info.in_code_block || line_info.is_blank {
1946 continue;
1947 }
1948
1949 let line = &line_info.content;
1950 let line_num = line_idx + 1;
1951
1952 if !line.contains('|') {
1954 continue;
1955 }
1956
1957 let parts: Vec<&str> = line.split('|').collect();
1959 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
1960
1961 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
1963 let mut column_alignments = Vec::new();
1964
1965 if is_separator {
1966 for part in &parts[1..parts.len() - 1] {
1967 let trimmed = part.trim();
1969 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
1970 "center".to_string()
1971 } else if trimmed.ends_with(':') {
1972 "right".to_string()
1973 } else if trimmed.starts_with(':') {
1974 "left".to_string()
1975 } else {
1976 "none".to_string()
1977 };
1978 column_alignments.push(alignment);
1979 }
1980 }
1981
1982 table_rows.push(TableRow {
1983 line: line_num,
1984 is_separator,
1985 column_count,
1986 column_alignments,
1987 });
1988 }
1989
1990 table_rows
1991 }
1992
1993 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
1995 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
1996
1997 for cap in BARE_URL_PATTERN.captures_iter(content) {
1999 let full_match = cap.get(0).unwrap();
2000 let match_start = full_match.start();
2001 let match_end = full_match.end();
2002
2003 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2005 continue;
2006 }
2007
2008 let preceding_char = if match_start > 0 {
2010 content.chars().nth(match_start - 1)
2011 } else {
2012 None
2013 };
2014 let following_char = content.chars().nth(match_end);
2015
2016 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2017 continue;
2018 }
2019 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2020 continue;
2021 }
2022
2023 let url = full_match.as_str();
2024 let url_type = if url.starts_with("https://") {
2025 "https"
2026 } else if url.starts_with("http://") {
2027 "http"
2028 } else if url.starts_with("ftp://") {
2029 "ftp"
2030 } else {
2031 "other"
2032 };
2033
2034 let mut line_num = 1;
2036 let mut col_start = match_start;
2037 let mut col_end = match_end;
2038 for (idx, line_info) in lines.iter().enumerate() {
2039 if match_start >= line_info.byte_offset {
2040 line_num = idx + 1;
2041 col_start = match_start - line_info.byte_offset;
2042 col_end = match_end - line_info.byte_offset;
2043 } else {
2044 break;
2045 }
2046 }
2047
2048 bare_urls.push(BareUrl {
2049 line: line_num,
2050 start_col: col_start,
2051 end_col: col_end,
2052 byte_offset: match_start,
2053 byte_end: match_end,
2054 url: url.to_string(),
2055 url_type: url_type.to_string(),
2056 });
2057 }
2058
2059 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2061 let full_match = cap.get(0).unwrap();
2062 let match_start = full_match.start();
2063 let match_end = full_match.end();
2064
2065 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2067 continue;
2068 }
2069
2070 let preceding_char = if match_start > 0 {
2072 content.chars().nth(match_start - 1)
2073 } else {
2074 None
2075 };
2076 let following_char = content.chars().nth(match_end);
2077
2078 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2079 continue;
2080 }
2081 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2082 continue;
2083 }
2084
2085 let email = full_match.as_str();
2086
2087 let mut line_num = 1;
2089 let mut col_start = match_start;
2090 let mut col_end = match_end;
2091 for (idx, line_info) in lines.iter().enumerate() {
2092 if match_start >= line_info.byte_offset {
2093 line_num = idx + 1;
2094 col_start = match_start - line_info.byte_offset;
2095 col_end = match_end - line_info.byte_offset;
2096 } else {
2097 break;
2098 }
2099 }
2100
2101 bare_urls.push(BareUrl {
2102 line: line_num,
2103 start_col: col_start,
2104 end_col: col_end,
2105 byte_offset: match_start,
2106 byte_end: match_end,
2107 url: email.to_string(),
2108 url_type: "email".to_string(),
2109 });
2110 }
2111
2112 bare_urls
2113 }
2114}
2115
2116fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2118 if list_blocks.len() < 2 {
2119 return;
2120 }
2121
2122 let mut merger = ListBlockMerger::new(lines);
2123 *list_blocks = merger.merge(list_blocks);
2124}
2125
2126struct ListBlockMerger<'a> {
2128 lines: &'a [LineInfo],
2129}
2130
2131impl<'a> ListBlockMerger<'a> {
2132 fn new(lines: &'a [LineInfo]) -> Self {
2133 Self { lines }
2134 }
2135
2136 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2137 let mut merged = Vec::with_capacity(list_blocks.len());
2138 let mut current = list_blocks[0].clone();
2139
2140 for next in list_blocks.iter().skip(1) {
2141 if self.should_merge_blocks(¤t, next) {
2142 current = self.merge_two_blocks(current, next);
2143 } else {
2144 merged.push(current);
2145 current = next.clone();
2146 }
2147 }
2148
2149 merged.push(current);
2150 merged
2151 }
2152
2153 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2155 if !self.blocks_are_compatible(current, next) {
2157 return false;
2158 }
2159
2160 let spacing = self.analyze_spacing_between(current, next);
2162 match spacing {
2163 BlockSpacing::Consecutive => true,
2164 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2165 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2166 self.can_merge_with_content_between(current, next)
2167 }
2168 }
2169 }
2170
2171 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2173 current.is_ordered == next.is_ordered
2174 && current.blockquote_prefix == next.blockquote_prefix
2175 && current.nesting_level == next.nesting_level
2176 }
2177
2178 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2180 let gap = next.start_line - current.end_line;
2181
2182 match gap {
2183 1 => BlockSpacing::Consecutive,
2184 2 => BlockSpacing::SingleBlank,
2185 _ if gap > 2 => {
2186 if self.has_only_blank_lines_between(current, next) {
2187 BlockSpacing::MultipleBlanks
2188 } else {
2189 BlockSpacing::ContentBetween
2190 }
2191 }
2192 _ => BlockSpacing::Consecutive, }
2194 }
2195
2196 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2198 if has_meaningful_content_between(current, next, self.lines) {
2201 return false; }
2203
2204 !current.is_ordered && current.marker == next.marker
2206 }
2207
2208 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2210 if has_meaningful_content_between(current, next, self.lines) {
2212 return false; }
2214
2215 current.is_ordered && next.is_ordered
2217 }
2218
2219 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2221 for line_num in (current.end_line + 1)..next.start_line {
2222 if let Some(line_info) = self.lines.get(line_num - 1)
2223 && !line_info.content.trim().is_empty()
2224 {
2225 return false;
2226 }
2227 }
2228 true
2229 }
2230
2231 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2233 current.end_line = next.end_line;
2234 current.item_lines.extend_from_slice(&next.item_lines);
2235
2236 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2238
2239 if !current.is_ordered && self.markers_differ(¤t, next) {
2241 current.marker = None; }
2243
2244 current
2245 }
2246
2247 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2249 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2250 }
2251}
2252
2253#[derive(Debug, PartialEq)]
2255enum BlockSpacing {
2256 Consecutive, SingleBlank, MultipleBlanks, ContentBetween, }
2261
2262fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2264 for line_num in (current.end_line + 1)..next.start_line {
2266 if let Some(line_info) = lines.get(line_num - 1) {
2267 let trimmed = line_info.content.trim();
2269
2270 if trimmed.is_empty() {
2272 continue;
2273 }
2274
2275 if line_info.heading.is_some() {
2279 return true; }
2281
2282 if is_horizontal_rule(trimmed) {
2284 return true; }
2286
2287 if trimmed.contains('|') && trimmed.len() > 1 {
2289 return true; }
2291
2292 if trimmed.starts_with('>') {
2294 return true; }
2296
2297 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2299 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2300
2301 let min_continuation_indent = if current.is_ordered {
2303 current.nesting_level + current.max_marker_width + 1 } else {
2305 current.nesting_level + 2
2306 };
2307
2308 if line_indent < min_continuation_indent {
2309 return true; }
2312 }
2313
2314 let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2316
2317 let min_indent = if current.is_ordered {
2319 current.nesting_level + current.max_marker_width
2320 } else {
2321 current.nesting_level + 2
2322 };
2323
2324 if line_indent < min_indent {
2326 return true; }
2328
2329 }
2332 }
2333
2334 false
2336}
2337
2338fn is_horizontal_rule(trimmed: &str) -> bool {
2340 if trimmed.len() < 3 {
2341 return false;
2342 }
2343
2344 let chars: Vec<char> = trimmed.chars().collect();
2346 if let Some(&first_char) = chars.first()
2347 && (first_char == '-' || first_char == '*' || first_char == '_')
2348 {
2349 let mut count = 0;
2350 for &ch in &chars {
2351 if ch == first_char {
2352 count += 1;
2353 } else if ch != ' ' && ch != '\t' {
2354 return false; }
2356 }
2357 return count >= 3;
2358 }
2359 false
2360}
2361
2362#[cfg(test)]
2364mod tests {
2365 use super::*;
2366
2367 #[test]
2368 fn test_empty_content() {
2369 let ctx = LintContext::new("");
2370 assert_eq!(ctx.content, "");
2371 assert_eq!(ctx.line_offsets, vec![0]);
2372 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2373 assert_eq!(ctx.lines.len(), 0);
2374 }
2375
2376 #[test]
2377 fn test_single_line() {
2378 let ctx = LintContext::new("# Hello");
2379 assert_eq!(ctx.content, "# Hello");
2380 assert_eq!(ctx.line_offsets, vec![0]);
2381 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2382 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2383 }
2384
2385 #[test]
2386 fn test_multi_line() {
2387 let content = "# Title\n\nSecond line\nThird line";
2388 let ctx = LintContext::new(content);
2389 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2390 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(8), (2, 1)); assert_eq!(ctx.offset_to_line_col(9), (3, 1)); assert_eq!(ctx.offset_to_line_col(15), (3, 7)); assert_eq!(ctx.offset_to_line_col(21), (4, 1)); }
2397
2398 #[test]
2399 fn test_line_info() {
2400 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
2401 let ctx = LintContext::new(content);
2402
2403 assert_eq!(ctx.lines.len(), 7);
2405
2406 let line1 = &ctx.lines[0];
2408 assert_eq!(line1.content, "# Title");
2409 assert_eq!(line1.byte_offset, 0);
2410 assert_eq!(line1.indent, 0);
2411 assert!(!line1.is_blank);
2412 assert!(!line1.in_code_block);
2413 assert!(line1.list_item.is_none());
2414
2415 let line2 = &ctx.lines[1];
2417 assert_eq!(line2.content, " indented");
2418 assert_eq!(line2.byte_offset, 8);
2419 assert_eq!(line2.indent, 4);
2420 assert!(!line2.is_blank);
2421
2422 let line3 = &ctx.lines[2];
2424 assert_eq!(line3.content, "");
2425 assert!(line3.is_blank);
2426
2427 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2429 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2430 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2431 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2432 }
2433
2434 #[test]
2435 fn test_list_item_detection() {
2436 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
2437 let ctx = LintContext::new(content);
2438
2439 let line1 = &ctx.lines[0];
2441 assert!(line1.list_item.is_some());
2442 let list1 = line1.list_item.as_ref().unwrap();
2443 assert_eq!(list1.marker, "-");
2444 assert!(!list1.is_ordered);
2445 assert_eq!(list1.marker_column, 0);
2446 assert_eq!(list1.content_column, 2);
2447
2448 let line2 = &ctx.lines[1];
2450 assert!(line2.list_item.is_some());
2451 let list2 = line2.list_item.as_ref().unwrap();
2452 assert_eq!(list2.marker, "*");
2453 assert_eq!(list2.marker_column, 2);
2454
2455 let line3 = &ctx.lines[2];
2457 assert!(line3.list_item.is_some());
2458 let list3 = line3.list_item.as_ref().unwrap();
2459 assert_eq!(list3.marker, "1.");
2460 assert!(list3.is_ordered);
2461 assert_eq!(list3.number, Some(1));
2462
2463 let line6 = &ctx.lines[5];
2465 assert!(line6.list_item.is_none());
2466 }
2467
2468 #[test]
2469 fn test_offset_to_line_col_edge_cases() {
2470 let content = "a\nb\nc";
2471 let ctx = LintContext::new(content);
2472 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); assert_eq!(ctx.offset_to_line_col(1), (1, 2)); assert_eq!(ctx.offset_to_line_col(2), (2, 1)); assert_eq!(ctx.offset_to_line_col(3), (2, 2)); assert_eq!(ctx.offset_to_line_col(4), (3, 1)); assert_eq!(ctx.offset_to_line_col(5), (3, 2)); }
2480}