1use crate::rules::heading_utils::HeadingStyle;
2use fancy_regex::Regex as FancyRegex;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6#[derive(Debug, Clone)]
9pub struct DocumentStructure {
10 pub code_blocks: Vec<CodeBlock>,
12 pub has_code_blocks: bool,
14 pub heading_lines: Vec<usize>,
16 pub heading_levels: Vec<usize>,
18 pub heading_regions: Vec<(usize, usize)>,
20 pub list_lines: Vec<usize>,
22 pub has_front_matter: bool,
24 pub front_matter_range: Option<(usize, usize)>,
26 pub has_urls: bool,
28 pub has_html: bool,
30 pub in_code_block: Vec<bool>,
32 pub fenced_code_block_starts: Vec<usize>,
34 pub fenced_code_block_ends: Vec<usize>,
36 pub first_heading_style: Option<HeadingStyle>,
38 pub code_spans: Vec<CodeSpan>,
40 pub in_code_span: Vec<Vec<bool>>,
42 pub links: Vec<Link>,
44 pub images: Vec<Image>,
46 pub list_items: Vec<ListItem>,
48 pub blockquotes: Vec<BlockquoteRange>,
50 pub in_blockquote: Vec<bool>,
52 pub in_html_block: Vec<bool>,
54 pub horizontal_rule_lines: Vec<usize>,
56}
57
58#[derive(Debug, Clone)]
60pub struct FrontMatter {
61 pub start_line: usize,
62 pub end_line: usize,
63 pub content: String,
64}
65
66#[derive(Debug, Clone, PartialEq)]
68pub struct Heading {
69 pub text: String,
70 pub level: u32,
71 pub line_number: usize,
72 pub original_text: String,
73 pub indentation: String,
74}
75
76#[derive(Debug, Clone)]
78pub struct CodeBlock {
79 pub start_line: usize,
81 pub end_line: usize,
83 pub language: Option<String>,
85 pub block_type: CodeBlockType,
87}
88
89#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum CodeBlockType {
92 Fenced,
94 Indented,
96}
97
98#[derive(Debug, Clone)]
100pub struct ListItem {
101 pub line_number: usize,
102 pub indentation: usize,
103 pub marker: String,
104 pub marker_type: ListMarkerType,
105 pub content: String,
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum ListMarkerType {
111 Unordered,
112 Ordered,
113 Task,
114}
115
116#[derive(Debug, Clone)]
118pub struct BlockquoteRange {
119 pub start_line: usize,
120 pub end_line: usize,
121}
122
123#[derive(Debug, Clone)]
125pub struct CodeSpan {
126 pub line: usize,
128 pub start_col: usize,
130 pub end_col: usize,
132 pub content: String,
134}
135
136#[derive(Debug, Clone)]
138pub struct Link {
139 pub line: usize,
141 pub start_col: usize,
143 pub end_col: usize,
145 pub text: String,
147 pub url: String,
149 pub is_reference: bool,
151 pub reference_id: Option<String>,
153}
154
155#[derive(Debug, Clone)]
157pub struct Image {
158 pub line: usize,
160 pub start_col: usize,
162 pub end_col: usize,
164 pub alt_text: String,
166 pub src: String,
168 pub is_reference: bool,
170 pub reference_id: Option<String>,
172}
173
174lazy_static! {
176 static ref CONTAINS_ATX_HEADING: Regex = Regex::new(r"(?m)^(\s*)#{1,6}").unwrap();
178 static ref CONTAINS_SETEXT_UNDERLINE: Regex = Regex::new(r"(?m)^(\s*)(=+|-+)\s*$").unwrap();
179 static ref CONTAINS_LIST_MARKERS: Regex = Regex::new(r"(?m)^(\s*)([*+-]|\d+\.)").unwrap();
180 static ref CONTAINS_BLOCKQUOTE: Regex = Regex::new(r"(?m)^(\s*)>").unwrap();
181 static ref CONTAINS_HTML_BLOCK: Regex = Regex::new(r"(?m)^(\s*)<[a-zA-Z]").unwrap();
182}
183
184impl DocumentStructure {
185 pub fn new(content: &str) -> Self {
187 let mut structure = DocumentStructure {
189 code_blocks: Vec::new(),
190 has_code_blocks: false,
191 heading_lines: Vec::new(),
192 heading_levels: Vec::new(),
193 heading_regions: Vec::new(),
194 list_lines: Vec::new(),
195 has_front_matter: false,
196 front_matter_range: None,
197 has_urls: false,
198 has_html: false,
199 in_code_block: Vec::new(),
200 fenced_code_block_starts: Vec::new(),
201 fenced_code_block_ends: Vec::new(),
202 first_heading_style: None,
203 code_spans: Vec::new(),
205 in_code_span: Vec::new(),
206 links: Vec::new(),
207 images: Vec::new(),
208 list_items: Vec::new(),
209 blockquotes: Vec::new(),
210 in_blockquote: Vec::new(),
211 in_html_block: Vec::new(),
212 horizontal_rule_lines: Vec::new(),
213 };
214
215 structure.analyze(content);
217 structure
218 }
219
220 fn analyze(&mut self, content: &str) {
222 if content.is_empty() {
224 return;
225 }
226
227 let lines: Vec<&str> = content.lines().collect();
229 self.in_code_span = vec![Vec::new(); lines.len()];
230 for (i, line) in lines.iter().enumerate() {
231 self.in_code_span[i] = vec![false; line.len() + 1]; }
233 self.in_blockquote = vec![false; lines.len()];
234 self.in_html_block = vec![false; lines.len()];
235
236 self.detect_front_matter(content);
238
239 let has_blockquote_markers = CONTAINS_BLOCKQUOTE.is_match(content);
241 let has_html_blocks = CONTAINS_HTML_BLOCK.is_match(content);
242
243 if has_html_blocks {
245 self.detect_html_blocks(content);
246 }
247
248 self.code_blocks = self.compute_code_blocks(content);
250 self.has_code_blocks = !self.code_blocks.is_empty();
251
252 self.compute_code_block_bitmap(content);
254
255 self.populate_fenced_code_blocks();
257 let has_backticks = content.contains('`');
258 let has_brackets = content.contains('[');
259 let has_headings = CONTAINS_ATX_HEADING.is_match(content) || CONTAINS_SETEXT_UNDERLINE.is_match(content);
260 let has_list_markers = CONTAINS_LIST_MARKERS.is_match(content)
262 || content.contains("- ")
263 || content.contains("* ")
264 || content.contains("+ ")
265 || content.contains("1. ")
266 || content.contains("2. ")
267 || content.contains("3. ")
268 || content.contains("4. ")
269 || content.contains("5. ")
270 || content.contains("6. ")
271 || content.contains("7. ")
272 || content.contains("8. ")
273 || content.contains("9. ")
274 || content.contains("10. ")
275 || content.contains("11. ")
276 || content.contains("12. ");
277
278 if has_blockquote_markers {
280 self.detect_blockquotes(content);
281 }
282
283 if has_backticks {
285 self.detect_code_spans(content);
286 }
287
288 if has_brackets {
290 self.detect_links_and_images(content);
291 }
292
293 if has_headings {
295 self.detect_headings(content);
296 }
297
298 if has_list_markers {
300 self.detect_list_items(content);
301 }
302
303 let has_potential_hrs = content.contains("---")
305 || content.contains("***")
306 || content.contains("___")
307 || content.contains("- -")
308 || content.contains("* *")
309 || content.contains("_ _");
310 if has_potential_hrs {
311 self.detect_horizontal_rules(content);
312 }
313
314 if crate::utils::early_returns::has_urls(content) {
316 self.has_urls = true;
317 }
318
319 if has_html_blocks && (content.contains("</") || content.contains("/>")) {
321 self.has_html = true;
322 }
323 }
324
325 fn compute_code_block_bitmap(&mut self, content: &str) {
327 let line_count = content.lines().count();
328 self.in_code_block = vec![false; line_count];
329
330 for block in &self.code_blocks {
331 let start = block.start_line.saturating_sub(1); let end = block.end_line.min(line_count); if let CodeBlockType::Fenced = block.block_type {
336 if end > start + 1 {
338 for i in (start + 1)..(end - 1) {
339 if i < self.in_code_block.len() {
340 self.in_code_block[i] = true;
341 }
342 }
343 }
344 } else {
345 for i in start..end {
347 if i < self.in_code_block.len() {
348 self.in_code_block[i] = true;
349 }
350 }
351 }
352 }
353 }
354
355 pub fn is_in_code_block(&self, line_num: usize) -> bool {
357 if line_num == 0 || line_num > self.in_code_block.len() {
358 return false;
359 }
360 self.in_code_block[line_num - 1] }
362
363 fn detect_headings(&mut self, content: &str) {
365 lazy_static! {
366 static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+|[^\s#])").unwrap();
367 static ref SETEXT_HEADING_UNDERLINE: Regex = Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
368 }
369
370 self.heading_lines.clear();
372 self.heading_levels.clear();
373 self.heading_regions.clear();
374 self.first_heading_style = None;
375
376 let lines: Vec<&str> = content.lines().collect();
377
378 for (i, line) in lines.iter().enumerate() {
379 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
381 continue;
382 }
383
384 if line.trim().is_empty() {
386 continue;
387 }
388
389 if let Some(captures) = ATX_HEADING.captures(line) {
391 let level = captures[2].len();
392 let mut chars = line.trim().chars();
394 while chars.next() == Some('#') {}
395 let heading_text = chars.as_str().trim();
396 if heading_text.is_empty() {
397 continue; }
399 self.heading_lines.push(i + 1);
400 self.heading_levels.push(level);
401 self.heading_regions.push((i + 1, i + 1)); if self.first_heading_style.is_none() {
405 if line.trim().ends_with('#') {
407 self.first_heading_style = Some(HeadingStyle::AtxClosed);
408 } else {
409 self.first_heading_style = Some(HeadingStyle::Atx);
410 }
411 }
412 continue;
413 }
414
415 if i > 0 && !lines[i - 1].trim().is_empty() &&
417 !self.is_in_front_matter(i) && SETEXT_HEADING_UNDERLINE.is_match(line)
419 {
420 let content_line = lines[i - 1].trim();
421 if content_line.is_empty() {
422 continue; }
424 let level = if line.trim().starts_with('=') { 1 } else { 2 };
425 self.heading_lines.push(i); self.heading_levels.push(level);
427 self.heading_regions.push((i, i + 1)); if self.first_heading_style.is_none() {
431 if level == 1 {
432 self.first_heading_style = Some(HeadingStyle::Setext1);
433 } else {
434 self.first_heading_style = Some(HeadingStyle::Setext2);
435 }
436 }
437 }
438 }
439
440 if self.heading_lines.is_empty() {
442 self.first_heading_style = Some(HeadingStyle::Atx);
443 }
444 }
445
446 fn detect_front_matter(&mut self, content: &str) {
448 let lines: Vec<&str> = content.lines().collect();
449
450 self.has_front_matter = false;
452 self.front_matter_range = None;
453
454 if !lines.is_empty() && lines[0] == "---" {
456 for (i, line) in lines.iter().enumerate().skip(1) {
458 if *line == "---" {
459 self.has_front_matter = true;
460 self.front_matter_range = Some((1, i + 1));
461 break;
462 }
463 }
464 }
465 }
466
467 fn compute_code_blocks(&self, content: &str) -> Vec<CodeBlock> {
469 lazy_static! {
470 static ref FENCED_START: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*([^`\s]*)").unwrap();
472 static ref FENCED_END: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*$").unwrap();
473 }
474
475 let mut code_blocks = Vec::new();
476 let mut in_code_block = false;
477 let mut current_block_start = 0;
478 let mut current_language = None;
479 let mut current_fence_char = ' ';
480 let mut current_fence_length = 0; let mut current_fence_indent = 0; let lines: Vec<&str> = content.lines().collect();
483
484 let mut i = 0;
485 while i < lines.len() {
486 let line = lines[i];
487
488 if !in_code_block {
489 if let Some(captures) = FENCED_START.captures(line) {
491 in_code_block = true;
492 current_block_start = i + 1;
493 let indent = captures.get(1).map_or("", |m| m.as_str());
494 current_fence_indent = indent.len();
495 let fence = captures.get(2).map_or("```", |m| m.as_str());
496 current_fence_char = fence.chars().next().unwrap();
497 current_fence_length = fence.len();
498
499 let lang = captures.get(3).map(|m| m.as_str().to_string());
501 current_language = lang.filter(|l| !l.is_empty());
502 }
503 else if Self::is_indented_code_line(line) && !line.trim().is_empty() && !self.is_in_html_block(i + 1)
506 {
507 let mut end_line = i;
510
511 while end_line + 1 < lines.len() {
514 let next_line = lines[end_line + 1];
515
516 if Self::is_indented_code_line(next_line)
517 && !next_line.trim().is_empty()
518 && !self.is_in_html_block(end_line + 2)
519 {
520 end_line += 1;
522 } else if next_line.trim().is_empty() {
523 let mut lookahead = end_line + 2;
525 let mut found_indented = false;
526
527 while lookahead < lines.len() {
528 let lookahead_line = lines[lookahead];
529 if Self::is_indented_code_line(lookahead_line)
530 && !lookahead_line.trim().is_empty()
531 && !self.is_in_html_block(lookahead + 1)
532 {
533 found_indented = true;
534 break;
535 } else if !lookahead_line.trim().is_empty() {
536 break;
538 }
539 lookahead += 1;
540 }
541
542 if found_indented {
543 end_line += 1;
545 } else {
546 break;
548 }
549 } else {
550 break;
552 }
553 }
554
555 code_blocks.push(CodeBlock {
556 start_line: i + 1,
557 end_line: end_line + 1,
558 language: None,
559 block_type: CodeBlockType::Indented,
560 });
561
562 i = end_line;
564 }
565 } else {
566 if let Some(captures) = FENCED_END.captures(line) {
569 let indent = captures.get(1).map_or("", |m| m.as_str());
570 let fence = captures.get(2).map_or("", |m| m.as_str());
571
572 if fence.starts_with(current_fence_char)
574 && fence.len() >= current_fence_length
575 && indent.len() <= current_fence_indent
576 {
577 code_blocks.push(CodeBlock {
578 start_line: current_block_start,
579 end_line: i + 1,
580 language: current_language.clone(),
581 block_type: CodeBlockType::Fenced,
582 });
583
584 in_code_block = false;
585 current_language = None;
586 current_fence_char = ' ';
587 current_fence_length = 0;
588 current_fence_indent = 0;
589 }
590 }
591 }
592
593 i += 1;
594 }
595
596 if in_code_block {
598 code_blocks.push(CodeBlock {
599 start_line: current_block_start,
600 end_line: lines.len(),
601 language: current_language,
602 block_type: CodeBlockType::Fenced,
603 });
604 }
605
606 code_blocks
607 }
608
609 fn populate_fenced_code_blocks(&mut self) {
611 self.fenced_code_block_starts.clear();
612 self.fenced_code_block_ends.clear();
613
614 for block in &self.code_blocks {
615 if let CodeBlockType::Fenced = block.block_type {
616 self.fenced_code_block_starts.push(block.start_line);
617 self.fenced_code_block_ends.push(block.end_line);
618 }
619 }
620 }
621
622 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
624 if let Some((start, end)) = self.front_matter_range {
625 line_num >= start && line_num <= end
626 } else {
627 false
628 }
629 }
630
631 #[inline]
636 pub fn count_trailing_spaces(line: &str) -> usize {
637 let content = line.strip_suffix('\n').unwrap_or(line);
639
640 let mut space_count = 0;
642 for c in content.chars().rev() {
643 if c == ' ' {
644 space_count += 1;
645 } else {
646 break;
647 }
648 }
649
650 space_count
651 }
652
653 #[inline]
658 pub fn has_trailing_spaces(line: &str) -> bool {
659 Self::count_trailing_spaces(line) > 0
660 }
661
662 #[inline]
668 fn is_indented_code_line(line: &str) -> bool {
669 if line.starts_with('\t') {
670 return true;
671 }
672
673 let mut space_count = 0;
675 for c in line.chars() {
676 if c == ' ' {
677 space_count += 1;
678 } else {
679 break;
680 }
681 }
682
683 space_count >= 4
684 }
685
686 pub fn get_list_start_indices(&self) -> Vec<usize> {
689 if self.list_lines.is_empty() {
690 return Vec::new();
691 }
692
693 let mut list_starts = Vec::new();
694 let mut prev_line = 0;
695
696 for (i, &line_num) in self.list_lines.iter().enumerate() {
697 if i == 0 || line_num > prev_line + 1 {
700 list_starts.push(line_num - 1); }
702 prev_line = line_num;
703 }
704
705 list_starts
706 }
707
708 pub fn get_list_end_indices(&self) -> Vec<usize> {
711 if self.list_lines.is_empty() {
712 return Vec::new();
713 }
714
715 let mut list_ends = Vec::new();
716 let list_lines = &self.list_lines;
717
718 for (i, &line_num) in list_lines.iter().enumerate() {
719 if i == list_lines.len() - 1 || list_lines[i + 1] > line_num + 1 {
722 list_ends.push(line_num - 1); }
724 }
725
726 list_ends
727 }
728
729 fn detect_code_spans(&mut self, content: &str) {
731 self.code_spans.clear();
733
734 let lines: Vec<&str> = content.lines().collect();
735
736 for (line_num, line) in lines.iter().enumerate() {
739 if self.is_in_code_block(line_num + 1) {
741 continue;
742 }
743
744 if line.is_empty() {
746 continue;
747 }
748
749 let mut i = 0;
750 while i < line.len() {
751 if let Some(start_pos) = line[i..].find('`') {
753 let start_idx = i + start_pos;
754
755 if let Some(end_pos) = line[start_idx + 1..].find('`') {
757 let end_idx = start_idx + 1 + end_pos;
758
759 let content = line[start_idx + 1..end_idx].to_string();
761
762 self.code_spans.push(CodeSpan {
764 line: line_num + 1, start_col: start_idx + 1, end_col: end_idx + 1, content,
768 });
769
770 for col in start_idx..=end_idx {
772 if col < self.in_code_span[line_num].len() {
773 self.in_code_span[line_num][col] = true;
774 }
775 }
776
777 i = end_idx + 1;
779 } else {
780 i = start_idx + 1;
782 }
783 } else {
784 break;
786 }
787 }
788 }
789 }
790
791 fn detect_links_and_images(&mut self, content: &str) {
793 lazy_static! {
794 static ref INLINE_LINK: FancyRegex = FancyRegex::new(r"(?x)
796 (?<!\\) # Not preceded by backslash
797 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text (handles nested brackets and escapes)
798 \(([^)]*)\) # URL in parentheses
799 ").unwrap();
800 static ref REFERENCE_LINK: FancyRegex = FancyRegex::new(r"(?x)
802 (?<!\\) # Not preceded by backslash
803 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text (handles nested brackets and escapes)
804 \[([^\]]*)\] # Reference ID
805 ").unwrap();
806 static ref SHORTCUT_LINK: FancyRegex = FancyRegex::new(r"(?x)
808 (?<!\\) # Not preceded by backslash
809 \[([^\]]+)\] # Link text
810 (?!\(|\[) # Not followed by ( or [
811 ").unwrap();
812 static ref LINK_DEFINITION: Regex = Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap();
814 static ref INLINE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
816 (?<!\\) # Not preceded by backslash
817 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
818 \(([^)]*)\) # Source URL
819 ").unwrap();
820 static ref REFERENCE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
822 (?<!\\) # Not preceded by backslash
823 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
824 \[([^\]]*)\] # Reference ID
825 ").unwrap();
826 }
827
828 self.links.clear();
830 self.images.clear();
831
832 let lines: Vec<&str> = content.lines().collect();
833
834 let mut link_defs = std::collections::HashMap::new();
836 for (line_num, line) in lines.iter().enumerate() {
837 if self.is_in_code_block(line_num + 1) {
839 continue;
840 }
841
842 if let Some(cap) = LINK_DEFINITION.captures(line) {
844 let id = cap.get(1).map_or("", |m| m.as_str()).to_string();
845 let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
846 link_defs.insert(id.to_lowercase(), url);
847 }
848 }
849
850 for (line_num, line) in lines.iter().enumerate() {
852 if self.is_in_code_block(line_num + 1) {
854 continue;
855 }
856
857 if line.is_empty() {
859 continue;
860 }
861
862 if !line.contains('[') && !line.contains('!') {
864 continue;
865 }
866
867 let mut i = 0;
869 while i < line.len() {
870 if i < self.in_code_span[line_num].len() && self.in_code_span[line_num][i] {
872 i += 1;
873 continue;
874 }
875
876 if let Some(rest) = line.get(i..) {
878 if rest.starts_with('[') {
879 let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
881 let is_escaped_image =
882 i > 1 && line.chars().nth(i - 2) == Some('\\') && line.chars().nth(i - 1) == Some('!');
883 if !is_escaped && !is_escaped_image {
884 if let Ok(Some(cap)) = INLINE_LINK.captures(rest) {
885 let whole_match = cap.get(0).unwrap();
886 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
887 let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
888
889 let is_in_span = (i..i + whole_match.end()).any(|pos| {
891 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
892 });
893
894 if !is_in_span {
895 self.links.push(Link {
896 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), text,
900 url,
901 is_reference: false,
902 reference_id: None,
903 });
904 }
905
906 i += whole_match.end();
908 } else if let Ok(Some(cap)) = REFERENCE_LINK.captures(rest) {
909 let whole_match = cap.get(0).unwrap();
910 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
911 let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
912
913 let ref_id = if id.is_empty() { text.clone() } else { id };
915
916 let url = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
918
919 let is_in_span = (i..i + whole_match.end()).any(|pos| {
921 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
922 });
923
924 if !is_in_span {
925 self.links.push(Link {
926 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), text,
930 url,
931 is_reference: true,
932 reference_id: Some(ref_id),
933 });
934 }
935
936 i += whole_match.end();
938 } else {
939 i += 1;
941 }
942 } else {
943 i += 1;
945 }
946 } else if rest.starts_with("![") {
947 let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
949 if !is_escaped {
950 if let Ok(Some(cap)) = INLINE_IMAGE.captures(rest) {
951 let whole_match = cap.get(0).unwrap();
952 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
953 let src = cap.get(2).map_or("", |m| m.as_str()).to_string();
954
955 let is_in_span = (i..i + whole_match.end()).any(|pos| {
957 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
958 });
959
960 if !is_in_span {
961 self.images.push(Image {
962 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), alt_text,
966 src,
967 is_reference: false,
968 reference_id: None,
969 });
970 }
971
972 i += whole_match.end();
974 } else if let Ok(Some(cap)) = REFERENCE_IMAGE.captures(rest) {
975 let whole_match = cap.get(0).unwrap();
976 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
977 let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
978
979 let ref_id = if id.is_empty() { alt_text.clone() } else { id };
981
982 let src = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
984
985 let is_in_span = (i..i + whole_match.end()).any(|pos| {
987 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
988 });
989
990 if !is_in_span {
991 self.images.push(Image {
992 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), alt_text,
996 src,
997 is_reference: true,
998 reference_id: Some(ref_id),
999 });
1000 }
1001
1002 i += whole_match.end();
1004 } else {
1005 i += 1;
1007 }
1008 } else {
1009 i += 1;
1011 }
1012 } else {
1013 i += 1;
1015 }
1016 } else {
1017 break;
1019 }
1020 }
1021 }
1022 }
1023
1024 fn detect_list_items(&mut self, content: &str) {
1026 lazy_static! {
1033 static ref UL_MARKER: FancyRegex =
1034 FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1035 static ref OL_MARKER: FancyRegex =
1036 FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1037 static ref TASK_MARKER: FancyRegex = FancyRegex::new(
1038 r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)\[(?P<checked>[ xX])\](?P<content>.*)$"
1039 )
1040 .unwrap();
1041 }
1042 self.list_items.clear();
1043 self.list_lines.clear();
1044 let lines: Vec<&str> = content.lines().collect();
1045 for (line_num, line) in lines.iter().enumerate() {
1046 if self.is_in_code_block(line_num + 1) || self.is_in_front_matter(line_num + 1) {
1047 continue;
1048 }
1049 if line.trim().is_empty() {
1050 continue;
1051 }
1052 if let Ok(Some(cap)) = TASK_MARKER.captures(line) {
1054 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1055 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1056 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1057 self.list_lines.push(line_num + 1);
1058 self.list_items.push(ListItem {
1059 line_number: line_num + 1,
1060 indentation,
1061 marker: marker.clone(),
1062 marker_type: ListMarkerType::Task,
1063 content,
1064 });
1065 continue;
1066 }
1067 if let Ok(Some(cap)) = UL_MARKER.captures(line) {
1068 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1069 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1070 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1071 self.list_lines.push(line_num + 1);
1072 self.list_items.push(ListItem {
1073 line_number: line_num + 1,
1074 indentation,
1075 marker: marker.clone(),
1076 marker_type: ListMarkerType::Unordered,
1077 content,
1078 });
1079 continue;
1080 }
1081 if let Ok(Some(cap)) = OL_MARKER.captures(line) {
1082 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1083 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1084 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1085 self.list_lines.push(line_num + 1);
1086 self.list_items.push(ListItem {
1087 line_number: line_num + 1,
1088 indentation,
1089 marker: marker.clone(),
1090 marker_type: ListMarkerType::Ordered,
1091 content,
1092 });
1093 continue;
1094 }
1095 }
1096 }
1097
1098 fn detect_blockquotes(&mut self, content: &str) {
1100 lazy_static! {
1101 static ref BLOCKQUOTE_MARKER: Regex = Regex::new(r"^\s*>(.*)$").unwrap();
1102 }
1103
1104 self.blockquotes.clear();
1106
1107 let lines: Vec<&str> = content.lines().collect();
1108
1109 let mut in_blockquote = false;
1112 let mut start_line = 0;
1113
1114 for (i, line) in lines.iter().enumerate() {
1115 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1117 continue;
1118 }
1119
1120 let is_blockquote_line = BLOCKQUOTE_MARKER.is_match(line);
1121
1122 if is_blockquote_line {
1123 self.in_blockquote[i] = true;
1125
1126 if !in_blockquote {
1127 in_blockquote = true;
1129 start_line = i + 1; }
1131 } else if in_blockquote {
1132 self.blockquotes.push(BlockquoteRange {
1134 start_line,
1135 end_line: i, });
1137
1138 in_blockquote = false;
1139 }
1140 }
1141
1142 if in_blockquote {
1144 self.blockquotes.push(BlockquoteRange {
1145 start_line,
1146 end_line: lines.len(), });
1148 }
1149 }
1150
1151 fn detect_horizontal_rules(&mut self, content: &str) {
1153 lazy_static! {
1154 static ref HR_HYPHEN: Regex = Regex::new(r"^[ \t]*-[ \t]*-[ \t]*-[ \t-]*$").unwrap();
1156 static ref HR_ASTERISK: Regex = Regex::new(r"^[ \t]*\*[ \t]*\*[ \t]*\*[ \t\*]*$").unwrap();
1157 static ref HR_UNDERSCORE: Regex = Regex::new(r"^[ \t]*_[ \t]*_[ \t]*_[ \t_]*$").unwrap();
1158 }
1159
1160 self.horizontal_rule_lines.clear();
1162
1163 let lines: Vec<&str> = content.lines().collect();
1164
1165 for (i, line) in lines.iter().enumerate() {
1166 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1168 continue;
1169 }
1170
1171 if HR_HYPHEN.is_match(line) || HR_ASTERISK.is_match(line) || HR_UNDERSCORE.is_match(line) {
1173 let is_setext_marker = if i > 0 {
1176 let prev_line = lines[i - 1].trim();
1177 !prev_line.is_empty()
1178 && !self.is_in_code_block(i)
1179 && !self.is_in_front_matter(i)
1180 && line.trim().chars().all(|c| c == '-' || c == ' ')
1181 } else {
1182 false
1183 };
1184
1185 if !is_setext_marker {
1186 self.horizontal_rule_lines.push(i + 1); }
1188 }
1189 }
1190 }
1191
1192 fn detect_html_blocks(&mut self, content: &str) {
1194 let lines: Vec<&str> = content.lines().collect();
1195 let mut i = 0;
1198 while i < lines.len() {
1199 let line = lines[i];
1200 let trimmed = line.trim_start();
1201
1202 if self.is_in_code_block(i + 1) {
1204 i += 1;
1205 continue;
1206 }
1207
1208 if self.is_html_block_start(trimmed) {
1210 let start_line = i;
1211
1212 let end_line = self.find_html_block_end(&lines, start_line);
1214
1215 for line_idx in start_line..=end_line {
1217 if line_idx < self.in_html_block.len() {
1218 self.in_html_block[line_idx] = true;
1219 }
1220 }
1221
1222 i = end_line + 1;
1224 } else {
1225 i += 1;
1226 }
1227 }
1228 }
1229
1230 fn is_html_block_start(&self, trimmed: &str) -> bool {
1232 if trimmed.is_empty() || !trimmed.starts_with('<') {
1233 return false;
1234 }
1235
1236 let mut chars = trimmed[1..].chars();
1238 let mut tag_name = String::new();
1239
1240 let is_closing = chars.as_str().starts_with('/');
1242 if is_closing {
1243 chars.next(); }
1245
1246 for ch in chars {
1248 if ch.is_ascii_alphabetic() || ch == '-' {
1249 tag_name.push(ch);
1250 } else {
1251 break;
1252 }
1253 }
1254
1255 if tag_name.is_empty() {
1256 return false;
1257 }
1258
1259 const BLOCK_ELEMENTS: &[&str] = &[
1261 "address",
1262 "article",
1263 "aside",
1264 "base",
1265 "basefont",
1266 "blockquote",
1267 "body",
1268 "caption",
1269 "center",
1270 "col",
1271 "colgroup",
1272 "dd",
1273 "details",
1274 "dialog",
1275 "dir",
1276 "div",
1277 "dl",
1278 "dt",
1279 "fieldset",
1280 "figcaption",
1281 "figure",
1282 "footer",
1283 "form",
1284 "frame",
1285 "frameset",
1286 "h1",
1287 "h2",
1288 "h3",
1289 "h4",
1290 "h5",
1291 "h6",
1292 "head",
1293 "header",
1294 "hr",
1295 "html",
1296 "iframe",
1297 "legend",
1298 "li",
1299 "link",
1300 "main",
1301 "menu",
1302 "menuitem",
1303 "nav",
1304 "noframes",
1305 "ol",
1306 "optgroup",
1307 "option",
1308 "p",
1309 "param",
1310 "section",
1311 "source",
1312 "summary",
1313 "table",
1314 "tbody",
1315 "td",
1316 "tfoot",
1317 "th",
1318 "thead",
1319 "title",
1320 "tr",
1321 "track",
1322 "ul",
1323 "img",
1324 "picture",
1325 ];
1326
1327 BLOCK_ELEMENTS.contains(&tag_name.to_ascii_lowercase().as_str())
1328 }
1329
1330 fn find_html_block_end(&self, lines: &[&str], start_line: usize) -> usize {
1332 let start_trimmed = lines[start_line].trim_start();
1333
1334 let tag_name = self.extract_tag_name(start_trimmed);
1336
1337 for (i, line) in lines.iter().enumerate().skip(start_line + 1) {
1339 let trimmed = line.trim();
1340
1341 if trimmed.is_empty() {
1343 return i - 1; }
1345
1346 if let Some(ref tag) = tag_name {
1348 let closing_tag = format!("</{tag}");
1349 if trimmed.contains(&closing_tag) {
1350 return i;
1351 }
1352 }
1353 }
1354
1355 lines.len() - 1
1357 }
1358
1359 fn extract_tag_name(&self, trimmed: &str) -> Option<String> {
1361 if !trimmed.starts_with('<') {
1362 return None;
1363 }
1364
1365 let mut chars = trimmed[1..].chars();
1366
1367 if chars.as_str().starts_with('/') {
1369 chars.next();
1370 }
1371
1372 let mut tag_name = String::new();
1373 for ch in chars {
1374 if ch.is_ascii_alphabetic() || ch == '-' {
1375 tag_name.push(ch);
1376 } else {
1377 break;
1378 }
1379 }
1380
1381 if tag_name.is_empty() {
1382 None
1383 } else {
1384 Some(tag_name.to_ascii_lowercase())
1385 }
1386 }
1387
1388 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1390 if line_num == 0 || line_num > self.in_code_span.len() {
1391 return false;
1392 }
1393
1394 let line_idx = line_num - 1; if col == 0 || col > self.in_code_span[line_idx].len() {
1397 return false;
1398 }
1399
1400 self.in_code_span[line_idx][col - 1] }
1402
1403 pub fn is_in_blockquote(&self, line_num: usize) -> bool {
1405 if line_num == 0 || line_num > self.in_blockquote.len() {
1406 return false;
1407 }
1408
1409 self.in_blockquote[line_num - 1] }
1411
1412 pub fn get_list_item_at_line(&self, line_num: usize) -> Option<&ListItem> {
1414 self.list_items.iter().find(|item| item.line_number == line_num)
1415 }
1416
1417 pub fn get_list_items_by_type(&self, marker_type: ListMarkerType) -> Vec<&ListItem> {
1419 self.list_items
1420 .iter()
1421 .filter(|item| item.marker_type == marker_type)
1422 .collect()
1423 }
1424
1425 pub fn get_empty_links(&self) -> Vec<&Link> {
1427 self.links
1428 .iter()
1429 .filter(|link| link.text.trim().is_empty() || link.url.trim().is_empty())
1430 .collect()
1431 }
1432
1433 pub fn get_images_without_alt_text(&self) -> Vec<&Image> {
1435 self.images
1436 .iter()
1437 .filter(|img| img.alt_text.trim().is_empty())
1438 .collect()
1439 }
1440
1441 pub fn is_in_html_block(&self, line_num: usize) -> bool {
1443 if line_num == 0 || line_num > self.in_html_block.len() {
1444 return false;
1445 }
1446 self.in_html_block[line_num - 1]
1447 }
1448}
1449
1450pub trait DocumentStructureExtensions {
1452 fn should_process_line(&self, line_num: usize, doc_structure: &DocumentStructure) -> bool {
1454 !doc_structure.is_in_code_block(line_num)
1456 }
1457
1458 fn has_relevant_elements(
1460 &self,
1461 _ctx: &crate::lint_context::LintContext,
1462 _doc_structure: &DocumentStructure,
1463 ) -> bool {
1464 true
1466 }
1467}
1468
1469pub fn document_structure_from_str(content: &str) -> DocumentStructure {
1471 DocumentStructure::new(content)
1472}
1473
1474#[cfg(test)]
1475mod tests {
1476 use super::*;
1477
1478 #[test]
1479 fn test_document_structure_creation() {
1480 let content = "# Heading 1\n\nSome text.\n\n## Heading 2\n\nMore text.\n\n```\nCode block\n```\n";
1481 let structure = DocumentStructure::new(content);
1482
1483 assert_eq!(structure.heading_lines.len(), 2);
1484 assert_eq!(structure.heading_levels.len(), 2);
1485 assert!(structure.has_code_blocks);
1486 assert_eq!(structure.code_blocks.len(), 1);
1487 }
1488
1489 #[test]
1490 fn test_nested_code_blocks() {
1491 let content = r#"```markdown
14921. First item
1493
1494 ```python
1495 code_in_list()
1496 ```
1497
14982. Second item
1499```"#;
1500
1501 let structure = DocumentStructure::new(content);
1502
1503 assert_eq!(structure.code_blocks.len(), 1);
1505 assert_eq!(structure.code_blocks[0].start_line, 1);
1506 assert_eq!(structure.code_blocks[0].end_line, 9);
1507
1508 for line in 2..=8 {
1510 assert!(structure.is_in_code_block(line), "Line {line} should be in code block");
1511 }
1512 }
1513
1514 #[test]
1515 fn test_document_with_front_matter() {
1516 let content = "---\ntitle: Test Document\ndate: 2021-01-01\n---\n\n# Heading 1\n\nSome text.\n";
1517 let structure = DocumentStructure::new(content);
1518
1519 assert!(structure.has_front_matter);
1520 assert!(structure.front_matter_range.is_some());
1521 assert_eq!(structure.heading_lines.len(), 1);
1522 assert!(!structure.has_code_blocks);
1523 }
1524
1525 #[test]
1526 fn test_is_in_code_block() {
1527 let content = "# Heading\n\nText.\n\n```\ncode line 1\ncode line 2\n```\n\nMore text.\n";
1528 let structure = DocumentStructure::new(content);
1529
1530 assert!(!structure.is_in_code_block(1)); assert!(!structure.is_in_code_block(3)); assert!(!structure.is_in_code_block(5)); assert!(structure.is_in_code_block(6)); assert!(structure.is_in_code_block(7)); assert!(!structure.is_in_code_block(8)); assert!(!structure.is_in_code_block(10)); }
1538
1539 #[test]
1540 fn test_headings_edge_cases() {
1541 let content =
1543 " # ATX Heading\n# Closed ATX Heading #\nSetext H1\n=======\nSetext H2\n-------\n\n# ATX Again\n";
1544 let structure = DocumentStructure::new(content);
1545 assert_eq!(structure.heading_lines, vec![1, 2, 3, 5, 8]);
1546 assert_eq!(structure.heading_levels, vec![1, 1, 1, 2, 1]);
1547
1548 let content = "---\ntitle: Test\n---\n# Heading 1\n\n```\n# Not a heading\n```\n# Heading 2\n";
1550 let structure = DocumentStructure::new(content);
1551 assert_eq!(structure.heading_lines, vec![4, 9]);
1552 assert_eq!(structure.heading_levels, vec![1, 1]);
1553
1554 let content = "#\n## \n### \n# Not Empty\n";
1556 let structure = DocumentStructure::new(content);
1557 assert_eq!(structure.heading_lines, vec![4]);
1558 assert_eq!(structure.heading_levels, vec![1]);
1559
1560 let content = "# Heading \n# Heading\n";
1562 let structure = DocumentStructure::new(content);
1563 assert_eq!(structure.heading_lines, vec![1, 2]);
1564 assert_eq!(structure.heading_levels, vec![1, 1]);
1565
1566 let content = " # Indented\n # Not a heading (too much indent)\n# Valid\n";
1568 let structure = DocumentStructure::new(content);
1569 assert_eq!(structure.heading_lines, vec![1, 3]);
1570 assert_eq!(structure.heading_levels, vec![1, 1]);
1571
1572 let content = "# Dup\n# Dup\n# Unique\n# Dup\n";
1574 let structure = DocumentStructure::new(content);
1575 assert_eq!(structure.heading_lines, vec![1, 2, 3, 4]);
1576 assert_eq!(structure.heading_levels, vec![1, 1, 1, 1]);
1577
1578 let content = "```\n# Not a heading\n```\n# Real Heading\n";
1580 let structure = DocumentStructure::new(content);
1581 assert_eq!(structure.heading_lines, vec![4]);
1582 assert_eq!(structure.heading_levels, vec![1]);
1583
1584 let content = "---\ntitle: Test\n---\n# Heading\n";
1585 let structure = DocumentStructure::new(content);
1586 assert_eq!(structure.heading_lines, vec![4]);
1587 assert_eq!(structure.heading_levels, vec![1]);
1588
1589 let content = "\nSetext\n=======\n\nSetext2\n-------\n";
1591 let structure = DocumentStructure::new(content);
1592 assert_eq!(structure.heading_lines, vec![2, 5]);
1593 assert_eq!(structure.heading_levels, vec![1, 2]);
1594
1595 let content = "# Heading!@#$%^&*()\nSetext Special\n=======\n";
1597 let structure = DocumentStructure::new(content);
1598 assert_eq!(structure.heading_lines, vec![1, 2]);
1599 assert_eq!(structure.heading_levels, vec![1, 1]);
1600 }
1601
1602 #[test]
1603 fn test_horizontal_rule_detection() {
1604 let content = "Text\n\n---\n\nMore text\n\n***\n\nFinal\n\n___\n\nEnd";
1606 let structure = DocumentStructure::new(content);
1607 assert_eq!(structure.horizontal_rule_lines, vec![3, 7, 11]);
1608
1609 let content = "Text\n\n- - -\n\n* * *\n\n_ _ _\n\nEnd";
1611 let structure = DocumentStructure::new(content);
1612 assert_eq!(structure.horizontal_rule_lines, vec![3, 5, 7]);
1613
1614 let content = "# ATX\n\nSetext\n------\n\n---\n\nAnother\n======\n";
1616 let structure = DocumentStructure::new(content);
1617 assert_eq!(structure.horizontal_rule_lines, vec![6]); assert_eq!(structure.heading_lines, vec![1, 3, 8]); let content = "Text\n\n```\n---\n***\n```\n\n---\n\nEnd";
1622 let structure = DocumentStructure::new(content);
1623 assert_eq!(structure.horizontal_rule_lines, vec![8]); let content = "---\ntitle: Test\n---\n\n---\n\nContent";
1627 let structure = DocumentStructure::new(content);
1628 assert_eq!(structure.horizontal_rule_lines, vec![5]); }
1630}