1use crate::rules::heading_utils::HeadingStyle;
2use fancy_regex::Regex as FancyRegex;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6#[derive(Debug, Clone)]
9pub struct DocumentStructure {
10 pub code_blocks: Vec<CodeBlock>,
12 pub has_code_blocks: bool,
14 pub heading_lines: Vec<usize>,
16 pub heading_levels: Vec<usize>,
18 pub heading_regions: Vec<(usize, usize)>,
20 pub list_lines: Vec<usize>,
22 pub has_front_matter: bool,
24 pub front_matter_range: Option<(usize, usize)>,
26 pub has_urls: bool,
28 pub has_html: bool,
30 pub in_code_block: Vec<bool>,
32 pub fenced_code_block_starts: Vec<usize>,
34 pub fenced_code_block_ends: Vec<usize>,
36 pub first_heading_style: Option<HeadingStyle>,
38 pub code_spans: Vec<CodeSpan>,
40 pub in_code_span: Vec<Vec<bool>>,
42 pub links: Vec<Link>,
44 pub images: Vec<Image>,
46 pub list_items: Vec<ListItem>,
48 pub blockquotes: Vec<BlockquoteRange>,
50 pub in_blockquote: Vec<bool>,
52 pub in_html_block: Vec<bool>,
54 pub horizontal_rule_lines: Vec<usize>,
56}
57
58#[derive(Debug, Clone)]
60pub struct FrontMatter {
61 pub start_line: usize,
62 pub end_line: usize,
63 pub content: String,
64}
65
66#[derive(Debug, Clone, PartialEq)]
68pub struct Heading {
69 pub text: String,
70 pub level: u32,
71 pub line_number: usize,
72 pub original_text: String,
73 pub indentation: String,
74}
75
76#[derive(Debug, Clone)]
78pub struct CodeBlock {
79 pub start_line: usize,
81 pub end_line: usize,
83 pub language: Option<String>,
85 pub block_type: CodeBlockType,
87}
88
89#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum CodeBlockType {
92 Fenced,
94 Indented,
96}
97
98#[derive(Debug, Clone)]
100pub struct ListItem {
101 pub line_number: usize,
102 pub indentation: usize,
103 pub marker: String,
104 pub marker_type: ListMarkerType,
105 pub content: String,
106}
107
108#[derive(Debug, Clone, PartialEq)]
110pub enum ListMarkerType {
111 Unordered,
112 Ordered,
113 Task,
114}
115
116#[derive(Debug, Clone)]
118pub struct BlockquoteRange {
119 pub start_line: usize,
120 pub end_line: usize,
121}
122
123#[derive(Debug, Clone)]
125pub struct CodeSpan {
126 pub line: usize,
128 pub start_col: usize,
130 pub end_col: usize,
132 pub content: String,
134}
135
136#[derive(Debug, Clone)]
138pub struct Link {
139 pub line: usize,
141 pub start_col: usize,
143 pub end_col: usize,
145 pub text: String,
147 pub url: String,
149 pub is_reference: bool,
151 pub reference_id: Option<String>,
153}
154
155#[derive(Debug, Clone)]
157pub struct Image {
158 pub line: usize,
160 pub start_col: usize,
162 pub end_col: usize,
164 pub alt_text: String,
166 pub src: String,
168 pub is_reference: bool,
170 pub reference_id: Option<String>,
172}
173
174lazy_static! {
176 static ref CONTAINS_ATX_HEADING: Regex = Regex::new(r"(?m)^(\s*)#{1,6}").unwrap();
178 static ref CONTAINS_SETEXT_UNDERLINE: Regex = Regex::new(r"(?m)^(\s*)(=+|-+)\s*$").unwrap();
179 static ref CONTAINS_LIST_MARKERS: Regex = Regex::new(r"(?m)^(\s*)([*+-]|\d+\.)").unwrap();
180 static ref CONTAINS_BLOCKQUOTE: Regex = Regex::new(r"(?m)^(\s*)>").unwrap();
181 static ref CONTAINS_HTML_BLOCK: Regex = Regex::new(r"(?m)^(\s*)<[a-zA-Z]").unwrap();
182}
183
184impl DocumentStructure {
185 pub fn new(content: &str) -> Self {
187 let mut structure = DocumentStructure {
189 code_blocks: Vec::new(),
190 has_code_blocks: false,
191 heading_lines: Vec::new(),
192 heading_levels: Vec::new(),
193 heading_regions: Vec::new(),
194 list_lines: Vec::new(),
195 has_front_matter: false,
196 front_matter_range: None,
197 has_urls: false,
198 has_html: false,
199 in_code_block: Vec::new(),
200 fenced_code_block_starts: Vec::new(),
201 fenced_code_block_ends: Vec::new(),
202 first_heading_style: None,
203 code_spans: Vec::new(),
205 in_code_span: Vec::new(),
206 links: Vec::new(),
207 images: Vec::new(),
208 list_items: Vec::new(),
209 blockquotes: Vec::new(),
210 in_blockquote: Vec::new(),
211 in_html_block: Vec::new(),
212 horizontal_rule_lines: Vec::new(),
213 };
214
215 structure.analyze(content);
217 structure
218 }
219
220 fn analyze(&mut self, content: &str) {
222 if content.is_empty() {
224 return;
225 }
226
227 let lines: Vec<&str> = content.lines().collect();
229 self.in_code_span = vec![Vec::new(); lines.len()];
230 for (i, line) in lines.iter().enumerate() {
231 self.in_code_span[i] = vec![false; line.len() + 1]; }
233 self.in_blockquote = vec![false; lines.len()];
234 self.in_html_block = vec![false; lines.len()];
235
236 self.detect_front_matter(content);
238
239 let has_blockquote_markers = CONTAINS_BLOCKQUOTE.is_match(content);
241 let has_html_blocks = CONTAINS_HTML_BLOCK.is_match(content);
242
243 if has_html_blocks {
245 self.detect_html_blocks(content);
246 }
247
248 self.code_blocks = self.compute_code_blocks(content);
250 self.has_code_blocks = !self.code_blocks.is_empty();
251
252 self.compute_code_block_bitmap(content);
254
255 self.populate_fenced_code_blocks();
257 let has_backticks = content.contains('`');
258 let has_brackets = content.contains('[');
259 let has_headings = CONTAINS_ATX_HEADING.is_match(content) || CONTAINS_SETEXT_UNDERLINE.is_match(content);
260 let has_list_markers = CONTAINS_LIST_MARKERS.is_match(content)
262 || content.contains("- ")
263 || content.contains("* ")
264 || content.contains("+ ")
265 || content.contains("1. ")
266 || content.contains("2. ")
267 || content.contains("3. ")
268 || content.contains("4. ")
269 || content.contains("5. ")
270 || content.contains("6. ")
271 || content.contains("7. ")
272 || content.contains("8. ")
273 || content.contains("9. ")
274 || content.contains("10. ")
275 || content.contains("11. ")
276 || content.contains("12. ");
277
278 if has_blockquote_markers {
280 self.detect_blockquotes(content);
281 }
282
283 if has_backticks {
285 self.detect_code_spans(content);
286 }
287
288 if has_brackets {
290 self.detect_links_and_images(content);
291 }
292
293 if has_headings {
295 self.detect_headings(content);
296 }
297
298 if has_list_markers {
300 self.detect_list_items(content);
301 }
302
303 let has_potential_hrs = content.contains("---")
305 || content.contains("***")
306 || content.contains("___")
307 || content.contains("- -")
308 || content.contains("* *")
309 || content.contains("_ _");
310 if has_potential_hrs {
311 self.detect_horizontal_rules(content);
312 }
313
314 if crate::utils::early_returns::has_urls(content) {
316 self.has_urls = true;
317 }
318
319 if has_html_blocks && (content.contains("</") || content.contains("/>")) {
321 self.has_html = true;
322 }
323 }
324
325 fn compute_code_block_bitmap(&mut self, content: &str) {
327 let line_count = content.lines().count();
328 self.in_code_block = vec![false; line_count];
329
330 for block in &self.code_blocks {
331 let start = block.start_line.saturating_sub(1); let end = block.end_line.min(line_count); if let CodeBlockType::Fenced = block.block_type {
336 if end > start + 1 {
338 for i in (start + 1)..(end - 1) {
339 if i < self.in_code_block.len() {
340 self.in_code_block[i] = true;
341 }
342 }
343 }
344 } else {
345 for i in start..end {
347 if i < self.in_code_block.len() {
348 self.in_code_block[i] = true;
349 }
350 }
351 }
352 }
353 }
354
355 pub fn is_in_code_block(&self, line_num: usize) -> bool {
357 if line_num == 0 || line_num > self.in_code_block.len() {
358 return false;
359 }
360 self.in_code_block[line_num - 1] }
362
363 fn detect_headings(&mut self, content: &str) {
365 lazy_static! {
366 static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+|[^\s#])").unwrap();
367 static ref SETEXT_HEADING_UNDERLINE: Regex = Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
368 }
369
370 self.heading_lines.clear();
372 self.heading_levels.clear();
373 self.heading_regions.clear();
374 self.first_heading_style = None;
375
376 let lines: Vec<&str> = content.lines().collect();
377
378 for (i, line) in lines.iter().enumerate() {
379 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
381 continue;
382 }
383
384 if line.trim().is_empty() {
386 continue;
387 }
388
389 if let Some(captures) = ATX_HEADING.captures(line) {
391 let level = captures[2].len();
392 let mut chars = line.trim().chars();
394 while chars.next() == Some('#') {}
395 let heading_text = chars.as_str().trim();
396 if heading_text.is_empty() {
397 continue; }
399 self.heading_lines.push(i + 1);
400 self.heading_levels.push(level);
401 self.heading_regions.push((i + 1, i + 1)); if self.first_heading_style.is_none() {
405 if line.trim().ends_with('#') {
407 self.first_heading_style = Some(HeadingStyle::AtxClosed);
408 } else {
409 self.first_heading_style = Some(HeadingStyle::Atx);
410 }
411 }
412 continue;
413 }
414
415 if i > 0 && !lines[i - 1].trim().is_empty() &&
417 !self.is_in_front_matter(i) && SETEXT_HEADING_UNDERLINE.is_match(line)
419 {
420 let content_line = lines[i - 1].trim();
421 if content_line.is_empty() {
422 continue; }
424 let level = if line.trim().starts_with('=') { 1 } else { 2 };
425 self.heading_lines.push(i); self.heading_levels.push(level);
427 self.heading_regions.push((i, i + 1)); if self.first_heading_style.is_none() {
431 if level == 1 {
432 self.first_heading_style = Some(HeadingStyle::Setext1);
433 } else {
434 self.first_heading_style = Some(HeadingStyle::Setext2);
435 }
436 }
437 }
438 }
439
440 if self.heading_lines.is_empty() {
442 self.first_heading_style = Some(HeadingStyle::Atx);
443 }
444 }
445
446 fn detect_front_matter(&mut self, content: &str) {
448 let lines: Vec<&str> = content.lines().collect();
449
450 self.has_front_matter = false;
452 self.front_matter_range = None;
453
454 if !lines.is_empty() && lines[0] == "---" {
456 for (i, line) in lines.iter().enumerate().skip(1) {
458 if *line == "---" {
459 self.has_front_matter = true;
460 self.front_matter_range = Some((1, i + 1));
461 break;
462 }
463 }
464 }
465 }
466
467 fn compute_code_blocks(&self, content: &str) -> Vec<CodeBlock> {
469 lazy_static! {
470 static ref FENCED_START: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*([^`\s]*)").unwrap();
472 static ref FENCED_END: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*$").unwrap();
473 }
474
475 let mut code_blocks = Vec::new();
476 let mut in_code_block = false;
477 let mut current_block_start = 0;
478 let mut current_language = None;
479 let mut current_fence_char = ' ';
480 let mut current_fence_length = 0; let mut current_fence_indent = 0; let lines: Vec<&str> = content.lines().collect();
483
484 let mut i = 0;
485 while i < lines.len() {
486 let line = lines[i];
487
488 if !in_code_block {
489 if let Some(captures) = FENCED_START.captures(line) {
491 in_code_block = true;
492 current_block_start = i + 1;
493 let indent = captures.get(1).map_or("", |m| m.as_str());
494 current_fence_indent = indent.len();
495 let fence = captures.get(2).map_or("```", |m| m.as_str());
496 current_fence_char = fence.chars().next().unwrap();
497 current_fence_length = fence.len();
498
499 let lang = captures.get(3).map(|m| m.as_str().to_string());
501 current_language = lang.filter(|l| !l.is_empty());
502 }
503 else if Self::is_indented_code_line(line)
507 && !line.trim().is_empty()
508 && !self.is_in_html_block(i + 1)
509 && !Self::is_potential_list_item(line)
510 {
511 let mut end_line = i;
514
515 while end_line + 1 < lines.len() {
518 let next_line = lines[end_line + 1];
519
520 if Self::is_indented_code_line(next_line)
521 && !next_line.trim().is_empty()
522 && !self.is_in_html_block(end_line + 2)
523 && !Self::is_potential_list_item(next_line)
524 {
525 end_line += 1;
527 } else if next_line.trim().is_empty() {
528 let mut lookahead = end_line + 2;
530 let mut found_indented = false;
531
532 while lookahead < lines.len() {
533 let lookahead_line = lines[lookahead];
534 if Self::is_indented_code_line(lookahead_line)
535 && !lookahead_line.trim().is_empty()
536 && !self.is_in_html_block(lookahead + 1)
537 && !Self::is_potential_list_item(lookahead_line)
538 {
539 found_indented = true;
540 break;
541 } else if !lookahead_line.trim().is_empty() {
542 break;
544 }
545 lookahead += 1;
546 }
547
548 if found_indented {
549 end_line += 1;
551 } else {
552 break;
554 }
555 } else {
556 break;
558 }
559 }
560
561 code_blocks.push(CodeBlock {
562 start_line: i + 1,
563 end_line: end_line + 1,
564 language: None,
565 block_type: CodeBlockType::Indented,
566 });
567
568 i = end_line;
570 }
571 } else {
572 if let Some(captures) = FENCED_END.captures(line) {
575 let indent = captures.get(1).map_or("", |m| m.as_str());
576 let fence = captures.get(2).map_or("", |m| m.as_str());
577
578 if fence.starts_with(current_fence_char)
580 && fence.len() >= current_fence_length
581 && indent.len() <= current_fence_indent
582 {
583 code_blocks.push(CodeBlock {
584 start_line: current_block_start,
585 end_line: i + 1,
586 language: current_language.clone(),
587 block_type: CodeBlockType::Fenced,
588 });
589
590 in_code_block = false;
591 current_language = None;
592 current_fence_char = ' ';
593 current_fence_length = 0;
594 current_fence_indent = 0;
595 }
596 }
597 }
598
599 i += 1;
600 }
601
602 if in_code_block {
604 code_blocks.push(CodeBlock {
605 start_line: current_block_start,
606 end_line: lines.len(),
607 language: current_language,
608 block_type: CodeBlockType::Fenced,
609 });
610 }
611
612 code_blocks
613 }
614
615 fn populate_fenced_code_blocks(&mut self) {
617 self.fenced_code_block_starts.clear();
618 self.fenced_code_block_ends.clear();
619
620 for block in &self.code_blocks {
621 if let CodeBlockType::Fenced = block.block_type {
622 self.fenced_code_block_starts.push(block.start_line);
623 self.fenced_code_block_ends.push(block.end_line);
624 }
625 }
626 }
627
628 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
630 if let Some((start, end)) = self.front_matter_range {
631 line_num >= start && line_num <= end
632 } else {
633 false
634 }
635 }
636
637 #[inline]
642 pub fn count_trailing_spaces(line: &str) -> usize {
643 let content = line.strip_suffix('\n').unwrap_or(line);
645
646 let mut space_count = 0;
648 for c in content.chars().rev() {
649 if c == ' ' {
650 space_count += 1;
651 } else {
652 break;
653 }
654 }
655
656 space_count
657 }
658
659 #[inline]
664 pub fn has_trailing_spaces(line: &str) -> bool {
665 Self::count_trailing_spaces(line) > 0
666 }
667
668 #[inline]
674 fn is_indented_code_line(line: &str) -> bool {
675 if line.starts_with('\t') {
676 return true;
677 }
678
679 let mut space_count = 0;
681 for c in line.chars() {
682 if c == ' ' {
683 space_count += 1;
684 } else {
685 break;
686 }
687 }
688
689 space_count >= 4
690 }
691
692 #[inline]
695 fn is_potential_list_item(line: &str) -> bool {
696 lazy_static! {
697 static ref LIST_ITEM_PATTERN: Regex = Regex::new(
700 r"^[ \t]*([*+-]|\d+[.)]])[ \t]"
701 ).unwrap();
702 }
703 LIST_ITEM_PATTERN.is_match(line)
704 }
705
706 pub fn get_list_start_indices(&self) -> Vec<usize> {
709 if self.list_lines.is_empty() {
710 return Vec::new();
711 }
712
713 let mut list_starts = Vec::new();
714 let mut prev_line = 0;
715
716 for (i, &line_num) in self.list_lines.iter().enumerate() {
717 if i == 0 || line_num > prev_line + 1 {
720 list_starts.push(line_num - 1); }
722 prev_line = line_num;
723 }
724
725 list_starts
726 }
727
728 pub fn get_list_end_indices(&self) -> Vec<usize> {
731 if self.list_lines.is_empty() {
732 return Vec::new();
733 }
734
735 let mut list_ends = Vec::new();
736 let list_lines = &self.list_lines;
737
738 for (i, &line_num) in list_lines.iter().enumerate() {
739 if i == list_lines.len() - 1 || list_lines[i + 1] > line_num + 1 {
742 list_ends.push(line_num - 1); }
744 }
745
746 list_ends
747 }
748
749 fn detect_code_spans(&mut self, content: &str) {
751 self.code_spans.clear();
753
754 let lines: Vec<&str> = content.lines().collect();
755
756 for (line_num, line) in lines.iter().enumerate() {
759 if self.is_in_code_block(line_num + 1) {
761 continue;
762 }
763
764 if line.is_empty() {
766 continue;
767 }
768
769 let mut i = 0;
770 while i < line.len() {
771 if let Some(start_pos) = line[i..].find('`') {
773 let start_idx = i + start_pos;
774
775 if let Some(end_pos) = line[start_idx + 1..].find('`') {
777 let end_idx = start_idx + 1 + end_pos;
778
779 let content = line[start_idx + 1..end_idx].to_string();
781
782 self.code_spans.push(CodeSpan {
784 line: line_num + 1, start_col: start_idx + 1, end_col: end_idx + 1, content,
788 });
789
790 for col in start_idx..=end_idx {
792 if col < self.in_code_span[line_num].len() {
793 self.in_code_span[line_num][col] = true;
794 }
795 }
796
797 i = end_idx + 1;
799 } else {
800 i = start_idx + 1;
802 }
803 } else {
804 break;
806 }
807 }
808 }
809 }
810
811 fn detect_links_and_images(&mut self, content: &str) {
813 lazy_static! {
814 static ref INLINE_LINK: FancyRegex = FancyRegex::new(r"(?x)
816 (?<!\\) # Not preceded by backslash
817 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text (handles nested brackets and escapes)
818 \(([^)]*)\) # URL in parentheses
819 ").unwrap();
820 static ref REFERENCE_LINK: FancyRegex = FancyRegex::new(r"(?x)
822 (?<!\\) # Not preceded by backslash
823 \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Link text (handles nested brackets and escapes)
824 \[([^\]]*)\] # Reference ID
825 ").unwrap();
826 static ref SHORTCUT_LINK: FancyRegex = FancyRegex::new(r"(?x)
828 (?<!\\) # Not preceded by backslash
829 \[([^\]]+)\] # Link text
830 (?!\(|\[) # Not followed by ( or [
831 ").unwrap();
832 static ref LINK_DEFINITION: Regex = Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap();
834 static ref INLINE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
836 (?<!\\) # Not preceded by backslash
837 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
838 \(([^)]*)\) # Source URL
839 ").unwrap();
840 static ref REFERENCE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
842 (?<!\\) # Not preceded by backslash
843 !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
844 \[([^\]]*)\] # Reference ID
845 ").unwrap();
846 }
847
848 self.links.clear();
850 self.images.clear();
851
852 let lines: Vec<&str> = content.lines().collect();
853
854 let mut link_defs = std::collections::HashMap::new();
856 for (line_num, line) in lines.iter().enumerate() {
857 if self.is_in_code_block(line_num + 1) {
859 continue;
860 }
861
862 if let Some(cap) = LINK_DEFINITION.captures(line) {
864 let id = cap.get(1).map_or("", |m| m.as_str()).to_string();
865 let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
866 link_defs.insert(id.to_lowercase(), url);
867 }
868 }
869
870 for (line_num, line) in lines.iter().enumerate() {
872 if self.is_in_code_block(line_num + 1) {
874 continue;
875 }
876
877 if line.is_empty() {
879 continue;
880 }
881
882 if !line.contains('[') && !line.contains('!') {
884 continue;
885 }
886
887 let mut i = 0;
889 while i < line.len() {
890 if i < self.in_code_span[line_num].len() && self.in_code_span[line_num][i] {
892 i += 1;
893 continue;
894 }
895
896 if let Some(rest) = line.get(i..) {
898 if rest.starts_with('[') {
899 let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
901 let is_escaped_image =
902 i > 1 && line.chars().nth(i - 2) == Some('\\') && line.chars().nth(i - 1) == Some('!');
903 if !is_escaped && !is_escaped_image {
904 if let Ok(Some(cap)) = INLINE_LINK.captures(rest) {
905 let whole_match = cap.get(0).unwrap();
906 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
907 let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
908
909 let is_in_span = (i..i + whole_match.end()).any(|pos| {
911 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
912 });
913
914 if !is_in_span {
915 self.links.push(Link {
916 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), text,
920 url,
921 is_reference: false,
922 reference_id: None,
923 });
924 }
925
926 i += whole_match.end();
928 } else if let Ok(Some(cap)) = REFERENCE_LINK.captures(rest) {
929 let whole_match = cap.get(0).unwrap();
930 let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
931 let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
932
933 let ref_id = if id.is_empty() { text.clone() } else { id };
935
936 let url = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
938
939 let is_in_span = (i..i + whole_match.end()).any(|pos| {
941 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
942 });
943
944 if !is_in_span {
945 self.links.push(Link {
946 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), text,
950 url,
951 is_reference: true,
952 reference_id: Some(ref_id),
953 });
954 }
955
956 i += whole_match.end();
958 } else {
959 i += 1;
961 }
962 } else {
963 i += 1;
965 }
966 } else if rest.starts_with("![") {
967 let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
969 if !is_escaped {
970 if let Ok(Some(cap)) = INLINE_IMAGE.captures(rest) {
971 let whole_match = cap.get(0).unwrap();
972 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
973 let src = cap.get(2).map_or("", |m| m.as_str()).to_string();
974
975 let is_in_span = (i..i + whole_match.end()).any(|pos| {
977 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
978 });
979
980 if !is_in_span {
981 self.images.push(Image {
982 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), alt_text,
986 src,
987 is_reference: false,
988 reference_id: None,
989 });
990 }
991
992 i += whole_match.end();
994 } else if let Ok(Some(cap)) = REFERENCE_IMAGE.captures(rest) {
995 let whole_match = cap.get(0).unwrap();
996 let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
997 let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
998
999 let ref_id = if id.is_empty() { alt_text.clone() } else { id };
1001
1002 let src = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
1004
1005 let is_in_span = (i..i + whole_match.end()).any(|pos| {
1007 pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
1008 });
1009
1010 if !is_in_span {
1011 self.images.push(Image {
1012 line: line_num + 1, start_col: i + 1, end_col: i + whole_match.end(), alt_text,
1016 src,
1017 is_reference: true,
1018 reference_id: Some(ref_id),
1019 });
1020 }
1021
1022 i += whole_match.end();
1024 } else {
1025 i += 1;
1027 }
1028 } else {
1029 i += 1;
1031 }
1032 } else {
1033 i += 1;
1035 }
1036 } else {
1037 break;
1039 }
1040 }
1041 }
1042 }
1043
1044 fn detect_list_items(&mut self, content: &str) {
1046 lazy_static! {
1053 static ref UL_MARKER: FancyRegex =
1054 FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1055 static ref OL_MARKER: FancyRegex =
1056 FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1057 static ref TASK_MARKER: FancyRegex = FancyRegex::new(
1058 r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)\[(?P<checked>[ xX])\](?P<content>.*)$"
1059 )
1060 .unwrap();
1061 }
1062 self.list_items.clear();
1063 self.list_lines.clear();
1064 let lines: Vec<&str> = content.lines().collect();
1065 for (line_num, line) in lines.iter().enumerate() {
1066 if self.is_in_code_block(line_num + 1) || self.is_in_front_matter(line_num + 1) {
1067 continue;
1068 }
1069 if line.trim().is_empty() {
1070 continue;
1071 }
1072 if let Ok(Some(cap)) = TASK_MARKER.captures(line) {
1074 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1075 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1076 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1077 self.list_lines.push(line_num + 1);
1078 self.list_items.push(ListItem {
1079 line_number: line_num + 1,
1080 indentation,
1081 marker: marker.clone(),
1082 marker_type: ListMarkerType::Task,
1083 content,
1084 });
1085 continue;
1086 }
1087 if let Ok(Some(cap)) = UL_MARKER.captures(line) {
1088 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1089 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1090 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1091 self.list_lines.push(line_num + 1);
1092 self.list_items.push(ListItem {
1093 line_number: line_num + 1,
1094 indentation,
1095 marker: marker.clone(),
1096 marker_type: ListMarkerType::Unordered,
1097 content,
1098 });
1099 continue;
1100 }
1101 if let Ok(Some(cap)) = OL_MARKER.captures(line) {
1102 let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1103 let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1104 let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1105 self.list_lines.push(line_num + 1);
1106 self.list_items.push(ListItem {
1107 line_number: line_num + 1,
1108 indentation,
1109 marker: marker.clone(),
1110 marker_type: ListMarkerType::Ordered,
1111 content,
1112 });
1113 continue;
1114 }
1115 }
1116 }
1117
1118 fn detect_blockquotes(&mut self, content: &str) {
1120 lazy_static! {
1121 static ref BLOCKQUOTE_MARKER: Regex = Regex::new(r"^\s*>(.*)$").unwrap();
1122 }
1123
1124 self.blockquotes.clear();
1126
1127 let lines: Vec<&str> = content.lines().collect();
1128
1129 let mut in_blockquote = false;
1132 let mut start_line = 0;
1133
1134 for (i, line) in lines.iter().enumerate() {
1135 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1137 continue;
1138 }
1139
1140 let is_blockquote_line = BLOCKQUOTE_MARKER.is_match(line);
1141
1142 if is_blockquote_line {
1143 self.in_blockquote[i] = true;
1145
1146 if !in_blockquote {
1147 in_blockquote = true;
1149 start_line = i + 1; }
1151 } else if in_blockquote {
1152 self.blockquotes.push(BlockquoteRange {
1154 start_line,
1155 end_line: i, });
1157
1158 in_blockquote = false;
1159 }
1160 }
1161
1162 if in_blockquote {
1164 self.blockquotes.push(BlockquoteRange {
1165 start_line,
1166 end_line: lines.len(), });
1168 }
1169 }
1170
1171 fn detect_horizontal_rules(&mut self, content: &str) {
1173 lazy_static! {
1174 static ref HR_HYPHEN: Regex = Regex::new(r"^[ \t]*-[ \t]*-[ \t]*-[ \t-]*$").unwrap();
1176 static ref HR_ASTERISK: Regex = Regex::new(r"^[ \t]*\*[ \t]*\*[ \t]*\*[ \t\*]*$").unwrap();
1177 static ref HR_UNDERSCORE: Regex = Regex::new(r"^[ \t]*_[ \t]*_[ \t]*_[ \t_]*$").unwrap();
1178 }
1179
1180 self.horizontal_rule_lines.clear();
1182
1183 let lines: Vec<&str> = content.lines().collect();
1184
1185 for (i, line) in lines.iter().enumerate() {
1186 if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1188 continue;
1189 }
1190
1191 if HR_HYPHEN.is_match(line) || HR_ASTERISK.is_match(line) || HR_UNDERSCORE.is_match(line) {
1193 let is_setext_marker = if i > 0 {
1196 let prev_line = lines[i - 1].trim();
1197 !prev_line.is_empty()
1198 && !self.is_in_code_block(i)
1199 && !self.is_in_front_matter(i)
1200 && line.trim().chars().all(|c| c == '-' || c == ' ')
1201 } else {
1202 false
1203 };
1204
1205 if !is_setext_marker {
1206 self.horizontal_rule_lines.push(i + 1); }
1208 }
1209 }
1210 }
1211
1212 fn detect_html_blocks(&mut self, content: &str) {
1214 let lines: Vec<&str> = content.lines().collect();
1215 let mut i = 0;
1218 while i < lines.len() {
1219 let line = lines[i];
1220 let trimmed = line.trim_start();
1221
1222 if self.is_in_code_block(i + 1) {
1224 i += 1;
1225 continue;
1226 }
1227
1228 if self.is_html_block_start(trimmed) {
1230 let start_line = i;
1231
1232 let end_line = self.find_html_block_end(&lines, start_line);
1234
1235 for line_idx in start_line..=end_line {
1237 if line_idx < self.in_html_block.len() {
1238 self.in_html_block[line_idx] = true;
1239 }
1240 }
1241
1242 i = end_line + 1;
1244 } else {
1245 i += 1;
1246 }
1247 }
1248 }
1249
1250 fn is_html_block_start(&self, trimmed: &str) -> bool {
1252 if trimmed.is_empty() || !trimmed.starts_with('<') {
1253 return false;
1254 }
1255
1256 let mut chars = trimmed[1..].chars();
1258 let mut tag_name = String::new();
1259
1260 let is_closing = chars.as_str().starts_with('/');
1262 if is_closing {
1263 chars.next(); }
1265
1266 for ch in chars {
1268 if ch.is_ascii_alphabetic() || ch == '-' {
1269 tag_name.push(ch);
1270 } else {
1271 break;
1272 }
1273 }
1274
1275 if tag_name.is_empty() {
1276 return false;
1277 }
1278
1279 const BLOCK_ELEMENTS: &[&str] = &[
1281 "address",
1282 "article",
1283 "aside",
1284 "base",
1285 "basefont",
1286 "blockquote",
1287 "body",
1288 "caption",
1289 "center",
1290 "col",
1291 "colgroup",
1292 "dd",
1293 "details",
1294 "dialog",
1295 "dir",
1296 "div",
1297 "dl",
1298 "dt",
1299 "fieldset",
1300 "figcaption",
1301 "figure",
1302 "footer",
1303 "form",
1304 "frame",
1305 "frameset",
1306 "h1",
1307 "h2",
1308 "h3",
1309 "h4",
1310 "h5",
1311 "h6",
1312 "head",
1313 "header",
1314 "hr",
1315 "html",
1316 "iframe",
1317 "legend",
1318 "li",
1319 "link",
1320 "main",
1321 "menu",
1322 "menuitem",
1323 "nav",
1324 "noframes",
1325 "ol",
1326 "optgroup",
1327 "option",
1328 "p",
1329 "param",
1330 "section",
1331 "source",
1332 "summary",
1333 "table",
1334 "tbody",
1335 "td",
1336 "tfoot",
1337 "th",
1338 "thead",
1339 "title",
1340 "tr",
1341 "track",
1342 "ul",
1343 "img",
1344 "picture",
1345 ];
1346
1347 BLOCK_ELEMENTS.contains(&tag_name.to_ascii_lowercase().as_str())
1348 }
1349
1350 fn find_html_block_end(&self, lines: &[&str], start_line: usize) -> usize {
1352 let start_trimmed = lines[start_line].trim_start();
1353
1354 let tag_name = self.extract_tag_name(start_trimmed);
1356
1357 for (i, line) in lines.iter().enumerate().skip(start_line + 1) {
1359 let trimmed = line.trim();
1360
1361 if trimmed.is_empty() {
1363 return i - 1; }
1365
1366 if let Some(ref tag) = tag_name {
1368 let closing_tag = format!("</{tag}");
1369 if trimmed.contains(&closing_tag) {
1370 return i;
1371 }
1372 }
1373 }
1374
1375 lines.len() - 1
1377 }
1378
1379 fn extract_tag_name(&self, trimmed: &str) -> Option<String> {
1381 if !trimmed.starts_with('<') {
1382 return None;
1383 }
1384
1385 let mut chars = trimmed[1..].chars();
1386
1387 if chars.as_str().starts_with('/') {
1389 chars.next();
1390 }
1391
1392 let mut tag_name = String::new();
1393 for ch in chars {
1394 if ch.is_ascii_alphabetic() || ch == '-' {
1395 tag_name.push(ch);
1396 } else {
1397 break;
1398 }
1399 }
1400
1401 if tag_name.is_empty() {
1402 None
1403 } else {
1404 Some(tag_name.to_ascii_lowercase())
1405 }
1406 }
1407
1408 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1410 if line_num == 0 || line_num > self.in_code_span.len() {
1411 return false;
1412 }
1413
1414 let line_idx = line_num - 1; if col == 0 || col > self.in_code_span[line_idx].len() {
1417 return false;
1418 }
1419
1420 self.in_code_span[line_idx][col - 1] }
1422
1423 pub fn is_in_blockquote(&self, line_num: usize) -> bool {
1425 if line_num == 0 || line_num > self.in_blockquote.len() {
1426 return false;
1427 }
1428
1429 self.in_blockquote[line_num - 1] }
1431
1432 pub fn get_list_item_at_line(&self, line_num: usize) -> Option<&ListItem> {
1434 self.list_items.iter().find(|item| item.line_number == line_num)
1435 }
1436
1437 pub fn get_list_items_by_type(&self, marker_type: ListMarkerType) -> Vec<&ListItem> {
1439 self.list_items
1440 .iter()
1441 .filter(|item| item.marker_type == marker_type)
1442 .collect()
1443 }
1444
1445 pub fn get_empty_links(&self) -> Vec<&Link> {
1447 self.links
1448 .iter()
1449 .filter(|link| link.text.trim().is_empty() || link.url.trim().is_empty())
1450 .collect()
1451 }
1452
1453 pub fn get_images_without_alt_text(&self) -> Vec<&Image> {
1455 self.images
1456 .iter()
1457 .filter(|img| img.alt_text.trim().is_empty())
1458 .collect()
1459 }
1460
1461 pub fn is_in_html_block(&self, line_num: usize) -> bool {
1463 if line_num == 0 || line_num > self.in_html_block.len() {
1464 return false;
1465 }
1466 self.in_html_block[line_num - 1]
1467 }
1468}
1469
1470pub trait DocumentStructureExtensions {
1472 fn should_process_line(&self, line_num: usize, doc_structure: &DocumentStructure) -> bool {
1474 !doc_structure.is_in_code_block(line_num)
1476 }
1477
1478 fn has_relevant_elements(
1480 &self,
1481 _ctx: &crate::lint_context::LintContext,
1482 _doc_structure: &DocumentStructure,
1483 ) -> bool {
1484 true
1486 }
1487}
1488
1489pub fn document_structure_from_str(content: &str) -> DocumentStructure {
1491 DocumentStructure::new(content)
1492}
1493
1494#[cfg(test)]
1495mod tests {
1496 use super::*;
1497
1498 #[test]
1499 fn test_document_structure_creation() {
1500 let content = "# Heading 1\n\nSome text.\n\n## Heading 2\n\nMore text.\n\n```\nCode block\n```\n";
1501 let structure = DocumentStructure::new(content);
1502
1503 assert_eq!(structure.heading_lines.len(), 2);
1504 assert_eq!(structure.heading_levels.len(), 2);
1505 assert!(structure.has_code_blocks);
1506 assert_eq!(structure.code_blocks.len(), 1);
1507 }
1508
1509 #[test]
1510 fn test_nested_code_blocks() {
1511 let content = r#"```markdown
15121. First item
1513
1514 ```python
1515 code_in_list()
1516 ```
1517
15182. Second item
1519```"#;
1520
1521 let structure = DocumentStructure::new(content);
1522
1523 assert_eq!(structure.code_blocks.len(), 1);
1525 assert_eq!(structure.code_blocks[0].start_line, 1);
1526 assert_eq!(structure.code_blocks[0].end_line, 9);
1527
1528 for line in 2..=8 {
1530 assert!(structure.is_in_code_block(line), "Line {line} should be in code block");
1531 }
1532 }
1533
1534 #[test]
1535 fn test_document_with_front_matter() {
1536 let content = "---\ntitle: Test Document\ndate: 2021-01-01\n---\n\n# Heading 1\n\nSome text.\n";
1537 let structure = DocumentStructure::new(content);
1538
1539 assert!(structure.has_front_matter);
1540 assert!(structure.front_matter_range.is_some());
1541 assert_eq!(structure.heading_lines.len(), 1);
1542 assert!(!structure.has_code_blocks);
1543 }
1544
1545 #[test]
1546 fn test_is_in_code_block() {
1547 let content = "# Heading\n\nText.\n\n```\ncode line 1\ncode line 2\n```\n\nMore text.\n";
1548 let structure = DocumentStructure::new(content);
1549
1550 assert!(!structure.is_in_code_block(1)); assert!(!structure.is_in_code_block(3)); assert!(!structure.is_in_code_block(5)); assert!(structure.is_in_code_block(6)); assert!(structure.is_in_code_block(7)); assert!(!structure.is_in_code_block(8)); assert!(!structure.is_in_code_block(10)); }
1558
1559 #[test]
1560 fn test_headings_edge_cases() {
1561 let content =
1563 " # ATX Heading\n# Closed ATX Heading #\nSetext H1\n=======\nSetext H2\n-------\n\n# ATX Again\n";
1564 let structure = DocumentStructure::new(content);
1565 assert_eq!(structure.heading_lines, vec![1, 2, 3, 5, 8]);
1566 assert_eq!(structure.heading_levels, vec![1, 1, 1, 2, 1]);
1567
1568 let content = "---\ntitle: Test\n---\n# Heading 1\n\n```\n# Not a heading\n```\n# Heading 2\n";
1570 let structure = DocumentStructure::new(content);
1571 assert_eq!(structure.heading_lines, vec![4, 9]);
1572 assert_eq!(structure.heading_levels, vec![1, 1]);
1573
1574 let content = "#\n## \n### \n# Not Empty\n";
1576 let structure = DocumentStructure::new(content);
1577 assert_eq!(structure.heading_lines, vec![4]);
1578 assert_eq!(structure.heading_levels, vec![1]);
1579
1580 let content = "# Heading \n# Heading\n";
1582 let structure = DocumentStructure::new(content);
1583 assert_eq!(structure.heading_lines, vec![1, 2]);
1584 assert_eq!(structure.heading_levels, vec![1, 1]);
1585
1586 let content = " # Indented\n # Not a heading (too much indent)\n# Valid\n";
1588 let structure = DocumentStructure::new(content);
1589 assert_eq!(structure.heading_lines, vec![1, 3]);
1590 assert_eq!(structure.heading_levels, vec![1, 1]);
1591
1592 let content = "# Dup\n# Dup\n# Unique\n# Dup\n";
1594 let structure = DocumentStructure::new(content);
1595 assert_eq!(structure.heading_lines, vec![1, 2, 3, 4]);
1596 assert_eq!(structure.heading_levels, vec![1, 1, 1, 1]);
1597
1598 let content = "```\n# Not a heading\n```\n# Real Heading\n";
1600 let structure = DocumentStructure::new(content);
1601 assert_eq!(structure.heading_lines, vec![4]);
1602 assert_eq!(structure.heading_levels, vec![1]);
1603
1604 let content = "---\ntitle: Test\n---\n# Heading\n";
1605 let structure = DocumentStructure::new(content);
1606 assert_eq!(structure.heading_lines, vec![4]);
1607 assert_eq!(structure.heading_levels, vec![1]);
1608
1609 let content = "\nSetext\n=======\n\nSetext2\n-------\n";
1611 let structure = DocumentStructure::new(content);
1612 assert_eq!(structure.heading_lines, vec![2, 5]);
1613 assert_eq!(structure.heading_levels, vec![1, 2]);
1614
1615 let content = "# Heading!@#$%^&*()\nSetext Special\n=======\n";
1617 let structure = DocumentStructure::new(content);
1618 assert_eq!(structure.heading_lines, vec![1, 2]);
1619 assert_eq!(structure.heading_levels, vec![1, 1]);
1620 }
1621
1622 #[test]
1623 fn test_horizontal_rule_detection() {
1624 let content = "Text\n\n---\n\nMore text\n\n***\n\nFinal\n\n___\n\nEnd";
1626 let structure = DocumentStructure::new(content);
1627 assert_eq!(structure.horizontal_rule_lines, vec![3, 7, 11]);
1628
1629 let content = "Text\n\n- - -\n\n* * *\n\n_ _ _\n\nEnd";
1631 let structure = DocumentStructure::new(content);
1632 assert_eq!(structure.horizontal_rule_lines, vec![3, 5, 7]);
1633
1634 let content = "# ATX\n\nSetext\n------\n\n---\n\nAnother\n======\n";
1636 let structure = DocumentStructure::new(content);
1637 assert_eq!(structure.horizontal_rule_lines, vec![6]); assert_eq!(structure.heading_lines, vec![1, 3, 8]); let content = "Text\n\n```\n---\n***\n```\n\n---\n\nEnd";
1642 let structure = DocumentStructure::new(content);
1643 assert_eq!(structure.horizontal_rule_lines, vec![8]); let content = "---\ntitle: Test\n---\n\n---\n\nContent";
1647 let structure = DocumentStructure::new(content);
1648 assert_eq!(structure.horizontal_rule_lines, vec![5]); }
1650}