rumdl_lib/utils/
document_structure.rs

1use crate::rules::heading_utils::HeadingStyle;
2use fancy_regex::Regex as FancyRegex;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6/// A struct that contains pre-computed information about a markdown document structure
7/// to avoid redundant parsing of the same elements by multiple rules.
8#[derive(Debug, Clone)]
9pub struct DocumentStructure {
10    /// Information about code block regions
11    pub code_blocks: Vec<CodeBlock>,
12    /// Whether the document contains code blocks
13    pub has_code_blocks: bool,
14    /// Line numbers of headings (1-indexed)
15    pub heading_lines: Vec<usize>,
16    /// Heading levels (1-6) for each heading
17    pub heading_levels: Vec<usize>,
18    /// Heading regions (start_line, end_line) for each heading (ATX: start==end, Setext: start=content, end=marker)
19    pub heading_regions: Vec<(usize, usize)>,
20    /// Line numbers of list items (1-indexed)
21    pub list_lines: Vec<usize>,
22    /// Whether the document contains front matter
23    pub has_front_matter: bool,
24    /// Line range of front matter (1-indexed, inclusive)
25    pub front_matter_range: Option<(usize, usize)>,
26    /// Whether the document contains URLs
27    pub has_urls: bool,
28    /// Whether the document contains inline HTML
29    pub has_html: bool,
30    /// Bitmap of code block regions for fast lookups
31    pub in_code_block: Vec<bool>,
32    /// Line numbers of fenced code block starts (1-indexed)
33    pub fenced_code_block_starts: Vec<usize>,
34    /// Line numbers of fenced code block ends (1-indexed)
35    pub fenced_code_block_ends: Vec<usize>,
36    /// Style of the first heading found in the document (for consistent style rules)
37    pub first_heading_style: Option<HeadingStyle>,
38    /// OPTIMIZATION 1: Detailed information about inline code spans
39    pub code_spans: Vec<CodeSpan>,
40    /// OPTIMIZATION 1: Bitmap indicating which line-column positions are within code spans
41    pub in_code_span: Vec<Vec<bool>>,
42    /// OPTIMIZATION 2: Collection of links in the document
43    pub links: Vec<Link>,
44    /// OPTIMIZATION 2: Collection of images in the document
45    pub images: Vec<Image>,
46    /// OPTIMIZATION 3: Detailed information about list items
47    pub list_items: Vec<ListItem>,
48    /// OPTIMIZATION 4: Blockquotes in the document
49    pub blockquotes: Vec<BlockquoteRange>,
50    /// OPTIMIZATION 4: Bitmap indicating which lines are inside blockquotes
51    pub in_blockquote: Vec<bool>,
52    /// Bitmap indicating which lines are inside HTML blocks
53    pub in_html_block: Vec<bool>,
54    /// Line numbers of horizontal rules (1-indexed)
55    pub horizontal_rule_lines: Vec<usize>,
56}
57
58/// Front matter block
59#[derive(Debug, Clone)]
60pub struct FrontMatter {
61    pub start_line: usize,
62    pub end_line: usize,
63    pub content: String,
64}
65
66/// Heading information
67#[derive(Debug, Clone, PartialEq)]
68pub struct Heading {
69    pub text: String,
70    pub level: u32,
71    pub line_number: usize,
72    pub original_text: String,
73    pub indentation: String,
74}
75
76/// Simple code block representation for document structure
77#[derive(Debug, Clone)]
78pub struct CodeBlock {
79    /// The line where the code block starts (1-indexed)
80    pub start_line: usize,
81    /// The line where the code block ends (1-indexed, inclusive)
82    pub end_line: usize,
83    /// Optional language specifier
84    pub language: Option<String>,
85    /// Type of code block (fenced or indented)
86    pub block_type: CodeBlockType,
87}
88
89/// Type of code block
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum CodeBlockType {
92    /// Fenced code block with ``` or ~~~
93    Fenced,
94    /// Indented code block
95    Indented,
96}
97
98/// List item information
99#[derive(Debug, Clone)]
100pub struct ListItem {
101    pub line_number: usize,
102    pub indentation: usize,
103    pub marker: String,
104    pub marker_type: ListMarkerType,
105    pub content: String,
106}
107
108/// Type of list marker
109#[derive(Debug, Clone, PartialEq)]
110pub enum ListMarkerType {
111    Unordered,
112    Ordered,
113    Task,
114}
115
116/// Blockquote range in the document
117#[derive(Debug, Clone)]
118pub struct BlockquoteRange {
119    pub start_line: usize,
120    pub end_line: usize,
121}
122
123/// OPTIMIZATION 1: Inline code span representation
124#[derive(Debug, Clone)]
125pub struct CodeSpan {
126    /// The line number where the code span is (1-indexed)
127    pub line: usize,
128    /// Starting column of the code span (1-indexed)
129    pub start_col: usize,
130    /// Ending column of the code span (1-indexed)
131    pub end_col: usize,
132    /// The content of the code span (without the backticks)
133    pub content: String,
134}
135
136/// OPTIMIZATION 2: Link representation
137#[derive(Debug, Clone)]
138pub struct Link {
139    /// The line number where the link is (1-indexed)
140    pub line: usize,
141    /// Starting column of the link (1-indexed)
142    pub start_col: usize,
143    /// Ending column of the link (1-indexed)
144    pub end_col: usize,
145    /// The text displayed for the link
146    pub text: String,
147    /// The destination URL
148    pub url: String,
149    /// Whether this is a reference link [text][reference]
150    pub is_reference: bool,
151    /// The reference ID (for reference links)
152    pub reference_id: Option<String>,
153}
154
155/// OPTIMIZATION 2: Image representation
156#[derive(Debug, Clone)]
157pub struct Image {
158    /// The line number where the image is (1-indexed)
159    pub line: usize,
160    /// Starting column of the image (1-indexed)
161    pub start_col: usize,
162    /// Ending column of the image (1-indexed)
163    pub end_col: usize,
164    /// The alt text of the image
165    pub alt_text: String,
166    /// The source URL
167    pub src: String,
168    /// Whether this is a reference image ![text][reference]
169    pub is_reference: bool,
170    /// The reference ID (for reference images)
171    pub reference_id: Option<String>,
172}
173
174// Cached regex patterns for performance
175lazy_static! {
176    // Quick check patterns
177    static ref CONTAINS_ATX_HEADING: Regex = Regex::new(r"(?m)^(\s*)#{1,6}").unwrap();
178    static ref CONTAINS_SETEXT_UNDERLINE: Regex = Regex::new(r"(?m)^(\s*)(=+|-+)\s*$").unwrap();
179    static ref CONTAINS_LIST_MARKERS: Regex = Regex::new(r"(?m)^(\s*)([*+-]|\d+\.)").unwrap();
180    static ref CONTAINS_BLOCKQUOTE: Regex = Regex::new(r"(?m)^(\s*)>").unwrap();
181    static ref CONTAINS_HTML_BLOCK: Regex = Regex::new(r"(?m)^(\s*)<[a-zA-Z]").unwrap();
182}
183
184impl DocumentStructure {
185    /// Create a new DocumentStructure by analyzing the document content
186    pub fn new(content: &str) -> Self {
187        // Initialize with default values
188        let mut structure = DocumentStructure {
189            code_blocks: Vec::new(),
190            has_code_blocks: false,
191            heading_lines: Vec::new(),
192            heading_levels: Vec::new(),
193            heading_regions: Vec::new(),
194            list_lines: Vec::new(),
195            has_front_matter: false,
196            front_matter_range: None,
197            has_urls: false,
198            has_html: false,
199            in_code_block: Vec::new(),
200            fenced_code_block_starts: Vec::new(),
201            fenced_code_block_ends: Vec::new(),
202            first_heading_style: None,
203            // Initialize new optimization fields
204            code_spans: Vec::new(),
205            in_code_span: Vec::new(),
206            links: Vec::new(),
207            images: Vec::new(),
208            list_items: Vec::new(),
209            blockquotes: Vec::new(),
210            in_blockquote: Vec::new(),
211            in_html_block: Vec::new(),
212            horizontal_rule_lines: Vec::new(),
213        };
214
215        // Analyze the document and populate the structure
216        structure.analyze(content);
217        structure
218    }
219
220    /// Analyze the document content and populate the structure
221    fn analyze(&mut self, content: &str) {
222        // Early return for empty content
223        if content.is_empty() {
224            return;
225        }
226
227        // Initialize line-based bitmaps early to avoid index errors
228        let lines: Vec<&str> = content.lines().collect();
229        self.in_code_span = vec![Vec::new(); lines.len()];
230        for (i, line) in lines.iter().enumerate() {
231            self.in_code_span[i] = vec![false; line.len() + 1]; // +1 for 1-indexed columns
232        }
233        self.in_blockquote = vec![false; lines.len()];
234        self.in_html_block = vec![false; lines.len()];
235
236        // Detect front matter FIRST (needed before heading detection)
237        self.detect_front_matter(content);
238
239        // Quick checks to skip expensive operations if not needed
240        let has_blockquote_markers = CONTAINS_BLOCKQUOTE.is_match(content);
241        let has_html_blocks = CONTAINS_HTML_BLOCK.is_match(content);
242
243        // Detect HTML blocks BEFORE computing code blocks (HTML blocks should not be treated as indented code)
244        if has_html_blocks {
245            self.detect_html_blocks(content);
246        }
247
248        // Compute code blocks
249        self.code_blocks = self.compute_code_blocks(content);
250        self.has_code_blocks = !self.code_blocks.is_empty();
251
252        // Compute bitmap of code block regions
253        self.compute_code_block_bitmap(content);
254
255        // Populate fenced code block starts and ends
256        self.populate_fenced_code_blocks();
257        let has_backticks = content.contains('`');
258        let has_brackets = content.contains('[');
259        let has_headings = CONTAINS_ATX_HEADING.is_match(content) || CONTAINS_SETEXT_UNDERLINE.is_match(content);
260        // More comprehensive list detection to handle edge cases
261        let has_list_markers = CONTAINS_LIST_MARKERS.is_match(content)
262            || content.contains("- ")
263            || content.contains("* ")
264            || content.contains("+ ")
265            || content.contains("1. ")
266            || content.contains("2. ")
267            || content.contains("3. ")
268            || content.contains("4. ")
269            || content.contains("5. ")
270            || content.contains("6. ")
271            || content.contains("7. ")
272            || content.contains("8. ")
273            || content.contains("9. ")
274            || content.contains("10. ")
275            || content.contains("11. ")
276            || content.contains("12. ");
277
278        // OPTIMIZATION 4: Detect blockquotes only if needed
279        if has_blockquote_markers {
280            self.detect_blockquotes(content);
281        }
282
283        // OPTIMIZATION 1: Detect inline code spans only if needed
284        if has_backticks {
285            self.detect_code_spans(content);
286        }
287
288        // OPTIMIZATION 2: Detect links and images only if needed
289        if has_brackets {
290            self.detect_links_and_images(content);
291        }
292
293        // Detect headings only if needed
294        if has_headings {
295            self.detect_headings(content);
296        }
297
298        // OPTIMIZATION 3: Detect lists only if needed
299        if has_list_markers {
300            self.detect_list_items(content);
301        }
302
303        // Detect horizontal rules only if needed
304        let has_potential_hrs = content.contains("---")
305            || content.contains("***")
306            || content.contains("___")
307            || content.contains("- -")
308            || content.contains("* *")
309            || content.contains("_ _");
310        if has_potential_hrs {
311            self.detect_horizontal_rules(content);
312        }
313
314        // Check for URLs only if needed
315        if crate::utils::early_returns::has_urls(content) {
316            self.has_urls = true;
317        }
318
319        // Check for HTML tags only if needed
320        if has_html_blocks && (content.contains("</") || content.contains("/>")) {
321            self.has_html = true;
322        }
323    }
324
325    /// Compute a bitmap of code block regions for fast lookups
326    fn compute_code_block_bitmap(&mut self, content: &str) {
327        let line_count = content.lines().count();
328        self.in_code_block = vec![false; line_count];
329
330        for block in &self.code_blocks {
331            let start = block.start_line.saturating_sub(1); // Convert 1-indexed to 0-indexed
332            let end = block.end_line.min(line_count); // Ensure we don't go out of bounds
333
334            // For fenced code blocks, skip the start and end lines (the "```" lines)
335            if let CodeBlockType::Fenced = block.block_type {
336                // Mark only the lines between fences as in code block
337                if end > start + 1 {
338                    for i in (start + 1)..(end - 1) {
339                        if i < self.in_code_block.len() {
340                            self.in_code_block[i] = true;
341                        }
342                    }
343                }
344            } else {
345                // For indented code blocks, mark all lines
346                for i in start..end {
347                    if i < self.in_code_block.len() {
348                        self.in_code_block[i] = true;
349                    }
350                }
351            }
352        }
353    }
354
355    /// Check if a particular line is inside a code block
356    pub fn is_in_code_block(&self, line_num: usize) -> bool {
357        if line_num == 0 || line_num > self.in_code_block.len() {
358            return false;
359        }
360        self.in_code_block[line_num - 1] // Convert 1-indexed to 0-indexed
361    }
362
363    /// Detect headings in the document
364    fn detect_headings(&mut self, content: &str) {
365        lazy_static! {
366            static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+|[^\s#])").unwrap();
367            static ref SETEXT_HEADING_UNDERLINE: Regex = Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
368        }
369
370        // Clear existing data
371        self.heading_lines.clear();
372        self.heading_levels.clear();
373        self.heading_regions.clear();
374        self.first_heading_style = None;
375
376        let lines: Vec<&str> = content.lines().collect();
377
378        for (i, line) in lines.iter().enumerate() {
379            // Skip lines in code blocks or front matter
380            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
381                continue;
382            }
383
384            // Skip empty lines
385            if line.trim().is_empty() {
386                continue;
387            }
388
389            // Check for ATX headings (both with and without spaces)
390            if let Some(captures) = ATX_HEADING.captures(line) {
391                let level = captures[2].len();
392                // Extract heading text after hashes and whitespace
393                let mut chars = line.trim().chars();
394                while chars.next() == Some('#') {}
395                let heading_text = chars.as_str().trim();
396                if heading_text.is_empty() {
397                    continue; // Skip empty ATX headings
398                }
399                self.heading_lines.push(i + 1);
400                self.heading_levels.push(level);
401                self.heading_regions.push((i + 1, i + 1)); // ATX: start==end
402
403                // If this is the first heading detected, set the style
404                if self.first_heading_style.is_none() {
405                    // Determine if it's a closed ATX heading
406                    if line.trim().ends_with('#') {
407                        self.first_heading_style = Some(HeadingStyle::AtxClosed);
408                    } else {
409                        self.first_heading_style = Some(HeadingStyle::Atx);
410                    }
411                }
412                continue;
413            }
414
415            // Check for setext headings (line with ===== or ----- below)
416            if i > 0 && !lines[i - 1].trim().is_empty() &&
417               !self.is_in_front_matter(i) && // Check that previous line is not in front matter
418               SETEXT_HEADING_UNDERLINE.is_match(line)
419            {
420                let content_line = lines[i - 1].trim();
421                if content_line.is_empty() {
422                    continue; // Skip empty Setext headings
423                }
424                let level = if line.trim().starts_with('=') { 1 } else { 2 };
425                self.heading_lines.push(i); // The heading is the previous line (content line)
426                self.heading_levels.push(level);
427                self.heading_regions.push((i, i + 1)); // Setext: (content, marker)
428
429                // If this is the first heading detected, set the style
430                if self.first_heading_style.is_none() {
431                    if level == 1 {
432                        self.first_heading_style = Some(HeadingStyle::Setext1);
433                    } else {
434                        self.first_heading_style = Some(HeadingStyle::Setext2);
435                    }
436                }
437            }
438        }
439
440        // Default to ATX if no headings are found
441        if self.heading_lines.is_empty() {
442            self.first_heading_style = Some(HeadingStyle::Atx);
443        }
444    }
445
446    /// Detect front matter in the document
447    fn detect_front_matter(&mut self, content: &str) {
448        let lines: Vec<&str> = content.lines().collect();
449
450        // Clear existing data
451        self.has_front_matter = false;
452        self.front_matter_range = None;
453
454        // If document starts with ---, it might have front matter
455        if !lines.is_empty() && lines[0] == "---" {
456            // Look for the closing delimiter
457            for (i, line) in lines.iter().enumerate().skip(1) {
458                if *line == "---" {
459                    self.has_front_matter = true;
460                    self.front_matter_range = Some((1, i + 1));
461                    break;
462                }
463            }
464        }
465    }
466
467    /// Compute code blocks in the document
468    fn compute_code_blocks(&self, content: &str) -> Vec<CodeBlock> {
469        lazy_static! {
470            // Fenced code blocks can be indented 0-3 spaces according to CommonMark
471            static ref FENCED_START: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*([^`\s]*)").unwrap();
472            static ref FENCED_END: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*$").unwrap();
473        }
474
475        let mut code_blocks = Vec::new();
476        let mut in_code_block = false;
477        let mut current_block_start = 0;
478        let mut current_language = None;
479        let mut current_fence_char = ' ';
480        let mut current_fence_length = 0; // Track fence length for proper nesting
481        let mut current_fence_indent = 0; // Track fence indentation
482        let lines: Vec<&str> = content.lines().collect();
483
484        let mut i = 0;
485        while i < lines.len() {
486            let line = lines[i];
487
488            if !in_code_block {
489                // Check for fenced code block start
490                if let Some(captures) = FENCED_START.captures(line) {
491                    in_code_block = true;
492                    current_block_start = i + 1;
493                    let indent = captures.get(1).map_or("", |m| m.as_str());
494                    current_fence_indent = indent.len();
495                    let fence = captures.get(2).map_or("```", |m| m.as_str());
496                    current_fence_char = fence.chars().next().unwrap();
497                    current_fence_length = fence.len();
498
499                    // Only set language if it's not empty
500                    let lang = captures.get(3).map(|m| m.as_str().to_string());
501                    current_language = lang.filter(|l| !l.is_empty());
502                }
503                // Check for indented code block (CommonMark compliant)
504                // But skip if we're inside an HTML block
505                else if Self::is_indented_code_line(line) && !line.trim().is_empty() && !self.is_in_html_block(i + 1)
506                {
507                    // According to CommonMark, any content indented by 4+ spaces OR a tab is a code block
508                    // unless it's inside an HTML block
509                    let mut end_line = i;
510
511                    // Find the end of this indented code block
512                    // Continue while we have indented lines OR blank lines that are followed by more indented lines
513                    while end_line + 1 < lines.len() {
514                        let next_line = lines[end_line + 1];
515
516                        if Self::is_indented_code_line(next_line)
517                            && !next_line.trim().is_empty()
518                            && !self.is_in_html_block(end_line + 2)
519                        {
520                            // Found another indented line that's not in HTML, continue the block
521                            end_line += 1;
522                        } else if next_line.trim().is_empty() {
523                            // Found a blank line, check if there are more indented lines after it
524                            let mut lookahead = end_line + 2;
525                            let mut found_indented = false;
526
527                            while lookahead < lines.len() {
528                                let lookahead_line = lines[lookahead];
529                                if Self::is_indented_code_line(lookahead_line)
530                                    && !lookahead_line.trim().is_empty()
531                                    && !self.is_in_html_block(lookahead + 1)
532                                {
533                                    found_indented = true;
534                                    break;
535                                } else if !lookahead_line.trim().is_empty() {
536                                    // Found non-empty, non-indented line, stop looking
537                                    break;
538                                }
539                                lookahead += 1;
540                            }
541
542                            if found_indented {
543                                // Include this blank line as part of the code block
544                                end_line += 1;
545                            } else {
546                                // No more indented lines, end the block here
547                                break;
548                            }
549                        } else {
550                            // Found non-empty, non-indented line, end the block
551                            break;
552                        }
553                    }
554
555                    code_blocks.push(CodeBlock {
556                        start_line: i + 1,
557                        end_line: end_line + 1,
558                        language: None,
559                        block_type: CodeBlockType::Indented,
560                    });
561
562                    // Skip to end of block
563                    i = end_line;
564                }
565            } else {
566                // Check for fenced code block end - must start with the same fence character,
567                // be at least as long as the opening fence, and have same or less indentation
568                if let Some(captures) = FENCED_END.captures(line) {
569                    let indent = captures.get(1).map_or("", |m| m.as_str());
570                    let fence = captures.get(2).map_or("", |m| m.as_str());
571
572                    // CommonMark: closing fence must have same or less indentation than opening
573                    if fence.starts_with(current_fence_char)
574                        && fence.len() >= current_fence_length
575                        && indent.len() <= current_fence_indent
576                    {
577                        code_blocks.push(CodeBlock {
578                            start_line: current_block_start,
579                            end_line: i + 1,
580                            language: current_language.clone(),
581                            block_type: CodeBlockType::Fenced,
582                        });
583
584                        in_code_block = false;
585                        current_language = None;
586                        current_fence_char = ' ';
587                        current_fence_length = 0;
588                        current_fence_indent = 0;
589                    }
590                }
591            }
592
593            i += 1;
594        }
595
596        // Handle case where file ends without closing code fence
597        if in_code_block {
598            code_blocks.push(CodeBlock {
599                start_line: current_block_start,
600                end_line: lines.len(),
601                language: current_language,
602                block_type: CodeBlockType::Fenced,
603            });
604        }
605
606        code_blocks
607    }
608
609    /// Populate fenced code block starts and ends
610    fn populate_fenced_code_blocks(&mut self) {
611        self.fenced_code_block_starts.clear();
612        self.fenced_code_block_ends.clear();
613
614        for block in &self.code_blocks {
615            if let CodeBlockType::Fenced = block.block_type {
616                self.fenced_code_block_starts.push(block.start_line);
617                self.fenced_code_block_ends.push(block.end_line);
618            }
619        }
620    }
621
622    /// Check if a line is in front matter
623    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
624        if let Some((start, end)) = self.front_matter_range {
625            line_num >= start && line_num <= end
626        } else {
627            false
628        }
629    }
630
631    /// Count the number of trailing spaces in a line
632    ///
633    /// This function returns the number of trailing spaces in a line,
634    /// ignoring newlines but counting spaces before newlines.
635    #[inline]
636    pub fn count_trailing_spaces(line: &str) -> usize {
637        // Prepare the string without newline if it ends with one
638        let content = line.strip_suffix('\n').unwrap_or(line);
639
640        // Count trailing spaces at the end, not including tabs
641        let mut space_count = 0;
642        for c in content.chars().rev() {
643            if c == ' ' {
644                space_count += 1;
645            } else {
646                break;
647            }
648        }
649
650        space_count
651    }
652
653    /// Check if a line has trailing whitespace
654    ///
655    /// This function returns true if the line has trailing spaces,
656    /// false otherwise.
657    #[inline]
658    pub fn has_trailing_spaces(line: &str) -> bool {
659        Self::count_trailing_spaces(line) > 0
660    }
661
662    /// Check if a line is indented code according to CommonMark specification
663    ///
664    /// According to CommonMark, a line is considered indented code if it starts with:
665    /// - 4 or more spaces, OR
666    /// - A tab character
667    #[inline]
668    fn is_indented_code_line(line: &str) -> bool {
669        if line.starts_with('\t') {
670            return true;
671        }
672
673        // Count leading spaces
674        let mut space_count = 0;
675        for c in line.chars() {
676            if c == ' ' {
677                space_count += 1;
678            } else {
679                break;
680            }
681        }
682
683        space_count >= 4
684    }
685
686    /// Get a list of list start indices
687    /// This method analyzes the list_lines to find where lists begin
688    pub fn get_list_start_indices(&self) -> Vec<usize> {
689        if self.list_lines.is_empty() {
690            return Vec::new();
691        }
692
693        let mut list_starts = Vec::new();
694        let mut prev_line = 0;
695
696        for (i, &line_num) in self.list_lines.iter().enumerate() {
697            // If this is the first item or there's a gap in line numbers,
698            // it's the start of a new list
699            if i == 0 || line_num > prev_line + 1 {
700                list_starts.push(line_num - 1); // Convert from 1-indexed to 0-indexed
701            }
702            prev_line = line_num;
703        }
704
705        list_starts
706    }
707
708    /// Get a list of list end indices
709    /// This method analyzes the list_lines to find where lists end
710    pub fn get_list_end_indices(&self) -> Vec<usize> {
711        if self.list_lines.is_empty() {
712            return Vec::new();
713        }
714
715        let mut list_ends = Vec::new();
716        let list_lines = &self.list_lines;
717
718        for (i, &line_num) in list_lines.iter().enumerate() {
719            // If this is the last item or there's a gap after this item,
720            // it's the end of a list
721            if i == list_lines.len() - 1 || list_lines[i + 1] > line_num + 1 {
722                list_ends.push(line_num - 1); // Convert from 1-indexed to 0-indexed
723            }
724        }
725
726        list_ends
727    }
728
729    /// OPTIMIZATION 1: Detect inline code spans in the document
730    fn detect_code_spans(&mut self, content: &str) {
731        // Clear existing data
732        self.code_spans.clear();
733
734        let lines: Vec<&str> = content.lines().collect();
735
736        // Note: in_code_span bitmap is already initialized in analyze() method
737
738        for (line_num, line) in lines.iter().enumerate() {
739            // Skip lines in code blocks
740            if self.is_in_code_block(line_num + 1) {
741                continue;
742            }
743
744            // Skip empty lines
745            if line.is_empty() {
746                continue;
747            }
748
749            let mut i = 0;
750            while i < line.len() {
751                // Look for backtick
752                if let Some(start_pos) = line[i..].find('`') {
753                    let start_idx = i + start_pos;
754
755                    // Look for closing backtick
756                    if let Some(end_pos) = line[start_idx + 1..].find('`') {
757                        let end_idx = start_idx + 1 + end_pos;
758
759                        // We found a code span
760                        let content = line[start_idx + 1..end_idx].to_string();
761
762                        // Add to code_spans collection
763                        self.code_spans.push(CodeSpan {
764                            line: line_num + 1,       // 1-indexed
765                            start_col: start_idx + 1, // 1-indexed
766                            end_col: end_idx + 1,     // 1-indexed
767                            content,
768                        });
769
770                        // Mark in the bitmap
771                        for col in start_idx..=end_idx {
772                            if col < self.in_code_span[line_num].len() {
773                                self.in_code_span[line_num][col] = true;
774                            }
775                        }
776
777                        // Continue from after the closing backtick
778                        i = end_idx + 1;
779                    } else {
780                        // No closing backtick found
781                        i = start_idx + 1;
782                    }
783                } else {
784                    // No more backticks in this line
785                    break;
786                }
787            }
788        }
789    }
790
791    /// OPTIMIZATION 2: Detect links and images in the document
792    fn detect_links_and_images(&mut self, content: &str) {
793        lazy_static! {
794            // Regex for inline links: [text](url) - handles escaped brackets
795            static ref INLINE_LINK: FancyRegex = FancyRegex::new(r"(?x)
796                (?<!\\)                               # Not preceded by backslash
797                \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]  # Link text (handles nested brackets and escapes)
798                \(([^)]*)\)                           # URL in parentheses
799            ").unwrap();
800            // Regex for reference links: [text][id] or [text][] (implicit) - handles escaped brackets
801            static ref REFERENCE_LINK: FancyRegex = FancyRegex::new(r"(?x)
802                (?<!\\)                               # Not preceded by backslash
803                \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]  # Link text (handles nested brackets and escapes)
804                \[([^\]]*)\]                          # Reference ID
805            ").unwrap();
806            // Regex for shortcut reference links: [text]
807            static ref SHORTCUT_LINK: FancyRegex = FancyRegex::new(r"(?x)
808                (?<!\\)                               # Not preceded by backslash
809                \[([^\]]+)\]                          # Link text
810                (?!\(|\[)                             # Not followed by ( or [
811            ").unwrap();
812            // Regex for link definitions: [id]: url
813            static ref LINK_DEFINITION: Regex = Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap();
814            // Regex for inline images: ![alt](src) - handles escaped brackets
815            static ref INLINE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
816                (?<!\\)                               # Not preceded by backslash
817                !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
818                \(([^)]*)\)                           # Source URL
819            ").unwrap();
820            // Regex for reference images: ![alt][id] - handles escaped brackets
821            static ref REFERENCE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
822                (?<!\\)                               # Not preceded by backslash
823                !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
824                \[([^\]]*)\]                          # Reference ID
825            ").unwrap();
826        }
827
828        // Clear existing data
829        self.links.clear();
830        self.images.clear();
831
832        let lines: Vec<&str> = content.lines().collect();
833
834        // First, find all link definitions
835        let mut link_defs = std::collections::HashMap::new();
836        for (line_num, line) in lines.iter().enumerate() {
837            // Skip lines in code blocks
838            if self.is_in_code_block(line_num + 1) {
839                continue;
840            }
841
842            // Check for link definitions
843            if let Some(cap) = LINK_DEFINITION.captures(line) {
844                let id = cap.get(1).map_or("", |m| m.as_str()).to_string();
845                let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
846                link_defs.insert(id.to_lowercase(), url);
847            }
848        }
849
850        // Now find all links and images
851        for (line_num, line) in lines.iter().enumerate() {
852            // Skip lines in code blocks
853            if self.is_in_code_block(line_num + 1) {
854                continue;
855            }
856
857            // Skip empty lines
858            if line.is_empty() {
859                continue;
860            }
861
862            // Check if this line contains a character that would indicate a link or image
863            if !line.contains('[') && !line.contains('!') {
864                continue;
865            }
866
867            // Process each character position to ensure we don't detect links inside code spans
868            let mut i = 0;
869            while i < line.len() {
870                // Skip if this position is in a code span
871                if i < self.in_code_span[line_num].len() && self.in_code_span[line_num][i] {
872                    i += 1;
873                    continue;
874                }
875
876                // Check for inline links starting at this position
877                if let Some(rest) = line.get(i..) {
878                    if rest.starts_with('[') {
879                        // Check if this bracket is escaped or part of an escaped image
880                        let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
881                        let is_escaped_image =
882                            i > 1 && line.chars().nth(i - 2) == Some('\\') && line.chars().nth(i - 1) == Some('!');
883                        if !is_escaped && !is_escaped_image {
884                            if let Ok(Some(cap)) = INLINE_LINK.captures(rest) {
885                                let whole_match = cap.get(0).unwrap();
886                                let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
887                                let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
888
889                                // Ensure we're not inside a code span
890                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
891                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
892                                });
893
894                                if !is_in_span {
895                                    self.links.push(Link {
896                                        line: line_num + 1,             // 1-indexed
897                                        start_col: i + 1,               // 1-indexed
898                                        end_col: i + whole_match.end(), // 1-indexed
899                                        text,
900                                        url,
901                                        is_reference: false,
902                                        reference_id: None,
903                                    });
904                                }
905
906                                // Skip past this link
907                                i += whole_match.end();
908                            } else if let Ok(Some(cap)) = REFERENCE_LINK.captures(rest) {
909                                let whole_match = cap.get(0).unwrap();
910                                let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
911                                let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
912
913                                // Use the ID or text as the reference
914                                let ref_id = if id.is_empty() { text.clone() } else { id };
915
916                                // Look up the URL from link definitions
917                                let url = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
918
919                                // Ensure we're not inside a code span
920                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
921                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
922                                });
923
924                                if !is_in_span {
925                                    self.links.push(Link {
926                                        line: line_num + 1,             // 1-indexed
927                                        start_col: i + 1,               // 1-indexed
928                                        end_col: i + whole_match.end(), // 1-indexed
929                                        text,
930                                        url,
931                                        is_reference: true,
932                                        reference_id: Some(ref_id),
933                                    });
934                                }
935
936                                // Skip past this link
937                                i += whole_match.end();
938                            } else {
939                                // No match found, move to next character
940                                i += 1;
941                            }
942                        } else {
943                            // Bracket is escaped or part of escaped image, skip it
944                            i += 1;
945                        }
946                    } else if rest.starts_with("![") {
947                        // Check if this image is escaped
948                        let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
949                        if !is_escaped {
950                            if let Ok(Some(cap)) = INLINE_IMAGE.captures(rest) {
951                                let whole_match = cap.get(0).unwrap();
952                                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
953                                let src = cap.get(2).map_or("", |m| m.as_str()).to_string();
954
955                                // Ensure we're not inside a code span
956                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
957                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
958                                });
959
960                                if !is_in_span {
961                                    self.images.push(Image {
962                                        line: line_num + 1,             // 1-indexed
963                                        start_col: i + 1,               // 1-indexed
964                                        end_col: i + whole_match.end(), // 1-indexed
965                                        alt_text,
966                                        src,
967                                        is_reference: false,
968                                        reference_id: None,
969                                    });
970                                }
971
972                                // Skip past this image
973                                i += whole_match.end();
974                            } else if let Ok(Some(cap)) = REFERENCE_IMAGE.captures(rest) {
975                                let whole_match = cap.get(0).unwrap();
976                                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
977                                let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
978
979                                // Use the ID or alt_text as the reference
980                                let ref_id = if id.is_empty() { alt_text.clone() } else { id };
981
982                                // Look up the URL from link definitions
983                                let src = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
984
985                                // Ensure we're not inside a code span
986                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
987                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
988                                });
989
990                                if !is_in_span {
991                                    self.images.push(Image {
992                                        line: line_num + 1,             // 1-indexed
993                                        start_col: i + 1,               // 1-indexed
994                                        end_col: i + whole_match.end(), // 1-indexed
995                                        alt_text,
996                                        src,
997                                        is_reference: true,
998                                        reference_id: Some(ref_id),
999                                    });
1000                                }
1001
1002                                // Skip past this image
1003                                i += whole_match.end();
1004                            } else {
1005                                // No match found, move to next character
1006                                i += 1;
1007                            }
1008                        } else {
1009                            // Image is escaped, skip it
1010                            i += 1;
1011                        }
1012                    } else {
1013                        // Neither a link nor an image, move to next character
1014                        i += 1;
1015                    }
1016                } else {
1017                    // We've reached the end of the line
1018                    break;
1019                }
1020            }
1021        }
1022    }
1023
1024    /// OPTIMIZATION 3: Detect list items with detailed information
1025    fn detect_list_items(&mut self, content: &str) {
1026        // Use fancy-regex for advanced Markdown list item detection
1027        // - Allow any number of spaces/tabs before the marker
1028        // - Marker must be *, +, or -
1029        // - At least one space/tab after the marker
1030        // - Use lookbehind to ensure marker is at the start or after whitespace
1031        // - Use Unicode support for whitespace
1032        lazy_static! {
1033            static ref UL_MARKER: FancyRegex =
1034                FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1035            static ref OL_MARKER: FancyRegex =
1036                FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1037            static ref TASK_MARKER: FancyRegex = FancyRegex::new(
1038                r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)\[(?P<checked>[ xX])\](?P<content>.*)$"
1039            )
1040            .unwrap();
1041        }
1042        self.list_items.clear();
1043        self.list_lines.clear();
1044        let lines: Vec<&str> = content.lines().collect();
1045        for (line_num, line) in lines.iter().enumerate() {
1046            if self.is_in_code_block(line_num + 1) || self.is_in_front_matter(line_num + 1) {
1047                continue;
1048            }
1049            if line.trim().is_empty() {
1050                continue;
1051            }
1052            // Use fancy-regex for advanced matching
1053            if let Ok(Some(cap)) = TASK_MARKER.captures(line) {
1054                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1055                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1056                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1057                self.list_lines.push(line_num + 1);
1058                self.list_items.push(ListItem {
1059                    line_number: line_num + 1,
1060                    indentation,
1061                    marker: marker.clone(),
1062                    marker_type: ListMarkerType::Task,
1063                    content,
1064                });
1065                continue;
1066            }
1067            if let Ok(Some(cap)) = UL_MARKER.captures(line) {
1068                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1069                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1070                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1071                self.list_lines.push(line_num + 1);
1072                self.list_items.push(ListItem {
1073                    line_number: line_num + 1,
1074                    indentation,
1075                    marker: marker.clone(),
1076                    marker_type: ListMarkerType::Unordered,
1077                    content,
1078                });
1079                continue;
1080            }
1081            if let Ok(Some(cap)) = OL_MARKER.captures(line) {
1082                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1083                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1084                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1085                self.list_lines.push(line_num + 1);
1086                self.list_items.push(ListItem {
1087                    line_number: line_num + 1,
1088                    indentation,
1089                    marker: marker.clone(),
1090                    marker_type: ListMarkerType::Ordered,
1091                    content,
1092                });
1093                continue;
1094            }
1095        }
1096    }
1097
1098    /// OPTIMIZATION 4: Detect blockquotes in the document
1099    fn detect_blockquotes(&mut self, content: &str) {
1100        lazy_static! {
1101            static ref BLOCKQUOTE_MARKER: Regex = Regex::new(r"^\s*>(.*)$").unwrap();
1102        }
1103
1104        // Clear existing data
1105        self.blockquotes.clear();
1106
1107        let lines: Vec<&str> = content.lines().collect();
1108
1109        // Note: in_blockquote bitmap is already initialized in analyze() method
1110
1111        let mut in_blockquote = false;
1112        let mut start_line = 0;
1113
1114        for (i, line) in lines.iter().enumerate() {
1115            // Skip lines in code blocks or front matter
1116            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1117                continue;
1118            }
1119
1120            let is_blockquote_line = BLOCKQUOTE_MARKER.is_match(line);
1121
1122            if is_blockquote_line {
1123                // Mark this line as inside a blockquote
1124                self.in_blockquote[i] = true;
1125
1126                if !in_blockquote {
1127                    // Start of a new blockquote
1128                    in_blockquote = true;
1129                    start_line = i + 1; // 1-indexed
1130                }
1131            } else if in_blockquote {
1132                // End of a blockquote
1133                self.blockquotes.push(BlockquoteRange {
1134                    start_line,
1135                    end_line: i, // Previous line was the end
1136                });
1137
1138                in_blockquote = false;
1139            }
1140        }
1141
1142        // Handle case where file ends with a blockquote
1143        if in_blockquote {
1144            self.blockquotes.push(BlockquoteRange {
1145                start_line,
1146                end_line: lines.len(), // Last line
1147            });
1148        }
1149    }
1150
1151    /// Detect horizontal rules in the document
1152    fn detect_horizontal_rules(&mut self, content: &str) {
1153        lazy_static! {
1154            // Horizontal rule patterns - simplified to match Markdown spec
1155            static ref HR_HYPHEN: Regex = Regex::new(r"^[ \t]*-[ \t]*-[ \t]*-[ \t-]*$").unwrap();
1156            static ref HR_ASTERISK: Regex = Regex::new(r"^[ \t]*\*[ \t]*\*[ \t]*\*[ \t\*]*$").unwrap();
1157            static ref HR_UNDERSCORE: Regex = Regex::new(r"^[ \t]*_[ \t]*_[ \t]*_[ \t_]*$").unwrap();
1158        }
1159
1160        // Clear existing data
1161        self.horizontal_rule_lines.clear();
1162
1163        let lines: Vec<&str> = content.lines().collect();
1164
1165        for (i, line) in lines.iter().enumerate() {
1166            // Skip lines in code blocks or front matter
1167            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1168                continue;
1169            }
1170
1171            // Check for horizontal rule patterns
1172            if HR_HYPHEN.is_match(line) || HR_ASTERISK.is_match(line) || HR_UNDERSCORE.is_match(line) {
1173                // Additional validation: ensure it's not part of a setext heading
1174                // (setext headings have content on the previous line)
1175                let is_setext_marker = if i > 0 {
1176                    let prev_line = lines[i - 1].trim();
1177                    !prev_line.is_empty()
1178                        && !self.is_in_code_block(i)
1179                        && !self.is_in_front_matter(i)
1180                        && line.trim().chars().all(|c| c == '-' || c == ' ')
1181                } else {
1182                    false
1183                };
1184
1185                if !is_setext_marker {
1186                    self.horizontal_rule_lines.push(i + 1); // 1-indexed
1187                }
1188            }
1189        }
1190    }
1191
1192    /// Detect HTML blocks (block-level HTML regions) according to CommonMark spec
1193    fn detect_html_blocks(&mut self, content: &str) {
1194        let lines: Vec<&str> = content.lines().collect();
1195        // Note: in_html_block bitmap is already initialized in analyze() method
1196
1197        let mut i = 0;
1198        while i < lines.len() {
1199            let line = lines[i];
1200            let trimmed = line.trim_start();
1201
1202            // Skip lines already in code blocks
1203            if self.is_in_code_block(i + 1) {
1204                i += 1;
1205                continue;
1206            }
1207
1208            // Check for HTML block start conditions (simplified version of CommonMark)
1209            if self.is_html_block_start(trimmed) {
1210                let start_line = i;
1211
1212                // Find the end of the HTML block
1213                let end_line = self.find_html_block_end(&lines, start_line);
1214
1215                // Mark all lines in the block as HTML
1216                for line_idx in start_line..=end_line {
1217                    if line_idx < self.in_html_block.len() {
1218                        self.in_html_block[line_idx] = true;
1219                    }
1220                }
1221
1222                // Skip to after the block
1223                i = end_line + 1;
1224            } else {
1225                i += 1;
1226            }
1227        }
1228    }
1229
1230    /// Check if a line starts an HTML block
1231    fn is_html_block_start(&self, trimmed: &str) -> bool {
1232        if trimmed.is_empty() || !trimmed.starts_with('<') {
1233            return false;
1234        }
1235
1236        // Extract tag name
1237        let mut chars = trimmed[1..].chars();
1238        let mut tag_name = String::new();
1239
1240        // Handle closing tags
1241        let is_closing = chars.as_str().starts_with('/');
1242        if is_closing {
1243            chars.next(); // Skip the '/'
1244        }
1245
1246        // Extract tag name
1247        for ch in chars {
1248            if ch.is_ascii_alphabetic() || ch == '-' {
1249                tag_name.push(ch);
1250            } else {
1251                break;
1252            }
1253        }
1254
1255        if tag_name.is_empty() {
1256            return false;
1257        }
1258
1259        // List of HTML block elements (based on CommonMark and markdownlint)
1260        const BLOCK_ELEMENTS: &[&str] = &[
1261            "address",
1262            "article",
1263            "aside",
1264            "base",
1265            "basefont",
1266            "blockquote",
1267            "body",
1268            "caption",
1269            "center",
1270            "col",
1271            "colgroup",
1272            "dd",
1273            "details",
1274            "dialog",
1275            "dir",
1276            "div",
1277            "dl",
1278            "dt",
1279            "fieldset",
1280            "figcaption",
1281            "figure",
1282            "footer",
1283            "form",
1284            "frame",
1285            "frameset",
1286            "h1",
1287            "h2",
1288            "h3",
1289            "h4",
1290            "h5",
1291            "h6",
1292            "head",
1293            "header",
1294            "hr",
1295            "html",
1296            "iframe",
1297            "legend",
1298            "li",
1299            "link",
1300            "main",
1301            "menu",
1302            "menuitem",
1303            "nav",
1304            "noframes",
1305            "ol",
1306            "optgroup",
1307            "option",
1308            "p",
1309            "param",
1310            "section",
1311            "source",
1312            "summary",
1313            "table",
1314            "tbody",
1315            "td",
1316            "tfoot",
1317            "th",
1318            "thead",
1319            "title",
1320            "tr",
1321            "track",
1322            "ul",
1323            "img",
1324            "picture",
1325        ];
1326
1327        BLOCK_ELEMENTS.contains(&tag_name.to_ascii_lowercase().as_str())
1328    }
1329
1330    /// Find the end line of an HTML block starting at start_line
1331    fn find_html_block_end(&self, lines: &[&str], start_line: usize) -> usize {
1332        let start_trimmed = lines[start_line].trim_start();
1333
1334        // Extract the tag name from the start line
1335        let tag_name = self.extract_tag_name(start_trimmed);
1336
1337        // Look for the closing tag or blank line
1338        for (i, line) in lines.iter().enumerate().skip(start_line + 1) {
1339            let trimmed = line.trim();
1340
1341            // HTML block ends on blank line
1342            if trimmed.is_empty() {
1343                return i - 1; // Don't include the blank line
1344            }
1345
1346            // HTML block ends when we find the matching closing tag
1347            if let Some(ref tag) = tag_name {
1348                let closing_tag = format!("</{tag}");
1349                if trimmed.contains(&closing_tag) {
1350                    return i;
1351                }
1352            }
1353        }
1354
1355        // If no end found, block continues to end of document
1356        lines.len() - 1
1357    }
1358
1359    /// Extract tag name from an HTML line
1360    fn extract_tag_name(&self, trimmed: &str) -> Option<String> {
1361        if !trimmed.starts_with('<') {
1362            return None;
1363        }
1364
1365        let mut chars = trimmed[1..].chars();
1366
1367        // Skip closing tag indicator
1368        if chars.as_str().starts_with('/') {
1369            chars.next();
1370        }
1371
1372        let mut tag_name = String::new();
1373        for ch in chars {
1374            if ch.is_ascii_alphabetic() || ch == '-' {
1375                tag_name.push(ch);
1376            } else {
1377                break;
1378            }
1379        }
1380
1381        if tag_name.is_empty() {
1382            None
1383        } else {
1384            Some(tag_name.to_ascii_lowercase())
1385        }
1386    }
1387
1388    /// Check if a position is inside a code span
1389    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1390        if line_num == 0 || line_num > self.in_code_span.len() {
1391            return false;
1392        }
1393
1394        let line_idx = line_num - 1; // Convert 1-indexed to 0-indexed
1395
1396        if col == 0 || col > self.in_code_span[line_idx].len() {
1397            return false;
1398        }
1399
1400        self.in_code_span[line_idx][col - 1] // Convert 1-indexed to 0-indexed
1401    }
1402
1403    /// Check if a line is inside a blockquote
1404    pub fn is_in_blockquote(&self, line_num: usize) -> bool {
1405        if line_num == 0 || line_num > self.in_blockquote.len() {
1406            return false;
1407        }
1408
1409        self.in_blockquote[line_num - 1] // Convert 1-indexed to 0-indexed
1410    }
1411
1412    /// Get detailed information about a list item at a specific line
1413    pub fn get_list_item_at_line(&self, line_num: usize) -> Option<&ListItem> {
1414        self.list_items.iter().find(|item| item.line_number == line_num)
1415    }
1416
1417    /// Get all list items with a specific marker type
1418    pub fn get_list_items_by_type(&self, marker_type: ListMarkerType) -> Vec<&ListItem> {
1419        self.list_items
1420            .iter()
1421            .filter(|item| item.marker_type == marker_type)
1422            .collect()
1423    }
1424
1425    /// Get all links with empty text or URLs
1426    pub fn get_empty_links(&self) -> Vec<&Link> {
1427        self.links
1428            .iter()
1429            .filter(|link| link.text.trim().is_empty() || link.url.trim().is_empty())
1430            .collect()
1431    }
1432
1433    /// Get all images with empty alt text
1434    pub fn get_images_without_alt_text(&self) -> Vec<&Image> {
1435        self.images
1436            .iter()
1437            .filter(|img| img.alt_text.trim().is_empty())
1438            .collect()
1439    }
1440
1441    /// Check if a line is inside an HTML block
1442    pub fn is_in_html_block(&self, line_num: usize) -> bool {
1443        if line_num == 0 || line_num > self.in_html_block.len() {
1444            return false;
1445        }
1446        self.in_html_block[line_num - 1]
1447    }
1448}
1449
1450/// Extended rule trait methods for using the document structure
1451pub trait DocumentStructureExtensions {
1452    /// Check if a rule should operate on a given line
1453    fn should_process_line(&self, line_num: usize, doc_structure: &DocumentStructure) -> bool {
1454        // Skip lines in code blocks by default
1455        !doc_structure.is_in_code_block(line_num)
1456    }
1457
1458    /// Check if content contains elements relevant to this rule
1459    fn has_relevant_elements(
1460        &self,
1461        _ctx: &crate::lint_context::LintContext,
1462        _doc_structure: &DocumentStructure,
1463    ) -> bool {
1464        // Default implementation returns true - rules should override this
1465        true
1466    }
1467}
1468
1469/// Create a DocumentStructure from a string
1470pub fn document_structure_from_str(content: &str) -> DocumentStructure {
1471    DocumentStructure::new(content)
1472}
1473
1474#[cfg(test)]
1475mod tests {
1476    use super::*;
1477
1478    #[test]
1479    fn test_document_structure_creation() {
1480        let content = "# Heading 1\n\nSome text.\n\n## Heading 2\n\nMore text.\n\n```\nCode block\n```\n";
1481        let structure = DocumentStructure::new(content);
1482
1483        assert_eq!(structure.heading_lines.len(), 2);
1484        assert_eq!(structure.heading_levels.len(), 2);
1485        assert!(structure.has_code_blocks);
1486        assert_eq!(structure.code_blocks.len(), 1);
1487    }
1488
1489    #[test]
1490    fn test_nested_code_blocks() {
1491        let content = r#"```markdown
14921. First item
1493
1494   ```python
1495   code_in_list()
1496   ```
1497
14982. Second item
1499```"#;
1500
1501        let structure = DocumentStructure::new(content);
1502
1503        // Should have exactly one code block (the outer markdown block)
1504        assert_eq!(structure.code_blocks.len(), 1);
1505        assert_eq!(structure.code_blocks[0].start_line, 1);
1506        assert_eq!(structure.code_blocks[0].end_line, 9);
1507
1508        // Lines 2-8 should be inside the code block
1509        for line in 2..=8 {
1510            assert!(structure.is_in_code_block(line), "Line {line} should be in code block");
1511        }
1512    }
1513
1514    #[test]
1515    fn test_document_with_front_matter() {
1516        let content = "---\ntitle: Test Document\ndate: 2021-01-01\n---\n\n# Heading 1\n\nSome text.\n";
1517        let structure = DocumentStructure::new(content);
1518
1519        assert!(structure.has_front_matter);
1520        assert!(structure.front_matter_range.is_some());
1521        assert_eq!(structure.heading_lines.len(), 1);
1522        assert!(!structure.has_code_blocks);
1523    }
1524
1525    #[test]
1526    fn test_is_in_code_block() {
1527        let content = "# Heading\n\nText.\n\n```\ncode line 1\ncode line 2\n```\n\nMore text.\n";
1528        let structure = DocumentStructure::new(content);
1529
1530        assert!(!structure.is_in_code_block(1)); // # Heading
1531        assert!(!structure.is_in_code_block(3)); // Text.
1532        assert!(!structure.is_in_code_block(5)); // ```
1533        assert!(structure.is_in_code_block(6)); // code line 1
1534        assert!(structure.is_in_code_block(7)); // code line 2
1535        assert!(!structure.is_in_code_block(8)); // ```
1536        assert!(!structure.is_in_code_block(10)); // More text.
1537    }
1538
1539    #[test]
1540    fn test_headings_edge_cases() {
1541        // ATX, closed ATX, Setext, mixed styles
1542        let content =
1543            "  # ATX Heading\n# Closed ATX Heading #\nSetext H1\n=======\nSetext H2\n-------\n\n# ATX Again\n";
1544        let structure = DocumentStructure::new(content);
1545        assert_eq!(structure.heading_lines, vec![1, 2, 3, 5, 8]);
1546        assert_eq!(structure.heading_levels, vec![1, 1, 1, 2, 1]);
1547
1548        // Headings in code blocks and front matter (should be ignored)
1549        let content = "---\ntitle: Test\n---\n# Heading 1\n\n```\n# Not a heading\n```\n# Heading 2\n";
1550        let structure = DocumentStructure::new(content);
1551        assert_eq!(structure.heading_lines, vec![4, 9]);
1552        assert_eq!(structure.heading_levels, vec![1, 1]);
1553
1554        // Empty headings
1555        let content = "#\n## \n###  \n# Not Empty\n";
1556        let structure = DocumentStructure::new(content);
1557        assert_eq!(structure.heading_lines, vec![4]);
1558        assert_eq!(structure.heading_levels, vec![1]);
1559
1560        // Headings with trailing whitespace
1561        let content = "# Heading \n# Heading\n";
1562        let structure = DocumentStructure::new(content);
1563        assert_eq!(structure.heading_lines, vec![1, 2]);
1564        assert_eq!(structure.heading_levels, vec![1, 1]);
1565
1566        // Headings with indentation
1567        let content = "   # Indented\n    # Not a heading (too much indent)\n# Valid\n";
1568        let structure = DocumentStructure::new(content);
1569        assert_eq!(structure.heading_lines, vec![1, 3]);
1570        assert_eq!(structure.heading_levels, vec![1, 1]);
1571
1572        // Multiple duplicates and edge line numbers
1573        let content = "# Dup\n# Dup\n# Unique\n# Dup\n";
1574        let structure = DocumentStructure::new(content);
1575        assert_eq!(structure.heading_lines, vec![1, 2, 3, 4]);
1576        assert_eq!(structure.heading_levels, vec![1, 1, 1, 1]);
1577
1578        // Headings after code blocks/front matter
1579        let content = "```\n# Not a heading\n```\n# Real Heading\n";
1580        let structure = DocumentStructure::new(content);
1581        assert_eq!(structure.heading_lines, vec![4]);
1582        assert_eq!(structure.heading_levels, vec![1]);
1583
1584        let content = "---\ntitle: Test\n---\n# Heading\n";
1585        let structure = DocumentStructure::new(content);
1586        assert_eq!(structure.heading_lines, vec![4]);
1587        assert_eq!(structure.heading_levels, vec![1]);
1588
1589        // Setext headings with blank lines before/after
1590        let content = "\nSetext\n=======\n\nSetext2\n-------\n";
1591        let structure = DocumentStructure::new(content);
1592        assert_eq!(structure.heading_lines, vec![2, 5]);
1593        assert_eq!(structure.heading_levels, vec![1, 2]);
1594
1595        // Headings with special characters
1596        let content = "# Heading!@#$%^&*()\nSetext Special\n=======\n";
1597        let structure = DocumentStructure::new(content);
1598        assert_eq!(structure.heading_lines, vec![1, 2]);
1599        assert_eq!(structure.heading_levels, vec![1, 1]);
1600    }
1601
1602    #[test]
1603    fn test_horizontal_rule_detection() {
1604        // Test basic horizontal rules
1605        let content = "Text\n\n---\n\nMore text\n\n***\n\nFinal\n\n___\n\nEnd";
1606        let structure = DocumentStructure::new(content);
1607        assert_eq!(structure.horizontal_rule_lines, vec![3, 7, 11]);
1608
1609        // Test horizontal rules with spaces
1610        let content = "Text\n\n- - -\n\n* * *\n\n_ _ _\n\nEnd";
1611        let structure = DocumentStructure::new(content);
1612        assert_eq!(structure.horizontal_rule_lines, vec![3, 5, 7]);
1613
1614        // Test setext headings are not detected as horizontal rules
1615        let content = "# ATX\n\nSetext\n------\n\n---\n\nAnother\n======\n";
1616        let structure = DocumentStructure::new(content);
1617        assert_eq!(structure.horizontal_rule_lines, vec![6]); // Only the actual HR
1618        assert_eq!(structure.heading_lines, vec![1, 3, 8]); // Three headings
1619
1620        // Test horizontal rules in code blocks are ignored
1621        let content = "Text\n\n```\n---\n***\n```\n\n---\n\nEnd";
1622        let structure = DocumentStructure::new(content);
1623        assert_eq!(structure.horizontal_rule_lines, vec![8]); // Only the one outside code block
1624
1625        // Test horizontal rules in front matter are ignored
1626        let content = "---\ntitle: Test\n---\n\n---\n\nContent";
1627        let structure = DocumentStructure::new(content);
1628        assert_eq!(structure.horizontal_rule_lines, vec![5]); // Only the one after front matter
1629    }
1630}
rumdl_lib/utils/document_structure.rs

rumdl_lib/utils/
document_structure.rs