rumdl_lib/utils/
document_structure.rs

1use crate::rules::heading_utils::HeadingStyle;
2use fancy_regex::Regex as FancyRegex;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6/// A struct that contains pre-computed information about a markdown document structure
7/// to avoid redundant parsing of the same elements by multiple rules.
8#[derive(Debug, Clone)]
9pub struct DocumentStructure {
10    /// Information about code block regions
11    pub code_blocks: Vec<CodeBlock>,
12    /// Whether the document contains code blocks
13    pub has_code_blocks: bool,
14    /// Line numbers of headings (1-indexed)
15    pub heading_lines: Vec<usize>,
16    /// Heading levels (1-6) for each heading
17    pub heading_levels: Vec<usize>,
18    /// Heading regions (start_line, end_line) for each heading (ATX: start==end, Setext: start=content, end=marker)
19    pub heading_regions: Vec<(usize, usize)>,
20    /// Line numbers of list items (1-indexed)
21    pub list_lines: Vec<usize>,
22    /// Whether the document contains front matter
23    pub has_front_matter: bool,
24    /// Line range of front matter (1-indexed, inclusive)
25    pub front_matter_range: Option<(usize, usize)>,
26    /// Whether the document contains URLs
27    pub has_urls: bool,
28    /// Whether the document contains inline HTML
29    pub has_html: bool,
30    /// Bitmap of code block regions for fast lookups
31    pub in_code_block: Vec<bool>,
32    /// Line numbers of fenced code block starts (1-indexed)
33    pub fenced_code_block_starts: Vec<usize>,
34    /// Line numbers of fenced code block ends (1-indexed)
35    pub fenced_code_block_ends: Vec<usize>,
36    /// Style of the first heading found in the document (for consistent style rules)
37    pub first_heading_style: Option<HeadingStyle>,
38    /// OPTIMIZATION 1: Detailed information about inline code spans
39    pub code_spans: Vec<CodeSpan>,
40    /// OPTIMIZATION 1: Bitmap indicating which line-column positions are within code spans
41    pub in_code_span: Vec<Vec<bool>>,
42    /// OPTIMIZATION 2: Collection of links in the document
43    pub links: Vec<Link>,
44    /// OPTIMIZATION 2: Collection of images in the document
45    pub images: Vec<Image>,
46    /// OPTIMIZATION 3: Detailed information about list items
47    pub list_items: Vec<ListItem>,
48    /// OPTIMIZATION 4: Blockquotes in the document
49    pub blockquotes: Vec<BlockquoteRange>,
50    /// OPTIMIZATION 4: Bitmap indicating which lines are inside blockquotes
51    pub in_blockquote: Vec<bool>,
52    /// Bitmap indicating which lines are inside HTML blocks
53    pub in_html_block: Vec<bool>,
54    /// Line numbers of horizontal rules (1-indexed)
55    pub horizontal_rule_lines: Vec<usize>,
56}
57
58/// Front matter block
59#[derive(Debug, Clone)]
60pub struct FrontMatter {
61    pub start_line: usize,
62    pub end_line: usize,
63    pub content: String,
64}
65
66/// Heading information
67#[derive(Debug, Clone, PartialEq)]
68pub struct Heading {
69    pub text: String,
70    pub level: u32,
71    pub line_number: usize,
72    pub original_text: String,
73    pub indentation: String,
74}
75
76/// Simple code block representation for document structure
77#[derive(Debug, Clone)]
78pub struct CodeBlock {
79    /// The line where the code block starts (1-indexed)
80    pub start_line: usize,
81    /// The line where the code block ends (1-indexed, inclusive)
82    pub end_line: usize,
83    /// Optional language specifier
84    pub language: Option<String>,
85    /// Type of code block (fenced or indented)
86    pub block_type: CodeBlockType,
87}
88
89/// Type of code block
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum CodeBlockType {
92    /// Fenced code block with ``` or ~~~
93    Fenced,
94    /// Indented code block
95    Indented,
96}
97
98/// List item information
99#[derive(Debug, Clone)]
100pub struct ListItem {
101    pub line_number: usize,
102    pub indentation: usize,
103    pub marker: String,
104    pub marker_type: ListMarkerType,
105    pub content: String,
106}
107
108/// Type of list marker
109#[derive(Debug, Clone, PartialEq)]
110pub enum ListMarkerType {
111    Unordered,
112    Ordered,
113    Task,
114}
115
116/// Blockquote range in the document
117#[derive(Debug, Clone)]
118pub struct BlockquoteRange {
119    pub start_line: usize,
120    pub end_line: usize,
121}
122
123/// OPTIMIZATION 1: Inline code span representation
124#[derive(Debug, Clone)]
125pub struct CodeSpan {
126    /// The line number where the code span is (1-indexed)
127    pub line: usize,
128    /// Starting column of the code span (1-indexed)
129    pub start_col: usize,
130    /// Ending column of the code span (1-indexed)
131    pub end_col: usize,
132    /// The content of the code span (without the backticks)
133    pub content: String,
134}
135
136/// OPTIMIZATION 2: Link representation
137#[derive(Debug, Clone)]
138pub struct Link {
139    /// The line number where the link is (1-indexed)
140    pub line: usize,
141    /// Starting column of the link (1-indexed)
142    pub start_col: usize,
143    /// Ending column of the link (1-indexed)
144    pub end_col: usize,
145    /// The text displayed for the link
146    pub text: String,
147    /// The destination URL
148    pub url: String,
149    /// Whether this is a reference link [text][reference]
150    pub is_reference: bool,
151    /// The reference ID (for reference links)
152    pub reference_id: Option<String>,
153}
154
155/// OPTIMIZATION 2: Image representation
156#[derive(Debug, Clone)]
157pub struct Image {
158    /// The line number where the image is (1-indexed)
159    pub line: usize,
160    /// Starting column of the image (1-indexed)
161    pub start_col: usize,
162    /// Ending column of the image (1-indexed)
163    pub end_col: usize,
164    /// The alt text of the image
165    pub alt_text: String,
166    /// The source URL
167    pub src: String,
168    /// Whether this is a reference image ![text][reference]
169    pub is_reference: bool,
170    /// The reference ID (for reference images)
171    pub reference_id: Option<String>,
172}
173
174// Cached regex patterns for performance
175lazy_static! {
176    // Quick check patterns
177    static ref CONTAINS_ATX_HEADING: Regex = Regex::new(r"(?m)^(\s*)#{1,6}").unwrap();
178    static ref CONTAINS_SETEXT_UNDERLINE: Regex = Regex::new(r"(?m)^(\s*)(=+|-+)\s*$").unwrap();
179    static ref CONTAINS_LIST_MARKERS: Regex = Regex::new(r"(?m)^(\s*)([*+-]|\d+\.)").unwrap();
180    static ref CONTAINS_BLOCKQUOTE: Regex = Regex::new(r"(?m)^(\s*)>").unwrap();
181    static ref CONTAINS_HTML_BLOCK: Regex = Regex::new(r"(?m)^(\s*)<[a-zA-Z]").unwrap();
182}
183
184impl DocumentStructure {
185    /// Create a new DocumentStructure by analyzing the document content
186    pub fn new(content: &str) -> Self {
187        // Initialize with default values
188        let mut structure = DocumentStructure {
189            code_blocks: Vec::new(),
190            has_code_blocks: false,
191            heading_lines: Vec::new(),
192            heading_levels: Vec::new(),
193            heading_regions: Vec::new(),
194            list_lines: Vec::new(),
195            has_front_matter: false,
196            front_matter_range: None,
197            has_urls: false,
198            has_html: false,
199            in_code_block: Vec::new(),
200            fenced_code_block_starts: Vec::new(),
201            fenced_code_block_ends: Vec::new(),
202            first_heading_style: None,
203            // Initialize new optimization fields
204            code_spans: Vec::new(),
205            in_code_span: Vec::new(),
206            links: Vec::new(),
207            images: Vec::new(),
208            list_items: Vec::new(),
209            blockquotes: Vec::new(),
210            in_blockquote: Vec::new(),
211            in_html_block: Vec::new(),
212            horizontal_rule_lines: Vec::new(),
213        };
214
215        // Analyze the document and populate the structure
216        structure.analyze(content);
217        structure
218    }
219
220    /// Analyze the document content and populate the structure
221    fn analyze(&mut self, content: &str) {
222        // Early return for empty content
223        if content.is_empty() {
224            return;
225        }
226
227        // Initialize line-based bitmaps early to avoid index errors
228        let lines: Vec<&str> = content.lines().collect();
229        self.in_code_span = vec![Vec::new(); lines.len()];
230        for (i, line) in lines.iter().enumerate() {
231            self.in_code_span[i] = vec![false; line.len() + 1]; // +1 for 1-indexed columns
232        }
233        self.in_blockquote = vec![false; lines.len()];
234        self.in_html_block = vec![false; lines.len()];
235
236        // Detect front matter FIRST (needed before heading detection)
237        self.detect_front_matter(content);
238
239        // Quick checks to skip expensive operations if not needed
240        let has_blockquote_markers = CONTAINS_BLOCKQUOTE.is_match(content);
241        let has_html_blocks = CONTAINS_HTML_BLOCK.is_match(content);
242
243        // Detect HTML blocks BEFORE computing code blocks (HTML blocks should not be treated as indented code)
244        if has_html_blocks {
245            self.detect_html_blocks(content);
246        }
247
248        // Compute code blocks
249        self.code_blocks = self.compute_code_blocks(content);
250        self.has_code_blocks = !self.code_blocks.is_empty();
251
252        // Compute bitmap of code block regions
253        self.compute_code_block_bitmap(content);
254
255        // Populate fenced code block starts and ends
256        self.populate_fenced_code_blocks();
257        let has_backticks = content.contains('`');
258        let has_brackets = content.contains('[');
259        let has_headings = CONTAINS_ATX_HEADING.is_match(content) || CONTAINS_SETEXT_UNDERLINE.is_match(content);
260        // More comprehensive list detection to handle edge cases
261        let has_list_markers = CONTAINS_LIST_MARKERS.is_match(content)
262            || content.contains("- ")
263            || content.contains("* ")
264            || content.contains("+ ")
265            || content.contains("1. ")
266            || content.contains("2. ")
267            || content.contains("3. ")
268            || content.contains("4. ")
269            || content.contains("5. ")
270            || content.contains("6. ")
271            || content.contains("7. ")
272            || content.contains("8. ")
273            || content.contains("9. ")
274            || content.contains("10. ")
275            || content.contains("11. ")
276            || content.contains("12. ");
277
278        // OPTIMIZATION 4: Detect blockquotes only if needed
279        if has_blockquote_markers {
280            self.detect_blockquotes(content);
281        }
282
283        // OPTIMIZATION 1: Detect inline code spans only if needed
284        if has_backticks {
285            self.detect_code_spans(content);
286        }
287
288        // OPTIMIZATION 2: Detect links and images only if needed
289        if has_brackets {
290            self.detect_links_and_images(content);
291        }
292
293        // Detect headings only if needed
294        if has_headings {
295            self.detect_headings(content);
296        }
297
298        // OPTIMIZATION 3: Detect lists only if needed
299        if has_list_markers {
300            self.detect_list_items(content);
301        }
302
303        // Detect horizontal rules only if needed
304        let has_potential_hrs = content.contains("---")
305            || content.contains("***")
306            || content.contains("___")
307            || content.contains("- -")
308            || content.contains("* *")
309            || content.contains("_ _");
310        if has_potential_hrs {
311            self.detect_horizontal_rules(content);
312        }
313
314        // Check for URLs only if needed
315        if crate::utils::early_returns::has_urls(content) {
316            self.has_urls = true;
317        }
318
319        // Check for HTML tags only if needed
320        if has_html_blocks && (content.contains("</") || content.contains("/>")) {
321            self.has_html = true;
322        }
323    }
324
325    /// Compute a bitmap of code block regions for fast lookups
326    fn compute_code_block_bitmap(&mut self, content: &str) {
327        let line_count = content.lines().count();
328        self.in_code_block = vec![false; line_count];
329
330        for block in &self.code_blocks {
331            let start = block.start_line.saturating_sub(1); // Convert 1-indexed to 0-indexed
332            let end = block.end_line.min(line_count); // Ensure we don't go out of bounds
333
334            // For fenced code blocks, skip the start and end lines (the "```" lines)
335            if let CodeBlockType::Fenced = block.block_type {
336                // Mark only the lines between fences as in code block
337                if end > start + 1 {
338                    for i in (start + 1)..(end - 1) {
339                        if i < self.in_code_block.len() {
340                            self.in_code_block[i] = true;
341                        }
342                    }
343                }
344            } else {
345                // For indented code blocks, mark all lines
346                for i in start..end {
347                    if i < self.in_code_block.len() {
348                        self.in_code_block[i] = true;
349                    }
350                }
351            }
352        }
353    }
354
355    /// Check if a particular line is inside a code block
356    pub fn is_in_code_block(&self, line_num: usize) -> bool {
357        if line_num == 0 || line_num > self.in_code_block.len() {
358            return false;
359        }
360        self.in_code_block[line_num - 1] // Convert 1-indexed to 0-indexed
361    }
362
363    /// Detect headings in the document
364    fn detect_headings(&mut self, content: &str) {
365        lazy_static! {
366            static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+|[^\s#])").unwrap();
367            static ref SETEXT_HEADING_UNDERLINE: Regex = Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
368        }
369
370        // Clear existing data
371        self.heading_lines.clear();
372        self.heading_levels.clear();
373        self.heading_regions.clear();
374        self.first_heading_style = None;
375
376        let lines: Vec<&str> = content.lines().collect();
377
378        for (i, line) in lines.iter().enumerate() {
379            // Skip lines in code blocks or front matter
380            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
381                continue;
382            }
383
384            // Skip empty lines
385            if line.trim().is_empty() {
386                continue;
387            }
388
389            // Check for ATX headings (both with and without spaces)
390            if let Some(captures) = ATX_HEADING.captures(line) {
391                let level = captures[2].len();
392                // Extract heading text after hashes and whitespace
393                let mut chars = line.trim().chars();
394                while chars.next() == Some('#') {}
395                let heading_text = chars.as_str().trim();
396                if heading_text.is_empty() {
397                    continue; // Skip empty ATX headings
398                }
399                self.heading_lines.push(i + 1);
400                self.heading_levels.push(level);
401                self.heading_regions.push((i + 1, i + 1)); // ATX: start==end
402
403                // If this is the first heading detected, set the style
404                if self.first_heading_style.is_none() {
405                    // Determine if it's a closed ATX heading
406                    if line.trim().ends_with('#') {
407                        self.first_heading_style = Some(HeadingStyle::AtxClosed);
408                    } else {
409                        self.first_heading_style = Some(HeadingStyle::Atx);
410                    }
411                }
412                continue;
413            }
414
415            // Check for setext headings (line with ===== or ----- below)
416            if i > 0 && !lines[i - 1].trim().is_empty() &&
417               !self.is_in_front_matter(i) && // Check that previous line is not in front matter
418               SETEXT_HEADING_UNDERLINE.is_match(line)
419            {
420                let content_line = lines[i - 1].trim();
421                if content_line.is_empty() {
422                    continue; // Skip empty Setext headings
423                }
424                let level = if line.trim().starts_with('=') { 1 } else { 2 };
425                self.heading_lines.push(i); // The heading is the previous line (content line)
426                self.heading_levels.push(level);
427                self.heading_regions.push((i, i + 1)); // Setext: (content, marker)
428
429                // If this is the first heading detected, set the style
430                if self.first_heading_style.is_none() {
431                    if level == 1 {
432                        self.first_heading_style = Some(HeadingStyle::Setext1);
433                    } else {
434                        self.first_heading_style = Some(HeadingStyle::Setext2);
435                    }
436                }
437            }
438        }
439
440        // Default to ATX if no headings are found
441        if self.heading_lines.is_empty() {
442            self.first_heading_style = Some(HeadingStyle::Atx);
443        }
444    }
445
446    /// Detect front matter in the document
447    fn detect_front_matter(&mut self, content: &str) {
448        let lines: Vec<&str> = content.lines().collect();
449
450        // Clear existing data
451        self.has_front_matter = false;
452        self.front_matter_range = None;
453
454        // If document starts with ---, it might have front matter
455        if !lines.is_empty() && lines[0] == "---" {
456            // Look for the closing delimiter
457            for (i, line) in lines.iter().enumerate().skip(1) {
458                if *line == "---" {
459                    self.has_front_matter = true;
460                    self.front_matter_range = Some((1, i + 1));
461                    break;
462                }
463            }
464        }
465    }
466
467    /// Compute code blocks in the document
468    fn compute_code_blocks(&self, content: &str) -> Vec<CodeBlock> {
469        lazy_static! {
470            // Fenced code blocks can be indented 0-3 spaces according to CommonMark
471            static ref FENCED_START: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*([^`\s]*)").unwrap();
472            static ref FENCED_END: Regex = Regex::new(r"^(\s{0,3})(`{3,}|~{3,})\s*$").unwrap();
473        }
474
475        let mut code_blocks = Vec::new();
476        let mut in_code_block = false;
477        let mut current_block_start = 0;
478        let mut current_language = None;
479        let mut current_fence_char = ' ';
480        let mut current_fence_length = 0; // Track fence length for proper nesting
481        let mut current_fence_indent = 0; // Track fence indentation
482        let lines: Vec<&str> = content.lines().collect();
483
484        let mut i = 0;
485        while i < lines.len() {
486            let line = lines[i];
487
488            if !in_code_block {
489                // Check for fenced code block start
490                if let Some(captures) = FENCED_START.captures(line) {
491                    in_code_block = true;
492                    current_block_start = i + 1;
493                    let indent = captures.get(1).map_or("", |m| m.as_str());
494                    current_fence_indent = indent.len();
495                    let fence = captures.get(2).map_or("```", |m| m.as_str());
496                    current_fence_char = fence.chars().next().unwrap();
497                    current_fence_length = fence.len();
498
499                    // Only set language if it's not empty
500                    let lang = captures.get(3).map(|m| m.as_str().to_string());
501                    current_language = lang.filter(|l| !l.is_empty());
502                }
503                // Check for indented code block (CommonMark compliant)
504                // But skip if we're inside an HTML block OR if it's a list item
505                // According to CommonMark, list items take precedence over indented code blocks
506                else if Self::is_indented_code_line(line)
507                    && !line.trim().is_empty()
508                    && !self.is_in_html_block(i + 1)
509                    && !Self::is_potential_list_item(line)
510                {
511                    // According to CommonMark, any content indented by 4+ spaces OR a tab is a code block
512                    // unless it's inside an HTML block or it's a list item
513                    let mut end_line = i;
514
515                    // Find the end of this indented code block
516                    // Continue while we have indented lines OR blank lines that are followed by more indented lines
517                    while end_line + 1 < lines.len() {
518                        let next_line = lines[end_line + 1];
519
520                        if Self::is_indented_code_line(next_line)
521                            && !next_line.trim().is_empty()
522                            && !self.is_in_html_block(end_line + 2)
523                            && !Self::is_potential_list_item(next_line)
524                        {
525                            // Found another indented line that's not in HTML or a list item, continue the block
526                            end_line += 1;
527                        } else if next_line.trim().is_empty() {
528                            // Found a blank line, check if there are more indented lines after it
529                            let mut lookahead = end_line + 2;
530                            let mut found_indented = false;
531
532                            while lookahead < lines.len() {
533                                let lookahead_line = lines[lookahead];
534                                if Self::is_indented_code_line(lookahead_line)
535                                    && !lookahead_line.trim().is_empty()
536                                    && !self.is_in_html_block(lookahead + 1)
537                                    && !Self::is_potential_list_item(lookahead_line)
538                                {
539                                    found_indented = true;
540                                    break;
541                                } else if !lookahead_line.trim().is_empty() {
542                                    // Found non-empty, non-indented line, stop looking
543                                    break;
544                                }
545                                lookahead += 1;
546                            }
547
548                            if found_indented {
549                                // Include this blank line as part of the code block
550                                end_line += 1;
551                            } else {
552                                // No more indented lines, end the block here
553                                break;
554                            }
555                        } else {
556                            // Found non-empty, non-indented line, end the block
557                            break;
558                        }
559                    }
560
561                    code_blocks.push(CodeBlock {
562                        start_line: i + 1,
563                        end_line: end_line + 1,
564                        language: None,
565                        block_type: CodeBlockType::Indented,
566                    });
567
568                    // Skip to end of block
569                    i = end_line;
570                }
571            } else {
572                // Check for fenced code block end - must start with the same fence character,
573                // be at least as long as the opening fence, and have same or less indentation
574                if let Some(captures) = FENCED_END.captures(line) {
575                    let indent = captures.get(1).map_or("", |m| m.as_str());
576                    let fence = captures.get(2).map_or("", |m| m.as_str());
577
578                    // CommonMark: closing fence must have same or less indentation than opening
579                    if fence.starts_with(current_fence_char)
580                        && fence.len() >= current_fence_length
581                        && indent.len() <= current_fence_indent
582                    {
583                        code_blocks.push(CodeBlock {
584                            start_line: current_block_start,
585                            end_line: i + 1,
586                            language: current_language.clone(),
587                            block_type: CodeBlockType::Fenced,
588                        });
589
590                        in_code_block = false;
591                        current_language = None;
592                        current_fence_char = ' ';
593                        current_fence_length = 0;
594                        current_fence_indent = 0;
595                    }
596                }
597            }
598
599            i += 1;
600        }
601
602        // Handle case where file ends without closing code fence
603        if in_code_block {
604            code_blocks.push(CodeBlock {
605                start_line: current_block_start,
606                end_line: lines.len(),
607                language: current_language,
608                block_type: CodeBlockType::Fenced,
609            });
610        }
611
612        code_blocks
613    }
614
615    /// Populate fenced code block starts and ends
616    fn populate_fenced_code_blocks(&mut self) {
617        self.fenced_code_block_starts.clear();
618        self.fenced_code_block_ends.clear();
619
620        for block in &self.code_blocks {
621            if let CodeBlockType::Fenced = block.block_type {
622                self.fenced_code_block_starts.push(block.start_line);
623                self.fenced_code_block_ends.push(block.end_line);
624            }
625        }
626    }
627
628    /// Check if a line is in front matter
629    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
630        if let Some((start, end)) = self.front_matter_range {
631            line_num >= start && line_num <= end
632        } else {
633            false
634        }
635    }
636
637    /// Count the number of trailing spaces in a line
638    ///
639    /// This function returns the number of trailing spaces in a line,
640    /// ignoring newlines but counting spaces before newlines.
641    #[inline]
642    pub fn count_trailing_spaces(line: &str) -> usize {
643        // Prepare the string without newline if it ends with one
644        let content = line.strip_suffix('\n').unwrap_or(line);
645
646        // Count trailing spaces at the end, not including tabs
647        let mut space_count = 0;
648        for c in content.chars().rev() {
649            if c == ' ' {
650                space_count += 1;
651            } else {
652                break;
653            }
654        }
655
656        space_count
657    }
658
659    /// Check if a line has trailing whitespace
660    ///
661    /// This function returns true if the line has trailing spaces,
662    /// false otherwise.
663    #[inline]
664    pub fn has_trailing_spaces(line: &str) -> bool {
665        Self::count_trailing_spaces(line) > 0
666    }
667
668    /// Check if a line is indented code according to CommonMark specification
669    ///
670    /// According to CommonMark, a line is considered indented code if it starts with:
671    /// - 4 or more spaces, OR
672    /// - A tab character
673    #[inline]
674    fn is_indented_code_line(line: &str) -> bool {
675        if line.starts_with('\t') {
676            return true;
677        }
678
679        // Count leading spaces
680        let mut space_count = 0;
681        for c in line.chars() {
682            if c == ' ' {
683                space_count += 1;
684            } else {
685                break;
686            }
687        }
688
689        space_count >= 4
690    }
691
692    /// Check if a line is potentially a list item
693    /// This is used to prevent list items from being detected as indented code blocks
694    #[inline]
695    fn is_potential_list_item(line: &str) -> bool {
696        lazy_static! {
697            // Simple regex to detect potential list items
698            // Matches lines that start with optional whitespace followed by a list marker
699            static ref LIST_ITEM_PATTERN: Regex = Regex::new(
700                r"^[ \t]*([*+-]|\d+[.)]])[ \t]"
701            ).unwrap();
702        }
703        LIST_ITEM_PATTERN.is_match(line)
704    }
705
706    /// Get a list of list start indices
707    /// This method analyzes the list_lines to find where lists begin
708    pub fn get_list_start_indices(&self) -> Vec<usize> {
709        if self.list_lines.is_empty() {
710            return Vec::new();
711        }
712
713        let mut list_starts = Vec::new();
714        let mut prev_line = 0;
715
716        for (i, &line_num) in self.list_lines.iter().enumerate() {
717            // If this is the first item or there's a gap in line numbers,
718            // it's the start of a new list
719            if i == 0 || line_num > prev_line + 1 {
720                list_starts.push(line_num - 1); // Convert from 1-indexed to 0-indexed
721            }
722            prev_line = line_num;
723        }
724
725        list_starts
726    }
727
728    /// Get a list of list end indices
729    /// This method analyzes the list_lines to find where lists end
730    pub fn get_list_end_indices(&self) -> Vec<usize> {
731        if self.list_lines.is_empty() {
732            return Vec::new();
733        }
734
735        let mut list_ends = Vec::new();
736        let list_lines = &self.list_lines;
737
738        for (i, &line_num) in list_lines.iter().enumerate() {
739            // If this is the last item or there's a gap after this item,
740            // it's the end of a list
741            if i == list_lines.len() - 1 || list_lines[i + 1] > line_num + 1 {
742                list_ends.push(line_num - 1); // Convert from 1-indexed to 0-indexed
743            }
744        }
745
746        list_ends
747    }
748
749    /// OPTIMIZATION 1: Detect inline code spans in the document
750    fn detect_code_spans(&mut self, content: &str) {
751        // Clear existing data
752        self.code_spans.clear();
753
754        let lines: Vec<&str> = content.lines().collect();
755
756        // Note: in_code_span bitmap is already initialized in analyze() method
757
758        for (line_num, line) in lines.iter().enumerate() {
759            // Skip lines in code blocks
760            if self.is_in_code_block(line_num + 1) {
761                continue;
762            }
763
764            // Skip empty lines
765            if line.is_empty() {
766                continue;
767            }
768
769            let mut i = 0;
770            while i < line.len() {
771                // Look for backtick
772                if let Some(start_pos) = line[i..].find('`') {
773                    let start_idx = i + start_pos;
774
775                    // Look for closing backtick
776                    if let Some(end_pos) = line[start_idx + 1..].find('`') {
777                        let end_idx = start_idx + 1 + end_pos;
778
779                        // We found a code span
780                        let content = line[start_idx + 1..end_idx].to_string();
781
782                        // Add to code_spans collection
783                        self.code_spans.push(CodeSpan {
784                            line: line_num + 1,       // 1-indexed
785                            start_col: start_idx + 1, // 1-indexed
786                            end_col: end_idx + 1,     // 1-indexed
787                            content,
788                        });
789
790                        // Mark in the bitmap
791                        for col in start_idx..=end_idx {
792                            if col < self.in_code_span[line_num].len() {
793                                self.in_code_span[line_num][col] = true;
794                            }
795                        }
796
797                        // Continue from after the closing backtick
798                        i = end_idx + 1;
799                    } else {
800                        // No closing backtick found
801                        i = start_idx + 1;
802                    }
803                } else {
804                    // No more backticks in this line
805                    break;
806                }
807            }
808        }
809    }
810
811    /// OPTIMIZATION 2: Detect links and images in the document
812    fn detect_links_and_images(&mut self, content: &str) {
813        lazy_static! {
814            // Regex for inline links: [text](url) - handles escaped brackets
815            static ref INLINE_LINK: FancyRegex = FancyRegex::new(r"(?x)
816                (?<!\\)                               # Not preceded by backslash
817                \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]  # Link text (handles nested brackets and escapes)
818                \(([^)]*)\)                           # URL in parentheses
819            ").unwrap();
820            // Regex for reference links: [text][id] or [text][] (implicit) - handles escaped brackets
821            static ref REFERENCE_LINK: FancyRegex = FancyRegex::new(r"(?x)
822                (?<!\\)                               # Not preceded by backslash
823                \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]  # Link text (handles nested brackets and escapes)
824                \[([^\]]*)\]                          # Reference ID
825            ").unwrap();
826            // Regex for shortcut reference links: [text]
827            static ref SHORTCUT_LINK: FancyRegex = FancyRegex::new(r"(?x)
828                (?<!\\)                               # Not preceded by backslash
829                \[([^\]]+)\]                          # Link text
830                (?!\(|\[)                             # Not followed by ( or [
831            ").unwrap();
832            // Regex for link definitions: [id]: url
833            static ref LINK_DEFINITION: Regex = Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap();
834            // Regex for inline images: ![alt](src) - handles escaped brackets
835            static ref INLINE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
836                (?<!\\)                               # Not preceded by backslash
837                !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
838                \(([^)]*)\)                           # Source URL
839            ").unwrap();
840            // Regex for reference images: ![alt][id] - handles escaped brackets
841            static ref REFERENCE_IMAGE: FancyRegex = FancyRegex::new(r"(?x)
842                (?<!\\)                               # Not preceded by backslash
843                !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\] # Alt text (handles nested brackets and escapes)
844                \[([^\]]*)\]                          # Reference ID
845            ").unwrap();
846        }
847
848        // Clear existing data
849        self.links.clear();
850        self.images.clear();
851
852        let lines: Vec<&str> = content.lines().collect();
853
854        // First, find all link definitions
855        let mut link_defs = std::collections::HashMap::new();
856        for (line_num, line) in lines.iter().enumerate() {
857            // Skip lines in code blocks
858            if self.is_in_code_block(line_num + 1) {
859                continue;
860            }
861
862            // Check for link definitions
863            if let Some(cap) = LINK_DEFINITION.captures(line) {
864                let id = cap.get(1).map_or("", |m| m.as_str()).to_string();
865                let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
866                link_defs.insert(id.to_lowercase(), url);
867            }
868        }
869
870        // Now find all links and images
871        for (line_num, line) in lines.iter().enumerate() {
872            // Skip lines in code blocks
873            if self.is_in_code_block(line_num + 1) {
874                continue;
875            }
876
877            // Skip empty lines
878            if line.is_empty() {
879                continue;
880            }
881
882            // Check if this line contains a character that would indicate a link or image
883            if !line.contains('[') && !line.contains('!') {
884                continue;
885            }
886
887            // Process each character position to ensure we don't detect links inside code spans
888            let mut i = 0;
889            while i < line.len() {
890                // Skip if this position is in a code span
891                if i < self.in_code_span[line_num].len() && self.in_code_span[line_num][i] {
892                    i += 1;
893                    continue;
894                }
895
896                // Check for inline links starting at this position
897                if let Some(rest) = line.get(i..) {
898                    if rest.starts_with('[') {
899                        // Check if this bracket is escaped or part of an escaped image
900                        let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
901                        let is_escaped_image =
902                            i > 1 && line.chars().nth(i - 2) == Some('\\') && line.chars().nth(i - 1) == Some('!');
903                        if !is_escaped && !is_escaped_image {
904                            if let Ok(Some(cap)) = INLINE_LINK.captures(rest) {
905                                let whole_match = cap.get(0).unwrap();
906                                let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
907                                let url = cap.get(2).map_or("", |m| m.as_str()).to_string();
908
909                                // Ensure we're not inside a code span
910                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
911                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
912                                });
913
914                                if !is_in_span {
915                                    self.links.push(Link {
916                                        line: line_num + 1,             // 1-indexed
917                                        start_col: i + 1,               // 1-indexed
918                                        end_col: i + whole_match.end(), // 1-indexed
919                                        text,
920                                        url,
921                                        is_reference: false,
922                                        reference_id: None,
923                                    });
924                                }
925
926                                // Skip past this link
927                                i += whole_match.end();
928                            } else if let Ok(Some(cap)) = REFERENCE_LINK.captures(rest) {
929                                let whole_match = cap.get(0).unwrap();
930                                let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
931                                let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
932
933                                // Use the ID or text as the reference
934                                let ref_id = if id.is_empty() { text.clone() } else { id };
935
936                                // Look up the URL from link definitions
937                                let url = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
938
939                                // Ensure we're not inside a code span
940                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
941                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
942                                });
943
944                                if !is_in_span {
945                                    self.links.push(Link {
946                                        line: line_num + 1,             // 1-indexed
947                                        start_col: i + 1,               // 1-indexed
948                                        end_col: i + whole_match.end(), // 1-indexed
949                                        text,
950                                        url,
951                                        is_reference: true,
952                                        reference_id: Some(ref_id),
953                                    });
954                                }
955
956                                // Skip past this link
957                                i += whole_match.end();
958                            } else {
959                                // No match found, move to next character
960                                i += 1;
961                            }
962                        } else {
963                            // Bracket is escaped or part of escaped image, skip it
964                            i += 1;
965                        }
966                    } else if rest.starts_with("![") {
967                        // Check if this image is escaped
968                        let is_escaped = i > 0 && line.chars().nth(i - 1) == Some('\\');
969                        if !is_escaped {
970                            if let Ok(Some(cap)) = INLINE_IMAGE.captures(rest) {
971                                let whole_match = cap.get(0).unwrap();
972                                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
973                                let src = cap.get(2).map_or("", |m| m.as_str()).to_string();
974
975                                // Ensure we're not inside a code span
976                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
977                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
978                                });
979
980                                if !is_in_span {
981                                    self.images.push(Image {
982                                        line: line_num + 1,             // 1-indexed
983                                        start_col: i + 1,               // 1-indexed
984                                        end_col: i + whole_match.end(), // 1-indexed
985                                        alt_text,
986                                        src,
987                                        is_reference: false,
988                                        reference_id: None,
989                                    });
990                                }
991
992                                // Skip past this image
993                                i += whole_match.end();
994                            } else if let Ok(Some(cap)) = REFERENCE_IMAGE.captures(rest) {
995                                let whole_match = cap.get(0).unwrap();
996                                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
997                                let id = cap.get(2).map_or("", |m| m.as_str()).to_string();
998
999                                // Use the ID or alt_text as the reference
1000                                let ref_id = if id.is_empty() { alt_text.clone() } else { id };
1001
1002                                // Look up the URL from link definitions
1003                                let src = link_defs.get(&ref_id.to_lowercase()).cloned().unwrap_or_default();
1004
1005                                // Ensure we're not inside a code span
1006                                let is_in_span = (i..i + whole_match.end()).any(|pos| {
1007                                    pos < self.in_code_span[line_num].len() && self.in_code_span[line_num][pos]
1008                                });
1009
1010                                if !is_in_span {
1011                                    self.images.push(Image {
1012                                        line: line_num + 1,             // 1-indexed
1013                                        start_col: i + 1,               // 1-indexed
1014                                        end_col: i + whole_match.end(), // 1-indexed
1015                                        alt_text,
1016                                        src,
1017                                        is_reference: true,
1018                                        reference_id: Some(ref_id),
1019                                    });
1020                                }
1021
1022                                // Skip past this image
1023                                i += whole_match.end();
1024                            } else {
1025                                // No match found, move to next character
1026                                i += 1;
1027                            }
1028                        } else {
1029                            // Image is escaped, skip it
1030                            i += 1;
1031                        }
1032                    } else {
1033                        // Neither a link nor an image, move to next character
1034                        i += 1;
1035                    }
1036                } else {
1037                    // We've reached the end of the line
1038                    break;
1039                }
1040            }
1041        }
1042    }
1043
1044    /// OPTIMIZATION 3: Detect list items with detailed information
1045    fn detect_list_items(&mut self, content: &str) {
1046        // Use fancy-regex for advanced Markdown list item detection
1047        // - Allow any number of spaces/tabs before the marker
1048        // - Marker must be *, +, or -
1049        // - At least one space/tab after the marker
1050        // - Use lookbehind to ensure marker is at the start or after whitespace
1051        // - Use Unicode support for whitespace
1052        lazy_static! {
1053            static ref UL_MARKER: FancyRegex =
1054                FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1055            static ref OL_MARKER: FancyRegex =
1056                FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]+)(?P<content>.*)$").unwrap();
1057            static ref TASK_MARKER: FancyRegex = FancyRegex::new(
1058                r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]+)\[(?P<checked>[ xX])\](?P<content>.*)$"
1059            )
1060            .unwrap();
1061        }
1062        self.list_items.clear();
1063        self.list_lines.clear();
1064        let lines: Vec<&str> = content.lines().collect();
1065        for (line_num, line) in lines.iter().enumerate() {
1066            if self.is_in_code_block(line_num + 1) || self.is_in_front_matter(line_num + 1) {
1067                continue;
1068            }
1069            if line.trim().is_empty() {
1070                continue;
1071            }
1072            // Use fancy-regex for advanced matching
1073            if let Ok(Some(cap)) = TASK_MARKER.captures(line) {
1074                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1075                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1076                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1077                self.list_lines.push(line_num + 1);
1078                self.list_items.push(ListItem {
1079                    line_number: line_num + 1,
1080                    indentation,
1081                    marker: marker.clone(),
1082                    marker_type: ListMarkerType::Task,
1083                    content,
1084                });
1085                continue;
1086            }
1087            if let Ok(Some(cap)) = UL_MARKER.captures(line) {
1088                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1089                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1090                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1091                self.list_lines.push(line_num + 1);
1092                self.list_items.push(ListItem {
1093                    line_number: line_num + 1,
1094                    indentation,
1095                    marker: marker.clone(),
1096                    marker_type: ListMarkerType::Unordered,
1097                    content,
1098                });
1099                continue;
1100            }
1101            if let Ok(Some(cap)) = OL_MARKER.captures(line) {
1102                let indentation = cap.name("indent").map_or(0, |m| m.as_str().len());
1103                let marker = cap.name("marker").map_or("", |m| m.as_str()).to_string();
1104                let content = cap.name("content").map_or("", |m| m.as_str()).to_string();
1105                self.list_lines.push(line_num + 1);
1106                self.list_items.push(ListItem {
1107                    line_number: line_num + 1,
1108                    indentation,
1109                    marker: marker.clone(),
1110                    marker_type: ListMarkerType::Ordered,
1111                    content,
1112                });
1113                continue;
1114            }
1115        }
1116    }
1117
1118    /// OPTIMIZATION 4: Detect blockquotes in the document
1119    fn detect_blockquotes(&mut self, content: &str) {
1120        lazy_static! {
1121            static ref BLOCKQUOTE_MARKER: Regex = Regex::new(r"^\s*>(.*)$").unwrap();
1122        }
1123
1124        // Clear existing data
1125        self.blockquotes.clear();
1126
1127        let lines: Vec<&str> = content.lines().collect();
1128
1129        // Note: in_blockquote bitmap is already initialized in analyze() method
1130
1131        let mut in_blockquote = false;
1132        let mut start_line = 0;
1133
1134        for (i, line) in lines.iter().enumerate() {
1135            // Skip lines in code blocks or front matter
1136            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1137                continue;
1138            }
1139
1140            let is_blockquote_line = BLOCKQUOTE_MARKER.is_match(line);
1141
1142            if is_blockquote_line {
1143                // Mark this line as inside a blockquote
1144                self.in_blockquote[i] = true;
1145
1146                if !in_blockquote {
1147                    // Start of a new blockquote
1148                    in_blockquote = true;
1149                    start_line = i + 1; // 1-indexed
1150                }
1151            } else if in_blockquote {
1152                // End of a blockquote
1153                self.blockquotes.push(BlockquoteRange {
1154                    start_line,
1155                    end_line: i, // Previous line was the end
1156                });
1157
1158                in_blockquote = false;
1159            }
1160        }
1161
1162        // Handle case where file ends with a blockquote
1163        if in_blockquote {
1164            self.blockquotes.push(BlockquoteRange {
1165                start_line,
1166                end_line: lines.len(), // Last line
1167            });
1168        }
1169    }
1170
1171    /// Detect horizontal rules in the document
1172    fn detect_horizontal_rules(&mut self, content: &str) {
1173        lazy_static! {
1174            // Horizontal rule patterns - simplified to match Markdown spec
1175            static ref HR_HYPHEN: Regex = Regex::new(r"^[ \t]*-[ \t]*-[ \t]*-[ \t-]*$").unwrap();
1176            static ref HR_ASTERISK: Regex = Regex::new(r"^[ \t]*\*[ \t]*\*[ \t]*\*[ \t\*]*$").unwrap();
1177            static ref HR_UNDERSCORE: Regex = Regex::new(r"^[ \t]*_[ \t]*_[ \t]*_[ \t_]*$").unwrap();
1178        }
1179
1180        // Clear existing data
1181        self.horizontal_rule_lines.clear();
1182
1183        let lines: Vec<&str> = content.lines().collect();
1184
1185        for (i, line) in lines.iter().enumerate() {
1186            // Skip lines in code blocks or front matter
1187            if self.is_in_code_block(i + 1) || self.is_in_front_matter(i + 1) {
1188                continue;
1189            }
1190
1191            // Check for horizontal rule patterns
1192            if HR_HYPHEN.is_match(line) || HR_ASTERISK.is_match(line) || HR_UNDERSCORE.is_match(line) {
1193                // Additional validation: ensure it's not part of a setext heading
1194                // (setext headings have content on the previous line)
1195                let is_setext_marker = if i > 0 {
1196                    let prev_line = lines[i - 1].trim();
1197                    !prev_line.is_empty()
1198                        && !self.is_in_code_block(i)
1199                        && !self.is_in_front_matter(i)
1200                        && line.trim().chars().all(|c| c == '-' || c == ' ')
1201                } else {
1202                    false
1203                };
1204
1205                if !is_setext_marker {
1206                    self.horizontal_rule_lines.push(i + 1); // 1-indexed
1207                }
1208            }
1209        }
1210    }
1211
1212    /// Detect HTML blocks (block-level HTML regions) according to CommonMark spec
1213    fn detect_html_blocks(&mut self, content: &str) {
1214        let lines: Vec<&str> = content.lines().collect();
1215        // Note: in_html_block bitmap is already initialized in analyze() method
1216
1217        let mut i = 0;
1218        while i < lines.len() {
1219            let line = lines[i];
1220            let trimmed = line.trim_start();
1221
1222            // Skip lines already in code blocks
1223            if self.is_in_code_block(i + 1) {
1224                i += 1;
1225                continue;
1226            }
1227
1228            // Check for HTML block start conditions (simplified version of CommonMark)
1229            if self.is_html_block_start(trimmed) {
1230                let start_line = i;
1231
1232                // Find the end of the HTML block
1233                let end_line = self.find_html_block_end(&lines, start_line);
1234
1235                // Mark all lines in the block as HTML
1236                for line_idx in start_line..=end_line {
1237                    if line_idx < self.in_html_block.len() {
1238                        self.in_html_block[line_idx] = true;
1239                    }
1240                }
1241
1242                // Skip to after the block
1243                i = end_line + 1;
1244            } else {
1245                i += 1;
1246            }
1247        }
1248    }
1249
1250    /// Check if a line starts an HTML block
1251    fn is_html_block_start(&self, trimmed: &str) -> bool {
1252        if trimmed.is_empty() || !trimmed.starts_with('<') {
1253            return false;
1254        }
1255
1256        // Extract tag name
1257        let mut chars = trimmed[1..].chars();
1258        let mut tag_name = String::new();
1259
1260        // Handle closing tags
1261        let is_closing = chars.as_str().starts_with('/');
1262        if is_closing {
1263            chars.next(); // Skip the '/'
1264        }
1265
1266        // Extract tag name
1267        for ch in chars {
1268            if ch.is_ascii_alphabetic() || ch == '-' {
1269                tag_name.push(ch);
1270            } else {
1271                break;
1272            }
1273        }
1274
1275        if tag_name.is_empty() {
1276            return false;
1277        }
1278
1279        // List of HTML block elements (based on CommonMark and markdownlint)
1280        const BLOCK_ELEMENTS: &[&str] = &[
1281            "address",
1282            "article",
1283            "aside",
1284            "base",
1285            "basefont",
1286            "blockquote",
1287            "body",
1288            "caption",
1289            "center",
1290            "col",
1291            "colgroup",
1292            "dd",
1293            "details",
1294            "dialog",
1295            "dir",
1296            "div",
1297            "dl",
1298            "dt",
1299            "fieldset",
1300            "figcaption",
1301            "figure",
1302            "footer",
1303            "form",
1304            "frame",
1305            "frameset",
1306            "h1",
1307            "h2",
1308            "h3",
1309            "h4",
1310            "h5",
1311            "h6",
1312            "head",
1313            "header",
1314            "hr",
1315            "html",
1316            "iframe",
1317            "legend",
1318            "li",
1319            "link",
1320            "main",
1321            "menu",
1322            "menuitem",
1323            "nav",
1324            "noframes",
1325            "ol",
1326            "optgroup",
1327            "option",
1328            "p",
1329            "param",
1330            "section",
1331            "source",
1332            "summary",
1333            "table",
1334            "tbody",
1335            "td",
1336            "tfoot",
1337            "th",
1338            "thead",
1339            "title",
1340            "tr",
1341            "track",
1342            "ul",
1343            "img",
1344            "picture",
1345        ];
1346
1347        BLOCK_ELEMENTS.contains(&tag_name.to_ascii_lowercase().as_str())
1348    }
1349
1350    /// Find the end line of an HTML block starting at start_line
1351    fn find_html_block_end(&self, lines: &[&str], start_line: usize) -> usize {
1352        let start_trimmed = lines[start_line].trim_start();
1353
1354        // Extract the tag name from the start line
1355        let tag_name = self.extract_tag_name(start_trimmed);
1356
1357        // Look for the closing tag or blank line
1358        for (i, line) in lines.iter().enumerate().skip(start_line + 1) {
1359            let trimmed = line.trim();
1360
1361            // HTML block ends on blank line
1362            if trimmed.is_empty() {
1363                return i - 1; // Don't include the blank line
1364            }
1365
1366            // HTML block ends when we find the matching closing tag
1367            if let Some(ref tag) = tag_name {
1368                let closing_tag = format!("</{tag}");
1369                if trimmed.contains(&closing_tag) {
1370                    return i;
1371                }
1372            }
1373        }
1374
1375        // If no end found, block continues to end of document
1376        lines.len() - 1
1377    }
1378
1379    /// Extract tag name from an HTML line
1380    fn extract_tag_name(&self, trimmed: &str) -> Option<String> {
1381        if !trimmed.starts_with('<') {
1382            return None;
1383        }
1384
1385        let mut chars = trimmed[1..].chars();
1386
1387        // Skip closing tag indicator
1388        if chars.as_str().starts_with('/') {
1389            chars.next();
1390        }
1391
1392        let mut tag_name = String::new();
1393        for ch in chars {
1394            if ch.is_ascii_alphabetic() || ch == '-' {
1395                tag_name.push(ch);
1396            } else {
1397                break;
1398            }
1399        }
1400
1401        if tag_name.is_empty() {
1402            None
1403        } else {
1404            Some(tag_name.to_ascii_lowercase())
1405        }
1406    }
1407
1408    /// Check if a position is inside a code span
1409    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1410        if line_num == 0 || line_num > self.in_code_span.len() {
1411            return false;
1412        }
1413
1414        let line_idx = line_num - 1; // Convert 1-indexed to 0-indexed
1415
1416        if col == 0 || col > self.in_code_span[line_idx].len() {
1417            return false;
1418        }
1419
1420        self.in_code_span[line_idx][col - 1] // Convert 1-indexed to 0-indexed
1421    }
1422
1423    /// Check if a line is inside a blockquote
1424    pub fn is_in_blockquote(&self, line_num: usize) -> bool {
1425        if line_num == 0 || line_num > self.in_blockquote.len() {
1426            return false;
1427        }
1428
1429        self.in_blockquote[line_num - 1] // Convert 1-indexed to 0-indexed
1430    }
1431
1432    /// Get detailed information about a list item at a specific line
1433    pub fn get_list_item_at_line(&self, line_num: usize) -> Option<&ListItem> {
1434        self.list_items.iter().find(|item| item.line_number == line_num)
1435    }
1436
1437    /// Get all list items with a specific marker type
1438    pub fn get_list_items_by_type(&self, marker_type: ListMarkerType) -> Vec<&ListItem> {
1439        self.list_items
1440            .iter()
1441            .filter(|item| item.marker_type == marker_type)
1442            .collect()
1443    }
1444
1445    /// Get all links with empty text or URLs
1446    pub fn get_empty_links(&self) -> Vec<&Link> {
1447        self.links
1448            .iter()
1449            .filter(|link| link.text.trim().is_empty() || link.url.trim().is_empty())
1450            .collect()
1451    }
1452
1453    /// Get all images with empty alt text
1454    pub fn get_images_without_alt_text(&self) -> Vec<&Image> {
1455        self.images
1456            .iter()
1457            .filter(|img| img.alt_text.trim().is_empty())
1458            .collect()
1459    }
1460
1461    /// Check if a line is inside an HTML block
1462    pub fn is_in_html_block(&self, line_num: usize) -> bool {
1463        if line_num == 0 || line_num > self.in_html_block.len() {
1464            return false;
1465        }
1466        self.in_html_block[line_num - 1]
1467    }
1468}
1469
1470/// Extended rule trait methods for using the document structure
1471pub trait DocumentStructureExtensions {
1472    /// Check if a rule should operate on a given line
1473    fn should_process_line(&self, line_num: usize, doc_structure: &DocumentStructure) -> bool {
1474        // Skip lines in code blocks by default
1475        !doc_structure.is_in_code_block(line_num)
1476    }
1477
1478    /// Check if content contains elements relevant to this rule
1479    fn has_relevant_elements(
1480        &self,
1481        _ctx: &crate::lint_context::LintContext,
1482        _doc_structure: &DocumentStructure,
1483    ) -> bool {
1484        // Default implementation returns true - rules should override this
1485        true
1486    }
1487}
1488
1489/// Create a DocumentStructure from a string
1490pub fn document_structure_from_str(content: &str) -> DocumentStructure {
1491    DocumentStructure::new(content)
1492}
1493
1494#[cfg(test)]
1495mod tests {
1496    use super::*;
1497
1498    #[test]
1499    fn test_document_structure_creation() {
1500        let content = "# Heading 1\n\nSome text.\n\n## Heading 2\n\nMore text.\n\n```\nCode block\n```\n";
1501        let structure = DocumentStructure::new(content);
1502
1503        assert_eq!(structure.heading_lines.len(), 2);
1504        assert_eq!(structure.heading_levels.len(), 2);
1505        assert!(structure.has_code_blocks);
1506        assert_eq!(structure.code_blocks.len(), 1);
1507    }
1508
1509    #[test]
1510    fn test_nested_code_blocks() {
1511        let content = r#"```markdown
15121. First item
1513
1514   ```python
1515   code_in_list()
1516   ```
1517
15182. Second item
1519```"#;
1520
1521        let structure = DocumentStructure::new(content);
1522
1523        // Should have exactly one code block (the outer markdown block)
1524        assert_eq!(structure.code_blocks.len(), 1);
1525        assert_eq!(structure.code_blocks[0].start_line, 1);
1526        assert_eq!(structure.code_blocks[0].end_line, 9);
1527
1528        // Lines 2-8 should be inside the code block
1529        for line in 2..=8 {
1530            assert!(structure.is_in_code_block(line), "Line {line} should be in code block");
1531        }
1532    }
1533
1534    #[test]
1535    fn test_document_with_front_matter() {
1536        let content = "---\ntitle: Test Document\ndate: 2021-01-01\n---\n\n# Heading 1\n\nSome text.\n";
1537        let structure = DocumentStructure::new(content);
1538
1539        assert!(structure.has_front_matter);
1540        assert!(structure.front_matter_range.is_some());
1541        assert_eq!(structure.heading_lines.len(), 1);
1542        assert!(!structure.has_code_blocks);
1543    }
1544
1545    #[test]
1546    fn test_is_in_code_block() {
1547        let content = "# Heading\n\nText.\n\n```\ncode line 1\ncode line 2\n```\n\nMore text.\n";
1548        let structure = DocumentStructure::new(content);
1549
1550        assert!(!structure.is_in_code_block(1)); // # Heading
1551        assert!(!structure.is_in_code_block(3)); // Text.
1552        assert!(!structure.is_in_code_block(5)); // ```
1553        assert!(structure.is_in_code_block(6)); // code line 1
1554        assert!(structure.is_in_code_block(7)); // code line 2
1555        assert!(!structure.is_in_code_block(8)); // ```
1556        assert!(!structure.is_in_code_block(10)); // More text.
1557    }
1558
1559    #[test]
1560    fn test_headings_edge_cases() {
1561        // ATX, closed ATX, Setext, mixed styles
1562        let content =
1563            "  # ATX Heading\n# Closed ATX Heading #\nSetext H1\n=======\nSetext H2\n-------\n\n# ATX Again\n";
1564        let structure = DocumentStructure::new(content);
1565        assert_eq!(structure.heading_lines, vec![1, 2, 3, 5, 8]);
1566        assert_eq!(structure.heading_levels, vec![1, 1, 1, 2, 1]);
1567
1568        // Headings in code blocks and front matter (should be ignored)
1569        let content = "---\ntitle: Test\n---\n# Heading 1\n\n```\n# Not a heading\n```\n# Heading 2\n";
1570        let structure = DocumentStructure::new(content);
1571        assert_eq!(structure.heading_lines, vec![4, 9]);
1572        assert_eq!(structure.heading_levels, vec![1, 1]);
1573
1574        // Empty headings
1575        let content = "#\n## \n###  \n# Not Empty\n";
1576        let structure = DocumentStructure::new(content);
1577        assert_eq!(structure.heading_lines, vec![4]);
1578        assert_eq!(structure.heading_levels, vec![1]);
1579
1580        // Headings with trailing whitespace
1581        let content = "# Heading \n# Heading\n";
1582        let structure = DocumentStructure::new(content);
1583        assert_eq!(structure.heading_lines, vec![1, 2]);
1584        assert_eq!(structure.heading_levels, vec![1, 1]);
1585
1586        // Headings with indentation
1587        let content = "   # Indented\n    # Not a heading (too much indent)\n# Valid\n";
1588        let structure = DocumentStructure::new(content);
1589        assert_eq!(structure.heading_lines, vec![1, 3]);
1590        assert_eq!(structure.heading_levels, vec![1, 1]);
1591
1592        // Multiple duplicates and edge line numbers
1593        let content = "# Dup\n# Dup\n# Unique\n# Dup\n";
1594        let structure = DocumentStructure::new(content);
1595        assert_eq!(structure.heading_lines, vec![1, 2, 3, 4]);
1596        assert_eq!(structure.heading_levels, vec![1, 1, 1, 1]);
1597
1598        // Headings after code blocks/front matter
1599        let content = "```\n# Not a heading\n```\n# Real Heading\n";
1600        let structure = DocumentStructure::new(content);
1601        assert_eq!(structure.heading_lines, vec![4]);
1602        assert_eq!(structure.heading_levels, vec![1]);
1603
1604        let content = "---\ntitle: Test\n---\n# Heading\n";
1605        let structure = DocumentStructure::new(content);
1606        assert_eq!(structure.heading_lines, vec![4]);
1607        assert_eq!(structure.heading_levels, vec![1]);
1608
1609        // Setext headings with blank lines before/after
1610        let content = "\nSetext\n=======\n\nSetext2\n-------\n";
1611        let structure = DocumentStructure::new(content);
1612        assert_eq!(structure.heading_lines, vec![2, 5]);
1613        assert_eq!(structure.heading_levels, vec![1, 2]);
1614
1615        // Headings with special characters
1616        let content = "# Heading!@#$%^&*()\nSetext Special\n=======\n";
1617        let structure = DocumentStructure::new(content);
1618        assert_eq!(structure.heading_lines, vec![1, 2]);
1619        assert_eq!(structure.heading_levels, vec![1, 1]);
1620    }
1621
1622    #[test]
1623    fn test_horizontal_rule_detection() {
1624        // Test basic horizontal rules
1625        let content = "Text\n\n---\n\nMore text\n\n***\n\nFinal\n\n___\n\nEnd";
1626        let structure = DocumentStructure::new(content);
1627        assert_eq!(structure.horizontal_rule_lines, vec![3, 7, 11]);
1628
1629        // Test horizontal rules with spaces
1630        let content = "Text\n\n- - -\n\n* * *\n\n_ _ _\n\nEnd";
1631        let structure = DocumentStructure::new(content);
1632        assert_eq!(structure.horizontal_rule_lines, vec![3, 5, 7]);
1633
1634        // Test setext headings are not detected as horizontal rules
1635        let content = "# ATX\n\nSetext\n------\n\n---\n\nAnother\n======\n";
1636        let structure = DocumentStructure::new(content);
1637        assert_eq!(structure.horizontal_rule_lines, vec![6]); // Only the actual HR
1638        assert_eq!(structure.heading_lines, vec![1, 3, 8]); // Three headings
1639
1640        // Test horizontal rules in code blocks are ignored
1641        let content = "Text\n\n```\n---\n***\n```\n\n---\n\nEnd";
1642        let structure = DocumentStructure::new(content);
1643        assert_eq!(structure.horizontal_rule_lines, vec![8]); // Only the one outside code block
1644
1645        // Test horizontal rules in front matter are ignored
1646        let content = "---\ntitle: Test\n---\n\n---\n\nContent";
1647        let structure = DocumentStructure::new(content);
1648        assert_eq!(structure.horizontal_rule_lines, vec![5]); // Only the one after front matter
1649    }
1650}
rumdl_lib/utils/document_structure.rs

rumdl_lib/utils/
document_structure.rs