rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::ast_utils::get_cached_ast;
4use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
5use lazy_static::lazy_static;
6use markdown::mdast::Node;
7use regex::Regex;
8
9lazy_static! {
10    // Comprehensive link pattern that captures both inline and reference links
11    // Use (?s) flag to make . match newlines
12    static ref LINK_PATTERN: Regex = Regex::new(
13        r"(?sx)
14        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
15        (?:
16            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
17            |
18            \[([^\]]*)\]      # Reference ID in group 3
19        )"
20    ).unwrap();
21
22    // Image pattern (similar to links but with ! prefix)
23    // Use (?s) flag to make . match newlines
24    static ref IMAGE_PATTERN: Regex = Regex::new(
25        r"(?sx)
26        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
27        (?:
28            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
29            |
30            \[([^\]]*)\]      # Reference ID in group 3
31        )"
32    ).unwrap();
33
34    // Reference definition pattern
35    static ref REF_DEF_PATTERN: Regex = Regex::new(
36        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
37    ).unwrap();
38
39    // Code span pattern - matches backticks and captures content
40    // This handles multi-backtick code spans correctly
41    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
42        r"`+"
43    ).unwrap();
44
45    // Pattern for bare URLs
46    static ref BARE_URL_PATTERN: Regex = Regex::new(
47        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
48    ).unwrap();
49
50    // Pattern for email addresses
51    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
52        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
53    ).unwrap();
54
55    // Pattern for angle bracket links (to exclude from bare URL detection)
56    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
57        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
58    ).unwrap();
59
60    // Pattern for blockquote prefix in parse_list_blocks
61    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
62}
63
64/// Pre-computed information about a line
65#[derive(Debug, Clone)]
66pub struct LineInfo {
67    /// The actual line content (without newline)
68    pub content: String,
69    /// Byte offset where this line starts in the document
70    pub byte_offset: usize,
71    /// Number of leading spaces/tabs
72    pub indent: usize,
73    /// Whether the line is blank (empty or only whitespace)
74    pub is_blank: bool,
75    /// Whether this line is inside a code block
76    pub in_code_block: bool,
77    /// Whether this line is inside front matter
78    pub in_front_matter: bool,
79    /// Whether this line is inside an HTML block
80    pub in_html_block: bool,
81    /// List item information if this line starts a list item
82    pub list_item: Option<ListItemInfo>,
83    /// Heading information if this line is a heading
84    pub heading: Option<HeadingInfo>,
85    /// Blockquote information if this line is a blockquote
86    pub blockquote: Option<BlockquoteInfo>,
87    /// Whether this line is inside a mkdocstrings autodoc block
88    pub in_mkdocstrings: bool,
89}
90
91/// Information about a list item
92#[derive(Debug, Clone)]
93pub struct ListItemInfo {
94    /// The marker used (*, -, +, or number with . or ))
95    pub marker: String,
96    /// Whether it's ordered (true) or unordered (false)
97    pub is_ordered: bool,
98    /// The number for ordered lists
99    pub number: Option<usize>,
100    /// Column where the marker starts (0-based)
101    pub marker_column: usize,
102    /// Column where content after marker starts
103    pub content_column: usize,
104}
105
106/// Heading style type
107#[derive(Debug, Clone, PartialEq)]
108pub enum HeadingStyle {
109    /// ATX style heading (# Heading)
110    ATX,
111    /// Setext style heading with = underline
112    Setext1,
113    /// Setext style heading with - underline
114    Setext2,
115}
116
117/// Parsed link information
118#[derive(Debug, Clone)]
119pub struct ParsedLink {
120    /// Line number (1-indexed)
121    pub line: usize,
122    /// Start column (0-indexed) in the line
123    pub start_col: usize,
124    /// End column (0-indexed) in the line
125    pub end_col: usize,
126    /// Byte offset in document
127    pub byte_offset: usize,
128    /// End byte offset in document
129    pub byte_end: usize,
130    /// Link text
131    pub text: String,
132    /// Link URL or reference
133    pub url: String,
134    /// Whether this is a reference link [text][ref] vs inline [text](url)
135    pub is_reference: bool,
136    /// Reference ID for reference links
137    pub reference_id: Option<String>,
138}
139
140/// Parsed image information
141#[derive(Debug, Clone)]
142pub struct ParsedImage {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Alt text
154    pub alt_text: String,
155    /// Image URL or reference
156    pub url: String,
157    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
158    pub is_reference: bool,
159    /// Reference ID for reference images
160    pub reference_id: Option<String>,
161}
162
163/// Reference definition [ref]: url "title"
164#[derive(Debug, Clone)]
165pub struct ReferenceDef {
166    /// Line number (1-indexed)
167    pub line: usize,
168    /// Reference ID (normalized to lowercase)
169    pub id: String,
170    /// URL
171    pub url: String,
172    /// Optional title
173    pub title: Option<String>,
174}
175
176/// Parsed code span information
177#[derive(Debug, Clone)]
178pub struct CodeSpan {
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start column (0-indexed) in the line
182    pub start_col: usize,
183    /// End column (0-indexed) in the line
184    pub end_col: usize,
185    /// Byte offset in document
186    pub byte_offset: usize,
187    /// End byte offset in document
188    pub byte_end: usize,
189    /// Number of backticks used (1, 2, 3, etc.)
190    pub backtick_count: usize,
191    /// Content inside the code span (without backticks)
192    pub content: String,
193}
194
195/// Information about a heading
196#[derive(Debug, Clone)]
197pub struct HeadingInfo {
198    /// Heading level (1-6 for ATX, 1-2 for Setext)
199    pub level: u8,
200    /// Style of heading
201    pub style: HeadingStyle,
202    /// The heading marker (# characters or underline)
203    pub marker: String,
204    /// Column where the marker starts (0-based)
205    pub marker_column: usize,
206    /// Column where heading text starts
207    pub content_column: usize,
208    /// The heading text (without markers and without custom ID syntax)
209    pub text: String,
210    /// Custom header ID if present (e.g., from {#custom-id} syntax)
211    pub custom_id: Option<String>,
212    /// Original heading text including custom ID syntax
213    pub raw_text: String,
214    /// Whether it has a closing sequence (for ATX)
215    pub has_closing_sequence: bool,
216    /// The closing sequence if present
217    pub closing_sequence: String,
218}
219
220/// Information about a blockquote line
221#[derive(Debug, Clone)]
222pub struct BlockquoteInfo {
223    /// Nesting level (1 for >, 2 for >>, etc.)
224    pub nesting_level: usize,
225    /// The indentation before the blockquote marker
226    pub indent: String,
227    /// Column where the first > starts (0-based)
228    pub marker_column: usize,
229    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
230    pub prefix: String,
231    /// Content after the blockquote marker(s)
232    pub content: String,
233    /// Whether the line has no space after the marker
234    pub has_no_space_after_marker: bool,
235    /// Whether the line has multiple spaces after the marker
236    pub has_multiple_spaces_after_marker: bool,
237    /// Whether this is an empty blockquote line needing MD028 fix
238    pub needs_md028_fix: bool,
239}
240
241/// Information about a list block
242#[derive(Debug, Clone)]
243pub struct ListBlock {
244    /// Line number where the list starts (1-indexed)
245    pub start_line: usize,
246    /// Line number where the list ends (1-indexed)
247    pub end_line: usize,
248    /// Whether it's ordered or unordered
249    pub is_ordered: bool,
250    /// The consistent marker for unordered lists (if any)
251    pub marker: Option<String>,
252    /// Blockquote prefix for this list (empty if not in blockquote)
253    pub blockquote_prefix: String,
254    /// Lines that are list items within this block
255    pub item_lines: Vec<usize>,
256    /// Nesting level (0 for top-level lists)
257    pub nesting_level: usize,
258    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
259    pub max_marker_width: usize,
260}
261
262use std::sync::{Arc, Mutex};
263
264/// Character frequency data for fast content analysis
265#[derive(Debug, Clone, Default)]
266pub struct CharFrequency {
267    /// Count of # characters (headings)
268    pub hash_count: usize,
269    /// Count of * characters (emphasis, lists, horizontal rules)
270    pub asterisk_count: usize,
271    /// Count of _ characters (emphasis, horizontal rules)
272    pub underscore_count: usize,
273    /// Count of - characters (lists, horizontal rules, setext headings)
274    pub hyphen_count: usize,
275    /// Count of + characters (lists)
276    pub plus_count: usize,
277    /// Count of > characters (blockquotes)
278    pub gt_count: usize,
279    /// Count of | characters (tables)
280    pub pipe_count: usize,
281    /// Count of [ characters (links, images)
282    pub bracket_count: usize,
283    /// Count of ` characters (code spans, code blocks)
284    pub backtick_count: usize,
285    /// Count of < characters (HTML tags, autolinks)
286    pub lt_count: usize,
287    /// Count of ! characters (images)
288    pub exclamation_count: usize,
289    /// Count of newline characters
290    pub newline_count: usize,
291}
292
293/// Pre-parsed HTML tag information
294#[derive(Debug, Clone)]
295pub struct HtmlTag {
296    /// Line number (1-indexed)
297    pub line: usize,
298    /// Start column (0-indexed) in the line
299    pub start_col: usize,
300    /// End column (0-indexed) in the line
301    pub end_col: usize,
302    /// Byte offset in document
303    pub byte_offset: usize,
304    /// End byte offset in document
305    pub byte_end: usize,
306    /// Tag name (e.g., "div", "img", "br")
307    pub tag_name: String,
308    /// Whether it's a closing tag (`</tag>`)
309    pub is_closing: bool,
310    /// Whether it's self-closing (`<tag />`)
311    pub is_self_closing: bool,
312    /// Raw tag content
313    pub raw_content: String,
314}
315
316/// Pre-parsed emphasis span information
317#[derive(Debug, Clone)]
318pub struct EmphasisSpan {
319    /// Line number (1-indexed)
320    pub line: usize,
321    /// Start column (0-indexed) in the line
322    pub start_col: usize,
323    /// End column (0-indexed) in the line
324    pub end_col: usize,
325    /// Byte offset in document
326    pub byte_offset: usize,
327    /// End byte offset in document
328    pub byte_end: usize,
329    /// Type of emphasis ('*' or '_')
330    pub marker: char,
331    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
332    pub marker_count: usize,
333    /// Content inside the emphasis
334    pub content: String,
335}
336
337/// Pre-parsed table row information
338#[derive(Debug, Clone)]
339pub struct TableRow {
340    /// Line number (1-indexed)
341    pub line: usize,
342    /// Whether this is a separator row (contains only |, -, :, and spaces)
343    pub is_separator: bool,
344    /// Number of columns (pipe-separated cells)
345    pub column_count: usize,
346    /// Alignment info from separator row
347    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
348}
349
350/// Pre-parsed bare URL information (not in links)
351#[derive(Debug, Clone)]
352pub struct BareUrl {
353    /// Line number (1-indexed)
354    pub line: usize,
355    /// Start column (0-indexed) in the line
356    pub start_col: usize,
357    /// End column (0-indexed) in the line
358    pub end_col: usize,
359    /// Byte offset in document
360    pub byte_offset: usize,
361    /// End byte offset in document
362    pub byte_end: usize,
363    /// The URL string
364    pub url: String,
365    /// Type of URL ("http", "https", "ftp", "email")
366    pub url_type: String,
367}
368
369pub struct LintContext<'a> {
370    pub content: &'a str,
371    pub line_offsets: Vec<usize>,
372    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
373    pub lines: Vec<LineInfo>,             // Pre-computed line information
374    pub links: Vec<ParsedLink>,           // Pre-parsed links
375    pub images: Vec<ParsedImage>,         // Pre-parsed images
376    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
377    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
378    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
379    pub char_frequency: CharFrequency,    // Character frequency analysis
380    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
381    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
382    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
383    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
384    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
385    pub flavor: MarkdownFlavor,           // Markdown flavor being used
386}
387
388impl<'a> LintContext<'a> {
389    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
390        let mut line_offsets = vec![0];
391        for (i, c) in content.char_indices() {
392            if c == '\n' {
393                line_offsets.push(i + 1);
394            }
395        }
396
397        // Detect code blocks once and cache them
398        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
399
400        // Pre-compute line information
401        let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
402
403        // Parse code spans early so we can exclude them from link/image parsing
404        let ast = get_cached_ast(content);
405        let code_spans = Self::parse_code_spans(content, &lines, &ast);
406
407        // Parse links, images, references, and list blocks
408        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
409        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
410        let reference_defs = Self::parse_reference_defs(content, &lines);
411        // Use line-by-line list parsing for MD032 compatibility
412        // TODO: Consider using AST-based parsing in the future when MD032 is updated
413        let list_blocks = Self::parse_list_blocks(&lines);
414
415        // Detect HTML blocks
416        Self::detect_html_blocks(&mut lines);
417
418        // Compute character frequency for fast content analysis
419        let char_frequency = Self::compute_char_frequency(content);
420
421        Self {
422            content,
423            line_offsets,
424            code_blocks,
425            lines,
426            links,
427            images,
428            reference_defs,
429            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
430            list_blocks,
431            char_frequency,
432            html_tags_cache: Mutex::new(None),
433            emphasis_spans_cache: Mutex::new(None),
434            table_rows_cache: Mutex::new(None),
435            bare_urls_cache: Mutex::new(None),
436            ast_cache: Mutex::new(None),
437            flavor,
438        }
439    }
440
441    /// Get AST - uses global cache for deduplication
442    pub fn get_ast(&self) -> Arc<Node> {
443        let mut cache = self.ast_cache.lock().unwrap();
444
445        if cache.is_none() {
446            // Use global AST cache to avoid duplicate parsing
447            // MarkdownAst is just a type alias for Node, so no conversion needed
448            *cache = Some(get_cached_ast(self.content));
449        }
450
451        cache.as_ref().unwrap().clone()
452    }
453
454    /// Get code spans - computed lazily on first access
455    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
456        let mut cache = self.code_spans_cache.lock().unwrap();
457
458        // Check if we need to compute code spans
459        if cache.is_none() {
460            let ast = self.get_ast();
461            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
462            *cache = Some(Arc::new(code_spans));
463        }
464
465        // Return a reference to the cached code spans
466        cache.as_ref().unwrap().clone()
467    }
468
469    /// Get HTML tags - computed lazily on first access
470    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
471        let mut cache = self.html_tags_cache.lock().unwrap();
472
473        if cache.is_none() {
474            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
475            *cache = Some(Arc::new(html_tags));
476        }
477
478        cache.as_ref().unwrap().clone()
479    }
480
481    /// Get emphasis spans - computed lazily on first access
482    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
483        let mut cache = self.emphasis_spans_cache.lock().unwrap();
484
485        if cache.is_none() {
486            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
487            *cache = Some(Arc::new(emphasis_spans));
488        }
489
490        cache.as_ref().unwrap().clone()
491    }
492
493    /// Get table rows - computed lazily on first access
494    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
495        let mut cache = self.table_rows_cache.lock().unwrap();
496
497        if cache.is_none() {
498            let table_rows = Self::parse_table_rows(&self.lines);
499            *cache = Some(Arc::new(table_rows));
500        }
501
502        cache.as_ref().unwrap().clone()
503    }
504
505    /// Get bare URLs - computed lazily on first access
506    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
507        let mut cache = self.bare_urls_cache.lock().unwrap();
508
509        if cache.is_none() {
510            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
511            *cache = Some(Arc::new(bare_urls));
512        }
513
514        cache.as_ref().unwrap().clone()
515    }
516
517    /// Map a byte offset to (line, column)
518    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
519        match self.line_offsets.binary_search(&offset) {
520            Ok(line) => (line + 1, 1),
521            Err(line) => {
522                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
523                (line, offset - line_start + 1)
524            }
525        }
526    }
527
528    /// Check if a position is within a code block or code span
529    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
530        // Check code blocks first
531        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
532            return true;
533        }
534
535        // Check inline code spans (lazy load if needed)
536        self.code_spans()
537            .iter()
538            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
539    }
540
541    /// Get line information by line number (1-indexed)
542    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
543        if line_num > 0 {
544            self.lines.get(line_num - 1)
545        } else {
546            None
547        }
548    }
549
550    /// Get byte offset for a line number (1-indexed)
551    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
552        self.line_info(line_num).map(|info| info.byte_offset)
553    }
554
555    /// Get URL for a reference link/image by its ID
556    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
557        let normalized_id = ref_id.to_lowercase();
558        self.reference_defs
559            .iter()
560            .find(|def| def.id == normalized_id)
561            .map(|def| def.url.as_str())
562    }
563
564    /// Get links on a specific line
565    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
566        self.links.iter().filter(|link| link.line == line_num).collect()
567    }
568
569    /// Get images on a specific line
570    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
571        self.images.iter().filter(|img| img.line == line_num).collect()
572    }
573
574    /// Check if a line is part of a list block
575    pub fn is_in_list_block(&self, line_num: usize) -> bool {
576        self.list_blocks
577            .iter()
578            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
579    }
580
581    /// Get the list block containing a specific line
582    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
583        self.list_blocks
584            .iter()
585            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
586    }
587
588    // Compatibility methods for DocumentStructure migration
589
590    /// Check if a line is within a code block
591    pub fn is_in_code_block(&self, line_num: usize) -> bool {
592        if line_num == 0 || line_num > self.lines.len() {
593            return false;
594        }
595        self.lines[line_num - 1].in_code_block
596    }
597
598    /// Check if a line is within front matter
599    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
600        if line_num == 0 || line_num > self.lines.len() {
601            return false;
602        }
603        self.lines[line_num - 1].in_front_matter
604    }
605
606    /// Check if a line is within an HTML block
607    pub fn is_in_html_block(&self, line_num: usize) -> bool {
608        if line_num == 0 || line_num > self.lines.len() {
609            return false;
610        }
611        self.lines[line_num - 1].in_html_block
612    }
613
614    /// Check if a line and column is within a code span
615    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
616        if line_num == 0 || line_num > self.lines.len() {
617            return false;
618        }
619
620        // Use the code spans cache to check
621        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
622        // Convert col to 0-indexed for comparison
623        let col_0indexed = if col > 0 { col - 1 } else { 0 };
624        let code_spans = self.code_spans();
625        code_spans
626            .iter()
627            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
628    }
629
630    /// Check if content has any instances of a specific character (fast)
631    pub fn has_char(&self, ch: char) -> bool {
632        match ch {
633            '#' => self.char_frequency.hash_count > 0,
634            '*' => self.char_frequency.asterisk_count > 0,
635            '_' => self.char_frequency.underscore_count > 0,
636            '-' => self.char_frequency.hyphen_count > 0,
637            '+' => self.char_frequency.plus_count > 0,
638            '>' => self.char_frequency.gt_count > 0,
639            '|' => self.char_frequency.pipe_count > 0,
640            '[' => self.char_frequency.bracket_count > 0,
641            '`' => self.char_frequency.backtick_count > 0,
642            '<' => self.char_frequency.lt_count > 0,
643            '!' => self.char_frequency.exclamation_count > 0,
644            '\n' => self.char_frequency.newline_count > 0,
645            _ => self.content.contains(ch), // Fallback for other characters
646        }
647    }
648
649    /// Get count of a specific character (fast)
650    pub fn char_count(&self, ch: char) -> usize {
651        match ch {
652            '#' => self.char_frequency.hash_count,
653            '*' => self.char_frequency.asterisk_count,
654            '_' => self.char_frequency.underscore_count,
655            '-' => self.char_frequency.hyphen_count,
656            '+' => self.char_frequency.plus_count,
657            '>' => self.char_frequency.gt_count,
658            '|' => self.char_frequency.pipe_count,
659            '[' => self.char_frequency.bracket_count,
660            '`' => self.char_frequency.backtick_count,
661            '<' => self.char_frequency.lt_count,
662            '!' => self.char_frequency.exclamation_count,
663            '\n' => self.char_frequency.newline_count,
664            _ => self.content.matches(ch).count(), // Fallback for other characters
665        }
666    }
667
668    /// Check if content likely contains headings (fast)
669    pub fn likely_has_headings(&self) -> bool {
670        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
671    }
672
673    /// Check if content likely contains lists (fast)
674    pub fn likely_has_lists(&self) -> bool {
675        self.char_frequency.asterisk_count > 0
676            || self.char_frequency.hyphen_count > 0
677            || self.char_frequency.plus_count > 0
678    }
679
680    /// Check if content likely contains emphasis (fast)
681    pub fn likely_has_emphasis(&self) -> bool {
682        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
683    }
684
685    /// Check if content likely contains tables (fast)
686    pub fn likely_has_tables(&self) -> bool {
687        self.char_frequency.pipe_count > 2
688    }
689
690    /// Check if content likely contains blockquotes (fast)
691    pub fn likely_has_blockquotes(&self) -> bool {
692        self.char_frequency.gt_count > 0
693    }
694
695    /// Check if content likely contains code (fast)
696    pub fn likely_has_code(&self) -> bool {
697        self.char_frequency.backtick_count > 0
698    }
699
700    /// Check if content likely contains links or images (fast)
701    pub fn likely_has_links_or_images(&self) -> bool {
702        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
703    }
704
705    /// Check if content likely contains HTML (fast)
706    pub fn likely_has_html(&self) -> bool {
707        self.char_frequency.lt_count > 0
708    }
709
710    /// Get HTML tags on a specific line
711    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
712        self.html_tags()
713            .iter()
714            .filter(|tag| tag.line == line_num)
715            .cloned()
716            .collect()
717    }
718
719    /// Get emphasis spans on a specific line
720    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
721        self.emphasis_spans()
722            .iter()
723            .filter(|span| span.line == line_num)
724            .cloned()
725            .collect()
726    }
727
728    /// Get table rows on a specific line
729    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
730        self.table_rows()
731            .iter()
732            .filter(|row| row.line == line_num)
733            .cloned()
734            .collect()
735    }
736
737    /// Get bare URLs on a specific line
738    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
739        self.bare_urls()
740            .iter()
741            .filter(|url| url.line == line_num)
742            .cloned()
743            .collect()
744    }
745
746    /// Parse all links in the content
747    fn parse_links(
748        content: &str,
749        lines: &[LineInfo],
750        code_blocks: &[(usize, usize)],
751        code_spans: &[CodeSpan],
752        flavor: MarkdownFlavor,
753    ) -> Vec<ParsedLink> {
754        use crate::utils::skip_context::is_mkdocs_snippet_line;
755
756        // Pre-size based on a heuristic: most markdown files have relatively few links
757        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
758
759        // Parse links across the entire content, not line by line
760        for cap in LINK_PATTERN.captures_iter(content) {
761            let full_match = cap.get(0).unwrap();
762            let match_start = full_match.start();
763            let match_end = full_match.end();
764
765            // Skip if the opening bracket is escaped (preceded by \)
766            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
767                continue;
768            }
769
770            // Skip if this is actually an image (preceded by !)
771            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
772                continue;
773            }
774
775            // Skip if in code block
776            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
777                continue;
778            }
779
780            // Skip if in code span
781            if code_spans
782                .iter()
783                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
784            {
785                continue;
786            }
787
788            // Skip if this link is on a MkDocs snippet line
789            // Find which line this link is on
790            let line_idx = lines
791                .iter()
792                .position(|line| {
793                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
794                })
795                .unwrap_or(0);
796
797            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
798                continue;
799            }
800
801            // Find which line this link starts on
802            let mut line_num = 1;
803            let mut col_start = match_start;
804            for (idx, line_info) in lines.iter().enumerate() {
805                if match_start >= line_info.byte_offset {
806                    line_num = idx + 1;
807                    col_start = match_start - line_info.byte_offset;
808                } else {
809                    break;
810                }
811            }
812
813            // Find which line this link ends on (and calculate column on that line)
814            let mut end_line_num = 1;
815            let mut col_end = match_end;
816            for (idx, line_info) in lines.iter().enumerate() {
817                if match_end > line_info.byte_offset {
818                    end_line_num = idx + 1;
819                    col_end = match_end - line_info.byte_offset;
820                } else {
821                    break;
822                }
823            }
824
825            // For single-line links, use the same approach as before
826            if line_num == end_line_num {
827                // col_end is already correct
828            } else {
829                // For multi-line links, col_end represents the column on the ending line
830                // which is what we want
831            }
832
833            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
834
835            if let Some(inline_url) = cap.get(2) {
836                // Inline link
837                links.push(ParsedLink {
838                    line: line_num,
839                    start_col: col_start,
840                    end_col: col_end,
841                    byte_offset: match_start,
842                    byte_end: match_end,
843                    text,
844                    url: inline_url.as_str().to_string(),
845                    is_reference: false,
846                    reference_id: None,
847                });
848            } else if let Some(ref_id) = cap.get(3) {
849                // Reference link
850                let ref_id_str = ref_id.as_str();
851                let normalized_ref = if ref_id_str.is_empty() {
852                    text.to_lowercase() // Implicit reference
853                } else {
854                    ref_id_str.to_lowercase()
855                };
856
857                links.push(ParsedLink {
858                    line: line_num,
859                    start_col: col_start,
860                    end_col: col_end,
861                    byte_offset: match_start,
862                    byte_end: match_end,
863                    text,
864                    url: String::new(), // Will be resolved with reference_defs
865                    is_reference: true,
866                    reference_id: Some(normalized_ref),
867                });
868            }
869        }
870
871        links
872    }
873
874    /// Parse all images in the content
875    fn parse_images(
876        content: &str,
877        lines: &[LineInfo],
878        code_blocks: &[(usize, usize)],
879        code_spans: &[CodeSpan],
880    ) -> Vec<ParsedImage> {
881        // Pre-size based on a heuristic: images are less common than links
882        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
883
884        // Parse images across the entire content, not line by line
885        for cap in IMAGE_PATTERN.captures_iter(content) {
886            let full_match = cap.get(0).unwrap();
887            let match_start = full_match.start();
888            let match_end = full_match.end();
889
890            // Skip if the ! is escaped (preceded by \)
891            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
892                continue;
893            }
894
895            // Skip if in code block
896            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
897                continue;
898            }
899
900            // Skip if in code span
901            if code_spans
902                .iter()
903                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
904            {
905                continue;
906            }
907
908            // Find which line this image starts on
909            let mut line_num = 1;
910            let mut col_start = match_start;
911            for (idx, line_info) in lines.iter().enumerate() {
912                if match_start >= line_info.byte_offset {
913                    line_num = idx + 1;
914                    col_start = match_start - line_info.byte_offset;
915                } else {
916                    break;
917                }
918            }
919
920            // Find which line this image ends on (and calculate column on that line)
921            let mut end_line_num = 1;
922            let mut col_end = match_end;
923            for (idx, line_info) in lines.iter().enumerate() {
924                if match_end > line_info.byte_offset {
925                    end_line_num = idx + 1;
926                    col_end = match_end - line_info.byte_offset;
927                } else {
928                    break;
929                }
930            }
931
932            // For single-line images, use the same approach as before
933            if line_num == end_line_num {
934                // col_end is already correct
935            } else {
936                // For multi-line images, col_end represents the column on the ending line
937                // which is what we want
938            }
939
940            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
941
942            if let Some(inline_url) = cap.get(2) {
943                // Inline image
944                images.push(ParsedImage {
945                    line: line_num,
946                    start_col: col_start,
947                    end_col: col_end,
948                    byte_offset: match_start,
949                    byte_end: match_end,
950                    alt_text,
951                    url: inline_url.as_str().to_string(),
952                    is_reference: false,
953                    reference_id: None,
954                });
955            } else if let Some(ref_id) = cap.get(3) {
956                // Reference image
957                let ref_id_str = ref_id.as_str();
958                let normalized_ref = if ref_id_str.is_empty() {
959                    alt_text.to_lowercase() // Implicit reference
960                } else {
961                    ref_id_str.to_lowercase()
962                };
963
964                images.push(ParsedImage {
965                    line: line_num,
966                    start_col: col_start,
967                    end_col: col_end,
968                    byte_offset: match_start,
969                    byte_end: match_end,
970                    alt_text,
971                    url: String::new(), // Will be resolved with reference_defs
972                    is_reference: true,
973                    reference_id: Some(normalized_ref),
974                });
975            }
976        }
977
978        images
979    }
980
981    /// Parse reference definitions
982    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
983        // Pre-size based on lines count as reference definitions are line-based
984        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
985
986        for (line_idx, line_info) in lines.iter().enumerate() {
987            // Skip lines in code blocks
988            if line_info.in_code_block {
989                continue;
990            }
991
992            let line = &line_info.content;
993            let line_num = line_idx + 1;
994
995            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
996                let id = cap.get(1).unwrap().as_str().to_lowercase();
997                let url = cap.get(2).unwrap().as_str().to_string();
998                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
999
1000                refs.push(ReferenceDef {
1001                    line: line_num,
1002                    id,
1003                    url,
1004                    title,
1005                });
1006            }
1007        }
1008
1009        refs
1010    }
1011
1012    /// Pre-compute line information
1013    fn compute_line_info(
1014        content: &str,
1015        line_offsets: &[usize],
1016        code_blocks: &[(usize, usize)],
1017        flavor: MarkdownFlavor,
1018    ) -> Vec<LineInfo> {
1019        lazy_static! {
1020            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1021            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1022            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1023
1024            // Regex for blockquote prefix
1025            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1026
1027            // Regex for heading detection
1028            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1029            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1030
1031            // Regex for blockquote detection
1032            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1033        }
1034
1035        let content_lines: Vec<&str> = content.lines().collect();
1036        let mut lines = Vec::with_capacity(content_lines.len());
1037
1038        // Detect front matter boundaries FIRST, before any other parsing
1039        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1040        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1041
1042        for (i, line) in content_lines.iter().enumerate() {
1043            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1044            let indent = line.len() - line.trim_start().len();
1045            // For blank detection, consider blockquote context
1046            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1047                // In blockquote context, check if content after prefix is blank
1048                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1049                after_prefix.trim().is_empty()
1050            } else {
1051                line.trim().is_empty()
1052            };
1053            // Check if this line is inside a code block (not inline code span)
1054            // We only want to check for fenced/indented code blocks, not inline code
1055            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1056                // Only consider ranges that span multiple lines (code blocks)
1057                // Inline code spans are typically on a single line
1058
1059                // Ensure we're at valid UTF-8 boundaries
1060                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1061                    // Find the nearest valid boundary before start
1062                    let mut boundary = start;
1063                    while boundary > 0 && !content.is_char_boundary(boundary) {
1064                        boundary -= 1;
1065                    }
1066                    boundary
1067                } else {
1068                    start
1069                };
1070
1071                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1072                    // Find the nearest valid boundary after end
1073                    let mut boundary = end;
1074                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1075                        boundary += 1;
1076                    }
1077                    boundary
1078                } else {
1079                    end.min(content.len())
1080                };
1081
1082                let block_content = &content[safe_start..safe_end];
1083                let is_multiline = block_content.contains('\n');
1084                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1085                let is_indented = !is_fenced
1086                    && block_content
1087                        .lines()
1088                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1089
1090                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1091            });
1092
1093            // Detect list items (skip if in frontmatter or in mkdocstrings block)
1094            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1095                && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1096            let list_item =
1097                if !(in_code_block || is_blank || in_mkdocstrings || (front_matter_end > 0 && i < front_matter_end)) {
1098                    // Strip blockquote prefix if present for list detection
1099                    let (line_for_list_check, blockquote_prefix_len) =
1100                        if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1101                            let prefix = caps.get(1).unwrap().as_str();
1102                            let content = caps.get(2).unwrap().as_str();
1103                            (content, prefix.len())
1104                        } else {
1105                            (&**line, 0)
1106                        };
1107
1108                    if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1109                        let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1110                        let marker = caps.get(2).map_or("", |m| m.as_str());
1111                        let spacing = caps.get(3).map_or("", |m| m.as_str());
1112                        let _content = caps.get(4).map_or("", |m| m.as_str());
1113                        let marker_column = blockquote_prefix_len + leading_spaces.len();
1114                        let content_column = marker_column + marker.len() + spacing.len();
1115
1116                        // According to CommonMark spec, unordered list items MUST have at least one space
1117                        // after the marker (-, *, or +). Without a space, it's not a list item.
1118                        // This also naturally handles cases like:
1119                        // - *emphasis* (not a list)
1120                        // - **bold** (not a list)
1121                        // - --- (horizontal rule, not a list)
1122                        if spacing.is_empty() {
1123                            None
1124                        } else {
1125                            Some(ListItemInfo {
1126                                marker: marker.to_string(),
1127                                is_ordered: false,
1128                                number: None,
1129                                marker_column,
1130                                content_column,
1131                            })
1132                        }
1133                    } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1134                        let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1135                        let number_str = caps.get(2).map_or("", |m| m.as_str());
1136                        let delimiter = caps.get(3).map_or("", |m| m.as_str());
1137                        let spacing = caps.get(4).map_or("", |m| m.as_str());
1138                        let _content = caps.get(5).map_or("", |m| m.as_str());
1139                        let marker = format!("{number_str}{delimiter}");
1140                        let marker_column = blockquote_prefix_len + leading_spaces.len();
1141                        let content_column = marker_column + marker.len() + spacing.len();
1142
1143                        // According to CommonMark spec, ordered list items MUST have at least one space
1144                        // after the marker (period or parenthesis). Without a space, it's not a list item.
1145                        if spacing.is_empty() {
1146                            None
1147                        } else {
1148                            Some(ListItemInfo {
1149                                marker,
1150                                is_ordered: true,
1151                                number: number_str.parse().ok(),
1152                                marker_column,
1153                                content_column,
1154                            })
1155                        }
1156                    } else {
1157                        None
1158                    }
1159                } else {
1160                    None
1161                };
1162
1163            lines.push(LineInfo {
1164                content: line.to_string(),
1165                byte_offset,
1166                indent,
1167                is_blank,
1168                in_code_block,
1169                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1170                in_html_block: false, // Will be populated after line creation
1171                list_item,
1172                heading: None,    // Will be populated in second pass for Setext headings
1173                blockquote: None, // Will be populated after line creation
1174                in_mkdocstrings,
1175            });
1176        }
1177
1178        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1179        for i in 0..content_lines.len() {
1180            if lines[i].in_code_block {
1181                continue;
1182            }
1183
1184            // Skip lines in front matter
1185            if front_matter_end > 0 && i < front_matter_end {
1186                continue;
1187            }
1188
1189            let line = content_lines[i];
1190
1191            // Check for blockquotes (even on blank lines within blockquotes)
1192            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1193                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1194                let markers = caps.get(2).map_or("", |m| m.as_str());
1195                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1196                let content = caps.get(4).map_or("", |m| m.as_str());
1197
1198                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1199                let marker_column = indent_str.len();
1200
1201                // Build the prefix (indentation + markers + space)
1202                let prefix = format!("{indent_str}{markers}{spaces_after}");
1203
1204                // Check for various blockquote issues
1205                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1206                // Consider tabs as multiple spaces, or actual multiple spaces
1207                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1208
1209                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1210                // MD028 flags empty blockquote lines that don't have a single space after the marker
1211                // Lines like "> " or ">> " are already correct and don't need fixing
1212                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1213
1214                lines[i].blockquote = Some(BlockquoteInfo {
1215                    nesting_level,
1216                    indent: indent_str.to_string(),
1217                    marker_column,
1218                    prefix,
1219                    content: content.to_string(),
1220                    has_no_space_after_marker: has_no_space,
1221                    has_multiple_spaces_after_marker: has_multiple_spaces,
1222                    needs_md028_fix,
1223                });
1224            }
1225
1226            // Skip heading detection for blank lines
1227            if lines[i].is_blank {
1228                continue;
1229            }
1230
1231            // Check for ATX headings (but skip MkDocs snippet lines)
1232            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1233            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1234                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1235                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1236            } else {
1237                false
1238            };
1239
1240            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1241                // Skip headings inside HTML comments
1242                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1243                    continue;
1244                }
1245                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1246                let hashes = caps.get(2).map_or("", |m| m.as_str());
1247                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1248                let rest = caps.get(4).map_or("", |m| m.as_str());
1249
1250                let level = hashes.len() as u8;
1251                let marker_column = leading_spaces.len();
1252
1253                // Check for closing sequence, but handle custom IDs that might come after
1254                let (text, has_closing, closing_seq) = {
1255                    // First check if there's a custom ID at the end
1256                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1257                        // Check if this looks like a valid custom ID (ends with })
1258                        if rest[id_start..].trim_end().ends_with('}') {
1259                            // Split off the custom ID
1260                            (&rest[..id_start], &rest[id_start..])
1261                        } else {
1262                            (rest, "")
1263                        }
1264                    } else {
1265                        (rest, "")
1266                    };
1267
1268                    // Now look for closing hashes in the part before the custom ID
1269                    let trimmed_rest = rest_without_id.trim_end();
1270                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1271                        // Look for the start of the hash sequence
1272                        let mut start_of_hashes = last_hash_pos;
1273                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1274                            start_of_hashes -= 1;
1275                        }
1276
1277                        // Check if there's at least one space before the closing hashes
1278                        let has_space_before = start_of_hashes == 0
1279                            || trimmed_rest
1280                                .chars()
1281                                .nth(start_of_hashes - 1)
1282                                .is_some_and(|c| c.is_whitespace());
1283
1284                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1285                        let potential_closing = &trimmed_rest[start_of_hashes..];
1286                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1287
1288                        if is_all_hashes && has_space_before {
1289                            // This is a closing sequence
1290                            let closing_hashes = potential_closing.to_string();
1291                            // The text is everything before the closing hashes
1292                            // Don't include the custom ID here - it will be extracted later
1293                            let text_part = if !custom_id_part.is_empty() {
1294                                // If we have a custom ID, append it back to get the full rest
1295                                // This allows the extract_header_id function to handle it properly
1296                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1297                            } else {
1298                                rest_without_id[..start_of_hashes].trim_end().to_string()
1299                            };
1300                            (text_part, true, closing_hashes)
1301                        } else {
1302                            // Not a valid closing sequence, return the full content
1303                            (rest.to_string(), false, String::new())
1304                        }
1305                    } else {
1306                        // No hashes found, return the full content
1307                        (rest.to_string(), false, String::new())
1308                    }
1309                };
1310
1311                let content_column = marker_column + hashes.len() + spaces_after.len();
1312
1313                // Extract custom header ID if present
1314                let raw_text = text.trim().to_string();
1315                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1316
1317                // If no custom ID was found on the header line, check the next line for standalone attr-list
1318                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1319                    let next_line = content_lines[i + 1];
1320                    if !lines[i + 1].in_code_block
1321                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1322                        && let Some(next_line_id) =
1323                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1324                    {
1325                        custom_id = Some(next_line_id);
1326                    }
1327                }
1328
1329                lines[i].heading = Some(HeadingInfo {
1330                    level,
1331                    style: HeadingStyle::ATX,
1332                    marker: hashes.to_string(),
1333                    marker_column,
1334                    content_column,
1335                    text: clean_text,
1336                    custom_id,
1337                    raw_text,
1338                    has_closing_sequence: has_closing,
1339                    closing_sequence: closing_seq,
1340                });
1341            }
1342            // Check for Setext headings (need to look at next line)
1343            else if i + 1 < content_lines.len() {
1344                let next_line = content_lines[i + 1];
1345                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1346                    // Skip if next line is front matter delimiter
1347                    if front_matter_end > 0 && i < front_matter_end {
1348                        continue;
1349                    }
1350
1351                    // Skip Setext headings inside HTML comments
1352                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1353                        continue;
1354                    }
1355
1356                    let underline = next_line.trim();
1357
1358                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1359                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1360                    if underline == "---" {
1361                        continue;
1362                    }
1363
1364                    // Skip if the current line looks like YAML key-value syntax
1365                    let current_line_trimmed = line.trim();
1366                    if current_line_trimmed.contains(':')
1367                        && !current_line_trimmed.starts_with('#')
1368                        && !current_line_trimmed.contains('[')
1369                        && !current_line_trimmed.contains("](")
1370                    {
1371                        // This looks like "key: value" which suggests YAML, not a heading
1372                        continue;
1373                    }
1374
1375                    let level = if underline.starts_with('=') { 1 } else { 2 };
1376                    let style = if level == 1 {
1377                        HeadingStyle::Setext1
1378                    } else {
1379                        HeadingStyle::Setext2
1380                    };
1381
1382                    // Extract custom header ID if present
1383                    let raw_text = line.trim().to_string();
1384                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1385
1386                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1387                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1388                        let attr_line = content_lines[i + 2];
1389                        if !lines[i + 2].in_code_block
1390                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1391                            && let Some(attr_line_id) =
1392                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1393                        {
1394                            custom_id = Some(attr_line_id);
1395                        }
1396                    }
1397
1398                    lines[i].heading = Some(HeadingInfo {
1399                        level,
1400                        style,
1401                        marker: underline.to_string(),
1402                        marker_column: next_line.len() - next_line.trim_start().len(),
1403                        content_column: lines[i].indent,
1404                        text: clean_text,
1405                        custom_id,
1406                        raw_text,
1407                        has_closing_sequence: false,
1408                        closing_sequence: String::new(),
1409                    });
1410                }
1411            }
1412        }
1413
1414        lines
1415    }
1416
1417    /// Detect HTML blocks in the content
1418    fn detect_html_blocks(lines: &mut [LineInfo]) {
1419        // HTML block elements that trigger block context
1420        const BLOCK_ELEMENTS: &[&str] = &[
1421            "address",
1422            "article",
1423            "aside",
1424            "blockquote",
1425            "details",
1426            "dialog",
1427            "dd",
1428            "div",
1429            "dl",
1430            "dt",
1431            "fieldset",
1432            "figcaption",
1433            "figure",
1434            "footer",
1435            "form",
1436            "h1",
1437            "h2",
1438            "h3",
1439            "h4",
1440            "h5",
1441            "h6",
1442            "header",
1443            "hr",
1444            "li",
1445            "main",
1446            "nav",
1447            "ol",
1448            "p",
1449            "pre",
1450            "section",
1451            "table",
1452            "tbody",
1453            "td",
1454            "tfoot",
1455            "th",
1456            "thead",
1457            "tr",
1458            "ul",
1459        ];
1460
1461        let mut i = 0;
1462        while i < lines.len() {
1463            // Skip if already in code block or front matter
1464            if lines[i].in_code_block || lines[i].in_front_matter {
1465                i += 1;
1466                continue;
1467            }
1468
1469            let trimmed = lines[i].content.trim_start();
1470
1471            // Check if line starts with an HTML tag
1472            if trimmed.starts_with('<') && trimmed.len() > 1 {
1473                // Extract tag name safely
1474                let after_bracket = &trimmed[1..];
1475                let is_closing = after_bracket.starts_with('/');
1476                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1477
1478                // Extract tag name (stop at space, >, /, or end of string)
1479                let tag_name = tag_start
1480                    .chars()
1481                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1482                    .collect::<String>()
1483                    .to_lowercase();
1484
1485                // Check if it's a block element
1486                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1487                    // Mark this line as in HTML block
1488                    lines[i].in_html_block = true;
1489
1490                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1491                    // This avoids complex nesting logic that might cause infinite loops
1492                    if !is_closing {
1493                        let closing_tag = format!("</{tag_name}>");
1494                        let mut j = i + 1;
1495                        while j < lines.len() && j < i + 100 {
1496                            // Limit search to 100 lines
1497                            // Stop at blank lines
1498                            if lines[j].is_blank {
1499                                break;
1500                            }
1501
1502                            lines[j].in_html_block = true;
1503
1504                            // Check if this line contains the closing tag
1505                            if lines[j].content.contains(&closing_tag) {
1506                                break;
1507                            }
1508                            j += 1;
1509                        }
1510                    }
1511                }
1512            }
1513
1514            i += 1;
1515        }
1516    }
1517
1518    /// Parse all inline code spans in the content using AST
1519    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1520        let mut code_spans = Vec::new();
1521
1522        // Quick check - if no backticks, no code spans
1523        if !content.contains('`') {
1524            return code_spans;
1525        }
1526
1527        // Helper function to recursively extract inline code spans from AST nodes
1528        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1529            match node {
1530                Node::InlineCode(inline_code) => {
1531                    if let Some(pos) = &inline_code.position {
1532                        let start_pos = pos.start.offset;
1533                        let end_pos = pos.end.offset;
1534
1535                        // The position includes the backticks, extract the actual content
1536                        let full_span = &content[start_pos..end_pos];
1537                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1538
1539                        // Extract content between backticks, preserving spaces
1540                        let content_start = start_pos + backtick_count;
1541                        let content_end = end_pos - backtick_count;
1542                        let span_content = if content_start < content_end {
1543                            content[content_start..content_end].to_string()
1544                        } else {
1545                            String::new()
1546                        };
1547
1548                        // Find which line this code span starts on
1549                        let mut line_num = 1;
1550                        let mut col_start = start_pos;
1551                        for (idx, line_info) in lines.iter().enumerate() {
1552                            if start_pos >= line_info.byte_offset {
1553                                line_num = idx + 1;
1554                                col_start = start_pos - line_info.byte_offset;
1555                            } else {
1556                                break;
1557                            }
1558                        }
1559
1560                        // Find end column
1561                        let mut col_end = end_pos;
1562                        for line_info in lines.iter() {
1563                            if end_pos > line_info.byte_offset {
1564                                col_end = end_pos - line_info.byte_offset;
1565                            } else {
1566                                break;
1567                            }
1568                        }
1569
1570                        spans.push(CodeSpan {
1571                            line: line_num,
1572                            start_col: col_start,
1573                            end_col: col_end,
1574                            byte_offset: start_pos,
1575                            byte_end: end_pos,
1576                            backtick_count,
1577                            content: span_content,
1578                        });
1579                    }
1580                }
1581                // Recursively process children
1582                Node::Root(root) => {
1583                    for child in &root.children {
1584                        extract_code_spans(child, content, lines, spans);
1585                    }
1586                }
1587                Node::Paragraph(para) => {
1588                    for child in &para.children {
1589                        extract_code_spans(child, content, lines, spans);
1590                    }
1591                }
1592                Node::Heading(heading) => {
1593                    for child in &heading.children {
1594                        extract_code_spans(child, content, lines, spans);
1595                    }
1596                }
1597                Node::List(list) => {
1598                    for child in &list.children {
1599                        extract_code_spans(child, content, lines, spans);
1600                    }
1601                }
1602                Node::ListItem(item) => {
1603                    for child in &item.children {
1604                        extract_code_spans(child, content, lines, spans);
1605                    }
1606                }
1607                Node::Blockquote(blockquote) => {
1608                    for child in &blockquote.children {
1609                        extract_code_spans(child, content, lines, spans);
1610                    }
1611                }
1612                Node::Table(table) => {
1613                    for child in &table.children {
1614                        extract_code_spans(child, content, lines, spans);
1615                    }
1616                }
1617                Node::TableRow(row) => {
1618                    for child in &row.children {
1619                        extract_code_spans(child, content, lines, spans);
1620                    }
1621                }
1622                Node::TableCell(cell) => {
1623                    for child in &cell.children {
1624                        extract_code_spans(child, content, lines, spans);
1625                    }
1626                }
1627                Node::Emphasis(emphasis) => {
1628                    for child in &emphasis.children {
1629                        extract_code_spans(child, content, lines, spans);
1630                    }
1631                }
1632                Node::Strong(strong) => {
1633                    for child in &strong.children {
1634                        extract_code_spans(child, content, lines, spans);
1635                    }
1636                }
1637                Node::Link(link) => {
1638                    for child in &link.children {
1639                        extract_code_spans(child, content, lines, spans);
1640                    }
1641                }
1642                Node::LinkReference(link_ref) => {
1643                    for child in &link_ref.children {
1644                        extract_code_spans(child, content, lines, spans);
1645                    }
1646                }
1647                Node::FootnoteDefinition(footnote) => {
1648                    for child in &footnote.children {
1649                        extract_code_spans(child, content, lines, spans);
1650                    }
1651                }
1652                Node::Delete(delete) => {
1653                    for child in &delete.children {
1654                        extract_code_spans(child, content, lines, spans);
1655                    }
1656                }
1657                // Terminal nodes or nodes without relevant children
1658                Node::Code(_)
1659                | Node::Text(_)
1660                | Node::Html(_)
1661                | Node::Image(_)
1662                | Node::ImageReference(_)
1663                | Node::FootnoteReference(_)
1664                | Node::Break(_)
1665                | Node::ThematicBreak(_)
1666                | Node::Definition(_)
1667                | Node::Yaml(_)
1668                | Node::Toml(_)
1669                | Node::Math(_)
1670                | Node::InlineMath(_)
1671                | Node::MdxJsxFlowElement(_)
1672                | Node::MdxFlowExpression(_)
1673                | Node::MdxJsxTextElement(_)
1674                | Node::MdxTextExpression(_)
1675                | Node::MdxjsEsm(_) => {
1676                    // No children to process or not relevant for code spans
1677                }
1678            }
1679        }
1680
1681        // Extract all code spans from the AST
1682        extract_code_spans(ast, content, lines, &mut code_spans);
1683
1684        // Sort by position to ensure consistent ordering
1685        code_spans.sort_by_key(|span| span.byte_offset);
1686
1687        code_spans
1688    }
1689
1690    /// Parse all list blocks in the content (legacy line-by-line approach)
1691    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1692        // Pre-size based on lines that could be list items
1693        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1694        let mut current_block: Option<ListBlock> = None;
1695        let mut last_list_item_line = 0;
1696        let mut current_indent_level = 0;
1697        let mut last_marker_width = 0;
1698
1699        for (line_idx, line_info) in lines.iter().enumerate() {
1700            let line_num = line_idx + 1;
1701
1702            // Enhanced code block handling using Design #3's context analysis
1703            if line_info.in_code_block {
1704                if let Some(ref mut block) = current_block {
1705                    // Calculate minimum indentation for list continuation
1706                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1707
1708                    // Analyze code block context using the three-tier classification
1709                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1710
1711                    match context {
1712                        CodeBlockContext::Indented => {
1713                            // Code block is properly indented - continues the list
1714                            block.end_line = line_num;
1715                            continue;
1716                        }
1717                        CodeBlockContext::Standalone => {
1718                            // Code block separates lists - end current block
1719                            let completed_block = current_block.take().unwrap();
1720                            list_blocks.push(completed_block);
1721                            continue;
1722                        }
1723                        CodeBlockContext::Adjacent => {
1724                            // Edge case - use conservative behavior (continue list)
1725                            block.end_line = line_num;
1726                            continue;
1727                        }
1728                    }
1729                } else {
1730                    // No current list block - skip code block lines
1731                    continue;
1732                }
1733            }
1734
1735            // Extract blockquote prefix if any
1736            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1737                caps.get(0).unwrap().as_str().to_string()
1738            } else {
1739                String::new()
1740            };
1741
1742            // Check if this line is a list item
1743            if let Some(list_item) = &line_info.list_item {
1744                // Calculate nesting level based on indentation
1745                let item_indent = list_item.marker_column;
1746                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1747
1748                if let Some(ref mut block) = current_block {
1749                    // Check if this continues the current block
1750                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1751                    // or a continuation at the same or lower level
1752                    let is_nested = nesting > block.nesting_level;
1753                    let same_type =
1754                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1755                    let same_context = block.blockquote_prefix == blockquote_prefix;
1756                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1757
1758                    // For unordered lists, also check marker consistency
1759                    let marker_compatible =
1760                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1761
1762                    // Check if there's non-list content between the last item and this one
1763                    let has_non_list_content = {
1764                        let mut found_non_list = false;
1765                        // Use the last item from the current block, not the global last_list_item_line
1766                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1767
1768                        // Debug: Special check for problematic line
1769                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1770                            let last_line = &lines[block_last_item_line - 1];
1771                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1772                                log::debug!(
1773                                    "After problematic line {}: checking lines {} to {} for non-list content",
1774                                    block_last_item_line,
1775                                    block_last_item_line + 1,
1776                                    line_num
1777                                );
1778                                // If they're consecutive list items, there's no content between
1779                                if line_num == block_last_item_line + 1 {
1780                                    log::debug!("Lines are consecutive, no content between");
1781                                }
1782                            }
1783                        }
1784
1785                        for check_line in (block_last_item_line + 1)..line_num {
1786                            let check_idx = check_line - 1;
1787                            if check_idx < lines.len() {
1788                                let check_info = &lines[check_idx];
1789                                // Check for content that breaks the list
1790                                let is_list_breaking_content = if check_info.in_code_block {
1791                                    // Use enhanced code block classification for list separation
1792                                    let last_item_marker_width =
1793                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1794                                            lines[block_last_item_line - 1]
1795                                                .list_item
1796                                                .as_ref()
1797                                                .map(|li| {
1798                                                    if li.is_ordered {
1799                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1800                                                    } else {
1801                                                        li.marker.len()
1802                                                    }
1803                                                })
1804                                                .unwrap_or(3) // fallback to 3 if no list item found
1805                                        } else {
1806                                            3 // fallback
1807                                        };
1808
1809                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1810
1811                                    // Analyze code block context using our enhanced classification
1812                                    let context = CodeBlockUtils::analyze_code_block_context(
1813                                        lines,
1814                                        check_line - 1,
1815                                        min_continuation,
1816                                    );
1817
1818                                    // Standalone code blocks break lists, indented ones continue them
1819                                    matches!(context, CodeBlockContext::Standalone)
1820                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1821                                    // Check for structural separators that should break lists (from issue #42)
1822                                    let line_content = check_info.content.trim();
1823
1824                                    // Any of these structural separators break lists
1825                                    if check_info.heading.is_some()
1826                                        || line_content.starts_with("---")
1827                                        || line_content.starts_with("***")
1828                                        || line_content.starts_with("___")
1829                                        || (line_content.contains('|')
1830                                            && !line_content.contains("](")
1831                                            && !line_content.contains("http")
1832                                            && (line_content.matches('|').count() > 1
1833                                                || line_content.starts_with('|')
1834                                                || line_content.ends_with('|')))
1835                                        || line_content.starts_with(">")
1836                                    {
1837                                        true
1838                                    }
1839                                    // Other non-list content - check if properly indented
1840                                    else {
1841                                        let last_item_marker_width =
1842                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1843                                                lines[block_last_item_line - 1]
1844                                                    .list_item
1845                                                    .as_ref()
1846                                                    .map(|li| {
1847                                                        if li.is_ordered {
1848                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1849                                                        } else {
1850                                                            li.marker.len()
1851                                                        }
1852                                                    })
1853                                                    .unwrap_or(3) // fallback to 3 if no list item found
1854                                            } else {
1855                                                3 // fallback
1856                                            };
1857
1858                                        let min_continuation =
1859                                            if block.is_ordered { last_item_marker_width } else { 2 };
1860                                        check_info.indent < min_continuation
1861                                    }
1862                                } else {
1863                                    false
1864                                };
1865
1866                                if is_list_breaking_content {
1867                                    // Not indented enough, so it breaks the list
1868                                    found_non_list = true;
1869                                    break;
1870                                }
1871                            }
1872                        }
1873                        found_non_list
1874                    };
1875
1876                    // A list continues if:
1877                    // 1. It's a nested item (indented more than the parent), OR
1878                    // 2. It's the same type at the same level with reasonable distance
1879                    let mut continues_list = if is_nested {
1880                        // Nested items always continue the list if they're in the same context
1881                        same_context && reasonable_distance && !has_non_list_content
1882                    } else {
1883                        // Same-level items need to match type and markers
1884                        let result = same_type
1885                            && same_context
1886                            && reasonable_distance
1887                            && marker_compatible
1888                            && !has_non_list_content;
1889
1890                        // Debug logging for lines after problematic content
1891                        if block.item_lines.last().is_some_and(|&last_line| {
1892                            last_line > 0
1893                                && last_line <= lines.len()
1894                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1895                                && lines[last_line - 1].content.contains(r"\`")
1896                        }) {
1897                            log::debug!(
1898                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1899                            );
1900                            if line_num > 0 && line_num <= lines.len() {
1901                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1902                            }
1903                        }
1904
1905                        result
1906                    };
1907
1908                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1909                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1910                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1911                        // Check if the previous line was a list item
1912                        if block.item_lines.contains(&(line_num - 1)) {
1913                            // They're consecutive list items - force them to be in the same list
1914                            continues_list = true;
1915                        }
1916                    }
1917
1918                    if continues_list {
1919                        // Extend current block
1920                        block.end_line = line_num;
1921                        block.item_lines.push(line_num);
1922
1923                        // Update max marker width
1924                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1925                            list_item.marker.len() + 1
1926                        } else {
1927                            list_item.marker.len()
1928                        });
1929
1930                        // Update marker consistency for unordered lists
1931                        if !block.is_ordered
1932                            && block.marker.is_some()
1933                            && block.marker.as_ref() != Some(&list_item.marker)
1934                        {
1935                            // Mixed markers, clear the marker field
1936                            block.marker = None;
1937                        }
1938                    } else {
1939                        // End current block and start a new one
1940
1941                        list_blocks.push(block.clone());
1942
1943                        *block = ListBlock {
1944                            start_line: line_num,
1945                            end_line: line_num,
1946                            is_ordered: list_item.is_ordered,
1947                            marker: if list_item.is_ordered {
1948                                None
1949                            } else {
1950                                Some(list_item.marker.clone())
1951                            },
1952                            blockquote_prefix: blockquote_prefix.clone(),
1953                            item_lines: vec![line_num],
1954                            nesting_level: nesting,
1955                            max_marker_width: if list_item.is_ordered {
1956                                list_item.marker.len() + 1
1957                            } else {
1958                                list_item.marker.len()
1959                            },
1960                        };
1961                    }
1962                } else {
1963                    // Start a new block
1964                    current_block = Some(ListBlock {
1965                        start_line: line_num,
1966                        end_line: line_num,
1967                        is_ordered: list_item.is_ordered,
1968                        marker: if list_item.is_ordered {
1969                            None
1970                        } else {
1971                            Some(list_item.marker.clone())
1972                        },
1973                        blockquote_prefix,
1974                        item_lines: vec![line_num],
1975                        nesting_level: nesting,
1976                        max_marker_width: list_item.marker.len(),
1977                    });
1978                }
1979
1980                last_list_item_line = line_num;
1981                current_indent_level = item_indent;
1982                last_marker_width = if list_item.is_ordered {
1983                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
1984                } else {
1985                    list_item.marker.len()
1986                };
1987            } else if let Some(ref mut block) = current_block {
1988                // Not a list item - check if it continues the current block
1989
1990                // For MD032 compatibility, we use a simple approach:
1991                // - Indented lines continue the list
1992                // - Blank lines followed by indented content continue the list
1993                // - Everything else ends the list
1994
1995                // Check if the last line in the list block ended with a backslash (hard line break)
1996                // This handles cases where list items use backslash for hard line breaks
1997                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
1998                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
1999                } else {
2000                    false
2001                };
2002
2003                // Calculate minimum indentation for list continuation
2004                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2005                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2006                let min_continuation_indent = if block.is_ordered {
2007                    current_indent_level + last_marker_width
2008                } else {
2009                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2010                };
2011
2012                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2013                    // Indented line or backslash continuation continues the list
2014                    block.end_line = line_num;
2015                } else if line_info.is_blank {
2016                    // Blank line - check if it's internal to the list or ending it
2017                    // We only include blank lines that are followed by more list content
2018                    let mut check_idx = line_idx + 1;
2019                    let mut found_continuation = false;
2020
2021                    // Skip additional blank lines
2022                    while check_idx < lines.len() && lines[check_idx].is_blank {
2023                        check_idx += 1;
2024                    }
2025
2026                    if check_idx < lines.len() {
2027                        let next_line = &lines[check_idx];
2028                        // Check if followed by indented content (list continuation)
2029                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2030                            found_continuation = true;
2031                        }
2032                        // Check if followed by another list item at the same level
2033                        else if !next_line.in_code_block
2034                            && next_line.list_item.is_some()
2035                            && let Some(item) = &next_line.list_item
2036                        {
2037                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2038                                .find(&next_line.content)
2039                                .map_or(String::new(), |m| m.as_str().to_string());
2040                            if item.marker_column == current_indent_level
2041                                && item.is_ordered == block.is_ordered
2042                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2043                            {
2044                                // Check if there was meaningful content between the list items (unused now)
2045                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2046                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2047                                    if let Some(between_line) = lines.get(idx) {
2048                                        let trimmed = between_line.content.trim();
2049                                        // Skip empty lines
2050                                        if trimmed.is_empty() {
2051                                            return false;
2052                                        }
2053                                        // Check for meaningful content
2054                                        let line_indent =
2055                                            between_line.content.len() - between_line.content.trim_start().len();
2056
2057                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2058                                        if trimmed.starts_with("```")
2059                                            || trimmed.starts_with("~~~")
2060                                            || trimmed.starts_with("---")
2061                                            || trimmed.starts_with("***")
2062                                            || trimmed.starts_with("___")
2063                                            || trimmed.starts_with(">")
2064                                            || trimmed.contains('|') // Tables
2065                                            || between_line.heading.is_some()
2066                                        {
2067                                            return true; // These are structural separators - meaningful content that breaks lists
2068                                        }
2069
2070                                        // Only properly indented content continues the list
2071                                        line_indent >= min_continuation_indent
2072                                    } else {
2073                                        false
2074                                    }
2075                                });
2076
2077                                if block.is_ordered {
2078                                    // For ordered lists: don't continue if there are structural separators
2079                                    // Check if there are structural separators between the list items
2080                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2081                                        if let Some(between_line) = lines.get(idx) {
2082                                            let trimmed = between_line.content.trim();
2083                                            if trimmed.is_empty() {
2084                                                return false;
2085                                            }
2086                                            // Check for structural separators that break lists
2087                                            trimmed.starts_with("```")
2088                                                || trimmed.starts_with("~~~")
2089                                                || trimmed.starts_with("---")
2090                                                || trimmed.starts_with("***")
2091                                                || trimmed.starts_with("___")
2092                                                || trimmed.starts_with(">")
2093                                                || trimmed.contains('|') // Tables
2094                                                || between_line.heading.is_some()
2095                                        } else {
2096                                            false
2097                                        }
2098                                    });
2099                                    found_continuation = !has_structural_separators;
2100                                } else {
2101                                    // For unordered lists: also check for structural separators
2102                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2103                                        if let Some(between_line) = lines.get(idx) {
2104                                            let trimmed = between_line.content.trim();
2105                                            if trimmed.is_empty() {
2106                                                return false;
2107                                            }
2108                                            // Check for structural separators that break lists
2109                                            trimmed.starts_with("```")
2110                                                || trimmed.starts_with("~~~")
2111                                                || trimmed.starts_with("---")
2112                                                || trimmed.starts_with("***")
2113                                                || trimmed.starts_with("___")
2114                                                || trimmed.starts_with(">")
2115                                                || trimmed.contains('|') // Tables
2116                                                || between_line.heading.is_some()
2117                                        } else {
2118                                            false
2119                                        }
2120                                    });
2121                                    found_continuation = !has_structural_separators;
2122                                }
2123                            }
2124                        }
2125                    }
2126
2127                    if found_continuation {
2128                        // Include the blank line in the block
2129                        block.end_line = line_num;
2130                    } else {
2131                        // Blank line ends the list - don't include it
2132                        list_blocks.push(block.clone());
2133                        current_block = None;
2134                    }
2135                } else {
2136                    // Check for lazy continuation - non-indented line immediately after a list item
2137                    // But only if the line has sufficient indentation for the list type
2138                    let min_required_indent = if block.is_ordered {
2139                        current_indent_level + last_marker_width
2140                    } else {
2141                        current_indent_level + 2
2142                    };
2143
2144                    // For lazy continuation to apply, the line must either:
2145                    // 1. Have no indentation (true lazy continuation)
2146                    // 2. Have sufficient indentation for the list type
2147                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2148                    let line_content = line_info.content.trim();
2149                    let is_structural_separator = line_info.heading.is_some()
2150                        || line_content.starts_with("```")
2151                        || line_content.starts_with("~~~")
2152                        || line_content.starts_with("---")
2153                        || line_content.starts_with("***")
2154                        || line_content.starts_with("___")
2155                        || line_content.starts_with(">")
2156                        || (line_content.contains('|')
2157                            && !line_content.contains("](")
2158                            && !line_content.contains("http")
2159                            && (line_content.matches('|').count() > 1
2160                                || line_content.starts_with('|')
2161                                || line_content.ends_with('|'))); // Tables
2162
2163                    // Allow lazy continuation if we're still within the same list block
2164                    // (not just immediately after a list item)
2165                    let is_lazy_continuation = !is_structural_separator
2166                        && !line_info.is_blank
2167                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2168
2169                    if is_lazy_continuation {
2170                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2171                        // it's probably not a continuation
2172                        let content_to_check = if !blockquote_prefix.is_empty() {
2173                            // Strip blockquote prefix to check the actual content
2174                            line_info
2175                                .content
2176                                .strip_prefix(&blockquote_prefix)
2177                                .unwrap_or(&line_info.content)
2178                                .trim()
2179                        } else {
2180                            line_info.content.trim()
2181                        };
2182
2183                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2184
2185                        // If it starts with uppercase and the previous line ended with punctuation,
2186                        // it's likely a new paragraph, not a continuation
2187                        if starts_with_uppercase && last_list_item_line > 0 {
2188                            // This looks like a new paragraph
2189                            list_blocks.push(block.clone());
2190                            current_block = None;
2191                        } else {
2192                            // This is a lazy continuation line
2193                            block.end_line = line_num;
2194                        }
2195                    } else {
2196                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2197                        list_blocks.push(block.clone());
2198                        current_block = None;
2199                    }
2200                }
2201            }
2202        }
2203
2204        // Don't forget the last block
2205        if let Some(block) = current_block {
2206            list_blocks.push(block);
2207        }
2208
2209        // Merge adjacent blocks that should be one
2210        merge_adjacent_list_blocks(&mut list_blocks, lines);
2211
2212        list_blocks
2213    }
2214
2215    /// Compute character frequency for fast content analysis
2216    fn compute_char_frequency(content: &str) -> CharFrequency {
2217        let mut frequency = CharFrequency::default();
2218
2219        for ch in content.chars() {
2220            match ch {
2221                '#' => frequency.hash_count += 1,
2222                '*' => frequency.asterisk_count += 1,
2223                '_' => frequency.underscore_count += 1,
2224                '-' => frequency.hyphen_count += 1,
2225                '+' => frequency.plus_count += 1,
2226                '>' => frequency.gt_count += 1,
2227                '|' => frequency.pipe_count += 1,
2228                '[' => frequency.bracket_count += 1,
2229                '`' => frequency.backtick_count += 1,
2230                '<' => frequency.lt_count += 1,
2231                '!' => frequency.exclamation_count += 1,
2232                '\n' => frequency.newline_count += 1,
2233                _ => {}
2234            }
2235        }
2236
2237        frequency
2238    }
2239
2240    /// Parse HTML tags in the content
2241    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2242        lazy_static! {
2243            static ref HTML_TAG_REGEX: regex::Regex =
2244                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2245        }
2246
2247        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2248
2249        for cap in HTML_TAG_REGEX.captures_iter(content) {
2250            let full_match = cap.get(0).unwrap();
2251            let match_start = full_match.start();
2252            let match_end = full_match.end();
2253
2254            // Skip if in code block
2255            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2256                continue;
2257            }
2258
2259            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2260            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2261            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2262
2263            // Find which line this tag is on
2264            let mut line_num = 1;
2265            let mut col_start = match_start;
2266            let mut col_end = match_end;
2267            for (idx, line_info) in lines.iter().enumerate() {
2268                if match_start >= line_info.byte_offset {
2269                    line_num = idx + 1;
2270                    col_start = match_start - line_info.byte_offset;
2271                    col_end = match_end - line_info.byte_offset;
2272                } else {
2273                    break;
2274                }
2275            }
2276
2277            html_tags.push(HtmlTag {
2278                line: line_num,
2279                start_col: col_start,
2280                end_col: col_end,
2281                byte_offset: match_start,
2282                byte_end: match_end,
2283                tag_name,
2284                is_closing,
2285                is_self_closing,
2286                raw_content: full_match.as_str().to_string(),
2287            });
2288        }
2289
2290        html_tags
2291    }
2292
2293    /// Parse emphasis spans in the content
2294    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2295        lazy_static! {
2296            static ref EMPHASIS_REGEX: regex::Regex =
2297                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2298        }
2299
2300        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2301
2302        for cap in EMPHASIS_REGEX.captures_iter(content) {
2303            let full_match = cap.get(0).unwrap();
2304            let match_start = full_match.start();
2305            let match_end = full_match.end();
2306
2307            // Skip if in code block
2308            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2309                continue;
2310            }
2311
2312            let opening_markers = cap.get(1).unwrap().as_str();
2313            let content_part = cap.get(2).unwrap().as_str();
2314            let closing_markers = cap.get(3).unwrap().as_str();
2315
2316            // Validate matching markers
2317            if opening_markers.chars().next() != closing_markers.chars().next()
2318                || opening_markers.len() != closing_markers.len()
2319            {
2320                continue;
2321            }
2322
2323            let marker = opening_markers.chars().next().unwrap();
2324            let marker_count = opening_markers.len();
2325
2326            // Find which line this emphasis is on
2327            let mut line_num = 1;
2328            let mut col_start = match_start;
2329            let mut col_end = match_end;
2330            for (idx, line_info) in lines.iter().enumerate() {
2331                if match_start >= line_info.byte_offset {
2332                    line_num = idx + 1;
2333                    col_start = match_start - line_info.byte_offset;
2334                    col_end = match_end - line_info.byte_offset;
2335                } else {
2336                    break;
2337                }
2338            }
2339
2340            emphasis_spans.push(EmphasisSpan {
2341                line: line_num,
2342                start_col: col_start,
2343                end_col: col_end,
2344                byte_offset: match_start,
2345                byte_end: match_end,
2346                marker,
2347                marker_count,
2348                content: content_part.to_string(),
2349            });
2350        }
2351
2352        emphasis_spans
2353    }
2354
2355    /// Parse table rows in the content
2356    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2357        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2358
2359        for (line_idx, line_info) in lines.iter().enumerate() {
2360            // Skip lines in code blocks or blank lines
2361            if line_info.in_code_block || line_info.is_blank {
2362                continue;
2363            }
2364
2365            let line = &line_info.content;
2366            let line_num = line_idx + 1;
2367
2368            // Check if this line contains pipes (potential table row)
2369            if !line.contains('|') {
2370                continue;
2371            }
2372
2373            // Count columns by splitting on pipes
2374            let parts: Vec<&str> = line.split('|').collect();
2375            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2376
2377            // Check if this is a separator row
2378            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2379            let mut column_alignments = Vec::new();
2380
2381            if is_separator {
2382                for part in &parts[1..parts.len() - 1] {
2383                    // Skip first and last empty parts
2384                    let trimmed = part.trim();
2385                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2386                        "center".to_string()
2387                    } else if trimmed.ends_with(':') {
2388                        "right".to_string()
2389                    } else if trimmed.starts_with(':') {
2390                        "left".to_string()
2391                    } else {
2392                        "none".to_string()
2393                    };
2394                    column_alignments.push(alignment);
2395                }
2396            }
2397
2398            table_rows.push(TableRow {
2399                line: line_num,
2400                is_separator,
2401                column_count,
2402                column_alignments,
2403            });
2404        }
2405
2406        table_rows
2407    }
2408
2409    /// Parse bare URLs and emails in the content
2410    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2411        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2412
2413        // Check for bare URLs (not in angle brackets or markdown links)
2414        for cap in BARE_URL_PATTERN.captures_iter(content) {
2415            let full_match = cap.get(0).unwrap();
2416            let match_start = full_match.start();
2417            let match_end = full_match.end();
2418
2419            // Skip if in code block
2420            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2421                continue;
2422            }
2423
2424            // Skip if already in angle brackets or markdown links
2425            let preceding_char = if match_start > 0 {
2426                content.chars().nth(match_start - 1)
2427            } else {
2428                None
2429            };
2430            let following_char = content.chars().nth(match_end);
2431
2432            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2433                continue;
2434            }
2435            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2436                continue;
2437            }
2438
2439            let url = full_match.as_str();
2440            let url_type = if url.starts_with("https://") {
2441                "https"
2442            } else if url.starts_with("http://") {
2443                "http"
2444            } else if url.starts_with("ftp://") {
2445                "ftp"
2446            } else {
2447                "other"
2448            };
2449
2450            // Find which line this URL is on
2451            let mut line_num = 1;
2452            let mut col_start = match_start;
2453            let mut col_end = match_end;
2454            for (idx, line_info) in lines.iter().enumerate() {
2455                if match_start >= line_info.byte_offset {
2456                    line_num = idx + 1;
2457                    col_start = match_start - line_info.byte_offset;
2458                    col_end = match_end - line_info.byte_offset;
2459                } else {
2460                    break;
2461                }
2462            }
2463
2464            bare_urls.push(BareUrl {
2465                line: line_num,
2466                start_col: col_start,
2467                end_col: col_end,
2468                byte_offset: match_start,
2469                byte_end: match_end,
2470                url: url.to_string(),
2471                url_type: url_type.to_string(),
2472            });
2473        }
2474
2475        // Check for bare email addresses
2476        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2477            let full_match = cap.get(0).unwrap();
2478            let match_start = full_match.start();
2479            let match_end = full_match.end();
2480
2481            // Skip if in code block
2482            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2483                continue;
2484            }
2485
2486            // Skip if already in angle brackets or markdown links
2487            let preceding_char = if match_start > 0 {
2488                content.chars().nth(match_start - 1)
2489            } else {
2490                None
2491            };
2492            let following_char = content.chars().nth(match_end);
2493
2494            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2495                continue;
2496            }
2497            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2498                continue;
2499            }
2500
2501            let email = full_match.as_str();
2502
2503            // Find which line this email is on
2504            let mut line_num = 1;
2505            let mut col_start = match_start;
2506            let mut col_end = match_end;
2507            for (idx, line_info) in lines.iter().enumerate() {
2508                if match_start >= line_info.byte_offset {
2509                    line_num = idx + 1;
2510                    col_start = match_start - line_info.byte_offset;
2511                    col_end = match_end - line_info.byte_offset;
2512                } else {
2513                    break;
2514                }
2515            }
2516
2517            bare_urls.push(BareUrl {
2518                line: line_num,
2519                start_col: col_start,
2520                end_col: col_end,
2521                byte_offset: match_start,
2522                byte_end: match_end,
2523                url: email.to_string(),
2524                url_type: "email".to_string(),
2525            });
2526        }
2527
2528        bare_urls
2529    }
2530}
2531
2532/// Merge adjacent list blocks that should be treated as one
2533fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2534    if list_blocks.len() < 2 {
2535        return;
2536    }
2537
2538    let mut merger = ListBlockMerger::new(lines);
2539    *list_blocks = merger.merge(list_blocks);
2540}
2541
2542/// Helper struct to manage the complex logic of merging list blocks
2543struct ListBlockMerger<'a> {
2544    lines: &'a [LineInfo],
2545}
2546
2547impl<'a> ListBlockMerger<'a> {
2548    fn new(lines: &'a [LineInfo]) -> Self {
2549        Self { lines }
2550    }
2551
2552    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2553        let mut merged = Vec::with_capacity(list_blocks.len());
2554        let mut current = list_blocks[0].clone();
2555
2556        for next in list_blocks.iter().skip(1) {
2557            if self.should_merge_blocks(&current, next) {
2558                current = self.merge_two_blocks(current, next);
2559            } else {
2560                merged.push(current);
2561                current = next.clone();
2562            }
2563        }
2564
2565        merged.push(current);
2566        merged
2567    }
2568
2569    /// Determine if two adjacent list blocks should be merged
2570    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2571        // Basic compatibility checks
2572        if !self.blocks_are_compatible(current, next) {
2573            return false;
2574        }
2575
2576        // Check spacing and content between blocks
2577        let spacing = self.analyze_spacing_between(current, next);
2578        match spacing {
2579            BlockSpacing::Consecutive => true,
2580            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2581            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2582                self.can_merge_with_content_between(current, next)
2583            }
2584        }
2585    }
2586
2587    /// Check if blocks have compatible structure for merging
2588    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2589        current.is_ordered == next.is_ordered
2590            && current.blockquote_prefix == next.blockquote_prefix
2591            && current.nesting_level == next.nesting_level
2592    }
2593
2594    /// Analyze the spacing between two list blocks
2595    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2596        let gap = next.start_line - current.end_line;
2597
2598        match gap {
2599            1 => BlockSpacing::Consecutive,
2600            2 => BlockSpacing::SingleBlank,
2601            _ if gap > 2 => {
2602                if self.has_only_blank_lines_between(current, next) {
2603                    BlockSpacing::MultipleBlanks
2604                } else {
2605                    BlockSpacing::ContentBetween
2606                }
2607            }
2608            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2609        }
2610    }
2611
2612    /// Check if unordered lists can be merged with a single blank line between
2613    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2614        // Check if there are structural separators between the blocks
2615        // If has_meaningful_content_between returns true, it means there are structural separators
2616        if has_meaningful_content_between(current, next, self.lines) {
2617            return false; // Structural separators prevent merging
2618        }
2619
2620        // Only merge unordered lists with same marker across single blank
2621        !current.is_ordered && current.marker == next.marker
2622    }
2623
2624    /// Check if ordered lists can be merged when there's content between them
2625    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2626        // Do not merge lists if there are structural separators between them
2627        if has_meaningful_content_between(current, next, self.lines) {
2628            return false; // Structural separators prevent merging
2629        }
2630
2631        // Only consider merging ordered lists if there's no structural content between
2632        current.is_ordered && next.is_ordered
2633    }
2634
2635    /// Check if there are only blank lines between blocks
2636    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2637        for line_num in (current.end_line + 1)..next.start_line {
2638            if let Some(line_info) = self.lines.get(line_num - 1)
2639                && !line_info.content.trim().is_empty()
2640            {
2641                return false;
2642            }
2643        }
2644        true
2645    }
2646
2647    /// Merge two compatible list blocks into one
2648    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2649        current.end_line = next.end_line;
2650        current.item_lines.extend_from_slice(&next.item_lines);
2651
2652        // Update max marker width
2653        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2654
2655        // Handle marker consistency for unordered lists
2656        if !current.is_ordered && self.markers_differ(&current, next) {
2657            current.marker = None; // Mixed markers
2658        }
2659
2660        current
2661    }
2662
2663    /// Check if two blocks have different markers
2664    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2665        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2666    }
2667}
2668
2669/// Types of spacing between list blocks
2670#[derive(Debug, PartialEq)]
2671enum BlockSpacing {
2672    Consecutive,    // No gap between blocks
2673    SingleBlank,    // One blank line between blocks
2674    MultipleBlanks, // Multiple blank lines but no content
2675    ContentBetween, // Content exists between blocks
2676}
2677
2678/// Check if there's meaningful content (not just blank lines) between two list blocks
2679fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2680    // Check lines between current.end_line and next.start_line
2681    for line_num in (current.end_line + 1)..next.start_line {
2682        if let Some(line_info) = lines.get(line_num - 1) {
2683            // Convert to 0-indexed
2684            let trimmed = line_info.content.trim();
2685
2686            // Skip empty lines
2687            if trimmed.is_empty() {
2688                continue;
2689            }
2690
2691            // Check for structural separators that should separate lists (CommonMark compliant)
2692
2693            // Headings separate lists
2694            if line_info.heading.is_some() {
2695                return true; // Has meaningful content - headings separate lists
2696            }
2697
2698            // Horizontal rules separate lists (---, ***, ___)
2699            if is_horizontal_rule(trimmed) {
2700                return true; // Has meaningful content - horizontal rules separate lists
2701            }
2702
2703            // Tables separate lists (lines containing | but not in URLs or code)
2704            // Simple heuristic: tables typically have | at start/end or multiple |
2705            if trimmed.contains('|') && trimmed.len() > 1 {
2706                // Don't treat URLs with | as tables
2707                if !trimmed.contains("](") && !trimmed.contains("http") {
2708                    // More robust check: tables usually have multiple | or | at edges
2709                    let pipe_count = trimmed.matches('|').count();
2710                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2711                        return true; // Has meaningful content - tables separate lists
2712                    }
2713                }
2714            }
2715
2716            // Blockquotes separate lists
2717            if trimmed.starts_with('>') {
2718                return true; // Has meaningful content - blockquotes separate lists
2719            }
2720
2721            // Code block fences separate lists (unless properly indented as list content)
2722            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2723                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2724
2725                // Check if this code block is properly indented as list continuation
2726                let min_continuation_indent = if current.is_ordered {
2727                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2728                } else {
2729                    current.nesting_level + 2
2730                };
2731
2732                if line_indent < min_continuation_indent {
2733                    // This is a standalone code block that separates lists
2734                    return true; // Has meaningful content - standalone code blocks separate lists
2735                }
2736            }
2737
2738            // Check if this line has proper indentation for list continuation
2739            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2740
2741            // Calculate minimum indentation needed to be list continuation
2742            let min_indent = if current.is_ordered {
2743                current.nesting_level + current.max_marker_width
2744            } else {
2745                current.nesting_level + 2
2746            };
2747
2748            // If the line is not indented enough to be list continuation, it's meaningful content
2749            if line_indent < min_indent {
2750                return true; // Has meaningful content - content not indented as list continuation
2751            }
2752
2753            // If we reach here, the line is properly indented as list continuation
2754            // Continue checking other lines
2755        }
2756    }
2757
2758    // Only blank lines or properly indented list continuation content between blocks
2759    false
2760}
2761
2762/// Check if a line is a horizontal rule (---, ***, ___)
2763fn is_horizontal_rule(trimmed: &str) -> bool {
2764    if trimmed.len() < 3 {
2765        return false;
2766    }
2767
2768    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2769    let chars: Vec<char> = trimmed.chars().collect();
2770    if let Some(&first_char) = chars.first()
2771        && (first_char == '-' || first_char == '*' || first_char == '_')
2772    {
2773        let mut count = 0;
2774        for &ch in &chars {
2775            if ch == first_char {
2776                count += 1;
2777            } else if ch != ' ' && ch != '\t' {
2778                return false; // Non-matching, non-whitespace character
2779            }
2780        }
2781        return count >= 3;
2782    }
2783    false
2784}
2785
2786/// Check if content contains patterns that cause the markdown crate to panic
2787#[cfg(test)]
2788mod tests {
2789    use super::*;
2790
2791    #[test]
2792    fn test_empty_content() {
2793        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2794        assert_eq!(ctx.content, "");
2795        assert_eq!(ctx.line_offsets, vec![0]);
2796        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2797        assert_eq!(ctx.lines.len(), 0);
2798    }
2799
2800    #[test]
2801    fn test_single_line() {
2802        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2803        assert_eq!(ctx.content, "# Hello");
2804        assert_eq!(ctx.line_offsets, vec![0]);
2805        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2806        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2807    }
2808
2809    #[test]
2810    fn test_multi_line() {
2811        let content = "# Title\n\nSecond line\nThird line";
2812        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2813        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2814        // Test offset to line/col
2815        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2816        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2817        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2818        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2819        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2820    }
2821
2822    #[test]
2823    fn test_line_info() {
2824        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2825        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2826
2827        // Test line info
2828        assert_eq!(ctx.lines.len(), 7);
2829
2830        // Line 1: "# Title"
2831        let line1 = &ctx.lines[0];
2832        assert_eq!(line1.content, "# Title");
2833        assert_eq!(line1.byte_offset, 0);
2834        assert_eq!(line1.indent, 0);
2835        assert!(!line1.is_blank);
2836        assert!(!line1.in_code_block);
2837        assert!(line1.list_item.is_none());
2838
2839        // Line 2: "    indented"
2840        let line2 = &ctx.lines[1];
2841        assert_eq!(line2.content, "    indented");
2842        assert_eq!(line2.byte_offset, 8);
2843        assert_eq!(line2.indent, 4);
2844        assert!(!line2.is_blank);
2845
2846        // Line 3: "" (blank)
2847        let line3 = &ctx.lines[2];
2848        assert_eq!(line3.content, "");
2849        assert!(line3.is_blank);
2850
2851        // Test helper methods
2852        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2853        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2854        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2855        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2856    }
2857
2858    #[test]
2859    fn test_list_item_detection() {
2860        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2861        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2862
2863        // Line 1: "- Unordered item"
2864        let line1 = &ctx.lines[0];
2865        assert!(line1.list_item.is_some());
2866        let list1 = line1.list_item.as_ref().unwrap();
2867        assert_eq!(list1.marker, "-");
2868        assert!(!list1.is_ordered);
2869        assert_eq!(list1.marker_column, 0);
2870        assert_eq!(list1.content_column, 2);
2871
2872        // Line 2: "  * Nested item"
2873        let line2 = &ctx.lines[1];
2874        assert!(line2.list_item.is_some());
2875        let list2 = line2.list_item.as_ref().unwrap();
2876        assert_eq!(list2.marker, "*");
2877        assert_eq!(list2.marker_column, 2);
2878
2879        // Line 3: "1. Ordered item"
2880        let line3 = &ctx.lines[2];
2881        assert!(line3.list_item.is_some());
2882        let list3 = line3.list_item.as_ref().unwrap();
2883        assert_eq!(list3.marker, "1.");
2884        assert!(list3.is_ordered);
2885        assert_eq!(list3.number, Some(1));
2886
2887        // Line 6: "Not a list"
2888        let line6 = &ctx.lines[5];
2889        assert!(line6.list_item.is_none());
2890    }
2891
2892    #[test]
2893    fn test_offset_to_line_col_edge_cases() {
2894        let content = "a\nb\nc";
2895        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2896        // line_offsets: [0, 2, 4]
2897        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2898        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2899        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2900        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2901        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2902        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2903    }
2904}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs