rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::utils::ast_utils::get_cached_ast;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use markdown::mdast::Node;
6use regex::Regex;
7
8lazy_static! {
9    // Comprehensive link pattern that captures both inline and reference links
10    // Use (?s) flag to make . match newlines
11    static ref LINK_PATTERN: Regex = Regex::new(
12        r"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
16            |
17            \[([^\]]*)\]      # Reference ID in group 3
18        )"
19    ).unwrap();
20
21    // Image pattern (similar to links but with ! prefix)
22    // Use (?s) flag to make . match newlines
23    static ref IMAGE_PATTERN: Regex = Regex::new(
24        r"(?sx)
25        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
26        (?:
27            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
28            |
29            \[([^\]]*)\]      # Reference ID in group 3
30        )"
31    ).unwrap();
32
33    // Reference definition pattern
34    static ref REF_DEF_PATTERN: Regex = Regex::new(
35        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36    ).unwrap();
37
38    // Code span pattern - matches backticks and captures content
39    // This handles multi-backtick code spans correctly
40    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41        r"`+"
42    ).unwrap();
43
44    // Pattern for bare URLs
45    static ref BARE_URL_PATTERN: Regex = Regex::new(
46        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47    ).unwrap();
48
49    // Pattern for email addresses
50    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52    ).unwrap();
53
54    // Pattern for angle bracket links (to exclude from bare URL detection)
55    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57    ).unwrap();
58
59    // Pattern for blockquote prefix in parse_list_blocks
60    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63/// Pre-computed information about a line
64#[derive(Debug, Clone)]
65pub struct LineInfo {
66    /// The actual line content (without newline)
67    pub content: String,
68    /// Byte offset where this line starts in the document
69    pub byte_offset: usize,
70    /// Number of leading spaces/tabs
71    pub indent: usize,
72    /// Whether the line is blank (empty or only whitespace)
73    pub is_blank: bool,
74    /// Whether this line is inside a code block
75    pub in_code_block: bool,
76    /// Whether this line is inside front matter
77    pub in_front_matter: bool,
78    /// Whether this line is inside an HTML block
79    pub in_html_block: bool,
80    /// List item information if this line starts a list item
81    pub list_item: Option<ListItemInfo>,
82    /// Heading information if this line is a heading
83    pub heading: Option<HeadingInfo>,
84    /// Blockquote information if this line is a blockquote
85    pub blockquote: Option<BlockquoteInfo>,
86    /// Whether this line is inside a mkdocstrings autodoc block
87    pub in_mkdocstrings: bool,
88}
89
90/// Information about a list item
91#[derive(Debug, Clone)]
92pub struct ListItemInfo {
93    /// The marker used (*, -, +, or number with . or ))
94    pub marker: String,
95    /// Whether it's ordered (true) or unordered (false)
96    pub is_ordered: bool,
97    /// The number for ordered lists
98    pub number: Option<usize>,
99    /// Column where the marker starts (0-based)
100    pub marker_column: usize,
101    /// Column where content after marker starts
102    pub content_column: usize,
103}
104
105/// Heading style type
106#[derive(Debug, Clone, PartialEq)]
107pub enum HeadingStyle {
108    /// ATX style heading (# Heading)
109    ATX,
110    /// Setext style heading with = underline
111    Setext1,
112    /// Setext style heading with - underline
113    Setext2,
114}
115
116/// Parsed link information
117#[derive(Debug, Clone)]
118pub struct ParsedLink {
119    /// Line number (1-indexed)
120    pub line: usize,
121    /// Start column (0-indexed) in the line
122    pub start_col: usize,
123    /// End column (0-indexed) in the line
124    pub end_col: usize,
125    /// Byte offset in document
126    pub byte_offset: usize,
127    /// End byte offset in document
128    pub byte_end: usize,
129    /// Link text
130    pub text: String,
131    /// Link URL or reference
132    pub url: String,
133    /// Whether this is a reference link [text][ref] vs inline [text](url)
134    pub is_reference: bool,
135    /// Reference ID for reference links
136    pub reference_id: Option<String>,
137}
138
139/// Parsed image information
140#[derive(Debug, Clone)]
141pub struct ParsedImage {
142    /// Line number (1-indexed)
143    pub line: usize,
144    /// Start column (0-indexed) in the line
145    pub start_col: usize,
146    /// End column (0-indexed) in the line
147    pub end_col: usize,
148    /// Byte offset in document
149    pub byte_offset: usize,
150    /// End byte offset in document
151    pub byte_end: usize,
152    /// Alt text
153    pub alt_text: String,
154    /// Image URL or reference
155    pub url: String,
156    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
157    pub is_reference: bool,
158    /// Reference ID for reference images
159    pub reference_id: Option<String>,
160}
161
162/// Reference definition [ref]: url "title"
163#[derive(Debug, Clone)]
164pub struct ReferenceDef {
165    /// Line number (1-indexed)
166    pub line: usize,
167    /// Reference ID (normalized to lowercase)
168    pub id: String,
169    /// URL
170    pub url: String,
171    /// Optional title
172    pub title: Option<String>,
173}
174
175/// Parsed code span information
176#[derive(Debug, Clone)]
177pub struct CodeSpan {
178    /// Line number (1-indexed)
179    pub line: usize,
180    /// Start column (0-indexed) in the line
181    pub start_col: usize,
182    /// End column (0-indexed) in the line
183    pub end_col: usize,
184    /// Byte offset in document
185    pub byte_offset: usize,
186    /// End byte offset in document
187    pub byte_end: usize,
188    /// Number of backticks used (1, 2, 3, etc.)
189    pub backtick_count: usize,
190    /// Content inside the code span (without backticks)
191    pub content: String,
192}
193
194/// Information about a heading
195#[derive(Debug, Clone)]
196pub struct HeadingInfo {
197    /// Heading level (1-6 for ATX, 1-2 for Setext)
198    pub level: u8,
199    /// Style of heading
200    pub style: HeadingStyle,
201    /// The heading marker (# characters or underline)
202    pub marker: String,
203    /// Column where the marker starts (0-based)
204    pub marker_column: usize,
205    /// Column where heading text starts
206    pub content_column: usize,
207    /// The heading text (without markers and without custom ID syntax)
208    pub text: String,
209    /// Custom header ID if present (e.g., from {#custom-id} syntax)
210    pub custom_id: Option<String>,
211    /// Original heading text including custom ID syntax
212    pub raw_text: String,
213    /// Whether it has a closing sequence (for ATX)
214    pub has_closing_sequence: bool,
215    /// The closing sequence if present
216    pub closing_sequence: String,
217}
218
219/// Information about a blockquote line
220#[derive(Debug, Clone)]
221pub struct BlockquoteInfo {
222    /// Nesting level (1 for >, 2 for >>, etc.)
223    pub nesting_level: usize,
224    /// The indentation before the blockquote marker
225    pub indent: String,
226    /// Column where the first > starts (0-based)
227    pub marker_column: usize,
228    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
229    pub prefix: String,
230    /// Content after the blockquote marker(s)
231    pub content: String,
232    /// Whether the line has no space after the marker
233    pub has_no_space_after_marker: bool,
234    /// Whether the line has multiple spaces after the marker
235    pub has_multiple_spaces_after_marker: bool,
236    /// Whether this is an empty blockquote line needing MD028 fix
237    pub needs_md028_fix: bool,
238}
239
240/// Information about a list block
241#[derive(Debug, Clone)]
242pub struct ListBlock {
243    /// Line number where the list starts (1-indexed)
244    pub start_line: usize,
245    /// Line number where the list ends (1-indexed)
246    pub end_line: usize,
247    /// Whether it's ordered or unordered
248    pub is_ordered: bool,
249    /// The consistent marker for unordered lists (if any)
250    pub marker: Option<String>,
251    /// Blockquote prefix for this list (empty if not in blockquote)
252    pub blockquote_prefix: String,
253    /// Lines that are list items within this block
254    pub item_lines: Vec<usize>,
255    /// Nesting level (0 for top-level lists)
256    pub nesting_level: usize,
257    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
258    pub max_marker_width: usize,
259}
260
261use std::sync::{Arc, Mutex};
262
263/// Character frequency data for fast content analysis
264#[derive(Debug, Clone, Default)]
265pub struct CharFrequency {
266    /// Count of # characters (headings)
267    pub hash_count: usize,
268    /// Count of * characters (emphasis, lists, horizontal rules)
269    pub asterisk_count: usize,
270    /// Count of _ characters (emphasis, horizontal rules)
271    pub underscore_count: usize,
272    /// Count of - characters (lists, horizontal rules, setext headings)
273    pub hyphen_count: usize,
274    /// Count of + characters (lists)
275    pub plus_count: usize,
276    /// Count of > characters (blockquotes)
277    pub gt_count: usize,
278    /// Count of | characters (tables)
279    pub pipe_count: usize,
280    /// Count of [ characters (links, images)
281    pub bracket_count: usize,
282    /// Count of ` characters (code spans, code blocks)
283    pub backtick_count: usize,
284    /// Count of < characters (HTML tags, autolinks)
285    pub lt_count: usize,
286    /// Count of ! characters (images)
287    pub exclamation_count: usize,
288    /// Count of newline characters
289    pub newline_count: usize,
290}
291
292/// Pre-parsed HTML tag information
293#[derive(Debug, Clone)]
294pub struct HtmlTag {
295    /// Line number (1-indexed)
296    pub line: usize,
297    /// Start column (0-indexed) in the line
298    pub start_col: usize,
299    /// End column (0-indexed) in the line
300    pub end_col: usize,
301    /// Byte offset in document
302    pub byte_offset: usize,
303    /// End byte offset in document
304    pub byte_end: usize,
305    /// Tag name (e.g., "div", "img", "br")
306    pub tag_name: String,
307    /// Whether it's a closing tag (</tag>)
308    pub is_closing: bool,
309    /// Whether it's self-closing (<tag />)
310    pub is_self_closing: bool,
311    /// Raw tag content
312    pub raw_content: String,
313}
314
315/// Pre-parsed emphasis span information
316#[derive(Debug, Clone)]
317pub struct EmphasisSpan {
318    /// Line number (1-indexed)
319    pub line: usize,
320    /// Start column (0-indexed) in the line
321    pub start_col: usize,
322    /// End column (0-indexed) in the line
323    pub end_col: usize,
324    /// Byte offset in document
325    pub byte_offset: usize,
326    /// End byte offset in document
327    pub byte_end: usize,
328    /// Type of emphasis ('*' or '_')
329    pub marker: char,
330    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
331    pub marker_count: usize,
332    /// Content inside the emphasis
333    pub content: String,
334}
335
336/// Pre-parsed table row information
337#[derive(Debug, Clone)]
338pub struct TableRow {
339    /// Line number (1-indexed)
340    pub line: usize,
341    /// Whether this is a separator row (contains only |, -, :, and spaces)
342    pub is_separator: bool,
343    /// Number of columns (pipe-separated cells)
344    pub column_count: usize,
345    /// Alignment info from separator row
346    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
347}
348
349/// Pre-parsed bare URL information (not in links)
350#[derive(Debug, Clone)]
351pub struct BareUrl {
352    /// Line number (1-indexed)
353    pub line: usize,
354    /// Start column (0-indexed) in the line
355    pub start_col: usize,
356    /// End column (0-indexed) in the line
357    pub end_col: usize,
358    /// Byte offset in document
359    pub byte_offset: usize,
360    /// End byte offset in document
361    pub byte_end: usize,
362    /// The URL string
363    pub url: String,
364    /// Type of URL ("http", "https", "ftp", "email")
365    pub url_type: String,
366}
367
368pub struct LintContext<'a> {
369    pub content: &'a str,
370    pub line_offsets: Vec<usize>,
371    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
372    pub lines: Vec<LineInfo>,             // Pre-computed line information
373    pub links: Vec<ParsedLink>,           // Pre-parsed links
374    pub images: Vec<ParsedImage>,         // Pre-parsed images
375    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
376    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
377    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
378    pub char_frequency: CharFrequency,    // Character frequency analysis
379    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
380    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
381    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
382    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
383    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
384    pub flavor: MarkdownFlavor,           // Markdown flavor being used
385}
386
387impl<'a> LintContext<'a> {
388    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
389        let mut line_offsets = vec![0];
390        for (i, c) in content.char_indices() {
391            if c == '\n' {
392                line_offsets.push(i + 1);
393            }
394        }
395
396        // Detect code blocks once and cache them
397        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
398
399        // Pre-compute line information
400        let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
401
402        // Parse code spans early so we can exclude them from link/image parsing
403        let ast = get_cached_ast(content);
404        let code_spans = Self::parse_code_spans(content, &lines, &ast);
405
406        // Parse links, images, references, and list blocks
407        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
408        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
409        let reference_defs = Self::parse_reference_defs(content, &lines);
410        // Use line-by-line list parsing for MD032 compatibility
411        // TODO: Consider using AST-based parsing in the future when MD032 is updated
412        let list_blocks = Self::parse_list_blocks(&lines);
413
414        // Detect HTML blocks
415        Self::detect_html_blocks(&mut lines);
416
417        // Compute character frequency for fast content analysis
418        let char_frequency = Self::compute_char_frequency(content);
419
420        Self {
421            content,
422            line_offsets,
423            code_blocks,
424            lines,
425            links,
426            images,
427            reference_defs,
428            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
429            list_blocks,
430            char_frequency,
431            html_tags_cache: Mutex::new(None),
432            emphasis_spans_cache: Mutex::new(None),
433            table_rows_cache: Mutex::new(None),
434            bare_urls_cache: Mutex::new(None),
435            ast_cache: Mutex::new(None),
436            flavor,
437        }
438    }
439
440    /// Get AST - uses global cache for deduplication
441    pub fn get_ast(&self) -> Arc<Node> {
442        let mut cache = self.ast_cache.lock().unwrap();
443
444        if cache.is_none() {
445            // Use global AST cache to avoid duplicate parsing
446            // MarkdownAst is just a type alias for Node, so no conversion needed
447            *cache = Some(get_cached_ast(self.content));
448        }
449
450        cache.as_ref().unwrap().clone()
451    }
452
453    /// Get code spans - computed lazily on first access
454    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
455        let mut cache = self.code_spans_cache.lock().unwrap();
456
457        // Check if we need to compute code spans
458        if cache.is_none() {
459            let ast = self.get_ast();
460            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
461            *cache = Some(Arc::new(code_spans));
462        }
463
464        // Return a reference to the cached code spans
465        cache.as_ref().unwrap().clone()
466    }
467
468    /// Get HTML tags - computed lazily on first access
469    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
470        let mut cache = self.html_tags_cache.lock().unwrap();
471
472        if cache.is_none() {
473            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
474            *cache = Some(Arc::new(html_tags));
475        }
476
477        cache.as_ref().unwrap().clone()
478    }
479
480    /// Get emphasis spans - computed lazily on first access
481    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
482        let mut cache = self.emphasis_spans_cache.lock().unwrap();
483
484        if cache.is_none() {
485            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
486            *cache = Some(Arc::new(emphasis_spans));
487        }
488
489        cache.as_ref().unwrap().clone()
490    }
491
492    /// Get table rows - computed lazily on first access
493    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
494        let mut cache = self.table_rows_cache.lock().unwrap();
495
496        if cache.is_none() {
497            let table_rows = Self::parse_table_rows(&self.lines);
498            *cache = Some(Arc::new(table_rows));
499        }
500
501        cache.as_ref().unwrap().clone()
502    }
503
504    /// Get bare URLs - computed lazily on first access
505    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
506        let mut cache = self.bare_urls_cache.lock().unwrap();
507
508        if cache.is_none() {
509            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
510            *cache = Some(Arc::new(bare_urls));
511        }
512
513        cache.as_ref().unwrap().clone()
514    }
515
516    /// Map a byte offset to (line, column)
517    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
518        match self.line_offsets.binary_search(&offset) {
519            Ok(line) => (line + 1, 1),
520            Err(line) => {
521                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
522                (line, offset - line_start + 1)
523            }
524        }
525    }
526
527    /// Check if a position is within a code block or code span
528    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
529        // Check code blocks first
530        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
531            return true;
532        }
533
534        // Check inline code spans (lazy load if needed)
535        self.code_spans()
536            .iter()
537            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
538    }
539
540    /// Get line information by line number (1-indexed)
541    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
542        if line_num > 0 {
543            self.lines.get(line_num - 1)
544        } else {
545            None
546        }
547    }
548
549    /// Get byte offset for a line number (1-indexed)
550    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
551        self.line_info(line_num).map(|info| info.byte_offset)
552    }
553
554    /// Get URL for a reference link/image by its ID
555    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
556        let normalized_id = ref_id.to_lowercase();
557        self.reference_defs
558            .iter()
559            .find(|def| def.id == normalized_id)
560            .map(|def| def.url.as_str())
561    }
562
563    /// Get links on a specific line
564    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
565        self.links.iter().filter(|link| link.line == line_num).collect()
566    }
567
568    /// Get images on a specific line
569    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
570        self.images.iter().filter(|img| img.line == line_num).collect()
571    }
572
573    /// Check if a line is part of a list block
574    pub fn is_in_list_block(&self, line_num: usize) -> bool {
575        self.list_blocks
576            .iter()
577            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
578    }
579
580    /// Get the list block containing a specific line
581    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
582        self.list_blocks
583            .iter()
584            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
585    }
586
587    // Compatibility methods for DocumentStructure migration
588
589    /// Check if a line is within a code block
590    pub fn is_in_code_block(&self, line_num: usize) -> bool {
591        if line_num == 0 || line_num > self.lines.len() {
592            return false;
593        }
594        self.lines[line_num - 1].in_code_block
595    }
596
597    /// Check if a line is within front matter
598    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
599        if line_num == 0 || line_num > self.lines.len() {
600            return false;
601        }
602        self.lines[line_num - 1].in_front_matter
603    }
604
605    /// Check if a line is within an HTML block
606    pub fn is_in_html_block(&self, line_num: usize) -> bool {
607        if line_num == 0 || line_num > self.lines.len() {
608            return false;
609        }
610        self.lines[line_num - 1].in_html_block
611    }
612
613    /// Check if a line and column is within a code span
614    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
615        if line_num == 0 || line_num > self.lines.len() {
616            return false;
617        }
618
619        // Use the code spans cache to check
620        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
621        // Convert col to 0-indexed for comparison
622        let col_0indexed = if col > 0 { col - 1 } else { 0 };
623        let code_spans = self.code_spans();
624        code_spans
625            .iter()
626            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
627    }
628
629    /// Check if content has any instances of a specific character (fast)
630    pub fn has_char(&self, ch: char) -> bool {
631        match ch {
632            '#' => self.char_frequency.hash_count > 0,
633            '*' => self.char_frequency.asterisk_count > 0,
634            '_' => self.char_frequency.underscore_count > 0,
635            '-' => self.char_frequency.hyphen_count > 0,
636            '+' => self.char_frequency.plus_count > 0,
637            '>' => self.char_frequency.gt_count > 0,
638            '|' => self.char_frequency.pipe_count > 0,
639            '[' => self.char_frequency.bracket_count > 0,
640            '`' => self.char_frequency.backtick_count > 0,
641            '<' => self.char_frequency.lt_count > 0,
642            '!' => self.char_frequency.exclamation_count > 0,
643            '\n' => self.char_frequency.newline_count > 0,
644            _ => self.content.contains(ch), // Fallback for other characters
645        }
646    }
647
648    /// Get count of a specific character (fast)
649    pub fn char_count(&self, ch: char) -> usize {
650        match ch {
651            '#' => self.char_frequency.hash_count,
652            '*' => self.char_frequency.asterisk_count,
653            '_' => self.char_frequency.underscore_count,
654            '-' => self.char_frequency.hyphen_count,
655            '+' => self.char_frequency.plus_count,
656            '>' => self.char_frequency.gt_count,
657            '|' => self.char_frequency.pipe_count,
658            '[' => self.char_frequency.bracket_count,
659            '`' => self.char_frequency.backtick_count,
660            '<' => self.char_frequency.lt_count,
661            '!' => self.char_frequency.exclamation_count,
662            '\n' => self.char_frequency.newline_count,
663            _ => self.content.matches(ch).count(), // Fallback for other characters
664        }
665    }
666
667    /// Check if content likely contains headings (fast)
668    pub fn likely_has_headings(&self) -> bool {
669        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
670    }
671
672    /// Check if content likely contains lists (fast)
673    pub fn likely_has_lists(&self) -> bool {
674        self.char_frequency.asterisk_count > 0
675            || self.char_frequency.hyphen_count > 0
676            || self.char_frequency.plus_count > 0
677    }
678
679    /// Check if content likely contains emphasis (fast)
680    pub fn likely_has_emphasis(&self) -> bool {
681        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
682    }
683
684    /// Check if content likely contains tables (fast)
685    pub fn likely_has_tables(&self) -> bool {
686        self.char_frequency.pipe_count > 2
687    }
688
689    /// Check if content likely contains blockquotes (fast)
690    pub fn likely_has_blockquotes(&self) -> bool {
691        self.char_frequency.gt_count > 0
692    }
693
694    /// Check if content likely contains code (fast)
695    pub fn likely_has_code(&self) -> bool {
696        self.char_frequency.backtick_count > 0
697    }
698
699    /// Check if content likely contains links or images (fast)
700    pub fn likely_has_links_or_images(&self) -> bool {
701        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
702    }
703
704    /// Check if content likely contains HTML (fast)
705    pub fn likely_has_html(&self) -> bool {
706        self.char_frequency.lt_count > 0
707    }
708
709    /// Get HTML tags on a specific line
710    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
711        self.html_tags()
712            .iter()
713            .filter(|tag| tag.line == line_num)
714            .cloned()
715            .collect()
716    }
717
718    /// Get emphasis spans on a specific line
719    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
720        self.emphasis_spans()
721            .iter()
722            .filter(|span| span.line == line_num)
723            .cloned()
724            .collect()
725    }
726
727    /// Get table rows on a specific line
728    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
729        self.table_rows()
730            .iter()
731            .filter(|row| row.line == line_num)
732            .cloned()
733            .collect()
734    }
735
736    /// Get bare URLs on a specific line
737    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
738        self.bare_urls()
739            .iter()
740            .filter(|url| url.line == line_num)
741            .cloned()
742            .collect()
743    }
744
745    /// Parse all links in the content
746    fn parse_links(
747        content: &str,
748        lines: &[LineInfo],
749        code_blocks: &[(usize, usize)],
750        code_spans: &[CodeSpan],
751        flavor: MarkdownFlavor,
752    ) -> Vec<ParsedLink> {
753        use crate::utils::skip_context::is_mkdocs_snippet_line;
754
755        // Pre-size based on a heuristic: most markdown files have relatively few links
756        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
757
758        // Parse links across the entire content, not line by line
759        for cap in LINK_PATTERN.captures_iter(content) {
760            let full_match = cap.get(0).unwrap();
761            let match_start = full_match.start();
762            let match_end = full_match.end();
763
764            // Skip if the opening bracket is escaped (preceded by \)
765            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
766                continue;
767            }
768
769            // Skip if this is actually an image (preceded by !)
770            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
771                continue;
772            }
773
774            // Skip if in code block
775            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
776                continue;
777            }
778
779            // Skip if in code span
780            if code_spans
781                .iter()
782                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
783            {
784                continue;
785            }
786
787            // Skip if this link is on a MkDocs snippet line
788            // Find which line this link is on
789            let line_idx = lines
790                .iter()
791                .position(|line| {
792                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
793                })
794                .unwrap_or(0);
795
796            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
797                continue;
798            }
799
800            // Find which line this link starts on
801            let mut line_num = 1;
802            let mut col_start = match_start;
803            for (idx, line_info) in lines.iter().enumerate() {
804                if match_start >= line_info.byte_offset {
805                    line_num = idx + 1;
806                    col_start = match_start - line_info.byte_offset;
807                } else {
808                    break;
809                }
810            }
811
812            // Find which line this link ends on (and calculate column on that line)
813            let mut end_line_num = 1;
814            let mut col_end = match_end;
815            for (idx, line_info) in lines.iter().enumerate() {
816                if match_end > line_info.byte_offset {
817                    end_line_num = idx + 1;
818                    col_end = match_end - line_info.byte_offset;
819                } else {
820                    break;
821                }
822            }
823
824            // For single-line links, use the same approach as before
825            if line_num == end_line_num {
826                // col_end is already correct
827            } else {
828                // For multi-line links, col_end represents the column on the ending line
829                // which is what we want
830            }
831
832            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
833
834            if let Some(inline_url) = cap.get(2) {
835                // Inline link
836                links.push(ParsedLink {
837                    line: line_num,
838                    start_col: col_start,
839                    end_col: col_end,
840                    byte_offset: match_start,
841                    byte_end: match_end,
842                    text,
843                    url: inline_url.as_str().to_string(),
844                    is_reference: false,
845                    reference_id: None,
846                });
847            } else if let Some(ref_id) = cap.get(3) {
848                // Reference link
849                let ref_id_str = ref_id.as_str();
850                let normalized_ref = if ref_id_str.is_empty() {
851                    text.to_lowercase() // Implicit reference
852                } else {
853                    ref_id_str.to_lowercase()
854                };
855
856                links.push(ParsedLink {
857                    line: line_num,
858                    start_col: col_start,
859                    end_col: col_end,
860                    byte_offset: match_start,
861                    byte_end: match_end,
862                    text,
863                    url: String::new(), // Will be resolved with reference_defs
864                    is_reference: true,
865                    reference_id: Some(normalized_ref),
866                });
867            }
868        }
869
870        links
871    }
872
873    /// Parse all images in the content
874    fn parse_images(
875        content: &str,
876        lines: &[LineInfo],
877        code_blocks: &[(usize, usize)],
878        code_spans: &[CodeSpan],
879    ) -> Vec<ParsedImage> {
880        // Pre-size based on a heuristic: images are less common than links
881        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
882
883        // Parse images across the entire content, not line by line
884        for cap in IMAGE_PATTERN.captures_iter(content) {
885            let full_match = cap.get(0).unwrap();
886            let match_start = full_match.start();
887            let match_end = full_match.end();
888
889            // Skip if the ! is escaped (preceded by \)
890            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
891                continue;
892            }
893
894            // Skip if in code block
895            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
896                continue;
897            }
898
899            // Skip if in code span
900            if code_spans
901                .iter()
902                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
903            {
904                continue;
905            }
906
907            // Find which line this image starts on
908            let mut line_num = 1;
909            let mut col_start = match_start;
910            for (idx, line_info) in lines.iter().enumerate() {
911                if match_start >= line_info.byte_offset {
912                    line_num = idx + 1;
913                    col_start = match_start - line_info.byte_offset;
914                } else {
915                    break;
916                }
917            }
918
919            // Find which line this image ends on (and calculate column on that line)
920            let mut end_line_num = 1;
921            let mut col_end = match_end;
922            for (idx, line_info) in lines.iter().enumerate() {
923                if match_end > line_info.byte_offset {
924                    end_line_num = idx + 1;
925                    col_end = match_end - line_info.byte_offset;
926                } else {
927                    break;
928                }
929            }
930
931            // For single-line images, use the same approach as before
932            if line_num == end_line_num {
933                // col_end is already correct
934            } else {
935                // For multi-line images, col_end represents the column on the ending line
936                // which is what we want
937            }
938
939            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
940
941            if let Some(inline_url) = cap.get(2) {
942                // Inline image
943                images.push(ParsedImage {
944                    line: line_num,
945                    start_col: col_start,
946                    end_col: col_end,
947                    byte_offset: match_start,
948                    byte_end: match_end,
949                    alt_text,
950                    url: inline_url.as_str().to_string(),
951                    is_reference: false,
952                    reference_id: None,
953                });
954            } else if let Some(ref_id) = cap.get(3) {
955                // Reference image
956                let ref_id_str = ref_id.as_str();
957                let normalized_ref = if ref_id_str.is_empty() {
958                    alt_text.to_lowercase() // Implicit reference
959                } else {
960                    ref_id_str.to_lowercase()
961                };
962
963                images.push(ParsedImage {
964                    line: line_num,
965                    start_col: col_start,
966                    end_col: col_end,
967                    byte_offset: match_start,
968                    byte_end: match_end,
969                    alt_text,
970                    url: String::new(), // Will be resolved with reference_defs
971                    is_reference: true,
972                    reference_id: Some(normalized_ref),
973                });
974            }
975        }
976
977        images
978    }
979
980    /// Parse reference definitions
981    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
982        // Pre-size based on lines count as reference definitions are line-based
983        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
984
985        for (line_idx, line_info) in lines.iter().enumerate() {
986            // Skip lines in code blocks
987            if line_info.in_code_block {
988                continue;
989            }
990
991            let line = &line_info.content;
992            let line_num = line_idx + 1;
993
994            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
995                let id = cap.get(1).unwrap().as_str().to_lowercase();
996                let url = cap.get(2).unwrap().as_str().to_string();
997                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
998
999                refs.push(ReferenceDef {
1000                    line: line_num,
1001                    id,
1002                    url,
1003                    title,
1004                });
1005            }
1006        }
1007
1008        refs
1009    }
1010
1011    /// Pre-compute line information
1012    fn compute_line_info(
1013        content: &str,
1014        line_offsets: &[usize],
1015        code_blocks: &[(usize, usize)],
1016        flavor: MarkdownFlavor,
1017    ) -> Vec<LineInfo> {
1018        lazy_static! {
1019            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1020            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1021            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1022
1023            // Regex for blockquote prefix
1024            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1025
1026            // Regex for heading detection
1027            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1028            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1029
1030            // Regex for blockquote detection
1031            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1032        }
1033
1034        let content_lines: Vec<&str> = content.lines().collect();
1035        let mut lines = Vec::with_capacity(content_lines.len());
1036
1037        // Detect front matter boundaries FIRST, before any other parsing
1038        let mut in_front_matter = false;
1039        let mut front_matter_end = 0;
1040        if content_lines.first().map(|l| l.trim()) == Some("---") {
1041            in_front_matter = true;
1042            for (idx, line) in content_lines.iter().enumerate().skip(1) {
1043                if line.trim() == "---" {
1044                    front_matter_end = idx;
1045                    break;
1046                }
1047            }
1048        }
1049
1050        for (i, line) in content_lines.iter().enumerate() {
1051            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1052            let indent = line.len() - line.trim_start().len();
1053            // For blank detection, consider blockquote context
1054            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1055                // In blockquote context, check if content after prefix is blank
1056                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1057                after_prefix.trim().is_empty()
1058            } else {
1059                line.trim().is_empty()
1060            };
1061            // Check if this line is inside a code block (not inline code span)
1062            // We only want to check for fenced/indented code blocks, not inline code
1063            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1064                // Only consider ranges that span multiple lines (code blocks)
1065                // Inline code spans are typically on a single line
1066
1067                // Ensure we're at valid UTF-8 boundaries
1068                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1069                    // Find the nearest valid boundary before start
1070                    let mut boundary = start;
1071                    while boundary > 0 && !content.is_char_boundary(boundary) {
1072                        boundary -= 1;
1073                    }
1074                    boundary
1075                } else {
1076                    start
1077                };
1078
1079                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1080                    // Find the nearest valid boundary after end
1081                    let mut boundary = end;
1082                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1083                        boundary += 1;
1084                    }
1085                    boundary
1086                } else {
1087                    end.min(content.len())
1088                };
1089
1090                let block_content = &content[safe_start..safe_end];
1091                let is_multiline = block_content.contains('\n');
1092                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1093                let is_indented = !is_fenced
1094                    && block_content
1095                        .lines()
1096                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1097
1098                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1099            });
1100
1101            // Detect list items (skip if in frontmatter or in mkdocstrings block)
1102            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1103                && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1104            let list_item =
1105                if !(in_code_block || is_blank || in_mkdocstrings || in_front_matter && i <= front_matter_end) {
1106                    // Strip blockquote prefix if present for list detection
1107                    let (line_for_list_check, blockquote_prefix_len) =
1108                        if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1109                            let prefix = caps.get(1).unwrap().as_str();
1110                            let content = caps.get(2).unwrap().as_str();
1111                            (content, prefix.len())
1112                        } else {
1113                            (&**line, 0)
1114                        };
1115
1116                    if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1117                        let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1118                        let marker = caps.get(2).map_or("", |m| m.as_str());
1119                        let spacing = caps.get(3).map_or("", |m| m.as_str());
1120                        let _content = caps.get(4).map_or("", |m| m.as_str());
1121                        let marker_column = blockquote_prefix_len + leading_spaces.len();
1122                        let content_column = marker_column + marker.len() + spacing.len();
1123
1124                        // According to CommonMark spec, unordered list items MUST have at least one space
1125                        // after the marker (-, *, or +). Without a space, it's not a list item.
1126                        // This also naturally handles cases like:
1127                        // - *emphasis* (not a list)
1128                        // - **bold** (not a list)
1129                        // - --- (horizontal rule, not a list)
1130                        if spacing.is_empty() {
1131                            None
1132                        } else {
1133                            Some(ListItemInfo {
1134                                marker: marker.to_string(),
1135                                is_ordered: false,
1136                                number: None,
1137                                marker_column,
1138                                content_column,
1139                            })
1140                        }
1141                    } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1142                        let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1143                        let number_str = caps.get(2).map_or("", |m| m.as_str());
1144                        let delimiter = caps.get(3).map_or("", |m| m.as_str());
1145                        let spacing = caps.get(4).map_or("", |m| m.as_str());
1146                        let _content = caps.get(5).map_or("", |m| m.as_str());
1147                        let marker = format!("{number_str}{delimiter}");
1148                        let marker_column = blockquote_prefix_len + leading_spaces.len();
1149                        let content_column = marker_column + marker.len() + spacing.len();
1150
1151                        // According to CommonMark spec, ordered list items MUST have at least one space
1152                        // after the marker (period or parenthesis). Without a space, it's not a list item.
1153                        if spacing.is_empty() {
1154                            None
1155                        } else {
1156                            Some(ListItemInfo {
1157                                marker,
1158                                is_ordered: true,
1159                                number: number_str.parse().ok(),
1160                                marker_column,
1161                                content_column,
1162                            })
1163                        }
1164                    } else {
1165                        None
1166                    }
1167                } else {
1168                    None
1169                };
1170
1171            lines.push(LineInfo {
1172                content: line.to_string(),
1173                byte_offset,
1174                indent,
1175                is_blank,
1176                in_code_block,
1177                in_front_matter: in_front_matter && i <= front_matter_end,
1178                in_html_block: false, // Will be populated after line creation
1179                list_item,
1180                heading: None,    // Will be populated in second pass for Setext headings
1181                blockquote: None, // Will be populated after line creation
1182                in_mkdocstrings,
1183            });
1184        }
1185
1186        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1187        for i in 0..content_lines.len() {
1188            if lines[i].in_code_block {
1189                continue;
1190            }
1191
1192            // Skip lines in front matter
1193            if in_front_matter && i <= front_matter_end {
1194                continue;
1195            }
1196
1197            let line = content_lines[i];
1198
1199            // Check for blockquotes (even on blank lines within blockquotes)
1200            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1201                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1202                let markers = caps.get(2).map_or("", |m| m.as_str());
1203                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1204                let content = caps.get(4).map_or("", |m| m.as_str());
1205
1206                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1207                let marker_column = indent_str.len();
1208
1209                // Build the prefix (indentation + markers + space)
1210                let prefix = format!("{indent_str}{markers}{spaces_after}");
1211
1212                // Check for various blockquote issues
1213                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1214                // Consider tabs as multiple spaces, or actual multiple spaces
1215                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1216
1217                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1218                // MD028 flags empty blockquote lines that don't have a single space after the marker
1219                // Lines like "> " or ">> " are already correct and don't need fixing
1220                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1221
1222                lines[i].blockquote = Some(BlockquoteInfo {
1223                    nesting_level,
1224                    indent: indent_str.to_string(),
1225                    marker_column,
1226                    prefix,
1227                    content: content.to_string(),
1228                    has_no_space_after_marker: has_no_space,
1229                    has_multiple_spaces_after_marker: has_multiple_spaces,
1230                    needs_md028_fix,
1231                });
1232            }
1233
1234            // Skip heading detection for blank lines
1235            if lines[i].is_blank {
1236                continue;
1237            }
1238
1239            // Check for ATX headings (but skip MkDocs snippet lines)
1240            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1241            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1242                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1243                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1244            } else {
1245                false
1246            };
1247
1248            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1249                // Skip headings inside HTML comments
1250                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1251                    continue;
1252                }
1253                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1254                let hashes = caps.get(2).map_or("", |m| m.as_str());
1255                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1256                let rest = caps.get(4).map_or("", |m| m.as_str());
1257
1258                let level = hashes.len() as u8;
1259                let marker_column = leading_spaces.len();
1260
1261                // Check for closing sequence, but handle custom IDs that might come after
1262                let (text, has_closing, closing_seq) = {
1263                    // First check if there's a custom ID at the end
1264                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1265                        // Check if this looks like a valid custom ID (ends with })
1266                        if rest[id_start..].trim_end().ends_with('}') {
1267                            // Split off the custom ID
1268                            (&rest[..id_start], &rest[id_start..])
1269                        } else {
1270                            (rest, "")
1271                        }
1272                    } else {
1273                        (rest, "")
1274                    };
1275
1276                    // Now look for closing hashes in the part before the custom ID
1277                    let trimmed_rest = rest_without_id.trim_end();
1278                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1279                        // Look for the start of the hash sequence
1280                        let mut start_of_hashes = last_hash_pos;
1281                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1282                            start_of_hashes -= 1;
1283                        }
1284
1285                        // Check if there's at least one space before the closing hashes
1286                        let has_space_before = start_of_hashes == 0
1287                            || trimmed_rest
1288                                .chars()
1289                                .nth(start_of_hashes - 1)
1290                                .is_some_and(|c| c.is_whitespace());
1291
1292                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1293                        let potential_closing = &trimmed_rest[start_of_hashes..];
1294                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1295
1296                        if is_all_hashes && has_space_before {
1297                            // This is a closing sequence
1298                            let closing_hashes = potential_closing.to_string();
1299                            // The text is everything before the closing hashes
1300                            // Don't include the custom ID here - it will be extracted later
1301                            let text_part = if !custom_id_part.is_empty() {
1302                                // If we have a custom ID, append it back to get the full rest
1303                                // This allows the extract_header_id function to handle it properly
1304                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1305                            } else {
1306                                rest_without_id[..start_of_hashes].trim_end().to_string()
1307                            };
1308                            (text_part, true, closing_hashes)
1309                        } else {
1310                            // Not a valid closing sequence, return the full content
1311                            (rest.to_string(), false, String::new())
1312                        }
1313                    } else {
1314                        // No hashes found, return the full content
1315                        (rest.to_string(), false, String::new())
1316                    }
1317                };
1318
1319                let content_column = marker_column + hashes.len() + spaces_after.len();
1320
1321                // Extract custom header ID if present
1322                let raw_text = text.trim().to_string();
1323                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1324
1325                // If no custom ID was found on the header line, check the next line for standalone attr-list
1326                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1327                    let next_line = content_lines[i + 1];
1328                    if !lines[i + 1].in_code_block
1329                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1330                        && let Some(next_line_id) =
1331                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1332                    {
1333                        custom_id = Some(next_line_id);
1334                    }
1335                }
1336
1337                lines[i].heading = Some(HeadingInfo {
1338                    level,
1339                    style: HeadingStyle::ATX,
1340                    marker: hashes.to_string(),
1341                    marker_column,
1342                    content_column,
1343                    text: clean_text,
1344                    custom_id,
1345                    raw_text,
1346                    has_closing_sequence: has_closing,
1347                    closing_sequence: closing_seq,
1348                });
1349            }
1350            // Check for Setext headings (need to look at next line)
1351            else if i + 1 < content_lines.len() {
1352                let next_line = content_lines[i + 1];
1353                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1354                    // Skip if next line is front matter delimiter
1355                    if in_front_matter && i < front_matter_end {
1356                        continue;
1357                    }
1358
1359                    // Skip Setext headings inside HTML comments
1360                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1361                        continue;
1362                    }
1363
1364                    let underline = next_line.trim();
1365
1366                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1367                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1368                    if underline == "---" {
1369                        continue;
1370                    }
1371
1372                    // Skip if the current line looks like YAML key-value syntax
1373                    let current_line_trimmed = line.trim();
1374                    if current_line_trimmed.contains(':')
1375                        && !current_line_trimmed.starts_with('#')
1376                        && !current_line_trimmed.contains('[')
1377                        && !current_line_trimmed.contains("](")
1378                    {
1379                        // This looks like "key: value" which suggests YAML, not a heading
1380                        continue;
1381                    }
1382
1383                    let level = if underline.starts_with('=') { 1 } else { 2 };
1384                    let style = if level == 1 {
1385                        HeadingStyle::Setext1
1386                    } else {
1387                        HeadingStyle::Setext2
1388                    };
1389
1390                    // Extract custom header ID if present
1391                    let raw_text = line.trim().to_string();
1392                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1393
1394                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1395                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1396                        let attr_line = content_lines[i + 2];
1397                        if !lines[i + 2].in_code_block
1398                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1399                            && let Some(attr_line_id) =
1400                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1401                        {
1402                            custom_id = Some(attr_line_id);
1403                        }
1404                    }
1405
1406                    lines[i].heading = Some(HeadingInfo {
1407                        level,
1408                        style,
1409                        marker: underline.to_string(),
1410                        marker_column: next_line.len() - next_line.trim_start().len(),
1411                        content_column: lines[i].indent,
1412                        text: clean_text,
1413                        custom_id,
1414                        raw_text,
1415                        has_closing_sequence: false,
1416                        closing_sequence: String::new(),
1417                    });
1418                }
1419            }
1420        }
1421
1422        lines
1423    }
1424
1425    /// Detect HTML blocks in the content
1426    fn detect_html_blocks(lines: &mut [LineInfo]) {
1427        // HTML block elements that trigger block context
1428        const BLOCK_ELEMENTS: &[&str] = &[
1429            "address",
1430            "article",
1431            "aside",
1432            "blockquote",
1433            "details",
1434            "dialog",
1435            "dd",
1436            "div",
1437            "dl",
1438            "dt",
1439            "fieldset",
1440            "figcaption",
1441            "figure",
1442            "footer",
1443            "form",
1444            "h1",
1445            "h2",
1446            "h3",
1447            "h4",
1448            "h5",
1449            "h6",
1450            "header",
1451            "hr",
1452            "li",
1453            "main",
1454            "nav",
1455            "ol",
1456            "p",
1457            "pre",
1458            "section",
1459            "table",
1460            "tbody",
1461            "td",
1462            "tfoot",
1463            "th",
1464            "thead",
1465            "tr",
1466            "ul",
1467        ];
1468
1469        let mut i = 0;
1470        while i < lines.len() {
1471            // Skip if already in code block or front matter
1472            if lines[i].in_code_block || lines[i].in_front_matter {
1473                i += 1;
1474                continue;
1475            }
1476
1477            let trimmed = lines[i].content.trim_start();
1478
1479            // Check if line starts with an HTML tag
1480            if trimmed.starts_with('<') && trimmed.len() > 1 {
1481                // Extract tag name safely
1482                let after_bracket = &trimmed[1..];
1483                let is_closing = after_bracket.starts_with('/');
1484                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1485
1486                // Extract tag name (stop at space, >, /, or end of string)
1487                let tag_name = tag_start
1488                    .chars()
1489                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1490                    .collect::<String>()
1491                    .to_lowercase();
1492
1493                // Check if it's a block element
1494                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1495                    // Mark this line as in HTML block
1496                    lines[i].in_html_block = true;
1497
1498                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1499                    // This avoids complex nesting logic that might cause infinite loops
1500                    if !is_closing {
1501                        let closing_tag = format!("</{tag_name}>");
1502                        let mut j = i + 1;
1503                        while j < lines.len() && j < i + 100 {
1504                            // Limit search to 100 lines
1505                            // Stop at blank lines
1506                            if lines[j].is_blank {
1507                                break;
1508                            }
1509
1510                            lines[j].in_html_block = true;
1511
1512                            // Check if this line contains the closing tag
1513                            if lines[j].content.contains(&closing_tag) {
1514                                break;
1515                            }
1516                            j += 1;
1517                        }
1518                    }
1519                }
1520            }
1521
1522            i += 1;
1523        }
1524    }
1525
1526    /// Parse all inline code spans in the content using AST
1527    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1528        let mut code_spans = Vec::new();
1529
1530        // Quick check - if no backticks, no code spans
1531        if !content.contains('`') {
1532            return code_spans;
1533        }
1534
1535        // Helper function to recursively extract inline code spans from AST nodes
1536        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1537            match node {
1538                Node::InlineCode(inline_code) => {
1539                    if let Some(pos) = &inline_code.position {
1540                        let start_pos = pos.start.offset;
1541                        let end_pos = pos.end.offset;
1542
1543                        // The position includes the backticks, extract the actual content
1544                        let full_span = &content[start_pos..end_pos];
1545                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1546
1547                        // Extract content between backticks, preserving spaces
1548                        let content_start = start_pos + backtick_count;
1549                        let content_end = end_pos - backtick_count;
1550                        let span_content = if content_start < content_end {
1551                            content[content_start..content_end].to_string()
1552                        } else {
1553                            String::new()
1554                        };
1555
1556                        // Find which line this code span starts on
1557                        let mut line_num = 1;
1558                        let mut col_start = start_pos;
1559                        for (idx, line_info) in lines.iter().enumerate() {
1560                            if start_pos >= line_info.byte_offset {
1561                                line_num = idx + 1;
1562                                col_start = start_pos - line_info.byte_offset;
1563                            } else {
1564                                break;
1565                            }
1566                        }
1567
1568                        // Find end column
1569                        let mut col_end = end_pos;
1570                        for line_info in lines.iter() {
1571                            if end_pos > line_info.byte_offset {
1572                                col_end = end_pos - line_info.byte_offset;
1573                            } else {
1574                                break;
1575                            }
1576                        }
1577
1578                        spans.push(CodeSpan {
1579                            line: line_num,
1580                            start_col: col_start,
1581                            end_col: col_end,
1582                            byte_offset: start_pos,
1583                            byte_end: end_pos,
1584                            backtick_count,
1585                            content: span_content,
1586                        });
1587                    }
1588                }
1589                // Recursively process children
1590                Node::Root(root) => {
1591                    for child in &root.children {
1592                        extract_code_spans(child, content, lines, spans);
1593                    }
1594                }
1595                Node::Paragraph(para) => {
1596                    for child in &para.children {
1597                        extract_code_spans(child, content, lines, spans);
1598                    }
1599                }
1600                Node::Heading(heading) => {
1601                    for child in &heading.children {
1602                        extract_code_spans(child, content, lines, spans);
1603                    }
1604                }
1605                Node::List(list) => {
1606                    for child in &list.children {
1607                        extract_code_spans(child, content, lines, spans);
1608                    }
1609                }
1610                Node::ListItem(item) => {
1611                    for child in &item.children {
1612                        extract_code_spans(child, content, lines, spans);
1613                    }
1614                }
1615                Node::Blockquote(blockquote) => {
1616                    for child in &blockquote.children {
1617                        extract_code_spans(child, content, lines, spans);
1618                    }
1619                }
1620                Node::Table(table) => {
1621                    for child in &table.children {
1622                        extract_code_spans(child, content, lines, spans);
1623                    }
1624                }
1625                Node::TableRow(row) => {
1626                    for child in &row.children {
1627                        extract_code_spans(child, content, lines, spans);
1628                    }
1629                }
1630                Node::TableCell(cell) => {
1631                    for child in &cell.children {
1632                        extract_code_spans(child, content, lines, spans);
1633                    }
1634                }
1635                Node::Emphasis(emphasis) => {
1636                    for child in &emphasis.children {
1637                        extract_code_spans(child, content, lines, spans);
1638                    }
1639                }
1640                Node::Strong(strong) => {
1641                    for child in &strong.children {
1642                        extract_code_spans(child, content, lines, spans);
1643                    }
1644                }
1645                Node::Link(link) => {
1646                    for child in &link.children {
1647                        extract_code_spans(child, content, lines, spans);
1648                    }
1649                }
1650                Node::LinkReference(link_ref) => {
1651                    for child in &link_ref.children {
1652                        extract_code_spans(child, content, lines, spans);
1653                    }
1654                }
1655                Node::FootnoteDefinition(footnote) => {
1656                    for child in &footnote.children {
1657                        extract_code_spans(child, content, lines, spans);
1658                    }
1659                }
1660                Node::Delete(delete) => {
1661                    for child in &delete.children {
1662                        extract_code_spans(child, content, lines, spans);
1663                    }
1664                }
1665                // Terminal nodes or nodes without relevant children
1666                Node::Code(_)
1667                | Node::Text(_)
1668                | Node::Html(_)
1669                | Node::Image(_)
1670                | Node::ImageReference(_)
1671                | Node::FootnoteReference(_)
1672                | Node::Break(_)
1673                | Node::ThematicBreak(_)
1674                | Node::Definition(_)
1675                | Node::Yaml(_)
1676                | Node::Toml(_)
1677                | Node::Math(_)
1678                | Node::InlineMath(_)
1679                | Node::MdxJsxFlowElement(_)
1680                | Node::MdxFlowExpression(_)
1681                | Node::MdxJsxTextElement(_)
1682                | Node::MdxTextExpression(_)
1683                | Node::MdxjsEsm(_) => {
1684                    // No children to process or not relevant for code spans
1685                }
1686            }
1687        }
1688
1689        // Extract all code spans from the AST
1690        extract_code_spans(ast, content, lines, &mut code_spans);
1691
1692        // Sort by position to ensure consistent ordering
1693        code_spans.sort_by_key(|span| span.byte_offset);
1694
1695        code_spans
1696    }
1697
1698    /// Parse all list blocks in the content (legacy line-by-line approach)
1699    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1700        // Pre-size based on lines that could be list items
1701        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1702        let mut current_block: Option<ListBlock> = None;
1703        let mut last_list_item_line = 0;
1704        let mut current_indent_level = 0;
1705        let mut last_marker_width = 0;
1706
1707        for (line_idx, line_info) in lines.iter().enumerate() {
1708            let line_num = line_idx + 1;
1709
1710            // Enhanced code block handling using Design #3's context analysis
1711            if line_info.in_code_block {
1712                if let Some(ref mut block) = current_block {
1713                    // Calculate minimum indentation for list continuation
1714                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1715
1716                    // Analyze code block context using the three-tier classification
1717                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1718
1719                    match context {
1720                        CodeBlockContext::Indented => {
1721                            // Code block is properly indented - continues the list
1722                            block.end_line = line_num;
1723                            continue;
1724                        }
1725                        CodeBlockContext::Standalone => {
1726                            // Code block separates lists - end current block
1727                            let completed_block = current_block.take().unwrap();
1728                            list_blocks.push(completed_block);
1729                            continue;
1730                        }
1731                        CodeBlockContext::Adjacent => {
1732                            // Edge case - use conservative behavior (continue list)
1733                            block.end_line = line_num;
1734                            continue;
1735                        }
1736                    }
1737                } else {
1738                    // No current list block - skip code block lines
1739                    continue;
1740                }
1741            }
1742
1743            // Extract blockquote prefix if any
1744            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1745                caps.get(0).unwrap().as_str().to_string()
1746            } else {
1747                String::new()
1748            };
1749
1750            // Check if this line is a list item
1751            if let Some(list_item) = &line_info.list_item {
1752                // Calculate nesting level based on indentation
1753                let item_indent = list_item.marker_column;
1754                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1755
1756                if let Some(ref mut block) = current_block {
1757                    // Check if this continues the current block
1758                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1759                    // or a continuation at the same or lower level
1760                    let is_nested = nesting > block.nesting_level;
1761                    let same_type =
1762                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1763                    let same_context = block.blockquote_prefix == blockquote_prefix;
1764                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1765
1766                    // For unordered lists, also check marker consistency
1767                    let marker_compatible =
1768                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1769
1770                    // Check if there's non-list content between the last item and this one
1771                    let has_non_list_content = {
1772                        let mut found_non_list = false;
1773                        // Use the last item from the current block, not the global last_list_item_line
1774                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1775
1776                        // Debug: Special check for problematic line
1777                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1778                            let last_line = &lines[block_last_item_line - 1];
1779                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1780                                log::debug!(
1781                                    "After problematic line {}: checking lines {} to {} for non-list content",
1782                                    block_last_item_line,
1783                                    block_last_item_line + 1,
1784                                    line_num
1785                                );
1786                                // If they're consecutive list items, there's no content between
1787                                if line_num == block_last_item_line + 1 {
1788                                    log::debug!("Lines are consecutive, no content between");
1789                                }
1790                            }
1791                        }
1792
1793                        for check_line in (block_last_item_line + 1)..line_num {
1794                            let check_idx = check_line - 1;
1795                            if check_idx < lines.len() {
1796                                let check_info = &lines[check_idx];
1797                                // Check for content that breaks the list
1798                                let is_list_breaking_content = if check_info.in_code_block {
1799                                    // Use enhanced code block classification for list separation
1800                                    let last_item_marker_width =
1801                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1802                                            lines[block_last_item_line - 1]
1803                                                .list_item
1804                                                .as_ref()
1805                                                .map(|li| {
1806                                                    if li.is_ordered {
1807                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1808                                                    } else {
1809                                                        li.marker.len()
1810                                                    }
1811                                                })
1812                                                .unwrap_or(3) // fallback to 3 if no list item found
1813                                        } else {
1814                                            3 // fallback
1815                                        };
1816
1817                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1818
1819                                    // Analyze code block context using our enhanced classification
1820                                    let context = CodeBlockUtils::analyze_code_block_context(
1821                                        lines,
1822                                        check_line - 1,
1823                                        min_continuation,
1824                                    );
1825
1826                                    // Standalone code blocks break lists, indented ones continue them
1827                                    matches!(context, CodeBlockContext::Standalone)
1828                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1829                                    // Check for structural separators that should break lists (from issue #42)
1830                                    let line_content = check_info.content.trim();
1831
1832                                    // Any of these structural separators break lists
1833                                    if check_info.heading.is_some()
1834                                        || line_content.starts_with("---")
1835                                        || line_content.starts_with("***")
1836                                        || line_content.starts_with("___")
1837                                        || (line_content.contains('|')
1838                                            && !line_content.contains("](")
1839                                            && !line_content.contains("http")
1840                                            && (line_content.matches('|').count() > 1
1841                                                || line_content.starts_with('|')
1842                                                || line_content.ends_with('|')))
1843                                        || line_content.starts_with(">")
1844                                    {
1845                                        true
1846                                    }
1847                                    // Other non-list content - check if properly indented
1848                                    else {
1849                                        let last_item_marker_width =
1850                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1851                                                lines[block_last_item_line - 1]
1852                                                    .list_item
1853                                                    .as_ref()
1854                                                    .map(|li| {
1855                                                        if li.is_ordered {
1856                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1857                                                        } else {
1858                                                            li.marker.len()
1859                                                        }
1860                                                    })
1861                                                    .unwrap_or(3) // fallback to 3 if no list item found
1862                                            } else {
1863                                                3 // fallback
1864                                            };
1865
1866                                        let min_continuation =
1867                                            if block.is_ordered { last_item_marker_width } else { 2 };
1868                                        check_info.indent < min_continuation
1869                                    }
1870                                } else {
1871                                    false
1872                                };
1873
1874                                if is_list_breaking_content {
1875                                    // Not indented enough, so it breaks the list
1876                                    found_non_list = true;
1877                                    break;
1878                                }
1879                            }
1880                        }
1881                        found_non_list
1882                    };
1883
1884                    // A list continues if:
1885                    // 1. It's a nested item (indented more than the parent), OR
1886                    // 2. It's the same type at the same level with reasonable distance
1887                    let mut continues_list = if is_nested {
1888                        // Nested items always continue the list if they're in the same context
1889                        same_context && reasonable_distance && !has_non_list_content
1890                    } else {
1891                        // Same-level items need to match type and markers
1892                        let result = same_type
1893                            && same_context
1894                            && reasonable_distance
1895                            && marker_compatible
1896                            && !has_non_list_content;
1897
1898                        // Debug logging for lines after problematic content
1899                        if block.item_lines.last().is_some_and(|&last_line| {
1900                            last_line > 0
1901                                && last_line <= lines.len()
1902                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1903                                && lines[last_line - 1].content.contains(r"\`")
1904                        }) {
1905                            log::debug!(
1906                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1907                            );
1908                            if line_num > 0 && line_num <= lines.len() {
1909                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1910                            }
1911                        }
1912
1913                        result
1914                    };
1915
1916                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1917                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1918                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1919                        // Check if the previous line was a list item
1920                        if block.item_lines.contains(&(line_num - 1)) {
1921                            // They're consecutive list items - force them to be in the same list
1922                            continues_list = true;
1923                        }
1924                    }
1925
1926                    if continues_list {
1927                        // Extend current block
1928                        block.end_line = line_num;
1929                        block.item_lines.push(line_num);
1930
1931                        // Update max marker width
1932                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1933                            list_item.marker.len() + 1
1934                        } else {
1935                            list_item.marker.len()
1936                        });
1937
1938                        // Update marker consistency for unordered lists
1939                        if !block.is_ordered
1940                            && block.marker.is_some()
1941                            && block.marker.as_ref() != Some(&list_item.marker)
1942                        {
1943                            // Mixed markers, clear the marker field
1944                            block.marker = None;
1945                        }
1946                    } else {
1947                        // End current block and start a new one
1948
1949                        list_blocks.push(block.clone());
1950
1951                        *block = ListBlock {
1952                            start_line: line_num,
1953                            end_line: line_num,
1954                            is_ordered: list_item.is_ordered,
1955                            marker: if list_item.is_ordered {
1956                                None
1957                            } else {
1958                                Some(list_item.marker.clone())
1959                            },
1960                            blockquote_prefix: blockquote_prefix.clone(),
1961                            item_lines: vec![line_num],
1962                            nesting_level: nesting,
1963                            max_marker_width: if list_item.is_ordered {
1964                                list_item.marker.len() + 1
1965                            } else {
1966                                list_item.marker.len()
1967                            },
1968                        };
1969                    }
1970                } else {
1971                    // Start a new block
1972                    current_block = Some(ListBlock {
1973                        start_line: line_num,
1974                        end_line: line_num,
1975                        is_ordered: list_item.is_ordered,
1976                        marker: if list_item.is_ordered {
1977                            None
1978                        } else {
1979                            Some(list_item.marker.clone())
1980                        },
1981                        blockquote_prefix,
1982                        item_lines: vec![line_num],
1983                        nesting_level: nesting,
1984                        max_marker_width: list_item.marker.len(),
1985                    });
1986                }
1987
1988                last_list_item_line = line_num;
1989                current_indent_level = item_indent;
1990                last_marker_width = if list_item.is_ordered {
1991                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
1992                } else {
1993                    list_item.marker.len()
1994                };
1995            } else if let Some(ref mut block) = current_block {
1996                // Not a list item - check if it continues the current block
1997
1998                // For MD032 compatibility, we use a simple approach:
1999                // - Indented lines continue the list
2000                // - Blank lines followed by indented content continue the list
2001                // - Everything else ends the list
2002
2003                // Check if the last line in the list block ended with a backslash (hard line break)
2004                // This handles cases where list items use backslash for hard line breaks
2005                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2006                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2007                } else {
2008                    false
2009                };
2010
2011                // Calculate minimum indentation for list continuation
2012                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2013                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2014                let min_continuation_indent = if block.is_ordered {
2015                    current_indent_level + last_marker_width
2016                } else {
2017                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2018                };
2019
2020                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2021                    // Indented line or backslash continuation continues the list
2022                    block.end_line = line_num;
2023                } else if line_info.is_blank {
2024                    // Blank line - check if it's internal to the list or ending it
2025                    // We only include blank lines that are followed by more list content
2026                    let mut check_idx = line_idx + 1;
2027                    let mut found_continuation = false;
2028
2029                    // Skip additional blank lines
2030                    while check_idx < lines.len() && lines[check_idx].is_blank {
2031                        check_idx += 1;
2032                    }
2033
2034                    if check_idx < lines.len() {
2035                        let next_line = &lines[check_idx];
2036                        // Check if followed by indented content (list continuation)
2037                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2038                            found_continuation = true;
2039                        }
2040                        // Check if followed by another list item at the same level
2041                        else if !next_line.in_code_block
2042                            && next_line.list_item.is_some()
2043                            && let Some(item) = &next_line.list_item
2044                        {
2045                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2046                                .find(&next_line.content)
2047                                .map_or(String::new(), |m| m.as_str().to_string());
2048                            if item.marker_column == current_indent_level
2049                                && item.is_ordered == block.is_ordered
2050                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2051                            {
2052                                // Check if there was meaningful content between the list items (unused now)
2053                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2054                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2055                                    if let Some(between_line) = lines.get(idx) {
2056                                        let trimmed = between_line.content.trim();
2057                                        // Skip empty lines
2058                                        if trimmed.is_empty() {
2059                                            return false;
2060                                        }
2061                                        // Check for meaningful content
2062                                        let line_indent =
2063                                            between_line.content.len() - between_line.content.trim_start().len();
2064
2065                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2066                                        if trimmed.starts_with("```")
2067                                            || trimmed.starts_with("~~~")
2068                                            || trimmed.starts_with("---")
2069                                            || trimmed.starts_with("***")
2070                                            || trimmed.starts_with("___")
2071                                            || trimmed.starts_with(">")
2072                                            || trimmed.contains('|') // Tables
2073                                            || between_line.heading.is_some()
2074                                        {
2075                                            return true; // These are structural separators - meaningful content that breaks lists
2076                                        }
2077
2078                                        // Only properly indented content continues the list
2079                                        line_indent >= min_continuation_indent
2080                                    } else {
2081                                        false
2082                                    }
2083                                });
2084
2085                                if block.is_ordered {
2086                                    // For ordered lists: don't continue if there are structural separators
2087                                    // Check if there are structural separators between the list items
2088                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2089                                        if let Some(between_line) = lines.get(idx) {
2090                                            let trimmed = between_line.content.trim();
2091                                            if trimmed.is_empty() {
2092                                                return false;
2093                                            }
2094                                            // Check for structural separators that break lists
2095                                            trimmed.starts_with("```")
2096                                                || trimmed.starts_with("~~~")
2097                                                || trimmed.starts_with("---")
2098                                                || trimmed.starts_with("***")
2099                                                || trimmed.starts_with("___")
2100                                                || trimmed.starts_with(">")
2101                                                || trimmed.contains('|') // Tables
2102                                                || between_line.heading.is_some()
2103                                        } else {
2104                                            false
2105                                        }
2106                                    });
2107                                    found_continuation = !has_structural_separators;
2108                                } else {
2109                                    // For unordered lists: also check for structural separators
2110                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2111                                        if let Some(between_line) = lines.get(idx) {
2112                                            let trimmed = between_line.content.trim();
2113                                            if trimmed.is_empty() {
2114                                                return false;
2115                                            }
2116                                            // Check for structural separators that break lists
2117                                            trimmed.starts_with("```")
2118                                                || trimmed.starts_with("~~~")
2119                                                || trimmed.starts_with("---")
2120                                                || trimmed.starts_with("***")
2121                                                || trimmed.starts_with("___")
2122                                                || trimmed.starts_with(">")
2123                                                || trimmed.contains('|') // Tables
2124                                                || between_line.heading.is_some()
2125                                        } else {
2126                                            false
2127                                        }
2128                                    });
2129                                    found_continuation = !has_structural_separators;
2130                                }
2131                            }
2132                        }
2133                    }
2134
2135                    if found_continuation {
2136                        // Include the blank line in the block
2137                        block.end_line = line_num;
2138                    } else {
2139                        // Blank line ends the list - don't include it
2140                        list_blocks.push(block.clone());
2141                        current_block = None;
2142                    }
2143                } else {
2144                    // Check for lazy continuation - non-indented line immediately after a list item
2145                    // But only if the line has sufficient indentation for the list type
2146                    let min_required_indent = if block.is_ordered {
2147                        current_indent_level + last_marker_width
2148                    } else {
2149                        current_indent_level + 2
2150                    };
2151
2152                    // For lazy continuation to apply, the line must either:
2153                    // 1. Have no indentation (true lazy continuation)
2154                    // 2. Have sufficient indentation for the list type
2155                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2156                    let line_content = line_info.content.trim();
2157                    let is_structural_separator = line_info.heading.is_some()
2158                        || line_content.starts_with("```")
2159                        || line_content.starts_with("~~~")
2160                        || line_content.starts_with("---")
2161                        || line_content.starts_with("***")
2162                        || line_content.starts_with("___")
2163                        || line_content.starts_with(">")
2164                        || (line_content.contains('|')
2165                            && !line_content.contains("](")
2166                            && !line_content.contains("http")
2167                            && (line_content.matches('|').count() > 1
2168                                || line_content.starts_with('|')
2169                                || line_content.ends_with('|'))); // Tables
2170
2171                    // Allow lazy continuation if we're still within the same list block
2172                    // (not just immediately after a list item)
2173                    let is_lazy_continuation = !is_structural_separator
2174                        && !line_info.is_blank
2175                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2176
2177                    if is_lazy_continuation {
2178                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2179                        // it's probably not a continuation
2180                        let content_to_check = if !blockquote_prefix.is_empty() {
2181                            // Strip blockquote prefix to check the actual content
2182                            line_info
2183                                .content
2184                                .strip_prefix(&blockquote_prefix)
2185                                .unwrap_or(&line_info.content)
2186                                .trim()
2187                        } else {
2188                            line_info.content.trim()
2189                        };
2190
2191                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2192
2193                        // If it starts with uppercase and the previous line ended with punctuation,
2194                        // it's likely a new paragraph, not a continuation
2195                        if starts_with_uppercase && last_list_item_line > 0 {
2196                            // This looks like a new paragraph
2197                            list_blocks.push(block.clone());
2198                            current_block = None;
2199                        } else {
2200                            // This is a lazy continuation line
2201                            block.end_line = line_num;
2202                        }
2203                    } else {
2204                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2205                        list_blocks.push(block.clone());
2206                        current_block = None;
2207                    }
2208                }
2209            }
2210        }
2211
2212        // Don't forget the last block
2213        if let Some(block) = current_block {
2214            list_blocks.push(block);
2215        }
2216
2217        // Merge adjacent blocks that should be one
2218        merge_adjacent_list_blocks(&mut list_blocks, lines);
2219
2220        list_blocks
2221    }
2222
2223    /// Compute character frequency for fast content analysis
2224    fn compute_char_frequency(content: &str) -> CharFrequency {
2225        let mut frequency = CharFrequency::default();
2226
2227        for ch in content.chars() {
2228            match ch {
2229                '#' => frequency.hash_count += 1,
2230                '*' => frequency.asterisk_count += 1,
2231                '_' => frequency.underscore_count += 1,
2232                '-' => frequency.hyphen_count += 1,
2233                '+' => frequency.plus_count += 1,
2234                '>' => frequency.gt_count += 1,
2235                '|' => frequency.pipe_count += 1,
2236                '[' => frequency.bracket_count += 1,
2237                '`' => frequency.backtick_count += 1,
2238                '<' => frequency.lt_count += 1,
2239                '!' => frequency.exclamation_count += 1,
2240                '\n' => frequency.newline_count += 1,
2241                _ => {}
2242            }
2243        }
2244
2245        frequency
2246    }
2247
2248    /// Parse HTML tags in the content
2249    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2250        lazy_static! {
2251            static ref HTML_TAG_REGEX: regex::Regex =
2252                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2253        }
2254
2255        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2256
2257        for cap in HTML_TAG_REGEX.captures_iter(content) {
2258            let full_match = cap.get(0).unwrap();
2259            let match_start = full_match.start();
2260            let match_end = full_match.end();
2261
2262            // Skip if in code block
2263            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2264                continue;
2265            }
2266
2267            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2268            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2269            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2270
2271            // Find which line this tag is on
2272            let mut line_num = 1;
2273            let mut col_start = match_start;
2274            let mut col_end = match_end;
2275            for (idx, line_info) in lines.iter().enumerate() {
2276                if match_start >= line_info.byte_offset {
2277                    line_num = idx + 1;
2278                    col_start = match_start - line_info.byte_offset;
2279                    col_end = match_end - line_info.byte_offset;
2280                } else {
2281                    break;
2282                }
2283            }
2284
2285            html_tags.push(HtmlTag {
2286                line: line_num,
2287                start_col: col_start,
2288                end_col: col_end,
2289                byte_offset: match_start,
2290                byte_end: match_end,
2291                tag_name,
2292                is_closing,
2293                is_self_closing,
2294                raw_content: full_match.as_str().to_string(),
2295            });
2296        }
2297
2298        html_tags
2299    }
2300
2301    /// Parse emphasis spans in the content
2302    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2303        lazy_static! {
2304            static ref EMPHASIS_REGEX: regex::Regex =
2305                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2306        }
2307
2308        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2309
2310        for cap in EMPHASIS_REGEX.captures_iter(content) {
2311            let full_match = cap.get(0).unwrap();
2312            let match_start = full_match.start();
2313            let match_end = full_match.end();
2314
2315            // Skip if in code block
2316            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2317                continue;
2318            }
2319
2320            let opening_markers = cap.get(1).unwrap().as_str();
2321            let content_part = cap.get(2).unwrap().as_str();
2322            let closing_markers = cap.get(3).unwrap().as_str();
2323
2324            // Validate matching markers
2325            if opening_markers.chars().next() != closing_markers.chars().next()
2326                || opening_markers.len() != closing_markers.len()
2327            {
2328                continue;
2329            }
2330
2331            let marker = opening_markers.chars().next().unwrap();
2332            let marker_count = opening_markers.len();
2333
2334            // Find which line this emphasis is on
2335            let mut line_num = 1;
2336            let mut col_start = match_start;
2337            let mut col_end = match_end;
2338            for (idx, line_info) in lines.iter().enumerate() {
2339                if match_start >= line_info.byte_offset {
2340                    line_num = idx + 1;
2341                    col_start = match_start - line_info.byte_offset;
2342                    col_end = match_end - line_info.byte_offset;
2343                } else {
2344                    break;
2345                }
2346            }
2347
2348            emphasis_spans.push(EmphasisSpan {
2349                line: line_num,
2350                start_col: col_start,
2351                end_col: col_end,
2352                byte_offset: match_start,
2353                byte_end: match_end,
2354                marker,
2355                marker_count,
2356                content: content_part.to_string(),
2357            });
2358        }
2359
2360        emphasis_spans
2361    }
2362
2363    /// Parse table rows in the content
2364    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2365        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2366
2367        for (line_idx, line_info) in lines.iter().enumerate() {
2368            // Skip lines in code blocks or blank lines
2369            if line_info.in_code_block || line_info.is_blank {
2370                continue;
2371            }
2372
2373            let line = &line_info.content;
2374            let line_num = line_idx + 1;
2375
2376            // Check if this line contains pipes (potential table row)
2377            if !line.contains('|') {
2378                continue;
2379            }
2380
2381            // Count columns by splitting on pipes
2382            let parts: Vec<&str> = line.split('|').collect();
2383            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2384
2385            // Check if this is a separator row
2386            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2387            let mut column_alignments = Vec::new();
2388
2389            if is_separator {
2390                for part in &parts[1..parts.len() - 1] {
2391                    // Skip first and last empty parts
2392                    let trimmed = part.trim();
2393                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2394                        "center".to_string()
2395                    } else if trimmed.ends_with(':') {
2396                        "right".to_string()
2397                    } else if trimmed.starts_with(':') {
2398                        "left".to_string()
2399                    } else {
2400                        "none".to_string()
2401                    };
2402                    column_alignments.push(alignment);
2403                }
2404            }
2405
2406            table_rows.push(TableRow {
2407                line: line_num,
2408                is_separator,
2409                column_count,
2410                column_alignments,
2411            });
2412        }
2413
2414        table_rows
2415    }
2416
2417    /// Parse bare URLs and emails in the content
2418    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2419        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2420
2421        // Check for bare URLs (not in angle brackets or markdown links)
2422        for cap in BARE_URL_PATTERN.captures_iter(content) {
2423            let full_match = cap.get(0).unwrap();
2424            let match_start = full_match.start();
2425            let match_end = full_match.end();
2426
2427            // Skip if in code block
2428            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2429                continue;
2430            }
2431
2432            // Skip if already in angle brackets or markdown links
2433            let preceding_char = if match_start > 0 {
2434                content.chars().nth(match_start - 1)
2435            } else {
2436                None
2437            };
2438            let following_char = content.chars().nth(match_end);
2439
2440            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2441                continue;
2442            }
2443            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2444                continue;
2445            }
2446
2447            let url = full_match.as_str();
2448            let url_type = if url.starts_with("https://") {
2449                "https"
2450            } else if url.starts_with("http://") {
2451                "http"
2452            } else if url.starts_with("ftp://") {
2453                "ftp"
2454            } else {
2455                "other"
2456            };
2457
2458            // Find which line this URL is on
2459            let mut line_num = 1;
2460            let mut col_start = match_start;
2461            let mut col_end = match_end;
2462            for (idx, line_info) in lines.iter().enumerate() {
2463                if match_start >= line_info.byte_offset {
2464                    line_num = idx + 1;
2465                    col_start = match_start - line_info.byte_offset;
2466                    col_end = match_end - line_info.byte_offset;
2467                } else {
2468                    break;
2469                }
2470            }
2471
2472            bare_urls.push(BareUrl {
2473                line: line_num,
2474                start_col: col_start,
2475                end_col: col_end,
2476                byte_offset: match_start,
2477                byte_end: match_end,
2478                url: url.to_string(),
2479                url_type: url_type.to_string(),
2480            });
2481        }
2482
2483        // Check for bare email addresses
2484        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2485            let full_match = cap.get(0).unwrap();
2486            let match_start = full_match.start();
2487            let match_end = full_match.end();
2488
2489            // Skip if in code block
2490            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2491                continue;
2492            }
2493
2494            // Skip if already in angle brackets or markdown links
2495            let preceding_char = if match_start > 0 {
2496                content.chars().nth(match_start - 1)
2497            } else {
2498                None
2499            };
2500            let following_char = content.chars().nth(match_end);
2501
2502            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2503                continue;
2504            }
2505            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2506                continue;
2507            }
2508
2509            let email = full_match.as_str();
2510
2511            // Find which line this email is on
2512            let mut line_num = 1;
2513            let mut col_start = match_start;
2514            let mut col_end = match_end;
2515            for (idx, line_info) in lines.iter().enumerate() {
2516                if match_start >= line_info.byte_offset {
2517                    line_num = idx + 1;
2518                    col_start = match_start - line_info.byte_offset;
2519                    col_end = match_end - line_info.byte_offset;
2520                } else {
2521                    break;
2522                }
2523            }
2524
2525            bare_urls.push(BareUrl {
2526                line: line_num,
2527                start_col: col_start,
2528                end_col: col_end,
2529                byte_offset: match_start,
2530                byte_end: match_end,
2531                url: email.to_string(),
2532                url_type: "email".to_string(),
2533            });
2534        }
2535
2536        bare_urls
2537    }
2538}
2539
2540/// Merge adjacent list blocks that should be treated as one
2541fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2542    if list_blocks.len() < 2 {
2543        return;
2544    }
2545
2546    let mut merger = ListBlockMerger::new(lines);
2547    *list_blocks = merger.merge(list_blocks);
2548}
2549
2550/// Helper struct to manage the complex logic of merging list blocks
2551struct ListBlockMerger<'a> {
2552    lines: &'a [LineInfo],
2553}
2554
2555impl<'a> ListBlockMerger<'a> {
2556    fn new(lines: &'a [LineInfo]) -> Self {
2557        Self { lines }
2558    }
2559
2560    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2561        let mut merged = Vec::with_capacity(list_blocks.len());
2562        let mut current = list_blocks[0].clone();
2563
2564        for next in list_blocks.iter().skip(1) {
2565            if self.should_merge_blocks(&current, next) {
2566                current = self.merge_two_blocks(current, next);
2567            } else {
2568                merged.push(current);
2569                current = next.clone();
2570            }
2571        }
2572
2573        merged.push(current);
2574        merged
2575    }
2576
2577    /// Determine if two adjacent list blocks should be merged
2578    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2579        // Basic compatibility checks
2580        if !self.blocks_are_compatible(current, next) {
2581            return false;
2582        }
2583
2584        // Check spacing and content between blocks
2585        let spacing = self.analyze_spacing_between(current, next);
2586        match spacing {
2587            BlockSpacing::Consecutive => true,
2588            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2589            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2590                self.can_merge_with_content_between(current, next)
2591            }
2592        }
2593    }
2594
2595    /// Check if blocks have compatible structure for merging
2596    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2597        current.is_ordered == next.is_ordered
2598            && current.blockquote_prefix == next.blockquote_prefix
2599            && current.nesting_level == next.nesting_level
2600    }
2601
2602    /// Analyze the spacing between two list blocks
2603    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2604        let gap = next.start_line - current.end_line;
2605
2606        match gap {
2607            1 => BlockSpacing::Consecutive,
2608            2 => BlockSpacing::SingleBlank,
2609            _ if gap > 2 => {
2610                if self.has_only_blank_lines_between(current, next) {
2611                    BlockSpacing::MultipleBlanks
2612                } else {
2613                    BlockSpacing::ContentBetween
2614                }
2615            }
2616            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2617        }
2618    }
2619
2620    /// Check if unordered lists can be merged with a single blank line between
2621    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2622        // Check if there are structural separators between the blocks
2623        // If has_meaningful_content_between returns true, it means there are structural separators
2624        if has_meaningful_content_between(current, next, self.lines) {
2625            return false; // Structural separators prevent merging
2626        }
2627
2628        // Only merge unordered lists with same marker across single blank
2629        !current.is_ordered && current.marker == next.marker
2630    }
2631
2632    /// Check if ordered lists can be merged when there's content between them
2633    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2634        // Do not merge lists if there are structural separators between them
2635        if has_meaningful_content_between(current, next, self.lines) {
2636            return false; // Structural separators prevent merging
2637        }
2638
2639        // Only consider merging ordered lists if there's no structural content between
2640        current.is_ordered && next.is_ordered
2641    }
2642
2643    /// Check if there are only blank lines between blocks
2644    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2645        for line_num in (current.end_line + 1)..next.start_line {
2646            if let Some(line_info) = self.lines.get(line_num - 1)
2647                && !line_info.content.trim().is_empty()
2648            {
2649                return false;
2650            }
2651        }
2652        true
2653    }
2654
2655    /// Merge two compatible list blocks into one
2656    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2657        current.end_line = next.end_line;
2658        current.item_lines.extend_from_slice(&next.item_lines);
2659
2660        // Update max marker width
2661        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2662
2663        // Handle marker consistency for unordered lists
2664        if !current.is_ordered && self.markers_differ(&current, next) {
2665            current.marker = None; // Mixed markers
2666        }
2667
2668        current
2669    }
2670
2671    /// Check if two blocks have different markers
2672    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2673        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2674    }
2675}
2676
2677/// Types of spacing between list blocks
2678#[derive(Debug, PartialEq)]
2679enum BlockSpacing {
2680    Consecutive,    // No gap between blocks
2681    SingleBlank,    // One blank line between blocks
2682    MultipleBlanks, // Multiple blank lines but no content
2683    ContentBetween, // Content exists between blocks
2684}
2685
2686/// Check if there's meaningful content (not just blank lines) between two list blocks
2687fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2688    // Check lines between current.end_line and next.start_line
2689    for line_num in (current.end_line + 1)..next.start_line {
2690        if let Some(line_info) = lines.get(line_num - 1) {
2691            // Convert to 0-indexed
2692            let trimmed = line_info.content.trim();
2693
2694            // Skip empty lines
2695            if trimmed.is_empty() {
2696                continue;
2697            }
2698
2699            // Check for structural separators that should separate lists (CommonMark compliant)
2700
2701            // Headings separate lists
2702            if line_info.heading.is_some() {
2703                return true; // Has meaningful content - headings separate lists
2704            }
2705
2706            // Horizontal rules separate lists (---, ***, ___)
2707            if is_horizontal_rule(trimmed) {
2708                return true; // Has meaningful content - horizontal rules separate lists
2709            }
2710
2711            // Tables separate lists (lines containing | but not in URLs or code)
2712            // Simple heuristic: tables typically have | at start/end or multiple |
2713            if trimmed.contains('|') && trimmed.len() > 1 {
2714                // Don't treat URLs with | as tables
2715                if !trimmed.contains("](") && !trimmed.contains("http") {
2716                    // More robust check: tables usually have multiple | or | at edges
2717                    let pipe_count = trimmed.matches('|').count();
2718                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2719                        return true; // Has meaningful content - tables separate lists
2720                    }
2721                }
2722            }
2723
2724            // Blockquotes separate lists
2725            if trimmed.starts_with('>') {
2726                return true; // Has meaningful content - blockquotes separate lists
2727            }
2728
2729            // Code block fences separate lists (unless properly indented as list content)
2730            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2731                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2732
2733                // Check if this code block is properly indented as list continuation
2734                let min_continuation_indent = if current.is_ordered {
2735                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2736                } else {
2737                    current.nesting_level + 2
2738                };
2739
2740                if line_indent < min_continuation_indent {
2741                    // This is a standalone code block that separates lists
2742                    return true; // Has meaningful content - standalone code blocks separate lists
2743                }
2744            }
2745
2746            // Check if this line has proper indentation for list continuation
2747            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2748
2749            // Calculate minimum indentation needed to be list continuation
2750            let min_indent = if current.is_ordered {
2751                current.nesting_level + current.max_marker_width
2752            } else {
2753                current.nesting_level + 2
2754            };
2755
2756            // If the line is not indented enough to be list continuation, it's meaningful content
2757            if line_indent < min_indent {
2758                return true; // Has meaningful content - content not indented as list continuation
2759            }
2760
2761            // If we reach here, the line is properly indented as list continuation
2762            // Continue checking other lines
2763        }
2764    }
2765
2766    // Only blank lines or properly indented list continuation content between blocks
2767    false
2768}
2769
2770/// Check if a line is a horizontal rule (---, ***, ___)
2771fn is_horizontal_rule(trimmed: &str) -> bool {
2772    if trimmed.len() < 3 {
2773        return false;
2774    }
2775
2776    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2777    let chars: Vec<char> = trimmed.chars().collect();
2778    if let Some(&first_char) = chars.first()
2779        && (first_char == '-' || first_char == '*' || first_char == '_')
2780    {
2781        let mut count = 0;
2782        for &ch in &chars {
2783            if ch == first_char {
2784                count += 1;
2785            } else if ch != ' ' && ch != '\t' {
2786                return false; // Non-matching, non-whitespace character
2787            }
2788        }
2789        return count >= 3;
2790    }
2791    false
2792}
2793
2794/// Check if content contains patterns that cause the markdown crate to panic
2795#[cfg(test)]
2796mod tests {
2797    use super::*;
2798
2799    #[test]
2800    fn test_empty_content() {
2801        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2802        assert_eq!(ctx.content, "");
2803        assert_eq!(ctx.line_offsets, vec![0]);
2804        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2805        assert_eq!(ctx.lines.len(), 0);
2806    }
2807
2808    #[test]
2809    fn test_single_line() {
2810        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2811        assert_eq!(ctx.content, "# Hello");
2812        assert_eq!(ctx.line_offsets, vec![0]);
2813        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2814        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2815    }
2816
2817    #[test]
2818    fn test_multi_line() {
2819        let content = "# Title\n\nSecond line\nThird line";
2820        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2821        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2822        // Test offset to line/col
2823        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2824        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2825        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2826        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2827        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2828    }
2829
2830    #[test]
2831    fn test_line_info() {
2832        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2833        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2834
2835        // Test line info
2836        assert_eq!(ctx.lines.len(), 7);
2837
2838        // Line 1: "# Title"
2839        let line1 = &ctx.lines[0];
2840        assert_eq!(line1.content, "# Title");
2841        assert_eq!(line1.byte_offset, 0);
2842        assert_eq!(line1.indent, 0);
2843        assert!(!line1.is_blank);
2844        assert!(!line1.in_code_block);
2845        assert!(line1.list_item.is_none());
2846
2847        // Line 2: "    indented"
2848        let line2 = &ctx.lines[1];
2849        assert_eq!(line2.content, "    indented");
2850        assert_eq!(line2.byte_offset, 8);
2851        assert_eq!(line2.indent, 4);
2852        assert!(!line2.is_blank);
2853
2854        // Line 3: "" (blank)
2855        let line3 = &ctx.lines[2];
2856        assert_eq!(line3.content, "");
2857        assert!(line3.is_blank);
2858
2859        // Test helper methods
2860        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2861        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2862        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2863        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2864    }
2865
2866    #[test]
2867    fn test_list_item_detection() {
2868        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2869        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2870
2871        // Line 1: "- Unordered item"
2872        let line1 = &ctx.lines[0];
2873        assert!(line1.list_item.is_some());
2874        let list1 = line1.list_item.as_ref().unwrap();
2875        assert_eq!(list1.marker, "-");
2876        assert!(!list1.is_ordered);
2877        assert_eq!(list1.marker_column, 0);
2878        assert_eq!(list1.content_column, 2);
2879
2880        // Line 2: "  * Nested item"
2881        let line2 = &ctx.lines[1];
2882        assert!(line2.list_item.is_some());
2883        let list2 = line2.list_item.as_ref().unwrap();
2884        assert_eq!(list2.marker, "*");
2885        assert_eq!(list2.marker_column, 2);
2886
2887        // Line 3: "1. Ordered item"
2888        let line3 = &ctx.lines[2];
2889        assert!(line3.list_item.is_some());
2890        let list3 = line3.list_item.as_ref().unwrap();
2891        assert_eq!(list3.marker, "1.");
2892        assert!(list3.is_ordered);
2893        assert_eq!(list3.number, Some(1));
2894
2895        // Line 6: "Not a list"
2896        let line6 = &ctx.lines[5];
2897        assert!(line6.list_item.is_none());
2898    }
2899
2900    #[test]
2901    fn test_offset_to_line_col_edge_cases() {
2902        let content = "a\nb\nc";
2903        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2904        // line_offsets: [0, 2, 4]
2905        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2906        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2907        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2908        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2909        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2910        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2911    }
2912}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs