rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::path::PathBuf;
10use std::sync::LazyLock;
11
12/// Macro for profiling sections - only active in non-WASM builds
13#[cfg(not(target_arch = "wasm32"))]
14macro_rules! profile_section {
15    ($name:expr, $profile:expr, $code:expr) => {{
16        let start = std::time::Instant::now();
17        let result = $code;
18        if $profile {
19            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
20        }
21        result
22    }};
23}
24
25#[cfg(target_arch = "wasm32")]
26macro_rules! profile_section {
27    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
28}
29
30// Comprehensive link pattern that captures both inline and reference links
31// Use (?s) flag to make . match newlines
32static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33    Regex::new(
34        r#"(?sx)
35        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
36        (?:
37            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
38            |
39            \[([^\]]*)\]      # Reference ID in group 6
40        )"#
41    ).unwrap()
42});
43
44// Image pattern (similar to links but with ! prefix)
45// Use (?s) flag to make . match newlines
46static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(
48        r#"(?sx)
49        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
50        (?:
51            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
52            |
53            \[([^\]]*)\]      # Reference ID in group 6
54        )"#
55    ).unwrap()
56});
57
58// Reference definition pattern
59static REF_DEF_PATTERN: LazyLock<Regex> =
60    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
61
62// Pattern for bare URLs - uses centralized URL pattern from regex_cache
63
64// Pattern for email addresses
65static BARE_EMAIL_PATTERN: LazyLock<Regex> =
66    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
67
68// Pattern for blockquote prefix in parse_list_blocks
69static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
70
71/// Pre-computed information about a line
72#[derive(Debug, Clone)]
73pub struct LineInfo {
74    /// Byte offset where this line starts in the document
75    pub byte_offset: usize,
76    /// Length of the line in bytes (without newline)
77    pub byte_len: usize,
78    /// Number of bytes of leading whitespace (for substring extraction)
79    pub indent: usize,
80    /// Visual column width of leading whitespace (with proper tab expansion)
81    /// Per CommonMark, tabs expand to the next column that is a multiple of 4.
82    /// Use this for numeric comparisons like checking for indented code blocks (>= 4).
83    pub visual_indent: usize,
84    /// Whether the line is blank (empty or only whitespace)
85    pub is_blank: bool,
86    /// Whether this line is inside a code block
87    pub in_code_block: bool,
88    /// Whether this line is inside front matter
89    pub in_front_matter: bool,
90    /// Whether this line is inside an HTML block
91    pub in_html_block: bool,
92    /// Whether this line is inside an HTML comment
93    pub in_html_comment: bool,
94    /// List item information if this line starts a list item
95    pub list_item: Option<ListItemInfo>,
96    /// Heading information if this line is a heading
97    pub heading: Option<HeadingInfo>,
98    /// Blockquote information if this line is a blockquote
99    pub blockquote: Option<BlockquoteInfo>,
100    /// Whether this line is inside a mkdocstrings autodoc block
101    pub in_mkdocstrings: bool,
102    /// Whether this line is part of an ESM import/export block (MDX only)
103    pub in_esm_block: bool,
104    /// Whether this line is a continuation of a multi-line code span from a previous line
105    pub in_code_span_continuation: bool,
106    /// Whether this line is a horizontal rule (---, ***, ___, etc.)
107    /// Pre-computed for consistent detection across all rules
108    pub is_horizontal_rule: bool,
109    /// Whether this line is inside a math block ($$ ... $$)
110    pub in_math_block: bool,
111}
112
113impl LineInfo {
114    /// Get the line content as a string slice from the source document
115    pub fn content<'a>(&self, source: &'a str) -> &'a str {
116        &source[self.byte_offset..self.byte_offset + self.byte_len]
117    }
118}
119
120/// Information about a list item
121#[derive(Debug, Clone)]
122pub struct ListItemInfo {
123    /// The marker used (*, -, +, or number with . or ))
124    pub marker: String,
125    /// Whether it's ordered (true) or unordered (false)
126    pub is_ordered: bool,
127    /// The number for ordered lists
128    pub number: Option<usize>,
129    /// Column where the marker starts (0-based)
130    pub marker_column: usize,
131    /// Column where content after marker starts
132    pub content_column: usize,
133}
134
135/// Heading style type
136#[derive(Debug, Clone, PartialEq)]
137pub enum HeadingStyle {
138    /// ATX style heading (# Heading)
139    ATX,
140    /// Setext style heading with = underline
141    Setext1,
142    /// Setext style heading with - underline
143    Setext2,
144}
145
146/// Parsed link information
147#[derive(Debug, Clone)]
148pub struct ParsedLink<'a> {
149    /// Line number (1-indexed)
150    pub line: usize,
151    /// Start column (0-indexed) in the line
152    pub start_col: usize,
153    /// End column (0-indexed) in the line
154    pub end_col: usize,
155    /// Byte offset in document
156    pub byte_offset: usize,
157    /// End byte offset in document
158    pub byte_end: usize,
159    /// Link text
160    pub text: Cow<'a, str>,
161    /// Link URL or reference
162    pub url: Cow<'a, str>,
163    /// Whether this is a reference link [text][ref] vs inline [text](url)
164    pub is_reference: bool,
165    /// Reference ID for reference links
166    pub reference_id: Option<Cow<'a, str>>,
167    /// Link type from pulldown-cmark
168    pub link_type: LinkType,
169}
170
171/// Information about a broken link reported by pulldown-cmark
172#[derive(Debug, Clone)]
173pub struct BrokenLinkInfo {
174    /// The reference text that couldn't be resolved
175    pub reference: String,
176    /// Byte span in the source document
177    pub span: std::ops::Range<usize>,
178}
179
180/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
181#[derive(Debug, Clone)]
182pub struct FootnoteRef {
183    /// The footnote ID (without the ^ prefix)
184    pub id: String,
185    /// Line number (1-indexed)
186    pub line: usize,
187    /// Start byte offset in document
188    pub byte_offset: usize,
189    /// End byte offset in document
190    pub byte_end: usize,
191}
192
193/// Parsed image information
194#[derive(Debug, Clone)]
195pub struct ParsedImage<'a> {
196    /// Line number (1-indexed)
197    pub line: usize,
198    /// Start column (0-indexed) in the line
199    pub start_col: usize,
200    /// End column (0-indexed) in the line
201    pub end_col: usize,
202    /// Byte offset in document
203    pub byte_offset: usize,
204    /// End byte offset in document
205    pub byte_end: usize,
206    /// Alt text
207    pub alt_text: Cow<'a, str>,
208    /// Image URL or reference
209    pub url: Cow<'a, str>,
210    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
211    pub is_reference: bool,
212    /// Reference ID for reference images
213    pub reference_id: Option<Cow<'a, str>>,
214    /// Link type from pulldown-cmark
215    pub link_type: LinkType,
216}
217
218/// Reference definition [ref]: url "title"
219#[derive(Debug, Clone)]
220pub struct ReferenceDef {
221    /// Line number (1-indexed)
222    pub line: usize,
223    /// Reference ID (normalized to lowercase)
224    pub id: String,
225    /// URL
226    pub url: String,
227    /// Optional title
228    pub title: Option<String>,
229    /// Byte offset where the reference definition starts
230    pub byte_offset: usize,
231    /// Byte offset where the reference definition ends
232    pub byte_end: usize,
233    /// Byte offset where the title starts (if present, includes quote)
234    pub title_byte_start: Option<usize>,
235    /// Byte offset where the title ends (if present, includes quote)
236    pub title_byte_end: Option<usize>,
237}
238
239/// Parsed code span information
240#[derive(Debug, Clone)]
241pub struct CodeSpan {
242    /// Line number where the code span starts (1-indexed)
243    pub line: usize,
244    /// Line number where the code span ends (1-indexed)
245    pub end_line: usize,
246    /// Start column (0-indexed) in the line
247    pub start_col: usize,
248    /// End column (0-indexed) in the line
249    pub end_col: usize,
250    /// Byte offset in document
251    pub byte_offset: usize,
252    /// End byte offset in document
253    pub byte_end: usize,
254    /// Number of backticks used (1, 2, 3, etc.)
255    pub backtick_count: usize,
256    /// Content inside the code span (without backticks)
257    pub content: String,
258}
259
260/// Information about a heading
261#[derive(Debug, Clone)]
262pub struct HeadingInfo {
263    /// Heading level (1-6 for ATX, 1-2 for Setext)
264    pub level: u8,
265    /// Style of heading
266    pub style: HeadingStyle,
267    /// The heading marker (# characters or underline)
268    pub marker: String,
269    /// Column where the marker starts (0-based)
270    pub marker_column: usize,
271    /// Column where heading text starts
272    pub content_column: usize,
273    /// The heading text (without markers and without custom ID syntax)
274    pub text: String,
275    /// Custom header ID if present (e.g., from {#custom-id} syntax)
276    pub custom_id: Option<String>,
277    /// Original heading text including custom ID syntax
278    pub raw_text: String,
279    /// Whether it has a closing sequence (for ATX)
280    pub has_closing_sequence: bool,
281    /// The closing sequence if present
282    pub closing_sequence: String,
283    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
284    /// False for malformed headings like `#NoSpace` that MD018 should flag
285    pub is_valid: bool,
286}
287
288/// A valid heading from a filtered iteration
289///
290/// Only includes headings that are CommonMark-compliant (have space after #).
291/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
292#[derive(Debug, Clone)]
293pub struct ValidHeading<'a> {
294    /// The 1-indexed line number in the document
295    pub line_num: usize,
296    /// Reference to the heading information
297    pub heading: &'a HeadingInfo,
298    /// Reference to the full line info (for rules that need additional context)
299    pub line_info: &'a LineInfo,
300}
301
302/// Iterator over valid CommonMark headings in a document
303///
304/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
305/// but should not be processed by other heading rules.
306pub struct ValidHeadingsIter<'a> {
307    lines: &'a [LineInfo],
308    current_index: usize,
309}
310
311impl<'a> ValidHeadingsIter<'a> {
312    fn new(lines: &'a [LineInfo]) -> Self {
313        Self {
314            lines,
315            current_index: 0,
316        }
317    }
318}
319
320impl<'a> Iterator for ValidHeadingsIter<'a> {
321    type Item = ValidHeading<'a>;
322
323    fn next(&mut self) -> Option<Self::Item> {
324        while self.current_index < self.lines.len() {
325            let idx = self.current_index;
326            self.current_index += 1;
327
328            let line_info = &self.lines[idx];
329            if let Some(heading) = &line_info.heading
330                && heading.is_valid
331            {
332                return Some(ValidHeading {
333                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
334                    heading,
335                    line_info,
336                });
337            }
338        }
339        None
340    }
341}
342
343/// Information about a blockquote line
344#[derive(Debug, Clone)]
345pub struct BlockquoteInfo {
346    /// Nesting level (1 for >, 2 for >>, etc.)
347    pub nesting_level: usize,
348    /// The indentation before the blockquote marker
349    pub indent: String,
350    /// Column where the first > starts (0-based)
351    pub marker_column: usize,
352    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
353    pub prefix: String,
354    /// Content after the blockquote marker(s)
355    pub content: String,
356    /// Whether the line has no space after the marker
357    pub has_no_space_after_marker: bool,
358    /// Whether the line has multiple spaces after the marker
359    pub has_multiple_spaces_after_marker: bool,
360    /// Whether this is an empty blockquote line needing MD028 fix
361    pub needs_md028_fix: bool,
362}
363
364/// Information about a list block
365#[derive(Debug, Clone)]
366pub struct ListBlock {
367    /// Line number where the list starts (1-indexed)
368    pub start_line: usize,
369    /// Line number where the list ends (1-indexed)
370    pub end_line: usize,
371    /// Whether it's ordered or unordered
372    pub is_ordered: bool,
373    /// The consistent marker for unordered lists (if any)
374    pub marker: Option<String>,
375    /// Blockquote prefix for this list (empty if not in blockquote)
376    pub blockquote_prefix: String,
377    /// Lines that are list items within this block
378    pub item_lines: Vec<usize>,
379    /// Nesting level (0 for top-level lists)
380    pub nesting_level: usize,
381    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
382    pub max_marker_width: usize,
383}
384
385use std::sync::{Arc, OnceLock};
386
387/// Map from line byte offset to list item data: (is_ordered, marker, marker_column, content_column, number)
388type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
389
390/// Character frequency data for fast content analysis
391#[derive(Debug, Clone, Default)]
392pub struct CharFrequency {
393    /// Count of # characters (headings)
394    pub hash_count: usize,
395    /// Count of * characters (emphasis, lists, horizontal rules)
396    pub asterisk_count: usize,
397    /// Count of _ characters (emphasis, horizontal rules)
398    pub underscore_count: usize,
399    /// Count of - characters (lists, horizontal rules, setext headings)
400    pub hyphen_count: usize,
401    /// Count of + characters (lists)
402    pub plus_count: usize,
403    /// Count of > characters (blockquotes)
404    pub gt_count: usize,
405    /// Count of | characters (tables)
406    pub pipe_count: usize,
407    /// Count of [ characters (links, images)
408    pub bracket_count: usize,
409    /// Count of ` characters (code spans, code blocks)
410    pub backtick_count: usize,
411    /// Count of < characters (HTML tags, autolinks)
412    pub lt_count: usize,
413    /// Count of ! characters (images)
414    pub exclamation_count: usize,
415    /// Count of newline characters
416    pub newline_count: usize,
417}
418
419/// Pre-parsed HTML tag information
420#[derive(Debug, Clone)]
421pub struct HtmlTag {
422    /// Line number (1-indexed)
423    pub line: usize,
424    /// Start column (0-indexed) in the line
425    pub start_col: usize,
426    /// End column (0-indexed) in the line
427    pub end_col: usize,
428    /// Byte offset in document
429    pub byte_offset: usize,
430    /// End byte offset in document
431    pub byte_end: usize,
432    /// Tag name (e.g., "div", "img", "br")
433    pub tag_name: String,
434    /// Whether it's a closing tag (`</tag>`)
435    pub is_closing: bool,
436    /// Whether it's self-closing (`<tag />`)
437    pub is_self_closing: bool,
438    /// Raw tag content
439    pub raw_content: String,
440}
441
442/// Pre-parsed emphasis span information
443#[derive(Debug, Clone)]
444pub struct EmphasisSpan {
445    /// Line number (1-indexed)
446    pub line: usize,
447    /// Start column (0-indexed) in the line
448    pub start_col: usize,
449    /// End column (0-indexed) in the line
450    pub end_col: usize,
451    /// Byte offset in document
452    pub byte_offset: usize,
453    /// End byte offset in document
454    pub byte_end: usize,
455    /// Type of emphasis ('*' or '_')
456    pub marker: char,
457    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
458    pub marker_count: usize,
459    /// Content inside the emphasis
460    pub content: String,
461}
462
463/// Pre-parsed table row information
464#[derive(Debug, Clone)]
465pub struct TableRow {
466    /// Line number (1-indexed)
467    pub line: usize,
468    /// Whether this is a separator row (contains only |, -, :, and spaces)
469    pub is_separator: bool,
470    /// Number of columns (pipe-separated cells)
471    pub column_count: usize,
472    /// Alignment info from separator row
473    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
474}
475
476/// Pre-parsed bare URL information (not in links)
477#[derive(Debug, Clone)]
478pub struct BareUrl {
479    /// Line number (1-indexed)
480    pub line: usize,
481    /// Start column (0-indexed) in the line
482    pub start_col: usize,
483    /// End column (0-indexed) in the line
484    pub end_col: usize,
485    /// Byte offset in document
486    pub byte_offset: usize,
487    /// End byte offset in document
488    pub byte_end: usize,
489    /// The URL string
490    pub url: String,
491    /// Type of URL ("http", "https", "ftp", "email")
492    pub url_type: String,
493}
494
495pub struct LintContext<'a> {
496    pub content: &'a str,
497    pub line_offsets: Vec<usize>,
498    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
499    pub lines: Vec<LineInfo>,             // Pre-computed line information
500    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
501    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
502    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
503    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
504    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
505    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
506    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
507    pub char_frequency: CharFrequency,    // Character frequency analysis
508    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
509    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
510    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
511    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
512    has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
513    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
514    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
515    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
516    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
517    pub flavor: MarkdownFlavor,           // Markdown flavor being used
518    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
519}
520
521/// Detailed blockquote parse result with all components
522struct BlockquoteComponents<'a> {
523    indent: &'a str,
524    markers: &'a str,
525    spaces_after: &'a str,
526    content: &'a str,
527}
528
529/// Parse blockquote prefix with detailed components using manual parsing
530#[inline]
531fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
532    let bytes = line.as_bytes();
533    let mut pos = 0;
534
535    // Parse leading whitespace (indent)
536    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
537        pos += 1;
538    }
539    let indent_end = pos;
540
541    // Must have at least one '>' marker
542    if pos >= bytes.len() || bytes[pos] != b'>' {
543        return None;
544    }
545
546    // Parse '>' markers
547    while pos < bytes.len() && bytes[pos] == b'>' {
548        pos += 1;
549    }
550    let markers_end = pos;
551
552    // Parse spaces after markers
553    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
554        pos += 1;
555    }
556    let spaces_end = pos;
557
558    Some(BlockquoteComponents {
559        indent: &line[0..indent_end],
560        markers: &line[indent_end..markers_end],
561        spaces_after: &line[markers_end..spaces_end],
562        content: &line[spaces_end..],
563    })
564}
565
566impl<'a> LintContext<'a> {
567    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
568        #[cfg(not(target_arch = "wasm32"))]
569        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
570        #[cfg(target_arch = "wasm32")]
571        let profile = false;
572
573        let line_offsets = profile_section!("Line offsets", profile, {
574            let mut offsets = vec![0];
575            for (i, c) in content.char_indices() {
576                if c == '\n' {
577                    offsets.push(i + 1);
578                }
579            }
580            offsets
581        });
582
583        // Detect code blocks once and cache them
584        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
585
586        // Pre-compute HTML comment ranges ONCE for all operations
587        let html_comment_ranges = profile_section!(
588            "HTML comment ranges",
589            profile,
590            crate::utils::skip_context::compute_html_comment_ranges(content)
591        );
592
593        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
594        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
595            if flavor == MarkdownFlavor::MkDocs {
596                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
597            } else {
598                Vec::new()
599            }
600        });
601
602        // Pre-compute line information AND emphasis spans (without headings/blockquotes yet)
603        // Emphasis spans are captured during the same pulldown-cmark parse as list detection
604        let (mut lines, emphasis_spans) = profile_section!(
605            "Basic line info",
606            profile,
607            Self::compute_basic_line_info(
608                content,
609                &line_offsets,
610                &code_blocks,
611                flavor,
612                &html_comment_ranges,
613                &autodoc_ranges,
614            )
615        );
616
617        // Detect HTML blocks BEFORE heading detection
618        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
619
620        // Detect ESM import/export blocks in MDX files BEFORE heading detection
621        profile_section!(
622            "ESM blocks",
623            profile,
624            Self::detect_esm_blocks(content, &mut lines, flavor)
625        );
626
627        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
628        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
629
630        // Now detect headings and blockquotes
631        profile_section!(
632            "Headings & blockquotes",
633            profile,
634            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
635        );
636
637        // Parse code spans early so we can exclude them from link/image parsing
638        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
639
640        // Mark lines that are continuations of multi-line code spans
641        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
642        for span in &code_spans {
643            if span.end_line > span.line {
644                // Mark lines after the first line as continuations
645                for line_num in (span.line + 1)..=span.end_line {
646                    if let Some(line_info) = lines.get_mut(line_num - 1) {
647                        line_info.in_code_span_continuation = true;
648                    }
649                }
650            }
651        }
652
653        // Parse links, images, references, and list blocks
654        let (links, broken_links, footnote_refs) = profile_section!(
655            "Links",
656            profile,
657            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
658        );
659
660        let images = profile_section!(
661            "Images",
662            profile,
663            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
664        );
665
666        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
667
668        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
669
670        // Compute character frequency for fast content analysis
671        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
672
673        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
674        let table_blocks = profile_section!(
675            "Table blocks",
676            profile,
677            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
678                content,
679                &code_blocks,
680                &code_spans,
681                &html_comment_ranges,
682            )
683        );
684
685        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
686        let line_index = profile_section!(
687            "Line index",
688            profile,
689            crate::utils::range_utils::LineIndex::new(content)
690        );
691
692        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
693        let jinja_ranges = profile_section!(
694            "Jinja ranges",
695            profile,
696            crate::utils::jinja_utils::find_jinja_ranges(content)
697        );
698
699        Self {
700            content,
701            line_offsets,
702            code_blocks,
703            lines,
704            links,
705            images,
706            broken_links,
707            footnote_refs,
708            reference_defs,
709            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
710            list_blocks,
711            char_frequency,
712            html_tags_cache: OnceLock::new(),
713            emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
714            table_rows_cache: OnceLock::new(),
715            bare_urls_cache: OnceLock::new(),
716            has_mixed_list_nesting_cache: OnceLock::new(),
717            html_comment_ranges,
718            table_blocks,
719            line_index,
720            jinja_ranges,
721            flavor,
722            source_file,
723        }
724    }
725
726    /// Get code spans - computed lazily on first access
727    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
728        Arc::clone(
729            self.code_spans_cache
730                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
731        )
732    }
733
734    /// Get HTML comment ranges - pre-computed during LintContext construction
735    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
736        &self.html_comment_ranges
737    }
738
739    /// Get HTML tags - computed lazily on first access
740    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
741        Arc::clone(self.html_tags_cache.get_or_init(|| {
742            Arc::new(Self::parse_html_tags(
743                self.content,
744                &self.lines,
745                &self.code_blocks,
746                self.flavor,
747            ))
748        }))
749    }
750
751    /// Get emphasis spans - pre-computed during construction
752    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
753        Arc::clone(
754            self.emphasis_spans_cache
755                .get()
756                .expect("emphasis_spans_cache initialized during construction"),
757        )
758    }
759
760    /// Get table rows - computed lazily on first access
761    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
762        Arc::clone(
763            self.table_rows_cache
764                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
765        )
766    }
767
768    /// Get bare URLs - computed lazily on first access
769    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
770        Arc::clone(
771            self.bare_urls_cache
772                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
773        )
774    }
775
776    /// Check if document has mixed ordered/unordered list nesting.
777    /// Result is cached after first computation (document-level invariant).
778    /// This is used by MD007 for smart style auto-detection.
779    pub fn has_mixed_list_nesting(&self) -> bool {
780        *self
781            .has_mixed_list_nesting_cache
782            .get_or_init(|| self.compute_mixed_list_nesting())
783    }
784
785    /// Internal computation for mixed list nesting (only called once per LintContext).
786    fn compute_mixed_list_nesting(&self) -> bool {
787        // Track parent list items by their marker position and type
788        // Using marker_column instead of indent because it works correctly
789        // for blockquoted content where indent doesn't account for the prefix
790        // Stack stores: (marker_column, is_ordered)
791        let mut stack: Vec<(usize, bool)> = Vec::new();
792        let mut last_was_blank = false;
793
794        for line_info in &self.lines {
795            // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
796            if line_info.in_code_block
797                || line_info.in_front_matter
798                || line_info.in_mkdocstrings
799                || line_info.in_html_comment
800                || line_info.in_esm_block
801            {
802                continue;
803            }
804
805            // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
806            if line_info.is_blank {
807                last_was_blank = true;
808                continue;
809            }
810
811            if let Some(list_item) = &line_info.list_item {
812                // Normalize column 1 to column 0 (consistent with MD007 check function)
813                let current_pos = if list_item.marker_column == 1 {
814                    0
815                } else {
816                    list_item.marker_column
817                };
818
819                // If there was a blank line and this item is at root level, reset stack
820                if last_was_blank && current_pos == 0 {
821                    stack.clear();
822                }
823                last_was_blank = false;
824
825                // Pop items at same or greater position (they're siblings or deeper, not parents)
826                while let Some(&(pos, _)) = stack.last() {
827                    if pos >= current_pos {
828                        stack.pop();
829                    } else {
830                        break;
831                    }
832                }
833
834                // Check if immediate parent has different type - this is mixed nesting
835                if let Some(&(_, parent_is_ordered)) = stack.last()
836                    && parent_is_ordered != list_item.is_ordered
837                {
838                    return true; // Found mixed nesting - early exit
839                }
840
841                stack.push((current_pos, list_item.is_ordered));
842            } else {
843                // Non-list line (but not blank) - could be paragraph or other content
844                last_was_blank = false;
845            }
846        }
847
848        false
849    }
850
851    /// Map a byte offset to (line, column)
852    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
853        match self.line_offsets.binary_search(&offset) {
854            Ok(line) => (line + 1, 1),
855            Err(line) => {
856                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
857                (line, offset - line_start + 1)
858            }
859        }
860    }
861
862    /// Check if a position is within a code block or code span
863    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
864        // Check code blocks first
865        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
866            return true;
867        }
868
869        // Check inline code spans (lazy load if needed)
870        self.code_spans()
871            .iter()
872            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
873    }
874
875    /// Get line information by line number (1-indexed)
876    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
877        if line_num > 0 {
878            self.lines.get(line_num - 1)
879        } else {
880            None
881        }
882    }
883
884    /// Get byte offset for a line number (1-indexed)
885    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
886        self.line_info(line_num).map(|info| info.byte_offset)
887    }
888
889    /// Get URL for a reference link/image by its ID
890    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
891        let normalized_id = ref_id.to_lowercase();
892        self.reference_defs
893            .iter()
894            .find(|def| def.id == normalized_id)
895            .map(|def| def.url.as_str())
896    }
897
898    /// Check if a line is part of a list block
899    pub fn is_in_list_block(&self, line_num: usize) -> bool {
900        self.list_blocks
901            .iter()
902            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
903    }
904
905    /// Get the list block containing a specific line
906    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
907        self.list_blocks
908            .iter()
909            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
910    }
911
912    // Compatibility methods for DocumentStructure migration
913
914    /// Check if a line is within a code block
915    pub fn is_in_code_block(&self, line_num: usize) -> bool {
916        if line_num == 0 || line_num > self.lines.len() {
917            return false;
918        }
919        self.lines[line_num - 1].in_code_block
920    }
921
922    /// Check if a line is within front matter
923    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
924        if line_num == 0 || line_num > self.lines.len() {
925            return false;
926        }
927        self.lines[line_num - 1].in_front_matter
928    }
929
930    /// Check if a line is within an HTML block
931    pub fn is_in_html_block(&self, line_num: usize) -> bool {
932        if line_num == 0 || line_num > self.lines.len() {
933            return false;
934        }
935        self.lines[line_num - 1].in_html_block
936    }
937
938    /// Check if a line and column is within a code span
939    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
940        if line_num == 0 || line_num > self.lines.len() {
941            return false;
942        }
943
944        // Use the code spans cache to check
945        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
946        // Convert col to 0-indexed for comparison
947        let col_0indexed = if col > 0 { col - 1 } else { 0 };
948        let code_spans = self.code_spans();
949        code_spans.iter().any(|span| {
950            // Check if line is within the span's line range
951            if line_num < span.line || line_num > span.end_line {
952                return false;
953            }
954
955            if span.line == span.end_line {
956                // Single-line span: check column bounds
957                col_0indexed >= span.start_col && col_0indexed < span.end_col
958            } else if line_num == span.line {
959                // First line of multi-line span: anything after start_col is in span
960                col_0indexed >= span.start_col
961            } else if line_num == span.end_line {
962                // Last line of multi-line span: anything before end_col is in span
963                col_0indexed < span.end_col
964            } else {
965                // Middle line of multi-line span: entire line is in span
966                true
967            }
968        })
969    }
970
971    /// Check if a byte offset is within a code span
972    #[inline]
973    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
974        let code_spans = self.code_spans();
975        code_spans
976            .iter()
977            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
978    }
979
980    /// Check if a byte position is within a reference definition
981    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
982    #[inline]
983    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
984        self.reference_defs
985            .iter()
986            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
987    }
988
989    /// Check if a byte position is within an HTML comment
990    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
991    /// where k is the number of HTML comments (typically very small)
992    #[inline]
993    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
994        self.html_comment_ranges
995            .iter()
996            .any(|range| byte_pos >= range.start && byte_pos < range.end)
997    }
998
999    /// Check if a byte position is within an HTML tag (including multiline tags)
1000    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
1001    #[inline]
1002    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1003        self.html_tags()
1004            .iter()
1005            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1006    }
1007
1008    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
1009    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1010        self.jinja_ranges
1011            .iter()
1012            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1013    }
1014
1015    /// Check if a byte position is within a link reference definition title
1016    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1017        self.reference_defs.iter().any(|def| {
1018            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1019                byte_pos >= start && byte_pos < end
1020            } else {
1021                false
1022            }
1023        })
1024    }
1025
1026    /// Check if content has any instances of a specific character (fast)
1027    pub fn has_char(&self, ch: char) -> bool {
1028        match ch {
1029            '#' => self.char_frequency.hash_count > 0,
1030            '*' => self.char_frequency.asterisk_count > 0,
1031            '_' => self.char_frequency.underscore_count > 0,
1032            '-' => self.char_frequency.hyphen_count > 0,
1033            '+' => self.char_frequency.plus_count > 0,
1034            '>' => self.char_frequency.gt_count > 0,
1035            '|' => self.char_frequency.pipe_count > 0,
1036            '[' => self.char_frequency.bracket_count > 0,
1037            '`' => self.char_frequency.backtick_count > 0,
1038            '<' => self.char_frequency.lt_count > 0,
1039            '!' => self.char_frequency.exclamation_count > 0,
1040            '\n' => self.char_frequency.newline_count > 0,
1041            _ => self.content.contains(ch), // Fallback for other characters
1042        }
1043    }
1044
1045    /// Get count of a specific character (fast)
1046    pub fn char_count(&self, ch: char) -> usize {
1047        match ch {
1048            '#' => self.char_frequency.hash_count,
1049            '*' => self.char_frequency.asterisk_count,
1050            '_' => self.char_frequency.underscore_count,
1051            '-' => self.char_frequency.hyphen_count,
1052            '+' => self.char_frequency.plus_count,
1053            '>' => self.char_frequency.gt_count,
1054            '|' => self.char_frequency.pipe_count,
1055            '[' => self.char_frequency.bracket_count,
1056            '`' => self.char_frequency.backtick_count,
1057            '<' => self.char_frequency.lt_count,
1058            '!' => self.char_frequency.exclamation_count,
1059            '\n' => self.char_frequency.newline_count,
1060            _ => self.content.matches(ch).count(), // Fallback for other characters
1061        }
1062    }
1063
1064    /// Check if content likely contains headings (fast)
1065    pub fn likely_has_headings(&self) -> bool {
1066        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1067    }
1068
1069    /// Check if content likely contains lists (fast)
1070    pub fn likely_has_lists(&self) -> bool {
1071        self.char_frequency.asterisk_count > 0
1072            || self.char_frequency.hyphen_count > 0
1073            || self.char_frequency.plus_count > 0
1074    }
1075
1076    /// Check if content likely contains emphasis (fast)
1077    pub fn likely_has_emphasis(&self) -> bool {
1078        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1079    }
1080
1081    /// Check if content likely contains tables (fast)
1082    pub fn likely_has_tables(&self) -> bool {
1083        self.char_frequency.pipe_count > 2
1084    }
1085
1086    /// Check if content likely contains blockquotes (fast)
1087    pub fn likely_has_blockquotes(&self) -> bool {
1088        self.char_frequency.gt_count > 0
1089    }
1090
1091    /// Check if content likely contains code (fast)
1092    pub fn likely_has_code(&self) -> bool {
1093        self.char_frequency.backtick_count > 0
1094    }
1095
1096    /// Check if content likely contains links or images (fast)
1097    pub fn likely_has_links_or_images(&self) -> bool {
1098        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1099    }
1100
1101    /// Check if content likely contains HTML (fast)
1102    pub fn likely_has_html(&self) -> bool {
1103        self.char_frequency.lt_count > 0
1104    }
1105
1106    /// Get HTML tags on a specific line
1107    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1108        self.html_tags()
1109            .iter()
1110            .filter(|tag| tag.line == line_num)
1111            .cloned()
1112            .collect()
1113    }
1114
1115    /// Get emphasis spans on a specific line
1116    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1117        self.emphasis_spans()
1118            .iter()
1119            .filter(|span| span.line == line_num)
1120            .cloned()
1121            .collect()
1122    }
1123
1124    /// Get table rows on a specific line
1125    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1126        self.table_rows()
1127            .iter()
1128            .filter(|row| row.line == line_num)
1129            .cloned()
1130            .collect()
1131    }
1132
1133    /// Get bare URLs on a specific line
1134    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1135        self.bare_urls()
1136            .iter()
1137            .filter(|url| url.line == line_num)
1138            .cloned()
1139            .collect()
1140    }
1141
1142    /// Find the line index for a given byte offset using binary search.
1143    /// Returns (line_index, line_number, column) where:
1144    /// - line_index is the 0-based index in the lines array
1145    /// - line_number is the 1-based line number
1146    /// - column is the byte offset within that line
1147    #[inline]
1148    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1149        // Binary search to find the line containing this byte offset
1150        let idx = match lines.binary_search_by(|line| {
1151            if byte_offset < line.byte_offset {
1152                std::cmp::Ordering::Greater
1153            } else if byte_offset > line.byte_offset + line.byte_len {
1154                std::cmp::Ordering::Less
1155            } else {
1156                std::cmp::Ordering::Equal
1157            }
1158        }) {
1159            Ok(idx) => idx,
1160            Err(idx) => idx.saturating_sub(1),
1161        };
1162
1163        let line = &lines[idx];
1164        let line_num = idx + 1;
1165        let col = byte_offset.saturating_sub(line.byte_offset);
1166
1167        (idx, line_num, col)
1168    }
1169
1170    /// Check if a byte offset is within a code span using binary search
1171    #[inline]
1172    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1173        // Since spans are sorted by byte_offset, use partition_point for binary search
1174        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1175
1176        // Check the span that starts at or before our offset
1177        if idx > 0 {
1178            let span = &code_spans[idx - 1];
1179            if offset >= span.byte_offset && offset < span.byte_end {
1180                return true;
1181            }
1182        }
1183
1184        false
1185    }
1186
1187    /// Collect byte ranges of all links using pulldown-cmark
1188    /// This is used to skip heading detection for lines that fall within link syntax
1189    /// (e.g., multiline links like `[text](url\n#fragment)`)
1190    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1191        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1192
1193        let mut link_ranges = Vec::new();
1194        let mut options = Options::empty();
1195        options.insert(Options::ENABLE_WIKILINKS);
1196        options.insert(Options::ENABLE_FOOTNOTES);
1197
1198        let parser = Parser::new_ext(content, options).into_offset_iter();
1199        let mut link_stack: Vec<usize> = Vec::new();
1200
1201        for (event, range) in parser {
1202            match event {
1203                Event::Start(Tag::Link { .. }) => {
1204                    link_stack.push(range.start);
1205                }
1206                Event::End(TagEnd::Link) => {
1207                    if let Some(start_pos) = link_stack.pop() {
1208                        link_ranges.push((start_pos, range.end));
1209                    }
1210                }
1211                _ => {}
1212            }
1213        }
1214
1215        link_ranges
1216    }
1217
1218    /// Parse all links in the content
1219    fn parse_links(
1220        content: &'a str,
1221        lines: &[LineInfo],
1222        code_blocks: &[(usize, usize)],
1223        code_spans: &[CodeSpan],
1224        flavor: MarkdownFlavor,
1225        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1226    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1227        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1228        use std::collections::HashSet;
1229
1230        let mut links = Vec::with_capacity(content.len() / 500);
1231        let mut broken_links = Vec::new();
1232        let mut footnote_refs = Vec::new();
1233
1234        // Track byte positions of links found by pulldown-cmark
1235        let mut found_positions = HashSet::new();
1236
1237        // Use pulldown-cmark's streaming parser with BrokenLink callback
1238        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1239        // This automatically handles:
1240        // - Escaped links (won't generate events)
1241        // - Links in code blocks/spans (won't generate Link events)
1242        // - Images (generates Tag::Image instead)
1243        // - Reference resolution (dest_url is already resolved!)
1244        // - Broken references (callback is invoked)
1245        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1246        let mut options = Options::empty();
1247        options.insert(Options::ENABLE_WIKILINKS);
1248        options.insert(Options::ENABLE_FOOTNOTES);
1249
1250        let parser = Parser::new_with_broken_link_callback(
1251            content,
1252            options,
1253            Some(|link: BrokenLink<'_>| {
1254                broken_links.push(BrokenLinkInfo {
1255                    reference: link.reference.to_string(),
1256                    span: link.span.clone(),
1257                });
1258                None
1259            }),
1260        )
1261        .into_offset_iter();
1262
1263        let mut link_stack: Vec<(
1264            usize,
1265            usize,
1266            pulldown_cmark::CowStr<'a>,
1267            LinkType,
1268            pulldown_cmark::CowStr<'a>,
1269        )> = Vec::new();
1270        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1271
1272        for (event, range) in parser {
1273            match event {
1274                Event::Start(Tag::Link {
1275                    link_type,
1276                    dest_url,
1277                    id,
1278                    ..
1279                }) => {
1280                    // Link start - record position, URL, and reference ID
1281                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1282                    text_chunks.clear();
1283                }
1284                Event::Text(text) if !link_stack.is_empty() => {
1285                    // Track text content with its byte range
1286                    text_chunks.push((text.to_string(), range.start, range.end));
1287                }
1288                Event::Code(code) if !link_stack.is_empty() => {
1289                    // Include inline code in link text (with backticks)
1290                    let code_text = format!("`{code}`");
1291                    text_chunks.push((code_text, range.start, range.end));
1292                }
1293                Event::End(TagEnd::Link) => {
1294                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1295                        // Skip if in HTML comment
1296                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1297                            text_chunks.clear();
1298                            continue;
1299                        }
1300
1301                        // Find line and column information
1302                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303
1304                        // Skip if this link is on a MkDocs snippet line
1305                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1306                            text_chunks.clear();
1307                            continue;
1308                        }
1309
1310                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1311
1312                        let is_reference = matches!(
1313                            link_type,
1314                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1315                        );
1316
1317                        // Extract link text directly from source bytes to preserve escaping
1318                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1319                        let link_text = if start_pos < content.len() {
1320                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1321
1322                            // Find MATCHING ] by tracking bracket depth for nested brackets
1323                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1324                            // Brackets inside code spans (between backticks) should be ignored
1325                            let mut close_pos = None;
1326                            let mut depth = 0;
1327                            let mut in_code_span = false;
1328
1329                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1330                                // Count preceding backslashes
1331                                let mut backslash_count = 0;
1332                                let mut j = i;
1333                                while j > 0 && link_bytes[j - 1] == b'\\' {
1334                                    backslash_count += 1;
1335                                    j -= 1;
1336                                }
1337                                let is_escaped = backslash_count % 2 != 0;
1338
1339                                // Track code spans - backticks toggle in/out of code
1340                                if byte == b'`' && !is_escaped {
1341                                    in_code_span = !in_code_span;
1342                                }
1343
1344                                // Only count brackets when NOT in a code span
1345                                if !is_escaped && !in_code_span {
1346                                    if byte == b'[' {
1347                                        depth += 1;
1348                                    } else if byte == b']' {
1349                                        if depth == 0 {
1350                                            // Found the matching closing bracket
1351                                            close_pos = Some(i);
1352                                            break;
1353                                        } else {
1354                                            depth -= 1;
1355                                        }
1356                                    }
1357                                }
1358                            }
1359
1360                            if let Some(pos) = close_pos {
1361                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1362                            } else {
1363                                Cow::Borrowed("")
1364                            }
1365                        } else {
1366                            Cow::Borrowed("")
1367                        };
1368
1369                        // For reference links, use the actual reference ID from pulldown-cmark
1370                        let reference_id = if is_reference && !ref_id.is_empty() {
1371                            Some(Cow::Owned(ref_id.to_lowercase()))
1372                        } else if is_reference {
1373                            // For collapsed/shortcut references without explicit ID, use the link text
1374                            Some(Cow::Owned(link_text.to_lowercase()))
1375                        } else {
1376                            None
1377                        };
1378
1379                        // Track this position as found
1380                        found_positions.insert(start_pos);
1381
1382                        links.push(ParsedLink {
1383                            line: line_num,
1384                            start_col: col_start,
1385                            end_col: col_end,
1386                            byte_offset: start_pos,
1387                            byte_end: range.end,
1388                            text: link_text,
1389                            url: Cow::Owned(url.to_string()),
1390                            is_reference,
1391                            reference_id,
1392                            link_type,
1393                        });
1394
1395                        text_chunks.clear();
1396                    }
1397                }
1398                Event::FootnoteReference(footnote_id) => {
1399                    // Capture footnote references like [^1], [^note]
1400                    // Skip if in HTML comment
1401                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1402                        continue;
1403                    }
1404
1405                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1406                    footnote_refs.push(FootnoteRef {
1407                        id: footnote_id.to_string(),
1408                        line: line_num,
1409                        byte_offset: range.start,
1410                        byte_end: range.end,
1411                    });
1412                }
1413                _ => {}
1414            }
1415        }
1416
1417        // Also find undefined references using regex
1418        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1419        // because the reference is undefined
1420        for cap in LINK_PATTERN.captures_iter(content) {
1421            let full_match = cap.get(0).unwrap();
1422            let match_start = full_match.start();
1423            let match_end = full_match.end();
1424
1425            // Skip if this was already found by pulldown-cmark (it's a valid link)
1426            if found_positions.contains(&match_start) {
1427                continue;
1428            }
1429
1430            // Skip if escaped
1431            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1432                continue;
1433            }
1434
1435            // Skip if it's an image
1436            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1437                continue;
1438            }
1439
1440            // Skip if in code block
1441            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1442                continue;
1443            }
1444
1445            // Skip if in code span
1446            if Self::is_offset_in_code_span(code_spans, match_start) {
1447                continue;
1448            }
1449
1450            // Skip if in HTML comment
1451            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1452                continue;
1453            }
1454
1455            // Find line and column information
1456            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1457
1458            // Skip if this link is on a MkDocs snippet line
1459            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1460                continue;
1461            }
1462
1463            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1464
1465            let text = cap.get(1).map_or("", |m| m.as_str());
1466
1467            // Only process reference links (group 6)
1468            if let Some(ref_id) = cap.get(6) {
1469                let ref_id_str = ref_id.as_str();
1470                let normalized_ref = if ref_id_str.is_empty() {
1471                    Cow::Owned(text.to_lowercase()) // Implicit reference
1472                } else {
1473                    Cow::Owned(ref_id_str.to_lowercase())
1474                };
1475
1476                // This is an undefined reference (pulldown-cmark didn't parse it)
1477                links.push(ParsedLink {
1478                    line: line_num,
1479                    start_col: col_start,
1480                    end_col: col_end,
1481                    byte_offset: match_start,
1482                    byte_end: match_end,
1483                    text: Cow::Borrowed(text),
1484                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1485                    is_reference: true,
1486                    reference_id: Some(normalized_ref),
1487                    link_type: LinkType::Reference, // Undefined references are reference-style
1488                });
1489            }
1490        }
1491
1492        (links, broken_links, footnote_refs)
1493    }
1494
1495    /// Parse all images in the content
1496    fn parse_images(
1497        content: &'a str,
1498        lines: &[LineInfo],
1499        code_blocks: &[(usize, usize)],
1500        code_spans: &[CodeSpan],
1501        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1502    ) -> Vec<ParsedImage<'a>> {
1503        use crate::utils::skip_context::is_in_html_comment_ranges;
1504        use std::collections::HashSet;
1505
1506        // Pre-size based on a heuristic: images are less common than links
1507        let mut images = Vec::with_capacity(content.len() / 1000);
1508        let mut found_positions = HashSet::new();
1509
1510        // Use pulldown-cmark for parsing - more accurate and faster
1511        let parser = Parser::new(content).into_offset_iter();
1512        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1513            Vec::new();
1514        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1515
1516        for (event, range) in parser {
1517            match event {
1518                Event::Start(Tag::Image {
1519                    link_type,
1520                    dest_url,
1521                    id,
1522                    ..
1523                }) => {
1524                    image_stack.push((range.start, dest_url, link_type, id));
1525                    text_chunks.clear();
1526                }
1527                Event::Text(text) if !image_stack.is_empty() => {
1528                    text_chunks.push((text.to_string(), range.start, range.end));
1529                }
1530                Event::Code(code) if !image_stack.is_empty() => {
1531                    let code_text = format!("`{code}`");
1532                    text_chunks.push((code_text, range.start, range.end));
1533                }
1534                Event::End(TagEnd::Image) => {
1535                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1536                        // Skip if in code block
1537                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1538                            continue;
1539                        }
1540
1541                        // Skip if in code span
1542                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1543                            continue;
1544                        }
1545
1546                        // Skip if in HTML comment
1547                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1548                            continue;
1549                        }
1550
1551                        // Find line and column using binary search
1552                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1553                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1554
1555                        let is_reference = matches!(
1556                            link_type,
1557                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1558                        );
1559
1560                        // Extract alt text directly from source bytes to preserve escaping
1561                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1562                        let alt_text = if start_pos < content.len() {
1563                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1564
1565                            // Find MATCHING ] by tracking bracket depth for nested brackets
1566                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1567                            let mut close_pos = None;
1568                            let mut depth = 0;
1569
1570                            if image_bytes.len() > 2 {
1571                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1572                                    // Count preceding backslashes
1573                                    let mut backslash_count = 0;
1574                                    let mut j = i;
1575                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1576                                        backslash_count += 1;
1577                                        j -= 1;
1578                                    }
1579                                    let is_escaped = backslash_count % 2 != 0;
1580
1581                                    if !is_escaped {
1582                                        if byte == b'[' {
1583                                            depth += 1;
1584                                        } else if byte == b']' {
1585                                            if depth == 0 {
1586                                                // Found the matching closing bracket
1587                                                close_pos = Some(i);
1588                                                break;
1589                                            } else {
1590                                                depth -= 1;
1591                                            }
1592                                        }
1593                                    }
1594                                }
1595                            }
1596
1597                            if let Some(pos) = close_pos {
1598                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1599                            } else {
1600                                Cow::Borrowed("")
1601                            }
1602                        } else {
1603                            Cow::Borrowed("")
1604                        };
1605
1606                        let reference_id = if is_reference && !ref_id.is_empty() {
1607                            Some(Cow::Owned(ref_id.to_lowercase()))
1608                        } else if is_reference {
1609                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1610                        } else {
1611                            None
1612                        };
1613
1614                        found_positions.insert(start_pos);
1615                        images.push(ParsedImage {
1616                            line: line_num,
1617                            start_col: col_start,
1618                            end_col: col_end,
1619                            byte_offset: start_pos,
1620                            byte_end: range.end,
1621                            alt_text,
1622                            url: Cow::Owned(url.to_string()),
1623                            is_reference,
1624                            reference_id,
1625                            link_type,
1626                        });
1627                    }
1628                }
1629                _ => {}
1630            }
1631        }
1632
1633        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1634        for cap in IMAGE_PATTERN.captures_iter(content) {
1635            let full_match = cap.get(0).unwrap();
1636            let match_start = full_match.start();
1637            let match_end = full_match.end();
1638
1639            // Skip if already found by pulldown-cmark
1640            if found_positions.contains(&match_start) {
1641                continue;
1642            }
1643
1644            // Skip if the ! is escaped
1645            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1646                continue;
1647            }
1648
1649            // Skip if in code block, code span, or HTML comment
1650            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1651                || Self::is_offset_in_code_span(code_spans, match_start)
1652                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1653            {
1654                continue;
1655            }
1656
1657            // Only process reference images (undefined references not found by pulldown-cmark)
1658            if let Some(ref_id) = cap.get(6) {
1659                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1660                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1661                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1662                let ref_id_str = ref_id.as_str();
1663                let normalized_ref = if ref_id_str.is_empty() {
1664                    Cow::Owned(alt_text.to_lowercase())
1665                } else {
1666                    Cow::Owned(ref_id_str.to_lowercase())
1667                };
1668
1669                images.push(ParsedImage {
1670                    line: line_num,
1671                    start_col: col_start,
1672                    end_col: col_end,
1673                    byte_offset: match_start,
1674                    byte_end: match_end,
1675                    alt_text: Cow::Borrowed(alt_text),
1676                    url: Cow::Borrowed(""),
1677                    is_reference: true,
1678                    reference_id: Some(normalized_ref),
1679                    link_type: LinkType::Reference, // Undefined references are reference-style
1680                });
1681            }
1682        }
1683
1684        images
1685    }
1686
1687    /// Parse reference definitions
1688    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1689        // Pre-size based on lines count as reference definitions are line-based
1690        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1691
1692        for (line_idx, line_info) in lines.iter().enumerate() {
1693            // Skip lines in code blocks
1694            if line_info.in_code_block {
1695                continue;
1696            }
1697
1698            let line = line_info.content(content);
1699            let line_num = line_idx + 1;
1700
1701            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1702                let id = cap.get(1).unwrap().as_str().to_lowercase();
1703                let url = cap.get(2).unwrap().as_str().to_string();
1704                let title_match = cap.get(3).or_else(|| cap.get(4));
1705                let title = title_match.map(|m| m.as_str().to_string());
1706
1707                // Calculate byte positions
1708                // The match starts at the beginning of the line (0) and extends to the end
1709                let match_obj = cap.get(0).unwrap();
1710                let byte_offset = line_info.byte_offset + match_obj.start();
1711                let byte_end = line_info.byte_offset + match_obj.end();
1712
1713                // Calculate title byte positions (includes the quote character before content)
1714                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1715                    // The match is the content inside quotes, so we include the quote before
1716                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1717                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1718                    (Some(start), Some(end))
1719                } else {
1720                    (None, None)
1721                };
1722
1723                refs.push(ReferenceDef {
1724                    line: line_num,
1725                    id,
1726                    url,
1727                    title,
1728                    byte_offset,
1729                    byte_end,
1730                    title_byte_start,
1731                    title_byte_end,
1732                });
1733            }
1734        }
1735
1736        refs
1737    }
1738
1739    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1740    /// Handles nested blockquotes like `> > > content`
1741    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1742    #[inline]
1743    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1744        let trimmed_start = line.trim_start();
1745        if !trimmed_start.starts_with('>') {
1746            return None;
1747        }
1748
1749        // Track total prefix length to handle nested blockquotes
1750        let mut remaining = line;
1751        let mut total_prefix_len = 0;
1752
1753        loop {
1754            let trimmed = remaining.trim_start();
1755            if !trimmed.starts_with('>') {
1756                break;
1757            }
1758
1759            // Add leading whitespace + '>' to prefix
1760            let leading_ws_len = remaining.len() - trimmed.len();
1761            total_prefix_len += leading_ws_len + 1;
1762
1763            let after_gt = &trimmed[1..];
1764
1765            // Handle optional whitespace after '>' (space or tab)
1766            if let Some(stripped) = after_gt.strip_prefix(' ') {
1767                total_prefix_len += 1;
1768                remaining = stripped;
1769            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1770                total_prefix_len += 1;
1771                remaining = stripped;
1772            } else {
1773                remaining = after_gt;
1774            }
1775        }
1776
1777        Some((&line[..total_prefix_len], remaining))
1778    }
1779
1780    /// Detect list items using pulldown-cmark for CommonMark-compliant parsing.
1781    ///
1782    /// Returns a HashMap keyed by line byte offset, containing:
1783    /// `(is_ordered, marker, marker_column, content_column, number)`
1784    ///
1785    /// ## Why pulldown-cmark?
1786    /// Using pulldown-cmark instead of regex ensures we only detect actual list items,
1787    /// not lines that merely look like lists (e.g., continuation paragraphs, code blocks).
1788    /// This fixes issue #253 where continuation lines were falsely detected.
1789    ///
1790    /// ## Tab indentation quirk
1791    /// Pulldown-cmark reports nested list items at the newline character position
1792    /// when tab indentation is used. For example, in `"* Item\n\t- Nested"`,
1793    /// the nested item is reported at byte 7 (the `\n`), not byte 8 (the `\t`).
1794    /// We detect this and advance to the correct line.
1795    ///
1796    /// ## HashMap key strategy
1797    /// We use `entry().or_insert()` because pulldown-cmark may emit multiple events
1798    /// that resolve to the same line (after newline adjustment). The first event
1799    /// for each line is authoritative.
1800    /// Detect list items and emphasis spans in a single pulldown-cmark pass.
1801    /// Returns both list items (for LineInfo) and emphasis spans (for MD030).
1802    /// This avoids a separate parse for emphasis detection.
1803    fn detect_list_items_and_emphasis_with_pulldown(
1804        content: &str,
1805        line_offsets: &[usize],
1806        flavor: MarkdownFlavor,
1807        front_matter_end: usize,
1808        code_blocks: &[(usize, usize)],
1809    ) -> (ListItemMap, Vec<EmphasisSpan>) {
1810        use std::collections::HashMap;
1811
1812        let mut list_items = HashMap::new();
1813        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1814
1815        let mut options = Options::empty();
1816        options.insert(Options::ENABLE_TABLES);
1817        options.insert(Options::ENABLE_FOOTNOTES);
1818        options.insert(Options::ENABLE_STRIKETHROUGH);
1819        options.insert(Options::ENABLE_TASKLISTS);
1820        // Always enable GFM features for consistency with existing behavior
1821        options.insert(Options::ENABLE_GFM);
1822
1823        // Suppress unused variable warning
1824        let _ = flavor;
1825
1826        let parser = Parser::new_ext(content, options).into_offset_iter();
1827        let mut list_depth: usize = 0;
1828        let mut list_stack: Vec<bool> = Vec::new();
1829
1830        for (event, range) in parser {
1831            match event {
1832                // Capture emphasis spans (for MD030's emphasis detection)
1833                Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1834                    let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1835                        2
1836                    } else {
1837                        1
1838                    };
1839                    let match_start = range.start;
1840                    let match_end = range.end;
1841
1842                    // Skip if in code block
1843                    if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1844                        // Determine marker character by looking at the content at the start
1845                        let marker = content[match_start..].chars().next().unwrap_or('*');
1846                        if marker == '*' || marker == '_' {
1847                            // Extract content between markers
1848                            let content_start = match_start + marker_count;
1849                            let content_end = if match_end >= marker_count {
1850                                match_end - marker_count
1851                            } else {
1852                                match_end
1853                            };
1854                            let content_part = if content_start < content_end && content_end <= content.len() {
1855                                &content[content_start..content_end]
1856                            } else {
1857                                ""
1858                            };
1859
1860                            // Find which line this emphasis is on using line_offsets
1861                            let line_idx = match line_offsets.binary_search(&match_start) {
1862                                Ok(idx) => idx,
1863                                Err(idx) => idx.saturating_sub(1),
1864                            };
1865                            let line_num = line_idx + 1;
1866                            let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1867                            let col_start = match_start - line_start;
1868                            let col_end = match_end - line_start;
1869
1870                            emphasis_spans.push(EmphasisSpan {
1871                                line: line_num,
1872                                start_col: col_start,
1873                                end_col: col_end,
1874                                byte_offset: match_start,
1875                                byte_end: match_end,
1876                                marker,
1877                                marker_count,
1878                                content: content_part.to_string(),
1879                            });
1880                        }
1881                    }
1882                }
1883                Event::Start(Tag::List(start_number)) => {
1884                    list_depth += 1;
1885                    list_stack.push(start_number.is_some());
1886                }
1887                Event::End(TagEnd::List(_)) => {
1888                    list_depth = list_depth.saturating_sub(1);
1889                    list_stack.pop();
1890                }
1891                Event::Start(Tag::Item) if list_depth > 0 => {
1892                    // Get the ordered state for the CURRENT (innermost) list
1893                    let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1894                    // Find which line this byte offset corresponds to
1895                    let item_start = range.start;
1896
1897                    // Binary search to find the line number
1898                    let mut line_idx = match line_offsets.binary_search(&item_start) {
1899                        Ok(idx) => idx,
1900                        Err(idx) => idx.saturating_sub(1),
1901                    };
1902
1903                    // Pulldown-cmark reports nested list items at the newline before the item
1904                    // when using tab indentation (e.g., "* Item\n\t- Nested").
1905                    // Advance to the actual content line in this case.
1906                    if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1907                        line_idx += 1;
1908                    }
1909
1910                    // Skip list items in frontmatter (they are YAML/TOML syntax, not Markdown)
1911                    if front_matter_end > 0 && line_idx < front_matter_end {
1912                        continue;
1913                    }
1914
1915                    if line_idx < line_offsets.len() {
1916                        let line_start_byte = line_offsets[line_idx];
1917                        let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
1918                        let line = &content[line_start_byte..line_end.min(content.len())];
1919
1920                        // Strip trailing newline
1921                        let line = line
1922                            .strip_suffix('\n')
1923                            .or_else(|| line.strip_suffix("\r\n"))
1924                            .unwrap_or(line);
1925
1926                        // Strip blockquote prefix if present
1927                        let blockquote_parse = Self::parse_blockquote_prefix(line);
1928                        let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
1929                            (prefix.len(), content)
1930                        } else {
1931                            (0, line)
1932                        };
1933
1934                        // Parse the list marker from the actual line
1935                        if current_list_is_ordered {
1936                            if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1937                                Self::parse_ordered_list(line_to_parse)
1938                            {
1939                                let marker = format!("{number_str}{delimiter}");
1940                                let marker_column = blockquote_prefix_len + leading_spaces.len();
1941                                let content_column = marker_column + marker.len() + spacing.len();
1942                                let number = number_str.parse().ok();
1943
1944                                list_items.entry(line_start_byte).or_insert((
1945                                    true,
1946                                    marker,
1947                                    marker_column,
1948                                    content_column,
1949                                    number,
1950                                ));
1951                            }
1952                        } else if let Some((leading_spaces, marker, spacing, _content)) =
1953                            Self::parse_unordered_list(line_to_parse)
1954                        {
1955                            let marker_column = blockquote_prefix_len + leading_spaces.len();
1956                            let content_column = marker_column + 1 + spacing.len();
1957
1958                            list_items.entry(line_start_byte).or_insert((
1959                                false,
1960                                marker.to_string(),
1961                                marker_column,
1962                                content_column,
1963                                None,
1964                            ));
1965                        }
1966                    }
1967                }
1968                _ => {}
1969            }
1970        }
1971
1972        (list_items, emphasis_spans)
1973    }
1974
1975    /// Fast unordered list parser - replaces regex for 5-10x speedup
1976    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1977    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1978    #[inline]
1979    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1980        let bytes = line.as_bytes();
1981        let mut i = 0;
1982
1983        // Skip leading whitespace
1984        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1985            i += 1;
1986        }
1987
1988        // Check for marker
1989        if i >= bytes.len() {
1990            return None;
1991        }
1992        let marker = bytes[i] as char;
1993        if marker != '-' && marker != '*' && marker != '+' {
1994            return None;
1995        }
1996        let marker_pos = i;
1997        i += 1;
1998
1999        // Collect spacing after marker (space or tab only)
2000        let spacing_start = i;
2001        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2002            i += 1;
2003        }
2004
2005        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2006    }
2007
2008    /// Fast ordered list parser - replaces regex for 5-10x speedup
2009    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
2010    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
2011    #[inline]
2012    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2013        let bytes = line.as_bytes();
2014        let mut i = 0;
2015
2016        // Skip leading whitespace
2017        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2018            i += 1;
2019        }
2020
2021        // Collect digits
2022        let number_start = i;
2023        while i < bytes.len() && bytes[i].is_ascii_digit() {
2024            i += 1;
2025        }
2026        if i == number_start {
2027            return None; // No digits found
2028        }
2029
2030        // Check for delimiter
2031        if i >= bytes.len() {
2032            return None;
2033        }
2034        let delimiter = bytes[i] as char;
2035        if delimiter != '.' && delimiter != ')' {
2036            return None;
2037        }
2038        let delimiter_pos = i;
2039        i += 1;
2040
2041        // Collect spacing after delimiter (space or tab only)
2042        let spacing_start = i;
2043        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2044            i += 1;
2045        }
2046
2047        Some((
2048            &line[..number_start],
2049            &line[number_start..delimiter_pos],
2050            delimiter,
2051            &line[spacing_start..i],
2052            &line[i..],
2053        ))
2054    }
2055
2056    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
2057    /// Returns a Vec<bool> where index i indicates if line i is in a code block
2058    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2059        let num_lines = line_offsets.len();
2060        let mut in_code_block = vec![false; num_lines];
2061
2062        // For each code block, mark all lines within it
2063        for &(start, end) in code_blocks {
2064            // Ensure we're at valid UTF-8 boundaries
2065            let safe_start = if start > 0 && !content.is_char_boundary(start) {
2066                let mut boundary = start;
2067                while boundary > 0 && !content.is_char_boundary(boundary) {
2068                    boundary -= 1;
2069                }
2070                boundary
2071            } else {
2072                start
2073            };
2074
2075            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2076                let mut boundary = end;
2077                while boundary < content.len() && !content.is_char_boundary(boundary) {
2078                    boundary += 1;
2079                }
2080                boundary
2081            } else {
2082                end.min(content.len())
2083            };
2084
2085            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
2086            // That function now has proper list context awareness (see code_block_utils.rs)
2087            // and correctly distinguishes between:
2088            // - Fenced code blocks (``` or ~~~)
2089            // - Indented code blocks at document level (4 spaces + blank line before)
2090            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
2091            //
2092            // We no longer need to re-validate here. The original validation logic
2093            // was causing false positives by marking list continuation paragraphs as
2094            // code blocks when they have 4 spaces of indentation.
2095
2096            // Use binary search to find the first and last line indices
2097            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
2098            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
2099            //
2100            // Find the line that CONTAINS safe_start: the line with the largest
2101            // start offset that is <= safe_start. partition_point gives us the
2102            // first line that starts AFTER safe_start, so we subtract 1.
2103            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2104            let first_line = first_line_after.saturating_sub(1);
2105            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2106
2107            // Mark all lines in the range at once
2108            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2109                *flag = true;
2110            }
2111        }
2112
2113        in_code_block
2114    }
2115
2116    /// Pre-compute which lines are inside math blocks ($$ ... $$) - O(n) single pass
2117    /// Returns a Vec<bool> where index i indicates if line i is in a math block
2118    fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2119        let content_lines: Vec<&str> = content.lines().collect();
2120        let num_lines = content_lines.len();
2121        let mut in_math_block = vec![false; num_lines];
2122
2123        let mut inside_math = false;
2124
2125        for (i, line) in content_lines.iter().enumerate() {
2126            // Skip lines that are in code blocks - math delimiters inside code are literal
2127            if code_block_map.get(i).copied().unwrap_or(false) {
2128                continue;
2129            }
2130
2131            let trimmed = line.trim();
2132
2133            // Check for math block delimiter ($$)
2134            // A line with just $$ toggles the math block state
2135            if trimmed == "$$" {
2136                if inside_math {
2137                    // Closing delimiter - this line is still part of the math block
2138                    in_math_block[i] = true;
2139                    inside_math = false;
2140                } else {
2141                    // Opening delimiter - this line starts the math block
2142                    in_math_block[i] = true;
2143                    inside_math = true;
2144                }
2145            } else if inside_math {
2146                // Content inside math block
2147                in_math_block[i] = true;
2148            }
2149        }
2150
2151        in_math_block
2152    }
2153
2154    /// Pre-compute basic line information (without headings/blockquotes)
2155    /// Also returns emphasis spans detected during the pulldown-cmark parse
2156    fn compute_basic_line_info(
2157        content: &str,
2158        line_offsets: &[usize],
2159        code_blocks: &[(usize, usize)],
2160        flavor: MarkdownFlavor,
2161        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2162        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2163    ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2164        let content_lines: Vec<&str> = content.lines().collect();
2165        let mut lines = Vec::with_capacity(content_lines.len());
2166
2167        // Pre-compute which lines are in code blocks
2168        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2169
2170        // Pre-compute which lines are in math blocks ($$ ... $$)
2171        let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2172
2173        // Detect front matter boundaries FIRST, before any other parsing
2174        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
2175        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2176
2177        // Use pulldown-cmark to detect list items AND emphasis spans in a single pass
2178        // (context-aware, eliminates false positives)
2179        let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2180            content,
2181            line_offsets,
2182            flavor,
2183            front_matter_end,
2184            code_blocks,
2185        );
2186
2187        for (i, line) in content_lines.iter().enumerate() {
2188            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2189            let indent = line.len() - line.trim_start().len();
2190            // Compute visual indent with proper CommonMark tab expansion
2191            let visual_indent = ElementCache::calculate_indentation_width_default(line);
2192
2193            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
2194            let blockquote_parse = Self::parse_blockquote_prefix(line);
2195
2196            // For blank detection, consider blockquote context
2197            let is_blank = if let Some((_, content)) = blockquote_parse {
2198                // In blockquote context, check if content after prefix is blank
2199                content.trim().is_empty()
2200            } else {
2201                line.trim().is_empty()
2202            };
2203
2204            // Use pre-computed map for O(1) lookup instead of O(m) iteration
2205            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2206
2207            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
2208            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2209                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2210            // Check if the ENTIRE line is within an HTML comment (not just the line start)
2211            // This ensures content after `-->` on the same line is not incorrectly skipped
2212            let line_end_offset = byte_offset + line.len();
2213            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2214                html_comment_ranges,
2215                byte_offset,
2216                line_end_offset,
2217            );
2218            // Use pulldown-cmark's list detection for context-aware parsing
2219            // This eliminates false positives on continuation lines (issue #253)
2220            let list_item =
2221                list_item_map
2222                    .get(&byte_offset)
2223                    .map(
2224                        |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2225                            marker: marker.clone(),
2226                            is_ordered: *is_ordered,
2227                            number: *number,
2228                            marker_column: *marker_column,
2229                            content_column: *content_column,
2230                        },
2231                    );
2232
2233            // Detect horizontal rules (only outside code blocks and frontmatter)
2234            // Uses CommonMark-compliant check including leading indentation validation
2235            let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2236            let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2237
2238            // Get math block status for this line
2239            let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2240
2241            lines.push(LineInfo {
2242                byte_offset,
2243                byte_len: line.len(),
2244                indent,
2245                visual_indent,
2246                is_blank,
2247                in_code_block,
2248                in_front_matter,
2249                in_html_block: false, // Will be populated after line creation
2250                in_html_comment,
2251                list_item,
2252                heading: None,    // Will be populated in second pass for Setext headings
2253                blockquote: None, // Will be populated after line creation
2254                in_mkdocstrings,
2255                in_esm_block: false, // Will be populated after line creation for MDX files
2256                in_code_span_continuation: false, // Will be populated after code spans are parsed
2257                is_horizontal_rule: is_hr,
2258                in_math_block,
2259            });
2260        }
2261
2262        (lines, emphasis_spans)
2263    }
2264
2265    /// Detect headings and blockquotes (called after HTML block detection)
2266    fn detect_headings_and_blockquotes(
2267        content: &str,
2268        lines: &mut [LineInfo],
2269        flavor: MarkdownFlavor,
2270        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2271        link_byte_ranges: &[(usize, usize)],
2272    ) {
2273        // Regex for heading detection
2274        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2275            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2276        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2277            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2278
2279        let content_lines: Vec<&str> = content.lines().collect();
2280
2281        // Detect front matter boundaries to skip those lines
2282        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2283
2284        // Detect headings (including Setext which needs look-ahead) and blockquotes
2285        for i in 0..lines.len() {
2286            let line = content_lines[i];
2287
2288            // Detect blockquotes FIRST, before any skip conditions.
2289            // A line can be both a blockquote AND contain a code block inside it.
2290            // We need to know about the blockquote marker regardless of code block status.
2291            // Skip only frontmatter lines - those are never blockquotes.
2292            if !(front_matter_end > 0 && i < front_matter_end)
2293                && let Some(bq) = parse_blockquote_detailed(line)
2294            {
2295                let nesting_level = bq.markers.len();
2296                let marker_column = bq.indent.len();
2297                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2298                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2299                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2300                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2301
2302                lines[i].blockquote = Some(BlockquoteInfo {
2303                    nesting_level,
2304                    indent: bq.indent.to_string(),
2305                    marker_column,
2306                    prefix,
2307                    content: bq.content.to_string(),
2308                    has_no_space_after_marker: has_no_space,
2309                    has_multiple_spaces_after_marker: has_multiple_spaces,
2310                    needs_md028_fix,
2311                });
2312            }
2313
2314            // Now apply skip conditions for heading detection
2315            if lines[i].in_code_block {
2316                continue;
2317            }
2318
2319            // Skip lines in front matter
2320            if front_matter_end > 0 && i < front_matter_end {
2321                continue;
2322            }
2323
2324            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2325            if lines[i].in_html_block {
2326                continue;
2327            }
2328
2329            // Skip heading detection for blank lines
2330            if lines[i].is_blank {
2331                continue;
2332            }
2333
2334            // Check for ATX headings (but skip MkDocs snippet lines)
2335            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2336            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2337                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2338                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2339            } else {
2340                false
2341            };
2342
2343            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2344                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2345                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2346                    continue;
2347                }
2348                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2349                // This prevents false positives where `#fragment` is detected as a heading
2350                let line_offset = lines[i].byte_offset;
2351                if link_byte_ranges
2352                    .iter()
2353                    .any(|&(start, end)| line_offset > start && line_offset < end)
2354                {
2355                    continue;
2356                }
2357                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2358                let hashes = caps.get(2).map_or("", |m| m.as_str());
2359                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2360                let rest = caps.get(4).map_or("", |m| m.as_str());
2361
2362                let level = hashes.len() as u8;
2363                let marker_column = leading_spaces.len();
2364
2365                // Check for closing sequence, but handle custom IDs that might come after
2366                let (text, has_closing, closing_seq) = {
2367                    // First check if there's a custom ID at the end
2368                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2369                        // Check if this looks like a valid custom ID (ends with })
2370                        if rest[id_start..].trim_end().ends_with('}') {
2371                            // Split off the custom ID
2372                            (&rest[..id_start], &rest[id_start..])
2373                        } else {
2374                            (rest, "")
2375                        }
2376                    } else {
2377                        (rest, "")
2378                    };
2379
2380                    // Now look for closing hashes in the part before the custom ID
2381                    let trimmed_rest = rest_without_id.trim_end();
2382                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2383                        // Find the start of the hash sequence by walking backwards
2384                        // Use char_indices to get byte positions at char boundaries
2385                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2386
2387                        // Find which char index corresponds to last_hash_byte_pos
2388                        let last_hash_char_idx = char_positions
2389                            .iter()
2390                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2391
2392                        if let Some(mut char_idx) = last_hash_char_idx {
2393                            // Walk backwards to find start of hash sequence
2394                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2395                                char_idx -= 1;
2396                            }
2397
2398                            // Get the byte position of the start of hashes
2399                            let start_of_hashes = char_positions[char_idx].0;
2400
2401                            // Check if there's at least one space before the closing hashes
2402                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2403
2404                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2405                            let potential_closing = &trimmed_rest[start_of_hashes..];
2406                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2407
2408                            if is_all_hashes && has_space_before {
2409                                // This is a closing sequence
2410                                let closing_hashes = potential_closing.to_string();
2411                                // The text is everything before the closing hashes
2412                                // Don't include the custom ID here - it will be extracted later
2413                                let text_part = if !custom_id_part.is_empty() {
2414                                    // If we have a custom ID, append it back to get the full rest
2415                                    // This allows the extract_header_id function to handle it properly
2416                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2417                                } else {
2418                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2419                                };
2420                                (text_part, true, closing_hashes)
2421                            } else {
2422                                // Not a valid closing sequence, return the full content
2423                                (rest.to_string(), false, String::new())
2424                            }
2425                        } else {
2426                            // Couldn't find char boundary, return the full content
2427                            (rest.to_string(), false, String::new())
2428                        }
2429                    } else {
2430                        // No hashes found, return the full content
2431                        (rest.to_string(), false, String::new())
2432                    }
2433                };
2434
2435                let content_column = marker_column + hashes.len() + spaces_after.len();
2436
2437                // Extract custom header ID if present
2438                let raw_text = text.trim().to_string();
2439                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2440
2441                // If no custom ID was found on the header line, check the next line for standalone attr-list
2442                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2443                    let next_line = content_lines[i + 1];
2444                    if !lines[i + 1].in_code_block
2445                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2446                        && let Some(next_line_id) =
2447                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2448                    {
2449                        custom_id = Some(next_line_id);
2450                    }
2451                }
2452
2453                // ATX heading is "valid" for processing by heading rules if:
2454                // 1. Has space after # (CommonMark compliant): `# Heading`
2455                // 2. Is empty (just hashes): `#`
2456                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2457                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2458                //
2459                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2460                // - `#tag` - single # with lowercase (social hashtag)
2461                // - `#123` - single # with number (GitHub issue ref)
2462                let is_valid = !spaces_after.is_empty()
2463                    || rest.is_empty()
2464                    || level > 1
2465                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2466
2467                lines[i].heading = Some(HeadingInfo {
2468                    level,
2469                    style: HeadingStyle::ATX,
2470                    marker: hashes.to_string(),
2471                    marker_column,
2472                    content_column,
2473                    text: clean_text,
2474                    custom_id,
2475                    raw_text,
2476                    has_closing_sequence: has_closing,
2477                    closing_sequence: closing_seq,
2478                    is_valid,
2479                });
2480            }
2481            // Check for Setext headings (need to look at next line)
2482            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2483                let next_line = content_lines[i + 1];
2484                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2485                    // Skip if next line is front matter delimiter
2486                    if front_matter_end > 0 && i < front_matter_end {
2487                        continue;
2488                    }
2489
2490                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2491                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2492                    {
2493                        continue;
2494                    }
2495
2496                    // Per CommonMark spec 4.3, setext heading content cannot be interpretable as:
2497                    // list item, ATX heading, block quote, thematic break, code fence, or HTML block
2498                    let content_line = line.trim();
2499
2500                    // Skip list items (-, *, +) and thematic breaks (---, ***, etc.)
2501                    if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2502                        continue;
2503                    }
2504
2505                    // Skip underscore thematic breaks (___)
2506                    if content_line.starts_with('_') {
2507                        let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2508                        if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2509                            continue;
2510                        }
2511                    }
2512
2513                    // Skip numbered lists (1. Item, 2. Item, etc.)
2514                    if let Some(first_char) = content_line.chars().next()
2515                        && first_char.is_ascii_digit()
2516                    {
2517                        let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2518                        if num_end < content_line.len() {
2519                            let next = content_line.chars().nth(num_end);
2520                            if next == Some('.') || next == Some(')') {
2521                                continue;
2522                            }
2523                        }
2524                    }
2525
2526                    // Skip ATX headings
2527                    if ATX_HEADING_REGEX.is_match(line) {
2528                        continue;
2529                    }
2530
2531                    // Skip blockquotes
2532                    if content_line.starts_with('>') {
2533                        continue;
2534                    }
2535
2536                    // Skip code fences
2537                    let trimmed_start = line.trim_start();
2538                    if trimmed_start.len() >= 3 {
2539                        let first_three: String = trimmed_start.chars().take(3).collect();
2540                        if first_three == "```" || first_three == "~~~" {
2541                            continue;
2542                        }
2543                    }
2544
2545                    // Skip HTML blocks
2546                    if content_line.starts_with('<') {
2547                        continue;
2548                    }
2549
2550                    let underline = next_line.trim();
2551
2552                    let level = if underline.starts_with('=') { 1 } else { 2 };
2553                    let style = if level == 1 {
2554                        HeadingStyle::Setext1
2555                    } else {
2556                        HeadingStyle::Setext2
2557                    };
2558
2559                    // Extract custom header ID if present
2560                    let raw_text = line.trim().to_string();
2561                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2562
2563                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2564                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2565                        let attr_line = content_lines[i + 2];
2566                        if !lines[i + 2].in_code_block
2567                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2568                            && let Some(attr_line_id) =
2569                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2570                        {
2571                            custom_id = Some(attr_line_id);
2572                        }
2573                    }
2574
2575                    lines[i].heading = Some(HeadingInfo {
2576                        level,
2577                        style,
2578                        marker: underline.to_string(),
2579                        marker_column: next_line.len() - next_line.trim_start().len(),
2580                        content_column: lines[i].indent,
2581                        text: clean_text,
2582                        custom_id,
2583                        raw_text,
2584                        has_closing_sequence: false,
2585                        closing_sequence: String::new(),
2586                        is_valid: true, // Setext headings are always valid
2587                    });
2588                }
2589            }
2590        }
2591    }
2592
2593    /// Detect HTML blocks in the content
2594    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2595        // HTML block elements that trigger block context
2596        // Includes HTML5 media, embedded content, and interactive elements
2597        const BLOCK_ELEMENTS: &[&str] = &[
2598            "address",
2599            "article",
2600            "aside",
2601            "audio",
2602            "blockquote",
2603            "canvas",
2604            "details",
2605            "dialog",
2606            "dd",
2607            "div",
2608            "dl",
2609            "dt",
2610            "embed",
2611            "fieldset",
2612            "figcaption",
2613            "figure",
2614            "footer",
2615            "form",
2616            "h1",
2617            "h2",
2618            "h3",
2619            "h4",
2620            "h5",
2621            "h6",
2622            "header",
2623            "hr",
2624            "iframe",
2625            "li",
2626            "main",
2627            "menu",
2628            "nav",
2629            "noscript",
2630            "object",
2631            "ol",
2632            "p",
2633            "picture",
2634            "pre",
2635            "script",
2636            "search",
2637            "section",
2638            "source",
2639            "style",
2640            "summary",
2641            "svg",
2642            "table",
2643            "tbody",
2644            "td",
2645            "template",
2646            "textarea",
2647            "tfoot",
2648            "th",
2649            "thead",
2650            "tr",
2651            "track",
2652            "ul",
2653            "video",
2654        ];
2655
2656        let mut i = 0;
2657        while i < lines.len() {
2658            // Skip if already in code block or front matter
2659            if lines[i].in_code_block || lines[i].in_front_matter {
2660                i += 1;
2661                continue;
2662            }
2663
2664            let trimmed = lines[i].content(content).trim_start();
2665
2666            // Check if line starts with an HTML tag
2667            if trimmed.starts_with('<') && trimmed.len() > 1 {
2668                // Extract tag name safely
2669                let after_bracket = &trimmed[1..];
2670                let is_closing = after_bracket.starts_with('/');
2671                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2672
2673                // Extract tag name (stop at space, >, /, or end of string)
2674                let tag_name = tag_start
2675                    .chars()
2676                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2677                    .collect::<String>()
2678                    .to_lowercase();
2679
2680                // Check if it's a block element
2681                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2682                    // Mark this line as in HTML block
2683                    lines[i].in_html_block = true;
2684
2685                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2686                    // This avoids complex nesting logic that might cause infinite loops
2687                    if !is_closing {
2688                        let closing_tag = format!("</{tag_name}>");
2689                        // style and script tags can contain blank lines (CSS/JS formatting)
2690                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2691                        let mut j = i + 1;
2692                        let mut found_closing_tag = false;
2693                        while j < lines.len() && j < i + 100 {
2694                            // Limit search to 100 lines
2695                            // Stop at blank lines (except for style/script tags)
2696                            if !allow_blank_lines && lines[j].is_blank {
2697                                break;
2698                            }
2699
2700                            lines[j].in_html_block = true;
2701
2702                            // Check if this line contains the closing tag
2703                            if lines[j].content(content).contains(&closing_tag) {
2704                                found_closing_tag = true;
2705                            }
2706
2707                            // After finding closing tag, continue marking lines as
2708                            // in_html_block until blank line (per CommonMark spec)
2709                            if found_closing_tag {
2710                                j += 1;
2711                                // Continue marking subsequent lines until blank
2712                                while j < lines.len() && j < i + 100 {
2713                                    if lines[j].is_blank {
2714                                        break;
2715                                    }
2716                                    lines[j].in_html_block = true;
2717                                    j += 1;
2718                                }
2719                                break;
2720                            }
2721                            j += 1;
2722                        }
2723                    }
2724                }
2725            }
2726
2727            i += 1;
2728        }
2729    }
2730
2731    /// Detect ESM import/export blocks in MDX files
2732    /// ESM blocks consist of contiguous import/export statements at the top of the file
2733    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2734        // Only process MDX files
2735        if !flavor.supports_esm_blocks() {
2736            return;
2737        }
2738
2739        let mut in_multiline_comment = false;
2740
2741        for line in lines.iter_mut() {
2742            // Skip blank lines and HTML comments
2743            if line.is_blank || line.in_html_comment {
2744                continue;
2745            }
2746
2747            let trimmed = line.content(content).trim_start();
2748
2749            // Handle continuation of multi-line JS comments
2750            if in_multiline_comment {
2751                if trimmed.contains("*/") {
2752                    in_multiline_comment = false;
2753                }
2754                continue;
2755            }
2756
2757            // Skip single-line JS comments (// and ///)
2758            if trimmed.starts_with("//") {
2759                continue;
2760            }
2761
2762            // Handle start of multi-line JS comment
2763            if trimmed.starts_with("/*") {
2764                if !trimmed.contains("*/") {
2765                    in_multiline_comment = true;
2766                }
2767                continue;
2768            }
2769
2770            // Check if line starts with import or export
2771            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2772                line.in_esm_block = true;
2773            } else {
2774                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2775                break;
2776            }
2777        }
2778    }
2779
2780    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2781    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2782        let mut code_spans = Vec::new();
2783
2784        // Quick check - if no backticks, no code spans
2785        if !content.contains('`') {
2786            return code_spans;
2787        }
2788
2789        // Use pulldown-cmark's streaming parser with byte offsets
2790        let parser = Parser::new(content).into_offset_iter();
2791
2792        for (event, range) in parser {
2793            if let Event::Code(_) = event {
2794                let start_pos = range.start;
2795                let end_pos = range.end;
2796
2797                // The range includes the backticks, extract the actual content
2798                let full_span = &content[start_pos..end_pos];
2799                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2800
2801                // Extract content between backticks, preserving spaces
2802                let content_start = start_pos + backtick_count;
2803                let content_end = end_pos - backtick_count;
2804                let span_content = if content_start < content_end {
2805                    content[content_start..content_end].to_string()
2806                } else {
2807                    String::new()
2808                };
2809
2810                // Use binary search to find line number - O(log n) instead of O(n)
2811                // Find the rightmost line whose byte_offset <= start_pos
2812                let line_idx = lines
2813                    .partition_point(|line| line.byte_offset <= start_pos)
2814                    .saturating_sub(1);
2815                let line_num = line_idx + 1;
2816                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2817
2818                // Find end column using binary search
2819                let end_line_idx = lines
2820                    .partition_point(|line| line.byte_offset <= end_pos)
2821                    .saturating_sub(1);
2822                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2823
2824                // Convert byte offsets to character positions for correct Unicode handling
2825                // This ensures consistency with warning.column which uses character positions
2826                let line_content = lines[line_idx].content(content);
2827                let col_start = if byte_col_start <= line_content.len() {
2828                    line_content[..byte_col_start].chars().count()
2829                } else {
2830                    line_content.chars().count()
2831                };
2832
2833                let end_line_content = lines[end_line_idx].content(content);
2834                let col_end = if byte_col_end <= end_line_content.len() {
2835                    end_line_content[..byte_col_end].chars().count()
2836                } else {
2837                    end_line_content.chars().count()
2838                };
2839
2840                code_spans.push(CodeSpan {
2841                    line: line_num,
2842                    end_line: end_line_idx + 1,
2843                    start_col: col_start,
2844                    end_col: col_end,
2845                    byte_offset: start_pos,
2846                    byte_end: end_pos,
2847                    backtick_count,
2848                    content: span_content,
2849                });
2850            }
2851        }
2852
2853        // Sort by position to ensure consistent ordering
2854        code_spans.sort_by_key(|span| span.byte_offset);
2855
2856        code_spans
2857    }
2858
2859    /// Parse all list blocks in the content (legacy line-by-line approach)
2860    ///
2861    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2862    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2863    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2864    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2865    ///   treated as list continuation (based on the list marker width)
2866    ///
2867    /// When a new list item is encountered, we check if list-breaking content was seen
2868    /// since the last item. If so, we start a new list block.
2869    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2870        // Minimum indentation for unordered list continuation per CommonMark spec
2871        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2872
2873        /// Initialize or reset the forward-scanning tracking state.
2874        /// This helper eliminates code duplication across three initialization sites.
2875        #[inline]
2876        fn reset_tracking_state(
2877            list_item: &ListItemInfo,
2878            has_list_breaking_content: &mut bool,
2879            min_continuation: &mut usize,
2880        ) {
2881            *has_list_breaking_content = false;
2882            let marker_width = if list_item.is_ordered {
2883                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2884            } else {
2885                list_item.marker.len()
2886            };
2887            *min_continuation = if list_item.is_ordered {
2888                marker_width
2889            } else {
2890                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2891            };
2892        }
2893
2894        // Pre-size based on lines that could be list items
2895        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2896        let mut current_block: Option<ListBlock> = None;
2897        let mut last_list_item_line = 0;
2898        let mut current_indent_level = 0;
2899        let mut last_marker_width = 0;
2900
2901        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2902        let mut has_list_breaking_content_since_last_item = false;
2903        let mut min_continuation_for_tracking = 0;
2904
2905        for (line_idx, line_info) in lines.iter().enumerate() {
2906            let line_num = line_idx + 1;
2907
2908            // Enhanced code block handling using Design #3's context analysis
2909            if line_info.in_code_block {
2910                if let Some(ref mut block) = current_block {
2911                    // Calculate minimum indentation for list continuation
2912                    let min_continuation_indent =
2913                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2914
2915                    // Analyze code block context using the three-tier classification
2916                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2917
2918                    match context {
2919                        CodeBlockContext::Indented => {
2920                            // Code block is properly indented - continues the list
2921                            block.end_line = line_num;
2922                            continue;
2923                        }
2924                        CodeBlockContext::Standalone => {
2925                            // Code block separates lists - end current block
2926                            let completed_block = current_block.take().unwrap();
2927                            list_blocks.push(completed_block);
2928                            continue;
2929                        }
2930                        CodeBlockContext::Adjacent => {
2931                            // Edge case - use conservative behavior (continue list)
2932                            block.end_line = line_num;
2933                            continue;
2934                        }
2935                    }
2936                } else {
2937                    // No current list block - skip code block lines
2938                    continue;
2939                }
2940            }
2941
2942            // Extract blockquote prefix if any
2943            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2944                caps.get(0).unwrap().as_str().to_string()
2945            } else {
2946                String::new()
2947            };
2948
2949            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2950            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2951            if let Some(ref block) = current_block
2952                && line_info.list_item.is_none()
2953                && !line_info.is_blank
2954                && !line_info.in_code_span_continuation
2955            {
2956                let line_content = line_info.content(content).trim();
2957
2958                // Check for structural separators that break lists
2959                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2960                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2961                // as they indicate improper indentation rather than lazy continuation.
2962                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2963
2964                // Check if blockquote context changes (different prefix than current block)
2965                // Lines within the SAME blockquote context don't break lists
2966                let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
2967
2968                let breaks_list = line_info.heading.is_some()
2969                    || line_content.starts_with("---")
2970                    || line_content.starts_with("***")
2971                    || line_content.starts_with("___")
2972                    || crate::utils::skip_context::is_table_line(line_content)
2973                    || blockquote_prefix_changes
2974                    || (line_info.indent > 0
2975                        && line_info.indent < min_continuation_for_tracking
2976                        && !is_lazy_continuation);
2977
2978                if breaks_list {
2979                    has_list_breaking_content_since_last_item = true;
2980                }
2981            }
2982
2983            // If this line is a code span continuation within an active list block,
2984            // extend the block's end_line to include this line (maintains list continuity)
2985            if line_info.in_code_span_continuation
2986                && line_info.list_item.is_none()
2987                && let Some(ref mut block) = current_block
2988            {
2989                block.end_line = line_num;
2990            }
2991
2992            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2993            // properly indented lines within the list). This ensures the workaround at line 2448
2994            // works correctly when there are multiple continuation lines before a nested list item.
2995            // Also include lazy continuation lines (indent=0) per CommonMark spec.
2996            // For blockquote lines, compute effective indent after stripping the prefix
2997            let effective_continuation_indent = if let Some(ref block) = current_block {
2998                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
2999                let line_content = line_info.content(content);
3000                let line_bq_level = line_content
3001                    .chars()
3002                    .take_while(|c| *c == '>' || c.is_whitespace())
3003                    .filter(|&c| c == '>')
3004                    .count();
3005                if line_bq_level > 0 && line_bq_level == block_bq_level {
3006                    // Compute indent after blockquote markers
3007                    let mut pos = 0;
3008                    let mut found_markers = 0;
3009                    for c in line_content.chars() {
3010                        pos += c.len_utf8();
3011                        if c == '>' {
3012                            found_markers += 1;
3013                            if found_markers == line_bq_level {
3014                                if line_content.get(pos..pos + 1) == Some(" ") {
3015                                    pos += 1;
3016                                }
3017                                break;
3018                            }
3019                        }
3020                    }
3021                    let after_bq = &line_content[pos..];
3022                    after_bq.len() - after_bq.trim_start().len()
3023                } else {
3024                    line_info.indent
3025                }
3026            } else {
3027                line_info.indent
3028            };
3029            let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3030                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3031                if block_bq_level > 0 {
3032                    if block.is_ordered { last_marker_width } else { 2 }
3033                } else {
3034                    min_continuation_for_tracking
3035                }
3036            } else {
3037                min_continuation_for_tracking
3038            };
3039            let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3040                || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
3041
3042            if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3043                eprintln!(
3044                    "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3045                    line_num,
3046                    effective_continuation_indent,
3047                    adjusted_min_continuation_for_tracking,
3048                    is_valid_continuation,
3049                    line_info.in_code_span_continuation,
3050                    line_info.in_code_block,
3051                    current_block.is_some()
3052                );
3053            }
3054
3055            if !line_info.in_code_span_continuation
3056                && line_info.list_item.is_none()
3057                && !line_info.is_blank
3058                && !line_info.in_code_block
3059                && is_valid_continuation
3060                && let Some(ref mut block) = current_block
3061            {
3062                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3063                    eprintln!(
3064                        "[DEBUG] Line {}: extending block.end_line from {} to {}",
3065                        line_num, block.end_line, line_num
3066                    );
3067                }
3068                block.end_line = line_num;
3069            }
3070
3071            // Check if this line is a list item
3072            if let Some(list_item) = &line_info.list_item {
3073                // Calculate nesting level based on indentation
3074                let item_indent = list_item.marker_column;
3075                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
3076
3077                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3078                    eprintln!(
3079                        "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3080                        line_num, list_item.marker, item_indent
3081                    );
3082                }
3083
3084                if let Some(ref mut block) = current_block {
3085                    // Check if this continues the current block
3086                    // For nested lists, we need to check if this is a nested item (higher nesting level)
3087                    // or a continuation at the same or lower level
3088                    let is_nested = nesting > block.nesting_level;
3089                    let same_type =
3090                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3091                    let same_context = block.blockquote_prefix == blockquote_prefix;
3092                    // Allow one blank line after last item, or lines immediately after block content
3093                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3094
3095                    // For unordered lists, also check marker consistency
3096                    let marker_compatible =
3097                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3098
3099                    // O(1) check: Use the tracked variable instead of O(n) nested loop
3100                    // This eliminates the quadratic bottleneck from issue #148
3101                    let has_non_list_content = has_list_breaking_content_since_last_item;
3102
3103                    // A list continues if:
3104                    // 1. It's a nested item (indented more than the parent), OR
3105                    // 2. It's the same type at the same level with reasonable distance
3106                    let mut continues_list = if is_nested {
3107                        // Nested items always continue the list if they're in the same context
3108                        same_context && reasonable_distance && !has_non_list_content
3109                    } else {
3110                        // Same-level items need to match type and markers
3111                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3112                    };
3113
3114                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3115                        eprintln!(
3116                            "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3117                            line_num,
3118                            continues_list,
3119                            is_nested,
3120                            same_type,
3121                            same_context,
3122                            reasonable_distance,
3123                            marker_compatible,
3124                            has_non_list_content,
3125                            last_list_item_line,
3126                            block.end_line
3127                        );
3128                    }
3129
3130                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
3131                    // This handles edge cases where content patterns might otherwise split lists incorrectly
3132                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
3133                        // Check if the previous line was a list item or a continuation of a list item
3134                        // (including lazy continuation lines)
3135                        if block.item_lines.contains(&(line_num - 1)) {
3136                            // They're consecutive list items - force them to be in the same list
3137                            continues_list = true;
3138                        } else {
3139                            // Previous line is a continuation line within this block
3140                            // (e.g., lazy continuation with indent=0)
3141                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
3142                            continues_list = true;
3143                        }
3144                    }
3145
3146                    if continues_list {
3147                        // Extend current block
3148                        block.end_line = line_num;
3149                        block.item_lines.push(line_num);
3150
3151                        // Update max marker width
3152                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3153                            list_item.marker.len() + 1
3154                        } else {
3155                            list_item.marker.len()
3156                        });
3157
3158                        // Update marker consistency for unordered lists
3159                        if !block.is_ordered
3160                            && block.marker.is_some()
3161                            && block.marker.as_ref() != Some(&list_item.marker)
3162                        {
3163                            // Mixed markers, clear the marker field
3164                            block.marker = None;
3165                        }
3166
3167                        // Reset tracked state for issue #148 optimization
3168                        reset_tracking_state(
3169                            list_item,
3170                            &mut has_list_breaking_content_since_last_item,
3171                            &mut min_continuation_for_tracking,
3172                        );
3173                    } else {
3174                        // End current block and start a new one
3175
3176                        list_blocks.push(block.clone());
3177
3178                        *block = ListBlock {
3179                            start_line: line_num,
3180                            end_line: line_num,
3181                            is_ordered: list_item.is_ordered,
3182                            marker: if list_item.is_ordered {
3183                                None
3184                            } else {
3185                                Some(list_item.marker.clone())
3186                            },
3187                            blockquote_prefix: blockquote_prefix.clone(),
3188                            item_lines: vec![line_num],
3189                            nesting_level: nesting,
3190                            max_marker_width: if list_item.is_ordered {
3191                                list_item.marker.len() + 1
3192                            } else {
3193                                list_item.marker.len()
3194                            },
3195                        };
3196
3197                        // Initialize tracked state for new block (issue #148 optimization)
3198                        reset_tracking_state(
3199                            list_item,
3200                            &mut has_list_breaking_content_since_last_item,
3201                            &mut min_continuation_for_tracking,
3202                        );
3203                    }
3204                } else {
3205                    // Start a new block
3206                    current_block = Some(ListBlock {
3207                        start_line: line_num,
3208                        end_line: line_num,
3209                        is_ordered: list_item.is_ordered,
3210                        marker: if list_item.is_ordered {
3211                            None
3212                        } else {
3213                            Some(list_item.marker.clone())
3214                        },
3215                        blockquote_prefix,
3216                        item_lines: vec![line_num],
3217                        nesting_level: nesting,
3218                        max_marker_width: list_item.marker.len(),
3219                    });
3220
3221                    // Initialize tracked state for new block (issue #148 optimization)
3222                    reset_tracking_state(
3223                        list_item,
3224                        &mut has_list_breaking_content_since_last_item,
3225                        &mut min_continuation_for_tracking,
3226                    );
3227                }
3228
3229                last_list_item_line = line_num;
3230                current_indent_level = item_indent;
3231                last_marker_width = if list_item.is_ordered {
3232                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
3233                } else {
3234                    list_item.marker.len()
3235                };
3236            } else if let Some(ref mut block) = current_block {
3237                // Not a list item - check if it continues the current block
3238                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3239                    eprintln!(
3240                        "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3241                        line_num, line_info.is_blank
3242                    );
3243                }
3244
3245                // For MD032 compatibility, we use a simple approach:
3246                // - Indented lines continue the list
3247                // - Blank lines followed by indented content continue the list
3248                // - Everything else ends the list
3249
3250                // Check if the last line in the list block ended with a backslash (hard line break)
3251                // This handles cases where list items use backslash for hard line breaks
3252                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3253                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3254                } else {
3255                    false
3256                };
3257
3258                // Calculate minimum indentation for list continuation
3259                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
3260                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
3261                let min_continuation_indent = if block.is_ordered {
3262                    current_indent_level + last_marker_width
3263                } else {
3264                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
3265                };
3266
3267                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3268                    // Indented line or backslash continuation continues the list
3269                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3270                        eprintln!(
3271                            "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3272                            line_num, line_info.indent, min_continuation_indent
3273                        );
3274                    }
3275                    block.end_line = line_num;
3276                } else if line_info.is_blank {
3277                    // Blank line - check if it's internal to the list or ending it
3278                    // We only include blank lines that are followed by more list content
3279                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3280                        eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3281                    }
3282                    let mut check_idx = line_idx + 1;
3283                    let mut found_continuation = false;
3284
3285                    // Skip additional blank lines
3286                    while check_idx < lines.len() && lines[check_idx].is_blank {
3287                        check_idx += 1;
3288                    }
3289
3290                    if check_idx < lines.len() {
3291                        let next_line = &lines[check_idx];
3292                        // For blockquote lines, compute indent AFTER stripping the blockquote prefix
3293                        let next_content = next_line.content(content);
3294                        // Use blockquote level (count of >) to compare, not the full prefix
3295                        // This avoids issues where the regex captures extra whitespace
3296                        let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3297                        let next_bq_level_for_indent = next_content
3298                            .chars()
3299                            .take_while(|c| *c == '>' || c.is_whitespace())
3300                            .filter(|&c| c == '>')
3301                            .count();
3302                        let effective_indent =
3303                            if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3304                                // For lines in the same blockquote context, compute indent after the blockquote marker(s)
3305                                // Find position after ">" and one space
3306                                let mut pos = 0;
3307                                let mut found_markers = 0;
3308                                for c in next_content.chars() {
3309                                    pos += c.len_utf8();
3310                                    if c == '>' {
3311                                        found_markers += 1;
3312                                        if found_markers == next_bq_level_for_indent {
3313                                            // Skip optional space after last >
3314                                            if next_content.get(pos..pos + 1) == Some(" ") {
3315                                                pos += 1;
3316                                            }
3317                                            break;
3318                                        }
3319                                    }
3320                                }
3321                                let after_blockquote_marker = &next_content[pos..];
3322                                after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3323                            } else {
3324                                next_line.indent
3325                            };
3326                        // Also adjust min_continuation_indent for blockquote lists
3327                        // The marker_column includes blockquote prefix, so subtract it
3328                        let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3329                            // For blockquote lists, the continuation is relative to blockquote content
3330                            // current_indent_level includes blockquote prefix (2 for "> "), so use just 2 for unordered
3331                            if block.is_ordered { last_marker_width } else { 2 }
3332                        } else {
3333                            min_continuation_indent
3334                        };
3335                        // Check if followed by indented content (list continuation)
3336                        if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3337                            eprintln!(
3338                                "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3339                                line_num,
3340                                check_idx + 1,
3341                                effective_indent,
3342                                adjusted_min_continuation,
3343                                next_line.list_item.is_some(),
3344                                next_line.in_code_block
3345                            );
3346                        }
3347                        if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3348                            found_continuation = true;
3349                        }
3350                        // Check if followed by another list item at the same level
3351                        else if !next_line.in_code_block
3352                            && next_line.list_item.is_some()
3353                            && let Some(item) = &next_line.list_item
3354                        {
3355                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3356                                .find(next_line.content(content))
3357                                .map_or(String::new(), |m| m.as_str().to_string());
3358                            if item.marker_column == current_indent_level
3359                                && item.is_ordered == block.is_ordered
3360                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3361                            {
3362                                // Check if there was meaningful content between the list items (unused now)
3363                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
3364                                // Pre-compute block's blockquote level for use in closures
3365                                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3366                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3367                                    if let Some(between_line) = lines.get(idx) {
3368                                        let between_content = between_line.content(content);
3369                                        let trimmed = between_content.trim();
3370                                        // Skip empty lines
3371                                        if trimmed.is_empty() {
3372                                            return false;
3373                                        }
3374                                        // Check for meaningful content
3375                                        let line_indent = between_content.len() - between_content.trim_start().len();
3376
3377                                        // Check if blockquote level changed (not just if line starts with ">")
3378                                        let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3379                                            .find(between_content)
3380                                            .map_or(String::new(), |m| m.as_str().to_string());
3381                                        let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3382                                        let blockquote_level_changed =
3383                                            trimmed.starts_with(">") && between_bq_level != block_bq_level;
3384
3385                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
3386                                        if trimmed.starts_with("```")
3387                                            || trimmed.starts_with("~~~")
3388                                            || trimmed.starts_with("---")
3389                                            || trimmed.starts_with("***")
3390                                            || trimmed.starts_with("___")
3391                                            || blockquote_level_changed
3392                                            || crate::utils::skip_context::is_table_line(trimmed)
3393                                            || between_line.heading.is_some()
3394                                        {
3395                                            return true; // These are structural separators - meaningful content that breaks lists
3396                                        }
3397
3398                                        // Only properly indented content continues the list
3399                                        line_indent >= min_continuation_indent
3400                                    } else {
3401                                        false
3402                                    }
3403                                });
3404
3405                                if block.is_ordered {
3406                                    // For ordered lists: don't continue if there are structural separators
3407                                    // Check if there are structural separators between the list items
3408                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3409                                        if let Some(between_line) = lines.get(idx) {
3410                                            let between_content = between_line.content(content);
3411                                            let trimmed = between_content.trim();
3412                                            if trimmed.is_empty() {
3413                                                return false;
3414                                            }
3415                                            // Check if blockquote level changed (not just if line starts with ">")
3416                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3417                                                .find(between_content)
3418                                                .map_or(String::new(), |m| m.as_str().to_string());
3419                                            let between_bq_level =
3420                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3421                                            let blockquote_level_changed =
3422                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3423                                            // Check for structural separators that break lists
3424                                            trimmed.starts_with("```")
3425                                                || trimmed.starts_with("~~~")
3426                                                || trimmed.starts_with("---")
3427                                                || trimmed.starts_with("***")
3428                                                || trimmed.starts_with("___")
3429                                                || blockquote_level_changed
3430                                                || crate::utils::skip_context::is_table_line(trimmed)
3431                                                || between_line.heading.is_some()
3432                                        } else {
3433                                            false
3434                                        }
3435                                    });
3436                                    found_continuation = !has_structural_separators;
3437                                } else {
3438                                    // For unordered lists: also check for structural separators
3439                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3440                                        if let Some(between_line) = lines.get(idx) {
3441                                            let between_content = between_line.content(content);
3442                                            let trimmed = between_content.trim();
3443                                            if trimmed.is_empty() {
3444                                                return false;
3445                                            }
3446                                            // Check if blockquote level changed (not just if line starts with ">")
3447                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3448                                                .find(between_content)
3449                                                .map_or(String::new(), |m| m.as_str().to_string());
3450                                            let between_bq_level =
3451                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3452                                            let blockquote_level_changed =
3453                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3454                                            // Check for structural separators that break lists
3455                                            trimmed.starts_with("```")
3456                                                || trimmed.starts_with("~~~")
3457                                                || trimmed.starts_with("---")
3458                                                || trimmed.starts_with("***")
3459                                                || trimmed.starts_with("___")
3460                                                || blockquote_level_changed
3461                                                || crate::utils::skip_context::is_table_line(trimmed)
3462                                                || between_line.heading.is_some()
3463                                        } else {
3464                                            false
3465                                        }
3466                                    });
3467                                    found_continuation = !has_structural_separators;
3468                                }
3469                            }
3470                        }
3471                    }
3472
3473                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3474                        eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3475                    }
3476                    if found_continuation {
3477                        // Include the blank line in the block
3478                        block.end_line = line_num;
3479                    } else {
3480                        // Blank line ends the list - don't include it
3481                        list_blocks.push(block.clone());
3482                        current_block = None;
3483                    }
3484                } else {
3485                    // Check for lazy continuation - non-indented line immediately after a list item
3486                    // But only if the line has sufficient indentation for the list type
3487                    let min_required_indent = if block.is_ordered {
3488                        current_indent_level + last_marker_width
3489                    } else {
3490                        current_indent_level + 2
3491                    };
3492
3493                    // For lazy continuation to apply, the line must either:
3494                    // 1. Have no indentation (true lazy continuation)
3495                    // 2. Have sufficient indentation for the list type
3496                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3497                    let line_content = line_info.content(content).trim();
3498
3499                    // Check for table-like patterns
3500                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3501
3502                    // Check if blockquote level changed (not just if line starts with ">")
3503                    // Lines within the same blockquote level are NOT structural separators
3504                    let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3505                    let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3506                    let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3507
3508                    let is_structural_separator = line_info.heading.is_some()
3509                        || line_content.starts_with("```")
3510                        || line_content.starts_with("~~~")
3511                        || line_content.starts_with("---")
3512                        || line_content.starts_with("***")
3513                        || line_content.starts_with("___")
3514                        || blockquote_level_changed
3515                        || looks_like_table;
3516
3517                    // Allow lazy continuation if we're still within the same list block
3518                    // (not just immediately after a list item)
3519                    let is_lazy_continuation = !is_structural_separator
3520                        && !line_info.is_blank
3521                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3522
3523                    if is_lazy_continuation {
3524                        // Additional check: if the line starts with uppercase and looks like a new sentence,
3525                        // it's probably not a continuation
3526                        // BUT: for blockquote lines with sufficient effective indent, always treat as continuation
3527                        let line_content_raw = line_info.content(content);
3528                        let block_bq_level_lazy = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3529                        let line_bq_level_lazy = line_content_raw
3530                            .chars()
3531                            .take_while(|c| *c == '>' || c.is_whitespace())
3532                            .filter(|&c| c == '>')
3533                            .count();
3534                        let has_proper_blockquote_indent =
3535                            if line_bq_level_lazy > 0 && line_bq_level_lazy == block_bq_level_lazy {
3536                                // Compute effective indent after blockquote markers
3537                                let mut pos = 0;
3538                                let mut found_markers = 0;
3539                                for c in line_content_raw.chars() {
3540                                    pos += c.len_utf8();
3541                                    if c == '>' {
3542                                        found_markers += 1;
3543                                        if found_markers == line_bq_level_lazy {
3544                                            if line_content_raw.get(pos..pos + 1) == Some(" ") {
3545                                                pos += 1;
3546                                            }
3547                                            break;
3548                                        }
3549                                    }
3550                                }
3551                                let after_bq = &line_content_raw[pos..];
3552                                let effective_indent_lazy = after_bq.len() - after_bq.trim_start().len();
3553                                let min_required_for_bq = if block.is_ordered { last_marker_width } else { 2 };
3554                                effective_indent_lazy >= min_required_for_bq
3555                            } else {
3556                                false
3557                            };
3558
3559                        // If it has proper blockquote indent, it's a continuation regardless of uppercase
3560                        if has_proper_blockquote_indent {
3561                            block.end_line = line_num;
3562                        } else {
3563                            let content_to_check = if !blockquote_prefix.is_empty() {
3564                                // Strip blockquote prefix to check the actual content
3565                                line_info
3566                                    .content(content)
3567                                    .strip_prefix(&blockquote_prefix)
3568                                    .unwrap_or(line_info.content(content))
3569                                    .trim()
3570                            } else {
3571                                line_info.content(content).trim()
3572                            };
3573
3574                            let starts_with_uppercase =
3575                                content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3576
3577                            // If it starts with uppercase and the previous line ended with punctuation,
3578                            // it's likely a new paragraph, not a continuation
3579                            if starts_with_uppercase && last_list_item_line > 0 {
3580                                // This looks like a new paragraph
3581                                list_blocks.push(block.clone());
3582                                current_block = None;
3583                            } else {
3584                                // This is a lazy continuation line
3585                                block.end_line = line_num;
3586                            }
3587                        }
3588                    } else {
3589                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3590                        list_blocks.push(block.clone());
3591                        current_block = None;
3592                    }
3593                }
3594            }
3595        }
3596
3597        // Don't forget the last block
3598        if let Some(block) = current_block {
3599            list_blocks.push(block);
3600        }
3601
3602        // Merge adjacent blocks that should be one
3603        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3604
3605        list_blocks
3606    }
3607
3608    /// Compute character frequency for fast content analysis
3609    fn compute_char_frequency(content: &str) -> CharFrequency {
3610        let mut frequency = CharFrequency::default();
3611
3612        for ch in content.chars() {
3613            match ch {
3614                '#' => frequency.hash_count += 1,
3615                '*' => frequency.asterisk_count += 1,
3616                '_' => frequency.underscore_count += 1,
3617                '-' => frequency.hyphen_count += 1,
3618                '+' => frequency.plus_count += 1,
3619                '>' => frequency.gt_count += 1,
3620                '|' => frequency.pipe_count += 1,
3621                '[' => frequency.bracket_count += 1,
3622                '`' => frequency.backtick_count += 1,
3623                '<' => frequency.lt_count += 1,
3624                '!' => frequency.exclamation_count += 1,
3625                '\n' => frequency.newline_count += 1,
3626                _ => {}
3627            }
3628        }
3629
3630        frequency
3631    }
3632
3633    /// Parse HTML tags in the content
3634    fn parse_html_tags(
3635        content: &str,
3636        lines: &[LineInfo],
3637        code_blocks: &[(usize, usize)],
3638        flavor: MarkdownFlavor,
3639    ) -> Vec<HtmlTag> {
3640        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3641            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3642
3643        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3644
3645        for cap in HTML_TAG_REGEX.captures_iter(content) {
3646            let full_match = cap.get(0).unwrap();
3647            let match_start = full_match.start();
3648            let match_end = full_match.end();
3649
3650            // Skip if in code block
3651            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3652                continue;
3653            }
3654
3655            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3656            let tag_name_original = cap.get(2).unwrap().as_str();
3657            let tag_name = tag_name_original.to_lowercase();
3658            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3659
3660            // Skip JSX components in MDX files (tags starting with uppercase letter)
3661            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3662            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3663                continue;
3664            }
3665
3666            // Find which line this tag is on
3667            let mut line_num = 1;
3668            let mut col_start = match_start;
3669            let mut col_end = match_end;
3670            for (idx, line_info) in lines.iter().enumerate() {
3671                if match_start >= line_info.byte_offset {
3672                    line_num = idx + 1;
3673                    col_start = match_start - line_info.byte_offset;
3674                    col_end = match_end - line_info.byte_offset;
3675                } else {
3676                    break;
3677                }
3678            }
3679
3680            html_tags.push(HtmlTag {
3681                line: line_num,
3682                start_col: col_start,
3683                end_col: col_end,
3684                byte_offset: match_start,
3685                byte_end: match_end,
3686                tag_name,
3687                is_closing,
3688                is_self_closing,
3689                raw_content: full_match.as_str().to_string(),
3690            });
3691        }
3692
3693        html_tags
3694    }
3695
3696    /// Parse table rows in the content
3697    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3698        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3699
3700        for (line_idx, line_info) in lines.iter().enumerate() {
3701            // Skip lines in code blocks or blank lines
3702            if line_info.in_code_block || line_info.is_blank {
3703                continue;
3704            }
3705
3706            let line = line_info.content(content);
3707            let line_num = line_idx + 1;
3708
3709            // Check if this line contains pipes (potential table row)
3710            if !line.contains('|') {
3711                continue;
3712            }
3713
3714            // Count columns by splitting on pipes
3715            let parts: Vec<&str> = line.split('|').collect();
3716            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3717
3718            // Check if this is a separator row
3719            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3720            let mut column_alignments = Vec::new();
3721
3722            if is_separator {
3723                for part in &parts[1..parts.len() - 1] {
3724                    // Skip first and last empty parts
3725                    let trimmed = part.trim();
3726                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3727                        "center".to_string()
3728                    } else if trimmed.ends_with(':') {
3729                        "right".to_string()
3730                    } else if trimmed.starts_with(':') {
3731                        "left".to_string()
3732                    } else {
3733                        "none".to_string()
3734                    };
3735                    column_alignments.push(alignment);
3736                }
3737            }
3738
3739            table_rows.push(TableRow {
3740                line: line_num,
3741                is_separator,
3742                column_count,
3743                column_alignments,
3744            });
3745        }
3746
3747        table_rows
3748    }
3749
3750    /// Parse bare URLs and emails in the content
3751    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3752        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3753
3754        // Check for bare URLs (not in angle brackets or markdown links)
3755        for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3756            let full_match = cap.get(0).unwrap();
3757            let match_start = full_match.start();
3758            let match_end = full_match.end();
3759
3760            // Skip if in code block
3761            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3762                continue;
3763            }
3764
3765            // Skip if already in angle brackets or markdown links
3766            let preceding_char = if match_start > 0 {
3767                content.chars().nth(match_start - 1)
3768            } else {
3769                None
3770            };
3771            let following_char = content.chars().nth(match_end);
3772
3773            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3774                continue;
3775            }
3776            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3777                continue;
3778            }
3779
3780            let url = full_match.as_str();
3781            let url_type = if url.starts_with("https://") {
3782                "https"
3783            } else if url.starts_with("http://") {
3784                "http"
3785            } else if url.starts_with("ftp://") {
3786                "ftp"
3787            } else {
3788                "other"
3789            };
3790
3791            // Find which line this URL is on
3792            let mut line_num = 1;
3793            let mut col_start = match_start;
3794            let mut col_end = match_end;
3795            for (idx, line_info) in lines.iter().enumerate() {
3796                if match_start >= line_info.byte_offset {
3797                    line_num = idx + 1;
3798                    col_start = match_start - line_info.byte_offset;
3799                    col_end = match_end - line_info.byte_offset;
3800                } else {
3801                    break;
3802                }
3803            }
3804
3805            bare_urls.push(BareUrl {
3806                line: line_num,
3807                start_col: col_start,
3808                end_col: col_end,
3809                byte_offset: match_start,
3810                byte_end: match_end,
3811                url: url.to_string(),
3812                url_type: url_type.to_string(),
3813            });
3814        }
3815
3816        // Check for bare email addresses
3817        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3818            let full_match = cap.get(0).unwrap();
3819            let match_start = full_match.start();
3820            let match_end = full_match.end();
3821
3822            // Skip if in code block
3823            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3824                continue;
3825            }
3826
3827            // Skip if already in angle brackets or markdown links
3828            let preceding_char = if match_start > 0 {
3829                content.chars().nth(match_start - 1)
3830            } else {
3831                None
3832            };
3833            let following_char = content.chars().nth(match_end);
3834
3835            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3836                continue;
3837            }
3838            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3839                continue;
3840            }
3841
3842            let email = full_match.as_str();
3843
3844            // Find which line this email is on
3845            let mut line_num = 1;
3846            let mut col_start = match_start;
3847            let mut col_end = match_end;
3848            for (idx, line_info) in lines.iter().enumerate() {
3849                if match_start >= line_info.byte_offset {
3850                    line_num = idx + 1;
3851                    col_start = match_start - line_info.byte_offset;
3852                    col_end = match_end - line_info.byte_offset;
3853                } else {
3854                    break;
3855                }
3856            }
3857
3858            bare_urls.push(BareUrl {
3859                line: line_num,
3860                start_col: col_start,
3861                end_col: col_end,
3862                byte_offset: match_start,
3863                byte_end: match_end,
3864                url: email.to_string(),
3865                url_type: "email".to_string(),
3866            });
3867        }
3868
3869        bare_urls
3870    }
3871
3872    /// Get an iterator over valid CommonMark headings
3873    ///
3874    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3875    /// that should be flagged by MD018 but should not be processed by other heading rules.
3876    ///
3877    /// # Examples
3878    ///
3879    /// ```rust
3880    /// use rumdl_lib::lint_context::LintContext;
3881    /// use rumdl_lib::config::MarkdownFlavor;
3882    ///
3883    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
3884    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3885    ///
3886    /// for heading in ctx.valid_headings() {
3887    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
3888    /// }
3889    /// // Only prints valid headings, skips `#NoSpace`
3890    /// ```
3891    #[must_use]
3892    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3893        ValidHeadingsIter::new(&self.lines)
3894    }
3895
3896    /// Check if the document contains any valid CommonMark headings
3897    ///
3898    /// Returns `true` if there is at least one heading with proper space after `#`.
3899    #[must_use]
3900    pub fn has_valid_headings(&self) -> bool {
3901        self.lines
3902            .iter()
3903            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3904    }
3905}
3906
3907/// Merge adjacent list blocks that should be treated as one
3908fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3909    if list_blocks.len() < 2 {
3910        return;
3911    }
3912
3913    let mut merger = ListBlockMerger::new(content, lines);
3914    *list_blocks = merger.merge(list_blocks);
3915}
3916
3917/// Helper struct to manage the complex logic of merging list blocks
3918struct ListBlockMerger<'a> {
3919    content: &'a str,
3920    lines: &'a [LineInfo],
3921}
3922
3923impl<'a> ListBlockMerger<'a> {
3924    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3925        Self { content, lines }
3926    }
3927
3928    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3929        let mut merged = Vec::with_capacity(list_blocks.len());
3930        let mut current = list_blocks[0].clone();
3931
3932        for next in list_blocks.iter().skip(1) {
3933            if self.should_merge_blocks(&current, next) {
3934                current = self.merge_two_blocks(current, next);
3935            } else {
3936                merged.push(current);
3937                current = next.clone();
3938            }
3939        }
3940
3941        merged.push(current);
3942        merged
3943    }
3944
3945    /// Determine if two adjacent list blocks should be merged
3946    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3947        // Basic compatibility checks
3948        if !self.blocks_are_compatible(current, next) {
3949            return false;
3950        }
3951
3952        // Check spacing and content between blocks
3953        let spacing = self.analyze_spacing_between(current, next);
3954        match spacing {
3955            BlockSpacing::Consecutive => true,
3956            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3957            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3958                self.can_merge_with_content_between(current, next)
3959            }
3960        }
3961    }
3962
3963    /// Check if blocks have compatible structure for merging
3964    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3965        current.is_ordered == next.is_ordered
3966            && current.blockquote_prefix == next.blockquote_prefix
3967            && current.nesting_level == next.nesting_level
3968    }
3969
3970    /// Analyze the spacing between two list blocks
3971    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3972        let gap = next.start_line - current.end_line;
3973
3974        match gap {
3975            1 => BlockSpacing::Consecutive,
3976            2 => BlockSpacing::SingleBlank,
3977            _ if gap > 2 => {
3978                if self.has_only_blank_lines_between(current, next) {
3979                    BlockSpacing::MultipleBlanks
3980                } else {
3981                    BlockSpacing::ContentBetween
3982                }
3983            }
3984            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3985        }
3986    }
3987
3988    /// Check if unordered lists can be merged with a single blank line between
3989    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3990        // Check if there are structural separators between the blocks
3991        // If has_meaningful_content_between returns true, it means there are structural separators
3992        if has_meaningful_content_between(self.content, current, next, self.lines) {
3993            return false; // Structural separators prevent merging
3994        }
3995
3996        // Only merge unordered lists with same marker across single blank
3997        !current.is_ordered && current.marker == next.marker
3998    }
3999
4000    /// Check if ordered lists can be merged when there's content between them
4001    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4002        // Do not merge lists if there are structural separators between them
4003        if has_meaningful_content_between(self.content, current, next, self.lines) {
4004            return false; // Structural separators prevent merging
4005        }
4006
4007        // Only consider merging ordered lists if there's no structural content between
4008        current.is_ordered && next.is_ordered
4009    }
4010
4011    /// Check if there are only blank lines between blocks
4012    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4013        for line_num in (current.end_line + 1)..next.start_line {
4014            if let Some(line_info) = self.lines.get(line_num - 1)
4015                && !line_info.content(self.content).trim().is_empty()
4016            {
4017                return false;
4018            }
4019        }
4020        true
4021    }
4022
4023    /// Merge two compatible list blocks into one
4024    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4025        current.end_line = next.end_line;
4026        current.item_lines.extend_from_slice(&next.item_lines);
4027
4028        // Update max marker width
4029        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4030
4031        // Handle marker consistency for unordered lists
4032        if !current.is_ordered && self.markers_differ(&current, next) {
4033            current.marker = None; // Mixed markers
4034        }
4035
4036        current
4037    }
4038
4039    /// Check if two blocks have different markers
4040    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4041        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4042    }
4043}
4044
4045/// Types of spacing between list blocks
4046#[derive(Debug, PartialEq)]
4047enum BlockSpacing {
4048    Consecutive,    // No gap between blocks
4049    SingleBlank,    // One blank line between blocks
4050    MultipleBlanks, // Multiple blank lines but no content
4051    ContentBetween, // Content exists between blocks
4052}
4053
4054/// Check if there's meaningful content (not just blank lines) between two list blocks
4055fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4056    // Check lines between current.end_line and next.start_line
4057    for line_num in (current.end_line + 1)..next.start_line {
4058        if let Some(line_info) = lines.get(line_num - 1) {
4059            // Convert to 0-indexed
4060            let trimmed = line_info.content(content).trim();
4061
4062            // Skip empty lines
4063            if trimmed.is_empty() {
4064                continue;
4065            }
4066
4067            // Check for structural separators that should separate lists (CommonMark compliant)
4068
4069            // Headings separate lists
4070            if line_info.heading.is_some() {
4071                return true; // Has meaningful content - headings separate lists
4072            }
4073
4074            // Horizontal rules separate lists (---, ***, ___)
4075            if is_horizontal_rule(trimmed) {
4076                return true; // Has meaningful content - horizontal rules separate lists
4077            }
4078
4079            // Tables separate lists
4080            if crate::utils::skip_context::is_table_line(trimmed) {
4081                return true; // Has meaningful content - tables separate lists
4082            }
4083
4084            // Blockquotes separate lists
4085            if trimmed.starts_with('>') {
4086                return true; // Has meaningful content - blockquotes separate lists
4087            }
4088
4089            // Code block fences separate lists (unless properly indented as list content)
4090            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4091                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4092
4093                // Check if this code block is properly indented as list continuation
4094                let min_continuation_indent = if current.is_ordered {
4095                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
4096                } else {
4097                    current.nesting_level + 2
4098                };
4099
4100                if line_indent < min_continuation_indent {
4101                    // This is a standalone code block that separates lists
4102                    return true; // Has meaningful content - standalone code blocks separate lists
4103                }
4104            }
4105
4106            // Check if this line has proper indentation for list continuation
4107            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4108
4109            // Calculate minimum indentation needed to be list continuation
4110            let min_indent = if current.is_ordered {
4111                current.nesting_level + current.max_marker_width
4112            } else {
4113                current.nesting_level + 2
4114            };
4115
4116            // If the line is not indented enough to be list continuation, it's meaningful content
4117            if line_indent < min_indent {
4118                return true; // Has meaningful content - content not indented as list continuation
4119            }
4120
4121            // If we reach here, the line is properly indented as list continuation
4122            // Continue checking other lines
4123        }
4124    }
4125
4126    // Only blank lines or properly indented list continuation content between blocks
4127    false
4128}
4129
4130/// Check if a line is a horizontal rule (---, ***, ___) per CommonMark spec.
4131/// CommonMark rules for thematic breaks (horizontal rules):
4132/// - May have 0-3 spaces of leading indentation (but NOT tabs)
4133/// - Must have 3+ of the same character (-, *, or _)
4134/// - May have spaces between characters
4135/// - No other characters allowed
4136pub fn is_horizontal_rule_line(line: &str) -> bool {
4137    // CommonMark: HRs can have 0-3 spaces of leading indentation, not tabs
4138    let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4139    if leading_spaces > 3 || line.starts_with('\t') {
4140        return false;
4141    }
4142
4143    is_horizontal_rule_content(line.trim())
4144}
4145
4146/// Check if trimmed content matches horizontal rule pattern.
4147/// Use `is_horizontal_rule_line` for full CommonMark compliance including indentation check.
4148pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4149    if trimmed.len() < 3 {
4150        return false;
4151    }
4152
4153    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
4154    let chars: Vec<char> = trimmed.chars().collect();
4155    if let Some(&first_char) = chars.first()
4156        && (first_char == '-' || first_char == '*' || first_char == '_')
4157    {
4158        let mut count = 0;
4159        for &ch in &chars {
4160            if ch == first_char {
4161                count += 1;
4162            } else if ch != ' ' && ch != '\t' {
4163                return false; // Non-matching, non-whitespace character
4164            }
4165        }
4166        return count >= 3;
4167    }
4168    false
4169}
4170
4171/// Backwards-compatible alias for `is_horizontal_rule_content`
4172pub fn is_horizontal_rule(trimmed: &str) -> bool {
4173    is_horizontal_rule_content(trimmed)
4174}
4175
4176/// Check if content contains patterns that cause the markdown crate to panic
4177#[cfg(test)]
4178mod tests {
4179    use super::*;
4180
4181    #[test]
4182    fn test_empty_content() {
4183        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4184        assert_eq!(ctx.content, "");
4185        assert_eq!(ctx.line_offsets, vec![0]);
4186        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4187        assert_eq!(ctx.lines.len(), 0);
4188    }
4189
4190    #[test]
4191    fn test_single_line() {
4192        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4193        assert_eq!(ctx.content, "# Hello");
4194        assert_eq!(ctx.line_offsets, vec![0]);
4195        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4196        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4197    }
4198
4199    #[test]
4200    fn test_multi_line() {
4201        let content = "# Title\n\nSecond line\nThird line";
4202        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4203        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4204        // Test offset to line/col
4205        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
4206        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
4207        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
4208        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
4209        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
4210    }
4211
4212    #[test]
4213    fn test_line_info() {
4214        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
4215        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4216
4217        // Test line info
4218        assert_eq!(ctx.lines.len(), 7);
4219
4220        // Line 1: "# Title"
4221        let line1 = &ctx.lines[0];
4222        assert_eq!(line1.content(ctx.content), "# Title");
4223        assert_eq!(line1.byte_offset, 0);
4224        assert_eq!(line1.indent, 0);
4225        assert!(!line1.is_blank);
4226        assert!(!line1.in_code_block);
4227        assert!(line1.list_item.is_none());
4228
4229        // Line 2: "    indented"
4230        let line2 = &ctx.lines[1];
4231        assert_eq!(line2.content(ctx.content), "    indented");
4232        assert_eq!(line2.byte_offset, 8);
4233        assert_eq!(line2.indent, 4);
4234        assert!(!line2.is_blank);
4235
4236        // Line 3: "" (blank)
4237        let line3 = &ctx.lines[2];
4238        assert_eq!(line3.content(ctx.content), "");
4239        assert!(line3.is_blank);
4240
4241        // Test helper methods
4242        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4243        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4244        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4245        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4246    }
4247
4248    #[test]
4249    fn test_list_item_detection() {
4250        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
4251        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4252
4253        // Line 1: "- Unordered item"
4254        let line1 = &ctx.lines[0];
4255        assert!(line1.list_item.is_some());
4256        let list1 = line1.list_item.as_ref().unwrap();
4257        assert_eq!(list1.marker, "-");
4258        assert!(!list1.is_ordered);
4259        assert_eq!(list1.marker_column, 0);
4260        assert_eq!(list1.content_column, 2);
4261
4262        // Line 2: "  * Nested item"
4263        let line2 = &ctx.lines[1];
4264        assert!(line2.list_item.is_some());
4265        let list2 = line2.list_item.as_ref().unwrap();
4266        assert_eq!(list2.marker, "*");
4267        assert_eq!(list2.marker_column, 2);
4268
4269        // Line 3: "1. Ordered item"
4270        let line3 = &ctx.lines[2];
4271        assert!(line3.list_item.is_some());
4272        let list3 = line3.list_item.as_ref().unwrap();
4273        assert_eq!(list3.marker, "1.");
4274        assert!(list3.is_ordered);
4275        assert_eq!(list3.number, Some(1));
4276
4277        // Line 6: "Not a list"
4278        let line6 = &ctx.lines[5];
4279        assert!(line6.list_item.is_none());
4280    }
4281
4282    #[test]
4283    fn test_offset_to_line_col_edge_cases() {
4284        let content = "a\nb\nc";
4285        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4286        // line_offsets: [0, 2, 4]
4287        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
4288        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
4289        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
4290        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
4291        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
4292        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
4293    }
4294
4295    #[test]
4296    fn test_mdx_esm_blocks() {
4297        let content = r##"import {Chart} from './snowfall.js'
4298export const year = 2023
4299
4300# Last year's snowfall
4301
4302In {year}, the snowfall was above average.
4303It was followed by a warm spring which caused
4304flood conditions in many of the nearby rivers.
4305
4306<Chart color="#fcb32c" year={year} />
4307"##;
4308
4309        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4310
4311        // Check that lines 1 and 2 are marked as ESM blocks
4312        assert_eq!(ctx.lines.len(), 10);
4313        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4314        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4315        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4316        assert!(
4317            !ctx.lines[3].in_esm_block,
4318            "Line 4 (heading) should NOT be in_esm_block"
4319        );
4320        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4321        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4322    }
4323
4324    #[test]
4325    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4326        let content = r#"import {Chart} from './snowfall.js'
4327export const year = 2023
4328
4329# Last year's snowfall
4330"#;
4331
4332        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4333
4334        // ESM blocks should NOT be detected in Standard flavor
4335        assert!(
4336            !ctx.lines[0].in_esm_block,
4337            "Line 1 should NOT be in_esm_block in Standard flavor"
4338        );
4339        assert!(
4340            !ctx.lines[1].in_esm_block,
4341            "Line 2 should NOT be in_esm_block in Standard flavor"
4342        );
4343    }
4344
4345    #[test]
4346    fn test_blockquote_with_indented_content() {
4347        // Lines with `>` followed by heavily-indented content should be detected as blockquotes.
4348        // The content inside the blockquote may also be detected as a code block (which is correct),
4349        // but for MD046 purposes, we need to know the line is inside a blockquote.
4350        let content = r#"# Heading
4351
4352>      -S socket-path
4353>                    More text
4354"#;
4355        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4356
4357        // Line 3 (index 2) should be detected as blockquote
4358        assert!(
4359            ctx.lines.get(2).is_some_and(|l| l.blockquote.is_some()),
4360            "Line 3 should be a blockquote"
4361        );
4362        // Line 4 (index 3) should also be blockquote
4363        assert!(
4364            ctx.lines.get(3).is_some_and(|l| l.blockquote.is_some()),
4365            "Line 4 should be a blockquote"
4366        );
4367
4368        // Verify blockquote content is correctly parsed
4369        // Note: spaces_after includes the spaces between `>` and content
4370        let bq3 = ctx.lines.get(2).unwrap().blockquote.as_ref().unwrap();
4371        assert_eq!(bq3.content, "-S socket-path");
4372        assert_eq!(bq3.nesting_level, 1);
4373        // 6 spaces after the `>` marker
4374        assert!(bq3.has_multiple_spaces_after_marker);
4375
4376        let bq4 = ctx.lines.get(3).unwrap().blockquote.as_ref().unwrap();
4377        assert_eq!(bq4.content, "More text");
4378        assert_eq!(bq4.nesting_level, 1);
4379    }
4380}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs