rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::path::PathBuf;
10use std::sync::LazyLock;
11
12/// Macro for profiling sections - only active in non-WASM builds
13#[cfg(not(target_arch = "wasm32"))]
14macro_rules! profile_section {
15    ($name:expr, $profile:expr, $code:expr) => {{
16        let start = std::time::Instant::now();
17        let result = $code;
18        if $profile {
19            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
20        }
21        result
22    }};
23}
24
25#[cfg(target_arch = "wasm32")]
26macro_rules! profile_section {
27    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
28}
29
30// Comprehensive link pattern that captures both inline and reference links
31// Use (?s) flag to make . match newlines
32static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33    Regex::new(
34        r#"(?sx)
35        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
36        (?:
37            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
38            |
39            \[([^\]]*)\]      # Reference ID in group 6
40        )"#
41    ).unwrap()
42});
43
44// Image pattern (similar to links but with ! prefix)
45// Use (?s) flag to make . match newlines
46static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(
48        r#"(?sx)
49        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
50        (?:
51            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
52            |
53            \[([^\]]*)\]      # Reference ID in group 6
54        )"#
55    ).unwrap()
56});
57
58// Reference definition pattern
59static REF_DEF_PATTERN: LazyLock<Regex> =
60    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
61
62// Pattern for bare URLs - uses centralized URL pattern from regex_cache
63
64// Pattern for email addresses
65static BARE_EMAIL_PATTERN: LazyLock<Regex> =
66    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
67
68// Pattern for blockquote prefix in parse_list_blocks
69static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
70
71/// Pre-computed information about a line
72#[derive(Debug, Clone)]
73pub struct LineInfo {
74    /// Byte offset where this line starts in the document
75    pub byte_offset: usize,
76    /// Length of the line in bytes (without newline)
77    pub byte_len: usize,
78    /// Number of bytes of leading whitespace (for substring extraction)
79    pub indent: usize,
80    /// Visual column width of leading whitespace (with proper tab expansion)
81    /// Per CommonMark, tabs expand to the next column that is a multiple of 4.
82    /// Use this for numeric comparisons like checking for indented code blocks (>= 4).
83    pub visual_indent: usize,
84    /// Whether the line is blank (empty or only whitespace)
85    pub is_blank: bool,
86    /// Whether this line is inside a code block
87    pub in_code_block: bool,
88    /// Whether this line is inside front matter
89    pub in_front_matter: bool,
90    /// Whether this line is inside an HTML block
91    pub in_html_block: bool,
92    /// Whether this line is inside an HTML comment
93    pub in_html_comment: bool,
94    /// List item information if this line starts a list item
95    pub list_item: Option<ListItemInfo>,
96    /// Heading information if this line is a heading
97    pub heading: Option<HeadingInfo>,
98    /// Blockquote information if this line is a blockquote
99    pub blockquote: Option<BlockquoteInfo>,
100    /// Whether this line is inside a mkdocstrings autodoc block
101    pub in_mkdocstrings: bool,
102    /// Whether this line is part of an ESM import/export block (MDX only)
103    pub in_esm_block: bool,
104    /// Whether this line is a continuation of a multi-line code span from a previous line
105    pub in_code_span_continuation: bool,
106    /// Whether this line is a horizontal rule (---, ***, ___, etc.)
107    /// Pre-computed for consistent detection across all rules
108    pub is_horizontal_rule: bool,
109    /// Whether this line is inside a math block ($$ ... $$)
110    pub in_math_block: bool,
111}
112
113impl LineInfo {
114    /// Get the line content as a string slice from the source document
115    pub fn content<'a>(&self, source: &'a str) -> &'a str {
116        &source[self.byte_offset..self.byte_offset + self.byte_len]
117    }
118}
119
120/// Information about a list item
121#[derive(Debug, Clone)]
122pub struct ListItemInfo {
123    /// The marker used (*, -, +, or number with . or ))
124    pub marker: String,
125    /// Whether it's ordered (true) or unordered (false)
126    pub is_ordered: bool,
127    /// The number for ordered lists
128    pub number: Option<usize>,
129    /// Column where the marker starts (0-based)
130    pub marker_column: usize,
131    /// Column where content after marker starts
132    pub content_column: usize,
133}
134
135/// Heading style type
136#[derive(Debug, Clone, PartialEq)]
137pub enum HeadingStyle {
138    /// ATX style heading (# Heading)
139    ATX,
140    /// Setext style heading with = underline
141    Setext1,
142    /// Setext style heading with - underline
143    Setext2,
144}
145
146/// Parsed link information
147#[derive(Debug, Clone)]
148pub struct ParsedLink<'a> {
149    /// Line number (1-indexed)
150    pub line: usize,
151    /// Start column (0-indexed) in the line
152    pub start_col: usize,
153    /// End column (0-indexed) in the line
154    pub end_col: usize,
155    /// Byte offset in document
156    pub byte_offset: usize,
157    /// End byte offset in document
158    pub byte_end: usize,
159    /// Link text
160    pub text: Cow<'a, str>,
161    /// Link URL or reference
162    pub url: Cow<'a, str>,
163    /// Whether this is a reference link [text][ref] vs inline [text](url)
164    pub is_reference: bool,
165    /// Reference ID for reference links
166    pub reference_id: Option<Cow<'a, str>>,
167    /// Link type from pulldown-cmark
168    pub link_type: LinkType,
169}
170
171/// Information about a broken link reported by pulldown-cmark
172#[derive(Debug, Clone)]
173pub struct BrokenLinkInfo {
174    /// The reference text that couldn't be resolved
175    pub reference: String,
176    /// Byte span in the source document
177    pub span: std::ops::Range<usize>,
178}
179
180/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
181#[derive(Debug, Clone)]
182pub struct FootnoteRef {
183    /// The footnote ID (without the ^ prefix)
184    pub id: String,
185    /// Line number (1-indexed)
186    pub line: usize,
187    /// Start byte offset in document
188    pub byte_offset: usize,
189    /// End byte offset in document
190    pub byte_end: usize,
191}
192
193/// Parsed image information
194#[derive(Debug, Clone)]
195pub struct ParsedImage<'a> {
196    /// Line number (1-indexed)
197    pub line: usize,
198    /// Start column (0-indexed) in the line
199    pub start_col: usize,
200    /// End column (0-indexed) in the line
201    pub end_col: usize,
202    /// Byte offset in document
203    pub byte_offset: usize,
204    /// End byte offset in document
205    pub byte_end: usize,
206    /// Alt text
207    pub alt_text: Cow<'a, str>,
208    /// Image URL or reference
209    pub url: Cow<'a, str>,
210    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
211    pub is_reference: bool,
212    /// Reference ID for reference images
213    pub reference_id: Option<Cow<'a, str>>,
214    /// Link type from pulldown-cmark
215    pub link_type: LinkType,
216}
217
218/// Reference definition [ref]: url "title"
219#[derive(Debug, Clone)]
220pub struct ReferenceDef {
221    /// Line number (1-indexed)
222    pub line: usize,
223    /// Reference ID (normalized to lowercase)
224    pub id: String,
225    /// URL
226    pub url: String,
227    /// Optional title
228    pub title: Option<String>,
229    /// Byte offset where the reference definition starts
230    pub byte_offset: usize,
231    /// Byte offset where the reference definition ends
232    pub byte_end: usize,
233    /// Byte offset where the title starts (if present, includes quote)
234    pub title_byte_start: Option<usize>,
235    /// Byte offset where the title ends (if present, includes quote)
236    pub title_byte_end: Option<usize>,
237}
238
239/// Parsed code span information
240#[derive(Debug, Clone)]
241pub struct CodeSpan {
242    /// Line number where the code span starts (1-indexed)
243    pub line: usize,
244    /// Line number where the code span ends (1-indexed)
245    pub end_line: usize,
246    /// Start column (0-indexed) in the line
247    pub start_col: usize,
248    /// End column (0-indexed) in the line
249    pub end_col: usize,
250    /// Byte offset in document
251    pub byte_offset: usize,
252    /// End byte offset in document
253    pub byte_end: usize,
254    /// Number of backticks used (1, 2, 3, etc.)
255    pub backtick_count: usize,
256    /// Content inside the code span (without backticks)
257    pub content: String,
258}
259
260/// Information about a heading
261#[derive(Debug, Clone)]
262pub struct HeadingInfo {
263    /// Heading level (1-6 for ATX, 1-2 for Setext)
264    pub level: u8,
265    /// Style of heading
266    pub style: HeadingStyle,
267    /// The heading marker (# characters or underline)
268    pub marker: String,
269    /// Column where the marker starts (0-based)
270    pub marker_column: usize,
271    /// Column where heading text starts
272    pub content_column: usize,
273    /// The heading text (without markers and without custom ID syntax)
274    pub text: String,
275    /// Custom header ID if present (e.g., from {#custom-id} syntax)
276    pub custom_id: Option<String>,
277    /// Original heading text including custom ID syntax
278    pub raw_text: String,
279    /// Whether it has a closing sequence (for ATX)
280    pub has_closing_sequence: bool,
281    /// The closing sequence if present
282    pub closing_sequence: String,
283    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
284    /// False for malformed headings like `#NoSpace` that MD018 should flag
285    pub is_valid: bool,
286}
287
288/// A valid heading from a filtered iteration
289///
290/// Only includes headings that are CommonMark-compliant (have space after #).
291/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
292#[derive(Debug, Clone)]
293pub struct ValidHeading<'a> {
294    /// The 1-indexed line number in the document
295    pub line_num: usize,
296    /// Reference to the heading information
297    pub heading: &'a HeadingInfo,
298    /// Reference to the full line info (for rules that need additional context)
299    pub line_info: &'a LineInfo,
300}
301
302/// Iterator over valid CommonMark headings in a document
303///
304/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
305/// but should not be processed by other heading rules.
306pub struct ValidHeadingsIter<'a> {
307    lines: &'a [LineInfo],
308    current_index: usize,
309}
310
311impl<'a> ValidHeadingsIter<'a> {
312    fn new(lines: &'a [LineInfo]) -> Self {
313        Self {
314            lines,
315            current_index: 0,
316        }
317    }
318}
319
320impl<'a> Iterator for ValidHeadingsIter<'a> {
321    type Item = ValidHeading<'a>;
322
323    fn next(&mut self) -> Option<Self::Item> {
324        while self.current_index < self.lines.len() {
325            let idx = self.current_index;
326            self.current_index += 1;
327
328            let line_info = &self.lines[idx];
329            if let Some(heading) = &line_info.heading
330                && heading.is_valid
331            {
332                return Some(ValidHeading {
333                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
334                    heading,
335                    line_info,
336                });
337            }
338        }
339        None
340    }
341}
342
343/// Information about a blockquote line
344#[derive(Debug, Clone)]
345pub struct BlockquoteInfo {
346    /// Nesting level (1 for >, 2 for >>, etc.)
347    pub nesting_level: usize,
348    /// The indentation before the blockquote marker
349    pub indent: String,
350    /// Column where the first > starts (0-based)
351    pub marker_column: usize,
352    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
353    pub prefix: String,
354    /// Content after the blockquote marker(s)
355    pub content: String,
356    /// Whether the line has no space after the marker
357    pub has_no_space_after_marker: bool,
358    /// Whether the line has multiple spaces after the marker
359    pub has_multiple_spaces_after_marker: bool,
360    /// Whether this is an empty blockquote line needing MD028 fix
361    pub needs_md028_fix: bool,
362}
363
364/// Information about a list block
365#[derive(Debug, Clone)]
366pub struct ListBlock {
367    /// Line number where the list starts (1-indexed)
368    pub start_line: usize,
369    /// Line number where the list ends (1-indexed)
370    pub end_line: usize,
371    /// Whether it's ordered or unordered
372    pub is_ordered: bool,
373    /// The consistent marker for unordered lists (if any)
374    pub marker: Option<String>,
375    /// Blockquote prefix for this list (empty if not in blockquote)
376    pub blockquote_prefix: String,
377    /// Lines that are list items within this block
378    pub item_lines: Vec<usize>,
379    /// Nesting level (0 for top-level lists)
380    pub nesting_level: usize,
381    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
382    pub max_marker_width: usize,
383}
384
385use std::sync::{Arc, OnceLock};
386
387/// Map from line byte offset to list item data: (is_ordered, marker, marker_column, content_column, number)
388type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
389
390/// Character frequency data for fast content analysis
391#[derive(Debug, Clone, Default)]
392pub struct CharFrequency {
393    /// Count of # characters (headings)
394    pub hash_count: usize,
395    /// Count of * characters (emphasis, lists, horizontal rules)
396    pub asterisk_count: usize,
397    /// Count of _ characters (emphasis, horizontal rules)
398    pub underscore_count: usize,
399    /// Count of - characters (lists, horizontal rules, setext headings)
400    pub hyphen_count: usize,
401    /// Count of + characters (lists)
402    pub plus_count: usize,
403    /// Count of > characters (blockquotes)
404    pub gt_count: usize,
405    /// Count of | characters (tables)
406    pub pipe_count: usize,
407    /// Count of [ characters (links, images)
408    pub bracket_count: usize,
409    /// Count of ` characters (code spans, code blocks)
410    pub backtick_count: usize,
411    /// Count of < characters (HTML tags, autolinks)
412    pub lt_count: usize,
413    /// Count of ! characters (images)
414    pub exclamation_count: usize,
415    /// Count of newline characters
416    pub newline_count: usize,
417}
418
419/// Pre-parsed HTML tag information
420#[derive(Debug, Clone)]
421pub struct HtmlTag {
422    /// Line number (1-indexed)
423    pub line: usize,
424    /// Start column (0-indexed) in the line
425    pub start_col: usize,
426    /// End column (0-indexed) in the line
427    pub end_col: usize,
428    /// Byte offset in document
429    pub byte_offset: usize,
430    /// End byte offset in document
431    pub byte_end: usize,
432    /// Tag name (e.g., "div", "img", "br")
433    pub tag_name: String,
434    /// Whether it's a closing tag (`</tag>`)
435    pub is_closing: bool,
436    /// Whether it's self-closing (`<tag />`)
437    pub is_self_closing: bool,
438    /// Raw tag content
439    pub raw_content: String,
440}
441
442/// Pre-parsed emphasis span information
443#[derive(Debug, Clone)]
444pub struct EmphasisSpan {
445    /// Line number (1-indexed)
446    pub line: usize,
447    /// Start column (0-indexed) in the line
448    pub start_col: usize,
449    /// End column (0-indexed) in the line
450    pub end_col: usize,
451    /// Byte offset in document
452    pub byte_offset: usize,
453    /// End byte offset in document
454    pub byte_end: usize,
455    /// Type of emphasis ('*' or '_')
456    pub marker: char,
457    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
458    pub marker_count: usize,
459    /// Content inside the emphasis
460    pub content: String,
461}
462
463/// Pre-parsed table row information
464#[derive(Debug, Clone)]
465pub struct TableRow {
466    /// Line number (1-indexed)
467    pub line: usize,
468    /// Whether this is a separator row (contains only |, -, :, and spaces)
469    pub is_separator: bool,
470    /// Number of columns (pipe-separated cells)
471    pub column_count: usize,
472    /// Alignment info from separator row
473    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
474}
475
476/// Pre-parsed bare URL information (not in links)
477#[derive(Debug, Clone)]
478pub struct BareUrl {
479    /// Line number (1-indexed)
480    pub line: usize,
481    /// Start column (0-indexed) in the line
482    pub start_col: usize,
483    /// End column (0-indexed) in the line
484    pub end_col: usize,
485    /// Byte offset in document
486    pub byte_offset: usize,
487    /// End byte offset in document
488    pub byte_end: usize,
489    /// The URL string
490    pub url: String,
491    /// Type of URL ("http", "https", "ftp", "email")
492    pub url_type: String,
493}
494
495pub struct LintContext<'a> {
496    pub content: &'a str,
497    pub line_offsets: Vec<usize>,
498    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
499    pub lines: Vec<LineInfo>,             // Pre-computed line information
500    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
501    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
502    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
503    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
504    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
505    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
506    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
507    pub char_frequency: CharFrequency,    // Character frequency analysis
508    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
509    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
510    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
511    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
512    has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
513    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
514    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
515    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
516    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
517    pub flavor: MarkdownFlavor,           // Markdown flavor being used
518    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
519}
520
521/// Detailed blockquote parse result with all components
522struct BlockquoteComponents<'a> {
523    indent: &'a str,
524    markers: &'a str,
525    spaces_after: &'a str,
526    content: &'a str,
527}
528
529/// Parse blockquote prefix with detailed components using manual parsing
530#[inline]
531fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
532    let bytes = line.as_bytes();
533    let mut pos = 0;
534
535    // Parse leading whitespace (indent)
536    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
537        pos += 1;
538    }
539    let indent_end = pos;
540
541    // Must have at least one '>' marker
542    if pos >= bytes.len() || bytes[pos] != b'>' {
543        return None;
544    }
545
546    // Parse '>' markers
547    while pos < bytes.len() && bytes[pos] == b'>' {
548        pos += 1;
549    }
550    let markers_end = pos;
551
552    // Parse spaces after markers
553    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
554        pos += 1;
555    }
556    let spaces_end = pos;
557
558    Some(BlockquoteComponents {
559        indent: &line[0..indent_end],
560        markers: &line[indent_end..markers_end],
561        spaces_after: &line[markers_end..spaces_end],
562        content: &line[spaces_end..],
563    })
564}
565
566impl<'a> LintContext<'a> {
567    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
568        #[cfg(not(target_arch = "wasm32"))]
569        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
570        #[cfg(target_arch = "wasm32")]
571        let profile = false;
572
573        let line_offsets = profile_section!("Line offsets", profile, {
574            let mut offsets = vec![0];
575            for (i, c) in content.char_indices() {
576                if c == '\n' {
577                    offsets.push(i + 1);
578                }
579            }
580            offsets
581        });
582
583        // Detect code blocks once and cache them
584        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
585
586        // Pre-compute HTML comment ranges ONCE for all operations
587        let html_comment_ranges = profile_section!(
588            "HTML comment ranges",
589            profile,
590            crate::utils::skip_context::compute_html_comment_ranges(content)
591        );
592
593        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
594        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
595            if flavor == MarkdownFlavor::MkDocs {
596                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
597            } else {
598                Vec::new()
599            }
600        });
601
602        // Pre-compute line information AND emphasis spans (without headings/blockquotes yet)
603        // Emphasis spans are captured during the same pulldown-cmark parse as list detection
604        let (mut lines, emphasis_spans) = profile_section!(
605            "Basic line info",
606            profile,
607            Self::compute_basic_line_info(
608                content,
609                &line_offsets,
610                &code_blocks,
611                flavor,
612                &html_comment_ranges,
613                &autodoc_ranges,
614            )
615        );
616
617        // Detect HTML blocks BEFORE heading detection
618        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
619
620        // Detect ESM import/export blocks in MDX files BEFORE heading detection
621        profile_section!(
622            "ESM blocks",
623            profile,
624            Self::detect_esm_blocks(content, &mut lines, flavor)
625        );
626
627        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
628        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
629
630        // Now detect headings and blockquotes
631        profile_section!(
632            "Headings & blockquotes",
633            profile,
634            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
635        );
636
637        // Parse code spans early so we can exclude them from link/image parsing
638        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
639
640        // Mark lines that are continuations of multi-line code spans
641        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
642        for span in &code_spans {
643            if span.end_line > span.line {
644                // Mark lines after the first line as continuations
645                for line_num in (span.line + 1)..=span.end_line {
646                    if let Some(line_info) = lines.get_mut(line_num - 1) {
647                        line_info.in_code_span_continuation = true;
648                    }
649                }
650            }
651        }
652
653        // Parse links, images, references, and list blocks
654        let (links, broken_links, footnote_refs) = profile_section!(
655            "Links",
656            profile,
657            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
658        );
659
660        let images = profile_section!(
661            "Images",
662            profile,
663            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
664        );
665
666        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
667
668        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
669
670        // Compute character frequency for fast content analysis
671        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
672
673        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
674        let table_blocks = profile_section!(
675            "Table blocks",
676            profile,
677            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
678                content,
679                &code_blocks,
680                &code_spans,
681                &html_comment_ranges,
682            )
683        );
684
685        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
686        let line_index = profile_section!(
687            "Line index",
688            profile,
689            crate::utils::range_utils::LineIndex::new(content)
690        );
691
692        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
693        let jinja_ranges = profile_section!(
694            "Jinja ranges",
695            profile,
696            crate::utils::jinja_utils::find_jinja_ranges(content)
697        );
698
699        Self {
700            content,
701            line_offsets,
702            code_blocks,
703            lines,
704            links,
705            images,
706            broken_links,
707            footnote_refs,
708            reference_defs,
709            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
710            list_blocks,
711            char_frequency,
712            html_tags_cache: OnceLock::new(),
713            emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
714            table_rows_cache: OnceLock::new(),
715            bare_urls_cache: OnceLock::new(),
716            has_mixed_list_nesting_cache: OnceLock::new(),
717            html_comment_ranges,
718            table_blocks,
719            line_index,
720            jinja_ranges,
721            flavor,
722            source_file,
723        }
724    }
725
726    /// Get code spans - computed lazily on first access
727    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
728        Arc::clone(
729            self.code_spans_cache
730                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
731        )
732    }
733
734    /// Get HTML comment ranges - pre-computed during LintContext construction
735    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
736        &self.html_comment_ranges
737    }
738
739    /// Get HTML tags - computed lazily on first access
740    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
741        Arc::clone(self.html_tags_cache.get_or_init(|| {
742            Arc::new(Self::parse_html_tags(
743                self.content,
744                &self.lines,
745                &self.code_blocks,
746                self.flavor,
747            ))
748        }))
749    }
750
751    /// Get emphasis spans - pre-computed during construction
752    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
753        Arc::clone(
754            self.emphasis_spans_cache
755                .get()
756                .expect("emphasis_spans_cache initialized during construction"),
757        )
758    }
759
760    /// Get table rows - computed lazily on first access
761    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
762        Arc::clone(
763            self.table_rows_cache
764                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
765        )
766    }
767
768    /// Get bare URLs - computed lazily on first access
769    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
770        Arc::clone(
771            self.bare_urls_cache
772                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
773        )
774    }
775
776    /// Check if document has mixed ordered/unordered list nesting.
777    /// Result is cached after first computation (document-level invariant).
778    /// This is used by MD007 for smart style auto-detection.
779    pub fn has_mixed_list_nesting(&self) -> bool {
780        *self
781            .has_mixed_list_nesting_cache
782            .get_or_init(|| self.compute_mixed_list_nesting())
783    }
784
785    /// Internal computation for mixed list nesting (only called once per LintContext).
786    fn compute_mixed_list_nesting(&self) -> bool {
787        // Track parent list items by their marker position and type
788        // Using marker_column instead of indent because it works correctly
789        // for blockquoted content where indent doesn't account for the prefix
790        // Stack stores: (marker_column, is_ordered)
791        let mut stack: Vec<(usize, bool)> = Vec::new();
792        let mut last_was_blank = false;
793
794        for line_info in &self.lines {
795            // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
796            if line_info.in_code_block
797                || line_info.in_front_matter
798                || line_info.in_mkdocstrings
799                || line_info.in_html_comment
800                || line_info.in_esm_block
801            {
802                continue;
803            }
804
805            // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
806            if line_info.is_blank {
807                last_was_blank = true;
808                continue;
809            }
810
811            if let Some(list_item) = &line_info.list_item {
812                // Normalize column 1 to column 0 (consistent with MD007 check function)
813                let current_pos = if list_item.marker_column == 1 {
814                    0
815                } else {
816                    list_item.marker_column
817                };
818
819                // If there was a blank line and this item is at root level, reset stack
820                if last_was_blank && current_pos == 0 {
821                    stack.clear();
822                }
823                last_was_blank = false;
824
825                // Pop items at same or greater position (they're siblings or deeper, not parents)
826                while let Some(&(pos, _)) = stack.last() {
827                    if pos >= current_pos {
828                        stack.pop();
829                    } else {
830                        break;
831                    }
832                }
833
834                // Check if immediate parent has different type - this is mixed nesting
835                if let Some(&(_, parent_is_ordered)) = stack.last()
836                    && parent_is_ordered != list_item.is_ordered
837                {
838                    return true; // Found mixed nesting - early exit
839                }
840
841                stack.push((current_pos, list_item.is_ordered));
842            } else {
843                // Non-list line (but not blank) - could be paragraph or other content
844                last_was_blank = false;
845            }
846        }
847
848        false
849    }
850
851    /// Map a byte offset to (line, column)
852    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
853        match self.line_offsets.binary_search(&offset) {
854            Ok(line) => (line + 1, 1),
855            Err(line) => {
856                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
857                (line, offset - line_start + 1)
858            }
859        }
860    }
861
862    /// Check if a position is within a code block or code span
863    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
864        // Check code blocks first
865        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
866            return true;
867        }
868
869        // Check inline code spans (lazy load if needed)
870        self.code_spans()
871            .iter()
872            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
873    }
874
875    /// Get line information by line number (1-indexed)
876    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
877        if line_num > 0 {
878            self.lines.get(line_num - 1)
879        } else {
880            None
881        }
882    }
883
884    /// Get byte offset for a line number (1-indexed)
885    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
886        self.line_info(line_num).map(|info| info.byte_offset)
887    }
888
889    /// Get URL for a reference link/image by its ID
890    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
891        let normalized_id = ref_id.to_lowercase();
892        self.reference_defs
893            .iter()
894            .find(|def| def.id == normalized_id)
895            .map(|def| def.url.as_str())
896    }
897
898    /// Check if a line is part of a list block
899    pub fn is_in_list_block(&self, line_num: usize) -> bool {
900        self.list_blocks
901            .iter()
902            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
903    }
904
905    /// Get the list block containing a specific line
906    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
907        self.list_blocks
908            .iter()
909            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
910    }
911
912    // Compatibility methods for DocumentStructure migration
913
914    /// Check if a line is within a code block
915    pub fn is_in_code_block(&self, line_num: usize) -> bool {
916        if line_num == 0 || line_num > self.lines.len() {
917            return false;
918        }
919        self.lines[line_num - 1].in_code_block
920    }
921
922    /// Check if a line is within front matter
923    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
924        if line_num == 0 || line_num > self.lines.len() {
925            return false;
926        }
927        self.lines[line_num - 1].in_front_matter
928    }
929
930    /// Check if a line is within an HTML block
931    pub fn is_in_html_block(&self, line_num: usize) -> bool {
932        if line_num == 0 || line_num > self.lines.len() {
933            return false;
934        }
935        self.lines[line_num - 1].in_html_block
936    }
937
938    /// Check if a line and column is within a code span
939    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
940        if line_num == 0 || line_num > self.lines.len() {
941            return false;
942        }
943
944        // Use the code spans cache to check
945        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
946        // Convert col to 0-indexed for comparison
947        let col_0indexed = if col > 0 { col - 1 } else { 0 };
948        let code_spans = self.code_spans();
949        code_spans.iter().any(|span| {
950            // Check if line is within the span's line range
951            if line_num < span.line || line_num > span.end_line {
952                return false;
953            }
954
955            if span.line == span.end_line {
956                // Single-line span: check column bounds
957                col_0indexed >= span.start_col && col_0indexed < span.end_col
958            } else if line_num == span.line {
959                // First line of multi-line span: anything after start_col is in span
960                col_0indexed >= span.start_col
961            } else if line_num == span.end_line {
962                // Last line of multi-line span: anything before end_col is in span
963                col_0indexed < span.end_col
964            } else {
965                // Middle line of multi-line span: entire line is in span
966                true
967            }
968        })
969    }
970
971    /// Check if a byte offset is within a code span
972    #[inline]
973    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
974        let code_spans = self.code_spans();
975        code_spans
976            .iter()
977            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
978    }
979
980    /// Check if a byte position is within a reference definition
981    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
982    #[inline]
983    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
984        self.reference_defs
985            .iter()
986            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
987    }
988
989    /// Check if a byte position is within an HTML comment
990    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
991    /// where k is the number of HTML comments (typically very small)
992    #[inline]
993    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
994        self.html_comment_ranges
995            .iter()
996            .any(|range| byte_pos >= range.start && byte_pos < range.end)
997    }
998
999    /// Check if a byte position is within an HTML tag (including multiline tags)
1000    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
1001    #[inline]
1002    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1003        self.html_tags()
1004            .iter()
1005            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1006    }
1007
1008    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
1009    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1010        self.jinja_ranges
1011            .iter()
1012            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1013    }
1014
1015    /// Check if a byte position is within a link reference definition title
1016    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1017        self.reference_defs.iter().any(|def| {
1018            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1019                byte_pos >= start && byte_pos < end
1020            } else {
1021                false
1022            }
1023        })
1024    }
1025
1026    /// Check if content has any instances of a specific character (fast)
1027    pub fn has_char(&self, ch: char) -> bool {
1028        match ch {
1029            '#' => self.char_frequency.hash_count > 0,
1030            '*' => self.char_frequency.asterisk_count > 0,
1031            '_' => self.char_frequency.underscore_count > 0,
1032            '-' => self.char_frequency.hyphen_count > 0,
1033            '+' => self.char_frequency.plus_count > 0,
1034            '>' => self.char_frequency.gt_count > 0,
1035            '|' => self.char_frequency.pipe_count > 0,
1036            '[' => self.char_frequency.bracket_count > 0,
1037            '`' => self.char_frequency.backtick_count > 0,
1038            '<' => self.char_frequency.lt_count > 0,
1039            '!' => self.char_frequency.exclamation_count > 0,
1040            '\n' => self.char_frequency.newline_count > 0,
1041            _ => self.content.contains(ch), // Fallback for other characters
1042        }
1043    }
1044
1045    /// Get count of a specific character (fast)
1046    pub fn char_count(&self, ch: char) -> usize {
1047        match ch {
1048            '#' => self.char_frequency.hash_count,
1049            '*' => self.char_frequency.asterisk_count,
1050            '_' => self.char_frequency.underscore_count,
1051            '-' => self.char_frequency.hyphen_count,
1052            '+' => self.char_frequency.plus_count,
1053            '>' => self.char_frequency.gt_count,
1054            '|' => self.char_frequency.pipe_count,
1055            '[' => self.char_frequency.bracket_count,
1056            '`' => self.char_frequency.backtick_count,
1057            '<' => self.char_frequency.lt_count,
1058            '!' => self.char_frequency.exclamation_count,
1059            '\n' => self.char_frequency.newline_count,
1060            _ => self.content.matches(ch).count(), // Fallback for other characters
1061        }
1062    }
1063
1064    /// Check if content likely contains headings (fast)
1065    pub fn likely_has_headings(&self) -> bool {
1066        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1067    }
1068
1069    /// Check if content likely contains lists (fast)
1070    pub fn likely_has_lists(&self) -> bool {
1071        self.char_frequency.asterisk_count > 0
1072            || self.char_frequency.hyphen_count > 0
1073            || self.char_frequency.plus_count > 0
1074    }
1075
1076    /// Check if content likely contains emphasis (fast)
1077    pub fn likely_has_emphasis(&self) -> bool {
1078        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1079    }
1080
1081    /// Check if content likely contains tables (fast)
1082    pub fn likely_has_tables(&self) -> bool {
1083        self.char_frequency.pipe_count > 2
1084    }
1085
1086    /// Check if content likely contains blockquotes (fast)
1087    pub fn likely_has_blockquotes(&self) -> bool {
1088        self.char_frequency.gt_count > 0
1089    }
1090
1091    /// Check if content likely contains code (fast)
1092    pub fn likely_has_code(&self) -> bool {
1093        self.char_frequency.backtick_count > 0
1094    }
1095
1096    /// Check if content likely contains links or images (fast)
1097    pub fn likely_has_links_or_images(&self) -> bool {
1098        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1099    }
1100
1101    /// Check if content likely contains HTML (fast)
1102    pub fn likely_has_html(&self) -> bool {
1103        self.char_frequency.lt_count > 0
1104    }
1105
1106    /// Get HTML tags on a specific line
1107    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1108        self.html_tags()
1109            .iter()
1110            .filter(|tag| tag.line == line_num)
1111            .cloned()
1112            .collect()
1113    }
1114
1115    /// Get emphasis spans on a specific line
1116    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1117        self.emphasis_spans()
1118            .iter()
1119            .filter(|span| span.line == line_num)
1120            .cloned()
1121            .collect()
1122    }
1123
1124    /// Get table rows on a specific line
1125    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1126        self.table_rows()
1127            .iter()
1128            .filter(|row| row.line == line_num)
1129            .cloned()
1130            .collect()
1131    }
1132
1133    /// Get bare URLs on a specific line
1134    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1135        self.bare_urls()
1136            .iter()
1137            .filter(|url| url.line == line_num)
1138            .cloned()
1139            .collect()
1140    }
1141
1142    /// Find the line index for a given byte offset using binary search.
1143    /// Returns (line_index, line_number, column) where:
1144    /// - line_index is the 0-based index in the lines array
1145    /// - line_number is the 1-based line number
1146    /// - column is the byte offset within that line
1147    #[inline]
1148    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1149        // Binary search to find the line containing this byte offset
1150        let idx = match lines.binary_search_by(|line| {
1151            if byte_offset < line.byte_offset {
1152                std::cmp::Ordering::Greater
1153            } else if byte_offset > line.byte_offset + line.byte_len {
1154                std::cmp::Ordering::Less
1155            } else {
1156                std::cmp::Ordering::Equal
1157            }
1158        }) {
1159            Ok(idx) => idx,
1160            Err(idx) => idx.saturating_sub(1),
1161        };
1162
1163        let line = &lines[idx];
1164        let line_num = idx + 1;
1165        let col = byte_offset.saturating_sub(line.byte_offset);
1166
1167        (idx, line_num, col)
1168    }
1169
1170    /// Check if a byte offset is within a code span using binary search
1171    #[inline]
1172    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1173        // Since spans are sorted by byte_offset, use partition_point for binary search
1174        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1175
1176        // Check the span that starts at or before our offset
1177        if idx > 0 {
1178            let span = &code_spans[idx - 1];
1179            if offset >= span.byte_offset && offset < span.byte_end {
1180                return true;
1181            }
1182        }
1183
1184        false
1185    }
1186
1187    /// Collect byte ranges of all links using pulldown-cmark
1188    /// This is used to skip heading detection for lines that fall within link syntax
1189    /// (e.g., multiline links like `[text](url\n#fragment)`)
1190    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1191        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1192
1193        let mut link_ranges = Vec::new();
1194        let mut options = Options::empty();
1195        options.insert(Options::ENABLE_WIKILINKS);
1196        options.insert(Options::ENABLE_FOOTNOTES);
1197
1198        let parser = Parser::new_ext(content, options).into_offset_iter();
1199        let mut link_stack: Vec<usize> = Vec::new();
1200
1201        for (event, range) in parser {
1202            match event {
1203                Event::Start(Tag::Link { .. }) => {
1204                    link_stack.push(range.start);
1205                }
1206                Event::End(TagEnd::Link) => {
1207                    if let Some(start_pos) = link_stack.pop() {
1208                        link_ranges.push((start_pos, range.end));
1209                    }
1210                }
1211                _ => {}
1212            }
1213        }
1214
1215        link_ranges
1216    }
1217
1218    /// Parse all links in the content
1219    fn parse_links(
1220        content: &'a str,
1221        lines: &[LineInfo],
1222        code_blocks: &[(usize, usize)],
1223        code_spans: &[CodeSpan],
1224        flavor: MarkdownFlavor,
1225        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1226    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1227        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1228        use std::collections::HashSet;
1229
1230        let mut links = Vec::with_capacity(content.len() / 500);
1231        let mut broken_links = Vec::new();
1232        let mut footnote_refs = Vec::new();
1233
1234        // Track byte positions of links found by pulldown-cmark
1235        let mut found_positions = HashSet::new();
1236
1237        // Use pulldown-cmark's streaming parser with BrokenLink callback
1238        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1239        // This automatically handles:
1240        // - Escaped links (won't generate events)
1241        // - Links in code blocks/spans (won't generate Link events)
1242        // - Images (generates Tag::Image instead)
1243        // - Reference resolution (dest_url is already resolved!)
1244        // - Broken references (callback is invoked)
1245        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1246        let mut options = Options::empty();
1247        options.insert(Options::ENABLE_WIKILINKS);
1248        options.insert(Options::ENABLE_FOOTNOTES);
1249
1250        let parser = Parser::new_with_broken_link_callback(
1251            content,
1252            options,
1253            Some(|link: BrokenLink<'_>| {
1254                broken_links.push(BrokenLinkInfo {
1255                    reference: link.reference.to_string(),
1256                    span: link.span.clone(),
1257                });
1258                None
1259            }),
1260        )
1261        .into_offset_iter();
1262
1263        let mut link_stack: Vec<(
1264            usize,
1265            usize,
1266            pulldown_cmark::CowStr<'a>,
1267            LinkType,
1268            pulldown_cmark::CowStr<'a>,
1269        )> = Vec::new();
1270        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1271
1272        for (event, range) in parser {
1273            match event {
1274                Event::Start(Tag::Link {
1275                    link_type,
1276                    dest_url,
1277                    id,
1278                    ..
1279                }) => {
1280                    // Link start - record position, URL, and reference ID
1281                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1282                    text_chunks.clear();
1283                }
1284                Event::Text(text) if !link_stack.is_empty() => {
1285                    // Track text content with its byte range
1286                    text_chunks.push((text.to_string(), range.start, range.end));
1287                }
1288                Event::Code(code) if !link_stack.is_empty() => {
1289                    // Include inline code in link text (with backticks)
1290                    let code_text = format!("`{code}`");
1291                    text_chunks.push((code_text, range.start, range.end));
1292                }
1293                Event::End(TagEnd::Link) => {
1294                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1295                        // Skip if in HTML comment
1296                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1297                            text_chunks.clear();
1298                            continue;
1299                        }
1300
1301                        // Find line and column information
1302                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303
1304                        // Skip if this link is on a MkDocs snippet line
1305                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1306                            text_chunks.clear();
1307                            continue;
1308                        }
1309
1310                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1311
1312                        let is_reference = matches!(
1313                            link_type,
1314                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1315                        );
1316
1317                        // Extract link text directly from source bytes to preserve escaping
1318                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1319                        let link_text = if start_pos < content.len() {
1320                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1321
1322                            // Find MATCHING ] by tracking bracket depth for nested brackets
1323                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1324                            // Brackets inside code spans (between backticks) should be ignored
1325                            let mut close_pos = None;
1326                            let mut depth = 0;
1327                            let mut in_code_span = false;
1328
1329                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1330                                // Count preceding backslashes
1331                                let mut backslash_count = 0;
1332                                let mut j = i;
1333                                while j > 0 && link_bytes[j - 1] == b'\\' {
1334                                    backslash_count += 1;
1335                                    j -= 1;
1336                                }
1337                                let is_escaped = backslash_count % 2 != 0;
1338
1339                                // Track code spans - backticks toggle in/out of code
1340                                if byte == b'`' && !is_escaped {
1341                                    in_code_span = !in_code_span;
1342                                }
1343
1344                                // Only count brackets when NOT in a code span
1345                                if !is_escaped && !in_code_span {
1346                                    if byte == b'[' {
1347                                        depth += 1;
1348                                    } else if byte == b']' {
1349                                        if depth == 0 {
1350                                            // Found the matching closing bracket
1351                                            close_pos = Some(i);
1352                                            break;
1353                                        } else {
1354                                            depth -= 1;
1355                                        }
1356                                    }
1357                                }
1358                            }
1359
1360                            if let Some(pos) = close_pos {
1361                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1362                            } else {
1363                                Cow::Borrowed("")
1364                            }
1365                        } else {
1366                            Cow::Borrowed("")
1367                        };
1368
1369                        // For reference links, use the actual reference ID from pulldown-cmark
1370                        let reference_id = if is_reference && !ref_id.is_empty() {
1371                            Some(Cow::Owned(ref_id.to_lowercase()))
1372                        } else if is_reference {
1373                            // For collapsed/shortcut references without explicit ID, use the link text
1374                            Some(Cow::Owned(link_text.to_lowercase()))
1375                        } else {
1376                            None
1377                        };
1378
1379                        // Track this position as found
1380                        found_positions.insert(start_pos);
1381
1382                        links.push(ParsedLink {
1383                            line: line_num,
1384                            start_col: col_start,
1385                            end_col: col_end,
1386                            byte_offset: start_pos,
1387                            byte_end: range.end,
1388                            text: link_text,
1389                            url: Cow::Owned(url.to_string()),
1390                            is_reference,
1391                            reference_id,
1392                            link_type,
1393                        });
1394
1395                        text_chunks.clear();
1396                    }
1397                }
1398                Event::FootnoteReference(footnote_id) => {
1399                    // Capture footnote references like [^1], [^note]
1400                    // Skip if in HTML comment
1401                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1402                        continue;
1403                    }
1404
1405                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1406                    footnote_refs.push(FootnoteRef {
1407                        id: footnote_id.to_string(),
1408                        line: line_num,
1409                        byte_offset: range.start,
1410                        byte_end: range.end,
1411                    });
1412                }
1413                _ => {}
1414            }
1415        }
1416
1417        // Also find undefined references using regex
1418        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1419        // because the reference is undefined
1420        for cap in LINK_PATTERN.captures_iter(content) {
1421            let full_match = cap.get(0).unwrap();
1422            let match_start = full_match.start();
1423            let match_end = full_match.end();
1424
1425            // Skip if this was already found by pulldown-cmark (it's a valid link)
1426            if found_positions.contains(&match_start) {
1427                continue;
1428            }
1429
1430            // Skip if escaped
1431            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1432                continue;
1433            }
1434
1435            // Skip if it's an image
1436            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1437                continue;
1438            }
1439
1440            // Skip if in code block
1441            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1442                continue;
1443            }
1444
1445            // Skip if in code span
1446            if Self::is_offset_in_code_span(code_spans, match_start) {
1447                continue;
1448            }
1449
1450            // Skip if in HTML comment
1451            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1452                continue;
1453            }
1454
1455            // Find line and column information
1456            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1457
1458            // Skip if this link is on a MkDocs snippet line
1459            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1460                continue;
1461            }
1462
1463            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1464
1465            let text = cap.get(1).map_or("", |m| m.as_str());
1466
1467            // Only process reference links (group 6)
1468            if let Some(ref_id) = cap.get(6) {
1469                let ref_id_str = ref_id.as_str();
1470                let normalized_ref = if ref_id_str.is_empty() {
1471                    Cow::Owned(text.to_lowercase()) // Implicit reference
1472                } else {
1473                    Cow::Owned(ref_id_str.to_lowercase())
1474                };
1475
1476                // This is an undefined reference (pulldown-cmark didn't parse it)
1477                links.push(ParsedLink {
1478                    line: line_num,
1479                    start_col: col_start,
1480                    end_col: col_end,
1481                    byte_offset: match_start,
1482                    byte_end: match_end,
1483                    text: Cow::Borrowed(text),
1484                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1485                    is_reference: true,
1486                    reference_id: Some(normalized_ref),
1487                    link_type: LinkType::Reference, // Undefined references are reference-style
1488                });
1489            }
1490        }
1491
1492        (links, broken_links, footnote_refs)
1493    }
1494
1495    /// Parse all images in the content
1496    fn parse_images(
1497        content: &'a str,
1498        lines: &[LineInfo],
1499        code_blocks: &[(usize, usize)],
1500        code_spans: &[CodeSpan],
1501        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1502    ) -> Vec<ParsedImage<'a>> {
1503        use crate::utils::skip_context::is_in_html_comment_ranges;
1504        use std::collections::HashSet;
1505
1506        // Pre-size based on a heuristic: images are less common than links
1507        let mut images = Vec::with_capacity(content.len() / 1000);
1508        let mut found_positions = HashSet::new();
1509
1510        // Use pulldown-cmark for parsing - more accurate and faster
1511        let parser = Parser::new(content).into_offset_iter();
1512        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1513            Vec::new();
1514        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1515
1516        for (event, range) in parser {
1517            match event {
1518                Event::Start(Tag::Image {
1519                    link_type,
1520                    dest_url,
1521                    id,
1522                    ..
1523                }) => {
1524                    image_stack.push((range.start, dest_url, link_type, id));
1525                    text_chunks.clear();
1526                }
1527                Event::Text(text) if !image_stack.is_empty() => {
1528                    text_chunks.push((text.to_string(), range.start, range.end));
1529                }
1530                Event::Code(code) if !image_stack.is_empty() => {
1531                    let code_text = format!("`{code}`");
1532                    text_chunks.push((code_text, range.start, range.end));
1533                }
1534                Event::End(TagEnd::Image) => {
1535                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1536                        // Skip if in code block
1537                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1538                            continue;
1539                        }
1540
1541                        // Skip if in code span
1542                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1543                            continue;
1544                        }
1545
1546                        // Skip if in HTML comment
1547                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1548                            continue;
1549                        }
1550
1551                        // Find line and column using binary search
1552                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1553                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1554
1555                        let is_reference = matches!(
1556                            link_type,
1557                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1558                        );
1559
1560                        // Extract alt text directly from source bytes to preserve escaping
1561                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1562                        let alt_text = if start_pos < content.len() {
1563                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1564
1565                            // Find MATCHING ] by tracking bracket depth for nested brackets
1566                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1567                            let mut close_pos = None;
1568                            let mut depth = 0;
1569
1570                            if image_bytes.len() > 2 {
1571                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1572                                    // Count preceding backslashes
1573                                    let mut backslash_count = 0;
1574                                    let mut j = i;
1575                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1576                                        backslash_count += 1;
1577                                        j -= 1;
1578                                    }
1579                                    let is_escaped = backslash_count % 2 != 0;
1580
1581                                    if !is_escaped {
1582                                        if byte == b'[' {
1583                                            depth += 1;
1584                                        } else if byte == b']' {
1585                                            if depth == 0 {
1586                                                // Found the matching closing bracket
1587                                                close_pos = Some(i);
1588                                                break;
1589                                            } else {
1590                                                depth -= 1;
1591                                            }
1592                                        }
1593                                    }
1594                                }
1595                            }
1596
1597                            if let Some(pos) = close_pos {
1598                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1599                            } else {
1600                                Cow::Borrowed("")
1601                            }
1602                        } else {
1603                            Cow::Borrowed("")
1604                        };
1605
1606                        let reference_id = if is_reference && !ref_id.is_empty() {
1607                            Some(Cow::Owned(ref_id.to_lowercase()))
1608                        } else if is_reference {
1609                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1610                        } else {
1611                            None
1612                        };
1613
1614                        found_positions.insert(start_pos);
1615                        images.push(ParsedImage {
1616                            line: line_num,
1617                            start_col: col_start,
1618                            end_col: col_end,
1619                            byte_offset: start_pos,
1620                            byte_end: range.end,
1621                            alt_text,
1622                            url: Cow::Owned(url.to_string()),
1623                            is_reference,
1624                            reference_id,
1625                            link_type,
1626                        });
1627                    }
1628                }
1629                _ => {}
1630            }
1631        }
1632
1633        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1634        for cap in IMAGE_PATTERN.captures_iter(content) {
1635            let full_match = cap.get(0).unwrap();
1636            let match_start = full_match.start();
1637            let match_end = full_match.end();
1638
1639            // Skip if already found by pulldown-cmark
1640            if found_positions.contains(&match_start) {
1641                continue;
1642            }
1643
1644            // Skip if the ! is escaped
1645            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1646                continue;
1647            }
1648
1649            // Skip if in code block, code span, or HTML comment
1650            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1651                || Self::is_offset_in_code_span(code_spans, match_start)
1652                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1653            {
1654                continue;
1655            }
1656
1657            // Only process reference images (undefined references not found by pulldown-cmark)
1658            if let Some(ref_id) = cap.get(6) {
1659                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1660                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1661                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1662                let ref_id_str = ref_id.as_str();
1663                let normalized_ref = if ref_id_str.is_empty() {
1664                    Cow::Owned(alt_text.to_lowercase())
1665                } else {
1666                    Cow::Owned(ref_id_str.to_lowercase())
1667                };
1668
1669                images.push(ParsedImage {
1670                    line: line_num,
1671                    start_col: col_start,
1672                    end_col: col_end,
1673                    byte_offset: match_start,
1674                    byte_end: match_end,
1675                    alt_text: Cow::Borrowed(alt_text),
1676                    url: Cow::Borrowed(""),
1677                    is_reference: true,
1678                    reference_id: Some(normalized_ref),
1679                    link_type: LinkType::Reference, // Undefined references are reference-style
1680                });
1681            }
1682        }
1683
1684        images
1685    }
1686
1687    /// Parse reference definitions
1688    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1689        // Pre-size based on lines count as reference definitions are line-based
1690        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1691
1692        for (line_idx, line_info) in lines.iter().enumerate() {
1693            // Skip lines in code blocks
1694            if line_info.in_code_block {
1695                continue;
1696            }
1697
1698            let line = line_info.content(content);
1699            let line_num = line_idx + 1;
1700
1701            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1702                let id = cap.get(1).unwrap().as_str().to_lowercase();
1703                let url = cap.get(2).unwrap().as_str().to_string();
1704                let title_match = cap.get(3).or_else(|| cap.get(4));
1705                let title = title_match.map(|m| m.as_str().to_string());
1706
1707                // Calculate byte positions
1708                // The match starts at the beginning of the line (0) and extends to the end
1709                let match_obj = cap.get(0).unwrap();
1710                let byte_offset = line_info.byte_offset + match_obj.start();
1711                let byte_end = line_info.byte_offset + match_obj.end();
1712
1713                // Calculate title byte positions (includes the quote character before content)
1714                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1715                    // The match is the content inside quotes, so we include the quote before
1716                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1717                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1718                    (Some(start), Some(end))
1719                } else {
1720                    (None, None)
1721                };
1722
1723                refs.push(ReferenceDef {
1724                    line: line_num,
1725                    id,
1726                    url,
1727                    title,
1728                    byte_offset,
1729                    byte_end,
1730                    title_byte_start,
1731                    title_byte_end,
1732                });
1733            }
1734        }
1735
1736        refs
1737    }
1738
1739    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1740    /// Handles nested blockquotes like `> > > content`
1741    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1742    #[inline]
1743    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1744        let trimmed_start = line.trim_start();
1745        if !trimmed_start.starts_with('>') {
1746            return None;
1747        }
1748
1749        // Track total prefix length to handle nested blockquotes
1750        let mut remaining = line;
1751        let mut total_prefix_len = 0;
1752
1753        loop {
1754            let trimmed = remaining.trim_start();
1755            if !trimmed.starts_with('>') {
1756                break;
1757            }
1758
1759            // Add leading whitespace + '>' to prefix
1760            let leading_ws_len = remaining.len() - trimmed.len();
1761            total_prefix_len += leading_ws_len + 1;
1762
1763            let after_gt = &trimmed[1..];
1764
1765            // Handle optional whitespace after '>' (space or tab)
1766            if let Some(stripped) = after_gt.strip_prefix(' ') {
1767                total_prefix_len += 1;
1768                remaining = stripped;
1769            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1770                total_prefix_len += 1;
1771                remaining = stripped;
1772            } else {
1773                remaining = after_gt;
1774            }
1775        }
1776
1777        Some((&line[..total_prefix_len], remaining))
1778    }
1779
1780    /// Detect list items using pulldown-cmark for CommonMark-compliant parsing.
1781    ///
1782    /// Returns a HashMap keyed by line byte offset, containing:
1783    /// `(is_ordered, marker, marker_column, content_column, number)`
1784    ///
1785    /// ## Why pulldown-cmark?
1786    /// Using pulldown-cmark instead of regex ensures we only detect actual list items,
1787    /// not lines that merely look like lists (e.g., continuation paragraphs, code blocks).
1788    /// This fixes issue #253 where continuation lines were falsely detected.
1789    ///
1790    /// ## Tab indentation quirk
1791    /// Pulldown-cmark reports nested list items at the newline character position
1792    /// when tab indentation is used. For example, in `"* Item\n\t- Nested"`,
1793    /// the nested item is reported at byte 7 (the `\n`), not byte 8 (the `\t`).
1794    /// We detect this and advance to the correct line.
1795    ///
1796    /// ## HashMap key strategy
1797    /// We use `entry().or_insert()` because pulldown-cmark may emit multiple events
1798    /// that resolve to the same line (after newline adjustment). The first event
1799    /// for each line is authoritative.
1800    /// Detect list items and emphasis spans in a single pulldown-cmark pass.
1801    /// Returns both list items (for LineInfo) and emphasis spans (for MD030).
1802    /// This avoids a separate parse for emphasis detection.
1803    fn detect_list_items_and_emphasis_with_pulldown(
1804        content: &str,
1805        line_offsets: &[usize],
1806        flavor: MarkdownFlavor,
1807        front_matter_end: usize,
1808        code_blocks: &[(usize, usize)],
1809    ) -> (ListItemMap, Vec<EmphasisSpan>) {
1810        use std::collections::HashMap;
1811
1812        let mut list_items = HashMap::new();
1813        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1814
1815        let mut options = Options::empty();
1816        options.insert(Options::ENABLE_TABLES);
1817        options.insert(Options::ENABLE_FOOTNOTES);
1818        options.insert(Options::ENABLE_STRIKETHROUGH);
1819        options.insert(Options::ENABLE_TASKLISTS);
1820        // Always enable GFM features for consistency with existing behavior
1821        options.insert(Options::ENABLE_GFM);
1822
1823        // Suppress unused variable warning
1824        let _ = flavor;
1825
1826        let parser = Parser::new_ext(content, options).into_offset_iter();
1827        let mut list_depth: usize = 0;
1828        let mut list_stack: Vec<bool> = Vec::new();
1829
1830        for (event, range) in parser {
1831            match event {
1832                // Capture emphasis spans (for MD030's emphasis detection)
1833                Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1834                    let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1835                        2
1836                    } else {
1837                        1
1838                    };
1839                    let match_start = range.start;
1840                    let match_end = range.end;
1841
1842                    // Skip if in code block
1843                    if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1844                        // Determine marker character by looking at the content at the start
1845                        let marker = content[match_start..].chars().next().unwrap_or('*');
1846                        if marker == '*' || marker == '_' {
1847                            // Extract content between markers
1848                            let content_start = match_start + marker_count;
1849                            let content_end = if match_end >= marker_count {
1850                                match_end - marker_count
1851                            } else {
1852                                match_end
1853                            };
1854                            let content_part = if content_start < content_end && content_end <= content.len() {
1855                                &content[content_start..content_end]
1856                            } else {
1857                                ""
1858                            };
1859
1860                            // Find which line this emphasis is on using line_offsets
1861                            let line_idx = match line_offsets.binary_search(&match_start) {
1862                                Ok(idx) => idx,
1863                                Err(idx) => idx.saturating_sub(1),
1864                            };
1865                            let line_num = line_idx + 1;
1866                            let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1867                            let col_start = match_start - line_start;
1868                            let col_end = match_end - line_start;
1869
1870                            emphasis_spans.push(EmphasisSpan {
1871                                line: line_num,
1872                                start_col: col_start,
1873                                end_col: col_end,
1874                                byte_offset: match_start,
1875                                byte_end: match_end,
1876                                marker,
1877                                marker_count,
1878                                content: content_part.to_string(),
1879                            });
1880                        }
1881                    }
1882                }
1883                Event::Start(Tag::List(start_number)) => {
1884                    list_depth += 1;
1885                    list_stack.push(start_number.is_some());
1886                }
1887                Event::End(TagEnd::List(_)) => {
1888                    list_depth = list_depth.saturating_sub(1);
1889                    list_stack.pop();
1890                }
1891                Event::Start(Tag::Item) if list_depth > 0 => {
1892                    // Get the ordered state for the CURRENT (innermost) list
1893                    let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1894                    // Find which line this byte offset corresponds to
1895                    let item_start = range.start;
1896
1897                    // Binary search to find the line number
1898                    let mut line_idx = match line_offsets.binary_search(&item_start) {
1899                        Ok(idx) => idx,
1900                        Err(idx) => idx.saturating_sub(1),
1901                    };
1902
1903                    // Pulldown-cmark reports nested list items at the newline before the item
1904                    // when using tab indentation (e.g., "* Item\n\t- Nested").
1905                    // Advance to the actual content line in this case.
1906                    if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1907                        line_idx += 1;
1908                    }
1909
1910                    // Skip list items in frontmatter (they are YAML/TOML syntax, not Markdown)
1911                    if front_matter_end > 0 && line_idx < front_matter_end {
1912                        continue;
1913                    }
1914
1915                    if line_idx < line_offsets.len() {
1916                        let line_start_byte = line_offsets[line_idx];
1917                        let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
1918                        let line = &content[line_start_byte..line_end.min(content.len())];
1919
1920                        // Strip trailing newline
1921                        let line = line
1922                            .strip_suffix('\n')
1923                            .or_else(|| line.strip_suffix("\r\n"))
1924                            .unwrap_or(line);
1925
1926                        // Strip blockquote prefix if present
1927                        let blockquote_parse = Self::parse_blockquote_prefix(line);
1928                        let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
1929                            (prefix.len(), content)
1930                        } else {
1931                            (0, line)
1932                        };
1933
1934                        // Parse the list marker from the actual line
1935                        if current_list_is_ordered {
1936                            if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1937                                Self::parse_ordered_list(line_to_parse)
1938                            {
1939                                let marker = format!("{number_str}{delimiter}");
1940                                let marker_column = blockquote_prefix_len + leading_spaces.len();
1941                                let content_column = marker_column + marker.len() + spacing.len();
1942                                let number = number_str.parse().ok();
1943
1944                                list_items.entry(line_start_byte).or_insert((
1945                                    true,
1946                                    marker,
1947                                    marker_column,
1948                                    content_column,
1949                                    number,
1950                                ));
1951                            }
1952                        } else if let Some((leading_spaces, marker, spacing, _content)) =
1953                            Self::parse_unordered_list(line_to_parse)
1954                        {
1955                            let marker_column = blockquote_prefix_len + leading_spaces.len();
1956                            let content_column = marker_column + 1 + spacing.len();
1957
1958                            list_items.entry(line_start_byte).or_insert((
1959                                false,
1960                                marker.to_string(),
1961                                marker_column,
1962                                content_column,
1963                                None,
1964                            ));
1965                        }
1966                    }
1967                }
1968                _ => {}
1969            }
1970        }
1971
1972        (list_items, emphasis_spans)
1973    }
1974
1975    /// Fast unordered list parser - replaces regex for 5-10x speedup
1976    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1977    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1978    #[inline]
1979    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1980        let bytes = line.as_bytes();
1981        let mut i = 0;
1982
1983        // Skip leading whitespace
1984        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1985            i += 1;
1986        }
1987
1988        // Check for marker
1989        if i >= bytes.len() {
1990            return None;
1991        }
1992        let marker = bytes[i] as char;
1993        if marker != '-' && marker != '*' && marker != '+' {
1994            return None;
1995        }
1996        let marker_pos = i;
1997        i += 1;
1998
1999        // Collect spacing after marker (space or tab only)
2000        let spacing_start = i;
2001        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2002            i += 1;
2003        }
2004
2005        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2006    }
2007
2008    /// Fast ordered list parser - replaces regex for 5-10x speedup
2009    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
2010    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
2011    #[inline]
2012    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2013        let bytes = line.as_bytes();
2014        let mut i = 0;
2015
2016        // Skip leading whitespace
2017        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2018            i += 1;
2019        }
2020
2021        // Collect digits
2022        let number_start = i;
2023        while i < bytes.len() && bytes[i].is_ascii_digit() {
2024            i += 1;
2025        }
2026        if i == number_start {
2027            return None; // No digits found
2028        }
2029
2030        // Check for delimiter
2031        if i >= bytes.len() {
2032            return None;
2033        }
2034        let delimiter = bytes[i] as char;
2035        if delimiter != '.' && delimiter != ')' {
2036            return None;
2037        }
2038        let delimiter_pos = i;
2039        i += 1;
2040
2041        // Collect spacing after delimiter (space or tab only)
2042        let spacing_start = i;
2043        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2044            i += 1;
2045        }
2046
2047        Some((
2048            &line[..number_start],
2049            &line[number_start..delimiter_pos],
2050            delimiter,
2051            &line[spacing_start..i],
2052            &line[i..],
2053        ))
2054    }
2055
2056    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
2057    /// Returns a Vec<bool> where index i indicates if line i is in a code block
2058    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2059        let num_lines = line_offsets.len();
2060        let mut in_code_block = vec![false; num_lines];
2061
2062        // For each code block, mark all lines within it
2063        for &(start, end) in code_blocks {
2064            // Ensure we're at valid UTF-8 boundaries
2065            let safe_start = if start > 0 && !content.is_char_boundary(start) {
2066                let mut boundary = start;
2067                while boundary > 0 && !content.is_char_boundary(boundary) {
2068                    boundary -= 1;
2069                }
2070                boundary
2071            } else {
2072                start
2073            };
2074
2075            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2076                let mut boundary = end;
2077                while boundary < content.len() && !content.is_char_boundary(boundary) {
2078                    boundary += 1;
2079                }
2080                boundary
2081            } else {
2082                end.min(content.len())
2083            };
2084
2085            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
2086            // That function now has proper list context awareness (see code_block_utils.rs)
2087            // and correctly distinguishes between:
2088            // - Fenced code blocks (``` or ~~~)
2089            // - Indented code blocks at document level (4 spaces + blank line before)
2090            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
2091            //
2092            // We no longer need to re-validate here. The original validation logic
2093            // was causing false positives by marking list continuation paragraphs as
2094            // code blocks when they have 4 spaces of indentation.
2095
2096            // Use binary search to find the first and last line indices
2097            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
2098            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
2099            //
2100            // Find the line that CONTAINS safe_start: the line with the largest
2101            // start offset that is <= safe_start. partition_point gives us the
2102            // first line that starts AFTER safe_start, so we subtract 1.
2103            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2104            let first_line = first_line_after.saturating_sub(1);
2105            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2106
2107            // Mark all lines in the range at once
2108            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2109                *flag = true;
2110            }
2111        }
2112
2113        in_code_block
2114    }
2115
2116    /// Pre-compute which lines are inside math blocks ($$ ... $$) - O(n) single pass
2117    /// Returns a Vec<bool> where index i indicates if line i is in a math block
2118    fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2119        let content_lines: Vec<&str> = content.lines().collect();
2120        let num_lines = content_lines.len();
2121        let mut in_math_block = vec![false; num_lines];
2122
2123        let mut inside_math = false;
2124
2125        for (i, line) in content_lines.iter().enumerate() {
2126            // Skip lines that are in code blocks - math delimiters inside code are literal
2127            if code_block_map.get(i).copied().unwrap_or(false) {
2128                continue;
2129            }
2130
2131            let trimmed = line.trim();
2132
2133            // Check for math block delimiter ($$)
2134            // A line with just $$ toggles the math block state
2135            if trimmed == "$$" {
2136                if inside_math {
2137                    // Closing delimiter - this line is still part of the math block
2138                    in_math_block[i] = true;
2139                    inside_math = false;
2140                } else {
2141                    // Opening delimiter - this line starts the math block
2142                    in_math_block[i] = true;
2143                    inside_math = true;
2144                }
2145            } else if inside_math {
2146                // Content inside math block
2147                in_math_block[i] = true;
2148            }
2149        }
2150
2151        in_math_block
2152    }
2153
2154    /// Pre-compute basic line information (without headings/blockquotes)
2155    /// Also returns emphasis spans detected during the pulldown-cmark parse
2156    fn compute_basic_line_info(
2157        content: &str,
2158        line_offsets: &[usize],
2159        code_blocks: &[(usize, usize)],
2160        flavor: MarkdownFlavor,
2161        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2162        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2163    ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2164        let content_lines: Vec<&str> = content.lines().collect();
2165        let mut lines = Vec::with_capacity(content_lines.len());
2166
2167        // Pre-compute which lines are in code blocks
2168        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2169
2170        // Pre-compute which lines are in math blocks ($$ ... $$)
2171        let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2172
2173        // Detect front matter boundaries FIRST, before any other parsing
2174        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
2175        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2176
2177        // Use pulldown-cmark to detect list items AND emphasis spans in a single pass
2178        // (context-aware, eliminates false positives)
2179        let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2180            content,
2181            line_offsets,
2182            flavor,
2183            front_matter_end,
2184            code_blocks,
2185        );
2186
2187        for (i, line) in content_lines.iter().enumerate() {
2188            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2189            let indent = line.len() - line.trim_start().len();
2190            // Compute visual indent with proper CommonMark tab expansion
2191            let visual_indent = ElementCache::calculate_indentation_width_default(line);
2192
2193            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
2194            let blockquote_parse = Self::parse_blockquote_prefix(line);
2195
2196            // For blank detection, consider blockquote context
2197            let is_blank = if let Some((_, content)) = blockquote_parse {
2198                // In blockquote context, check if content after prefix is blank
2199                content.trim().is_empty()
2200            } else {
2201                line.trim().is_empty()
2202            };
2203
2204            // Use pre-computed map for O(1) lookup instead of O(m) iteration
2205            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2206
2207            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
2208            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2209                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2210            // Check if the ENTIRE line is within an HTML comment (not just the line start)
2211            // This ensures content after `-->` on the same line is not incorrectly skipped
2212            let line_end_offset = byte_offset + line.len();
2213            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2214                html_comment_ranges,
2215                byte_offset,
2216                line_end_offset,
2217            );
2218            // Use pulldown-cmark's list detection for context-aware parsing
2219            // This eliminates false positives on continuation lines (issue #253)
2220            let list_item =
2221                list_item_map
2222                    .get(&byte_offset)
2223                    .map(
2224                        |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2225                            marker: marker.clone(),
2226                            is_ordered: *is_ordered,
2227                            number: *number,
2228                            marker_column: *marker_column,
2229                            content_column: *content_column,
2230                        },
2231                    );
2232
2233            // Detect horizontal rules (only outside code blocks and frontmatter)
2234            // Uses CommonMark-compliant check including leading indentation validation
2235            let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2236            let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2237
2238            // Get math block status for this line
2239            let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2240
2241            lines.push(LineInfo {
2242                byte_offset,
2243                byte_len: line.len(),
2244                indent,
2245                visual_indent,
2246                is_blank,
2247                in_code_block,
2248                in_front_matter,
2249                in_html_block: false, // Will be populated after line creation
2250                in_html_comment,
2251                list_item,
2252                heading: None,    // Will be populated in second pass for Setext headings
2253                blockquote: None, // Will be populated after line creation
2254                in_mkdocstrings,
2255                in_esm_block: false, // Will be populated after line creation for MDX files
2256                in_code_span_continuation: false, // Will be populated after code spans are parsed
2257                is_horizontal_rule: is_hr,
2258                in_math_block,
2259            });
2260        }
2261
2262        (lines, emphasis_spans)
2263    }
2264
2265    /// Detect headings and blockquotes (called after HTML block detection)
2266    fn detect_headings_and_blockquotes(
2267        content: &str,
2268        lines: &mut [LineInfo],
2269        flavor: MarkdownFlavor,
2270        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2271        link_byte_ranges: &[(usize, usize)],
2272    ) {
2273        // Regex for heading detection
2274        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2275            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2276        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2277            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2278
2279        let content_lines: Vec<&str> = content.lines().collect();
2280
2281        // Detect front matter boundaries to skip those lines
2282        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2283
2284        // Detect headings (including Setext which needs look-ahead) and blockquotes
2285        for i in 0..lines.len() {
2286            if lines[i].in_code_block {
2287                continue;
2288            }
2289
2290            // Skip lines in front matter
2291            if front_matter_end > 0 && i < front_matter_end {
2292                continue;
2293            }
2294
2295            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2296            if lines[i].in_html_block {
2297                continue;
2298            }
2299
2300            let line = content_lines[i];
2301
2302            // Check for blockquotes (even on blank lines within blockquotes)
2303            if let Some(bq) = parse_blockquote_detailed(line) {
2304                let nesting_level = bq.markers.len(); // Each '>' is one level
2305                let marker_column = bq.indent.len();
2306
2307                // Build the prefix (indentation + markers + space)
2308                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2309
2310                // Check for various blockquote issues
2311                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2312                // Only flag multiple literal spaces, not tabs
2313                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
2314                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2315
2316                // Check if needs MD028 fix (empty blockquote line without proper spacing)
2317                // MD028 flags empty blockquote lines that don't have a single space after the marker
2318                // Lines like "> " or ">> " are already correct and don't need fixing
2319                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2320
2321                lines[i].blockquote = Some(BlockquoteInfo {
2322                    nesting_level,
2323                    indent: bq.indent.to_string(),
2324                    marker_column,
2325                    prefix,
2326                    content: bq.content.to_string(),
2327                    has_no_space_after_marker: has_no_space,
2328                    has_multiple_spaces_after_marker: has_multiple_spaces,
2329                    needs_md028_fix,
2330                });
2331            }
2332
2333            // Skip heading detection for blank lines
2334            if lines[i].is_blank {
2335                continue;
2336            }
2337
2338            // Check for ATX headings (but skip MkDocs snippet lines)
2339            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2340            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2341                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2342                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2343            } else {
2344                false
2345            };
2346
2347            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2348                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2349                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2350                    continue;
2351                }
2352                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2353                // This prevents false positives where `#fragment` is detected as a heading
2354                let line_offset = lines[i].byte_offset;
2355                if link_byte_ranges
2356                    .iter()
2357                    .any(|&(start, end)| line_offset > start && line_offset < end)
2358                {
2359                    continue;
2360                }
2361                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2362                let hashes = caps.get(2).map_or("", |m| m.as_str());
2363                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2364                let rest = caps.get(4).map_or("", |m| m.as_str());
2365
2366                let level = hashes.len() as u8;
2367                let marker_column = leading_spaces.len();
2368
2369                // Check for closing sequence, but handle custom IDs that might come after
2370                let (text, has_closing, closing_seq) = {
2371                    // First check if there's a custom ID at the end
2372                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2373                        // Check if this looks like a valid custom ID (ends with })
2374                        if rest[id_start..].trim_end().ends_with('}') {
2375                            // Split off the custom ID
2376                            (&rest[..id_start], &rest[id_start..])
2377                        } else {
2378                            (rest, "")
2379                        }
2380                    } else {
2381                        (rest, "")
2382                    };
2383
2384                    // Now look for closing hashes in the part before the custom ID
2385                    let trimmed_rest = rest_without_id.trim_end();
2386                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2387                        // Find the start of the hash sequence by walking backwards
2388                        // Use char_indices to get byte positions at char boundaries
2389                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2390
2391                        // Find which char index corresponds to last_hash_byte_pos
2392                        let last_hash_char_idx = char_positions
2393                            .iter()
2394                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2395
2396                        if let Some(mut char_idx) = last_hash_char_idx {
2397                            // Walk backwards to find start of hash sequence
2398                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2399                                char_idx -= 1;
2400                            }
2401
2402                            // Get the byte position of the start of hashes
2403                            let start_of_hashes = char_positions[char_idx].0;
2404
2405                            // Check if there's at least one space before the closing hashes
2406                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2407
2408                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2409                            let potential_closing = &trimmed_rest[start_of_hashes..];
2410                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2411
2412                            if is_all_hashes && has_space_before {
2413                                // This is a closing sequence
2414                                let closing_hashes = potential_closing.to_string();
2415                                // The text is everything before the closing hashes
2416                                // Don't include the custom ID here - it will be extracted later
2417                                let text_part = if !custom_id_part.is_empty() {
2418                                    // If we have a custom ID, append it back to get the full rest
2419                                    // This allows the extract_header_id function to handle it properly
2420                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2421                                } else {
2422                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2423                                };
2424                                (text_part, true, closing_hashes)
2425                            } else {
2426                                // Not a valid closing sequence, return the full content
2427                                (rest.to_string(), false, String::new())
2428                            }
2429                        } else {
2430                            // Couldn't find char boundary, return the full content
2431                            (rest.to_string(), false, String::new())
2432                        }
2433                    } else {
2434                        // No hashes found, return the full content
2435                        (rest.to_string(), false, String::new())
2436                    }
2437                };
2438
2439                let content_column = marker_column + hashes.len() + spaces_after.len();
2440
2441                // Extract custom header ID if present
2442                let raw_text = text.trim().to_string();
2443                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2444
2445                // If no custom ID was found on the header line, check the next line for standalone attr-list
2446                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2447                    let next_line = content_lines[i + 1];
2448                    if !lines[i + 1].in_code_block
2449                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2450                        && let Some(next_line_id) =
2451                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2452                    {
2453                        custom_id = Some(next_line_id);
2454                    }
2455                }
2456
2457                // ATX heading is "valid" for processing by heading rules if:
2458                // 1. Has space after # (CommonMark compliant): `# Heading`
2459                // 2. Is empty (just hashes): `#`
2460                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2461                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2462                //
2463                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2464                // - `#tag` - single # with lowercase (social hashtag)
2465                // - `#123` - single # with number (GitHub issue ref)
2466                let is_valid = !spaces_after.is_empty()
2467                    || rest.is_empty()
2468                    || level > 1
2469                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2470
2471                lines[i].heading = Some(HeadingInfo {
2472                    level,
2473                    style: HeadingStyle::ATX,
2474                    marker: hashes.to_string(),
2475                    marker_column,
2476                    content_column,
2477                    text: clean_text,
2478                    custom_id,
2479                    raw_text,
2480                    has_closing_sequence: has_closing,
2481                    closing_sequence: closing_seq,
2482                    is_valid,
2483                });
2484            }
2485            // Check for Setext headings (need to look at next line)
2486            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2487                let next_line = content_lines[i + 1];
2488                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2489                    // Skip if next line is front matter delimiter
2490                    if front_matter_end > 0 && i < front_matter_end {
2491                        continue;
2492                    }
2493
2494                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2495                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2496                    {
2497                        continue;
2498                    }
2499
2500                    // Per CommonMark spec 4.3, setext heading content cannot be interpretable as:
2501                    // list item, ATX heading, block quote, thematic break, code fence, or HTML block
2502                    let content_line = line.trim();
2503
2504                    // Skip list items (-, *, +) and thematic breaks (---, ***, etc.)
2505                    if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2506                        continue;
2507                    }
2508
2509                    // Skip underscore thematic breaks (___)
2510                    if content_line.starts_with('_') {
2511                        let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2512                        if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2513                            continue;
2514                        }
2515                    }
2516
2517                    // Skip numbered lists (1. Item, 2. Item, etc.)
2518                    if let Some(first_char) = content_line.chars().next()
2519                        && first_char.is_ascii_digit()
2520                    {
2521                        let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2522                        if num_end < content_line.len() {
2523                            let next = content_line.chars().nth(num_end);
2524                            if next == Some('.') || next == Some(')') {
2525                                continue;
2526                            }
2527                        }
2528                    }
2529
2530                    // Skip ATX headings
2531                    if ATX_HEADING_REGEX.is_match(line) {
2532                        continue;
2533                    }
2534
2535                    // Skip blockquotes
2536                    if content_line.starts_with('>') {
2537                        continue;
2538                    }
2539
2540                    // Skip code fences
2541                    let trimmed_start = line.trim_start();
2542                    if trimmed_start.len() >= 3 {
2543                        let first_three: String = trimmed_start.chars().take(3).collect();
2544                        if first_three == "```" || first_three == "~~~" {
2545                            continue;
2546                        }
2547                    }
2548
2549                    // Skip HTML blocks
2550                    if content_line.starts_with('<') {
2551                        continue;
2552                    }
2553
2554                    let underline = next_line.trim();
2555
2556                    let level = if underline.starts_with('=') { 1 } else { 2 };
2557                    let style = if level == 1 {
2558                        HeadingStyle::Setext1
2559                    } else {
2560                        HeadingStyle::Setext2
2561                    };
2562
2563                    // Extract custom header ID if present
2564                    let raw_text = line.trim().to_string();
2565                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2566
2567                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2568                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2569                        let attr_line = content_lines[i + 2];
2570                        if !lines[i + 2].in_code_block
2571                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2572                            && let Some(attr_line_id) =
2573                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2574                        {
2575                            custom_id = Some(attr_line_id);
2576                        }
2577                    }
2578
2579                    lines[i].heading = Some(HeadingInfo {
2580                        level,
2581                        style,
2582                        marker: underline.to_string(),
2583                        marker_column: next_line.len() - next_line.trim_start().len(),
2584                        content_column: lines[i].indent,
2585                        text: clean_text,
2586                        custom_id,
2587                        raw_text,
2588                        has_closing_sequence: false,
2589                        closing_sequence: String::new(),
2590                        is_valid: true, // Setext headings are always valid
2591                    });
2592                }
2593            }
2594        }
2595    }
2596
2597    /// Detect HTML blocks in the content
2598    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2599        // HTML block elements that trigger block context
2600        // Includes HTML5 media, embedded content, and interactive elements
2601        const BLOCK_ELEMENTS: &[&str] = &[
2602            "address",
2603            "article",
2604            "aside",
2605            "audio",
2606            "blockquote",
2607            "canvas",
2608            "details",
2609            "dialog",
2610            "dd",
2611            "div",
2612            "dl",
2613            "dt",
2614            "embed",
2615            "fieldset",
2616            "figcaption",
2617            "figure",
2618            "footer",
2619            "form",
2620            "h1",
2621            "h2",
2622            "h3",
2623            "h4",
2624            "h5",
2625            "h6",
2626            "header",
2627            "hr",
2628            "iframe",
2629            "li",
2630            "main",
2631            "menu",
2632            "nav",
2633            "noscript",
2634            "object",
2635            "ol",
2636            "p",
2637            "picture",
2638            "pre",
2639            "script",
2640            "search",
2641            "section",
2642            "source",
2643            "style",
2644            "summary",
2645            "svg",
2646            "table",
2647            "tbody",
2648            "td",
2649            "template",
2650            "textarea",
2651            "tfoot",
2652            "th",
2653            "thead",
2654            "tr",
2655            "track",
2656            "ul",
2657            "video",
2658        ];
2659
2660        let mut i = 0;
2661        while i < lines.len() {
2662            // Skip if already in code block or front matter
2663            if lines[i].in_code_block || lines[i].in_front_matter {
2664                i += 1;
2665                continue;
2666            }
2667
2668            let trimmed = lines[i].content(content).trim_start();
2669
2670            // Check if line starts with an HTML tag
2671            if trimmed.starts_with('<') && trimmed.len() > 1 {
2672                // Extract tag name safely
2673                let after_bracket = &trimmed[1..];
2674                let is_closing = after_bracket.starts_with('/');
2675                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2676
2677                // Extract tag name (stop at space, >, /, or end of string)
2678                let tag_name = tag_start
2679                    .chars()
2680                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2681                    .collect::<String>()
2682                    .to_lowercase();
2683
2684                // Check if it's a block element
2685                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2686                    // Mark this line as in HTML block
2687                    lines[i].in_html_block = true;
2688
2689                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2690                    // This avoids complex nesting logic that might cause infinite loops
2691                    if !is_closing {
2692                        let closing_tag = format!("</{tag_name}>");
2693                        // style and script tags can contain blank lines (CSS/JS formatting)
2694                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2695                        let mut j = i + 1;
2696                        let mut found_closing_tag = false;
2697                        while j < lines.len() && j < i + 100 {
2698                            // Limit search to 100 lines
2699                            // Stop at blank lines (except for style/script tags)
2700                            if !allow_blank_lines && lines[j].is_blank {
2701                                break;
2702                            }
2703
2704                            lines[j].in_html_block = true;
2705
2706                            // Check if this line contains the closing tag
2707                            if lines[j].content(content).contains(&closing_tag) {
2708                                found_closing_tag = true;
2709                            }
2710
2711                            // After finding closing tag, continue marking lines as
2712                            // in_html_block until blank line (per CommonMark spec)
2713                            if found_closing_tag {
2714                                j += 1;
2715                                // Continue marking subsequent lines until blank
2716                                while j < lines.len() && j < i + 100 {
2717                                    if lines[j].is_blank {
2718                                        break;
2719                                    }
2720                                    lines[j].in_html_block = true;
2721                                    j += 1;
2722                                }
2723                                break;
2724                            }
2725                            j += 1;
2726                        }
2727                    }
2728                }
2729            }
2730
2731            i += 1;
2732        }
2733    }
2734
2735    /// Detect ESM import/export blocks in MDX files
2736    /// ESM blocks consist of contiguous import/export statements at the top of the file
2737    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2738        // Only process MDX files
2739        if !flavor.supports_esm_blocks() {
2740            return;
2741        }
2742
2743        let mut in_multiline_comment = false;
2744
2745        for line in lines.iter_mut() {
2746            // Skip blank lines and HTML comments
2747            if line.is_blank || line.in_html_comment {
2748                continue;
2749            }
2750
2751            let trimmed = line.content(content).trim_start();
2752
2753            // Handle continuation of multi-line JS comments
2754            if in_multiline_comment {
2755                if trimmed.contains("*/") {
2756                    in_multiline_comment = false;
2757                }
2758                continue;
2759            }
2760
2761            // Skip single-line JS comments (// and ///)
2762            if trimmed.starts_with("//") {
2763                continue;
2764            }
2765
2766            // Handle start of multi-line JS comment
2767            if trimmed.starts_with("/*") {
2768                if !trimmed.contains("*/") {
2769                    in_multiline_comment = true;
2770                }
2771                continue;
2772            }
2773
2774            // Check if line starts with import or export
2775            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2776                line.in_esm_block = true;
2777            } else {
2778                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2779                break;
2780            }
2781        }
2782    }
2783
2784    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2785    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2786        let mut code_spans = Vec::new();
2787
2788        // Quick check - if no backticks, no code spans
2789        if !content.contains('`') {
2790            return code_spans;
2791        }
2792
2793        // Use pulldown-cmark's streaming parser with byte offsets
2794        let parser = Parser::new(content).into_offset_iter();
2795
2796        for (event, range) in parser {
2797            if let Event::Code(_) = event {
2798                let start_pos = range.start;
2799                let end_pos = range.end;
2800
2801                // The range includes the backticks, extract the actual content
2802                let full_span = &content[start_pos..end_pos];
2803                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2804
2805                // Extract content between backticks, preserving spaces
2806                let content_start = start_pos + backtick_count;
2807                let content_end = end_pos - backtick_count;
2808                let span_content = if content_start < content_end {
2809                    content[content_start..content_end].to_string()
2810                } else {
2811                    String::new()
2812                };
2813
2814                // Use binary search to find line number - O(log n) instead of O(n)
2815                // Find the rightmost line whose byte_offset <= start_pos
2816                let line_idx = lines
2817                    .partition_point(|line| line.byte_offset <= start_pos)
2818                    .saturating_sub(1);
2819                let line_num = line_idx + 1;
2820                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2821
2822                // Find end column using binary search
2823                let end_line_idx = lines
2824                    .partition_point(|line| line.byte_offset <= end_pos)
2825                    .saturating_sub(1);
2826                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2827
2828                // Convert byte offsets to character positions for correct Unicode handling
2829                // This ensures consistency with warning.column which uses character positions
2830                let line_content = lines[line_idx].content(content);
2831                let col_start = if byte_col_start <= line_content.len() {
2832                    line_content[..byte_col_start].chars().count()
2833                } else {
2834                    line_content.chars().count()
2835                };
2836
2837                let end_line_content = lines[end_line_idx].content(content);
2838                let col_end = if byte_col_end <= end_line_content.len() {
2839                    end_line_content[..byte_col_end].chars().count()
2840                } else {
2841                    end_line_content.chars().count()
2842                };
2843
2844                code_spans.push(CodeSpan {
2845                    line: line_num,
2846                    end_line: end_line_idx + 1,
2847                    start_col: col_start,
2848                    end_col: col_end,
2849                    byte_offset: start_pos,
2850                    byte_end: end_pos,
2851                    backtick_count,
2852                    content: span_content,
2853                });
2854            }
2855        }
2856
2857        // Sort by position to ensure consistent ordering
2858        code_spans.sort_by_key(|span| span.byte_offset);
2859
2860        code_spans
2861    }
2862
2863    /// Parse all list blocks in the content (legacy line-by-line approach)
2864    ///
2865    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2866    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2867    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2868    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2869    ///   treated as list continuation (based on the list marker width)
2870    ///
2871    /// When a new list item is encountered, we check if list-breaking content was seen
2872    /// since the last item. If so, we start a new list block.
2873    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2874        // Minimum indentation for unordered list continuation per CommonMark spec
2875        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2876
2877        /// Initialize or reset the forward-scanning tracking state.
2878        /// This helper eliminates code duplication across three initialization sites.
2879        #[inline]
2880        fn reset_tracking_state(
2881            list_item: &ListItemInfo,
2882            has_list_breaking_content: &mut bool,
2883            min_continuation: &mut usize,
2884        ) {
2885            *has_list_breaking_content = false;
2886            let marker_width = if list_item.is_ordered {
2887                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2888            } else {
2889                list_item.marker.len()
2890            };
2891            *min_continuation = if list_item.is_ordered {
2892                marker_width
2893            } else {
2894                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2895            };
2896        }
2897
2898        // Pre-size based on lines that could be list items
2899        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2900        let mut current_block: Option<ListBlock> = None;
2901        let mut last_list_item_line = 0;
2902        let mut current_indent_level = 0;
2903        let mut last_marker_width = 0;
2904
2905        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2906        let mut has_list_breaking_content_since_last_item = false;
2907        let mut min_continuation_for_tracking = 0;
2908
2909        for (line_idx, line_info) in lines.iter().enumerate() {
2910            let line_num = line_idx + 1;
2911
2912            // Enhanced code block handling using Design #3's context analysis
2913            if line_info.in_code_block {
2914                if let Some(ref mut block) = current_block {
2915                    // Calculate minimum indentation for list continuation
2916                    let min_continuation_indent =
2917                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2918
2919                    // Analyze code block context using the three-tier classification
2920                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2921
2922                    match context {
2923                        CodeBlockContext::Indented => {
2924                            // Code block is properly indented - continues the list
2925                            block.end_line = line_num;
2926                            continue;
2927                        }
2928                        CodeBlockContext::Standalone => {
2929                            // Code block separates lists - end current block
2930                            let completed_block = current_block.take().unwrap();
2931                            list_blocks.push(completed_block);
2932                            continue;
2933                        }
2934                        CodeBlockContext::Adjacent => {
2935                            // Edge case - use conservative behavior (continue list)
2936                            block.end_line = line_num;
2937                            continue;
2938                        }
2939                    }
2940                } else {
2941                    // No current list block - skip code block lines
2942                    continue;
2943                }
2944            }
2945
2946            // Extract blockquote prefix if any
2947            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2948                caps.get(0).unwrap().as_str().to_string()
2949            } else {
2950                String::new()
2951            };
2952
2953            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2954            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2955            if let Some(ref block) = current_block
2956                && line_info.list_item.is_none()
2957                && !line_info.is_blank
2958                && !line_info.in_code_span_continuation
2959            {
2960                let line_content = line_info.content(content).trim();
2961
2962                // Check for structural separators that break lists
2963                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2964                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2965                // as they indicate improper indentation rather than lazy continuation.
2966                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2967
2968                // Check if blockquote context changes (different prefix than current block)
2969                // Lines within the SAME blockquote context don't break lists
2970                let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
2971
2972                let breaks_list = line_info.heading.is_some()
2973                    || line_content.starts_with("---")
2974                    || line_content.starts_with("***")
2975                    || line_content.starts_with("___")
2976                    || crate::utils::skip_context::is_table_line(line_content)
2977                    || blockquote_prefix_changes
2978                    || (line_info.indent > 0
2979                        && line_info.indent < min_continuation_for_tracking
2980                        && !is_lazy_continuation);
2981
2982                if breaks_list {
2983                    has_list_breaking_content_since_last_item = true;
2984                }
2985            }
2986
2987            // If this line is a code span continuation within an active list block,
2988            // extend the block's end_line to include this line (maintains list continuity)
2989            if line_info.in_code_span_continuation
2990                && line_info.list_item.is_none()
2991                && let Some(ref mut block) = current_block
2992            {
2993                block.end_line = line_num;
2994            }
2995
2996            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2997            // properly indented lines within the list). This ensures the workaround at line 2448
2998            // works correctly when there are multiple continuation lines before a nested list item.
2999            // Also include lazy continuation lines (indent=0) per CommonMark spec.
3000            // For blockquote lines, compute effective indent after stripping the prefix
3001            let effective_continuation_indent = if let Some(ref block) = current_block {
3002                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3003                let line_content = line_info.content(content);
3004                let line_bq_level = line_content
3005                    .chars()
3006                    .take_while(|c| *c == '>' || c.is_whitespace())
3007                    .filter(|&c| c == '>')
3008                    .count();
3009                if line_bq_level > 0 && line_bq_level == block_bq_level {
3010                    // Compute indent after blockquote markers
3011                    let mut pos = 0;
3012                    let mut found_markers = 0;
3013                    for c in line_content.chars() {
3014                        pos += c.len_utf8();
3015                        if c == '>' {
3016                            found_markers += 1;
3017                            if found_markers == line_bq_level {
3018                                if line_content.get(pos..pos + 1) == Some(" ") {
3019                                    pos += 1;
3020                                }
3021                                break;
3022                            }
3023                        }
3024                    }
3025                    let after_bq = &line_content[pos..];
3026                    after_bq.len() - after_bq.trim_start().len()
3027                } else {
3028                    line_info.indent
3029                }
3030            } else {
3031                line_info.indent
3032            };
3033            let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3034                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3035                if block_bq_level > 0 {
3036                    if block.is_ordered { last_marker_width } else { 2 }
3037                } else {
3038                    min_continuation_for_tracking
3039                }
3040            } else {
3041                min_continuation_for_tracking
3042            };
3043            let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3044                || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
3045
3046            if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3047                eprintln!(
3048                    "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3049                    line_num,
3050                    effective_continuation_indent,
3051                    adjusted_min_continuation_for_tracking,
3052                    is_valid_continuation,
3053                    line_info.in_code_span_continuation,
3054                    line_info.in_code_block,
3055                    current_block.is_some()
3056                );
3057            }
3058
3059            if !line_info.in_code_span_continuation
3060                && line_info.list_item.is_none()
3061                && !line_info.is_blank
3062                && !line_info.in_code_block
3063                && is_valid_continuation
3064                && let Some(ref mut block) = current_block
3065            {
3066                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3067                    eprintln!(
3068                        "[DEBUG] Line {}: extending block.end_line from {} to {}",
3069                        line_num, block.end_line, line_num
3070                    );
3071                }
3072                block.end_line = line_num;
3073            }
3074
3075            // Check if this line is a list item
3076            if let Some(list_item) = &line_info.list_item {
3077                // Calculate nesting level based on indentation
3078                let item_indent = list_item.marker_column;
3079                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
3080
3081                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3082                    eprintln!(
3083                        "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3084                        line_num, list_item.marker, item_indent
3085                    );
3086                }
3087
3088                if let Some(ref mut block) = current_block {
3089                    // Check if this continues the current block
3090                    // For nested lists, we need to check if this is a nested item (higher nesting level)
3091                    // or a continuation at the same or lower level
3092                    let is_nested = nesting > block.nesting_level;
3093                    let same_type =
3094                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3095                    let same_context = block.blockquote_prefix == blockquote_prefix;
3096                    // Allow one blank line after last item, or lines immediately after block content
3097                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3098
3099                    // For unordered lists, also check marker consistency
3100                    let marker_compatible =
3101                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3102
3103                    // O(1) check: Use the tracked variable instead of O(n) nested loop
3104                    // This eliminates the quadratic bottleneck from issue #148
3105                    let has_non_list_content = has_list_breaking_content_since_last_item;
3106
3107                    // A list continues if:
3108                    // 1. It's a nested item (indented more than the parent), OR
3109                    // 2. It's the same type at the same level with reasonable distance
3110                    let mut continues_list = if is_nested {
3111                        // Nested items always continue the list if they're in the same context
3112                        same_context && reasonable_distance && !has_non_list_content
3113                    } else {
3114                        // Same-level items need to match type and markers
3115                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3116                    };
3117
3118                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3119                        eprintln!(
3120                            "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3121                            line_num,
3122                            continues_list,
3123                            is_nested,
3124                            same_type,
3125                            same_context,
3126                            reasonable_distance,
3127                            marker_compatible,
3128                            has_non_list_content,
3129                            last_list_item_line,
3130                            block.end_line
3131                        );
3132                    }
3133
3134                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
3135                    // This handles edge cases where content patterns might otherwise split lists incorrectly
3136                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
3137                        // Check if the previous line was a list item or a continuation of a list item
3138                        // (including lazy continuation lines)
3139                        if block.item_lines.contains(&(line_num - 1)) {
3140                            // They're consecutive list items - force them to be in the same list
3141                            continues_list = true;
3142                        } else {
3143                            // Previous line is a continuation line within this block
3144                            // (e.g., lazy continuation with indent=0)
3145                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
3146                            continues_list = true;
3147                        }
3148                    }
3149
3150                    if continues_list {
3151                        // Extend current block
3152                        block.end_line = line_num;
3153                        block.item_lines.push(line_num);
3154
3155                        // Update max marker width
3156                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3157                            list_item.marker.len() + 1
3158                        } else {
3159                            list_item.marker.len()
3160                        });
3161
3162                        // Update marker consistency for unordered lists
3163                        if !block.is_ordered
3164                            && block.marker.is_some()
3165                            && block.marker.as_ref() != Some(&list_item.marker)
3166                        {
3167                            // Mixed markers, clear the marker field
3168                            block.marker = None;
3169                        }
3170
3171                        // Reset tracked state for issue #148 optimization
3172                        reset_tracking_state(
3173                            list_item,
3174                            &mut has_list_breaking_content_since_last_item,
3175                            &mut min_continuation_for_tracking,
3176                        );
3177                    } else {
3178                        // End current block and start a new one
3179
3180                        list_blocks.push(block.clone());
3181
3182                        *block = ListBlock {
3183                            start_line: line_num,
3184                            end_line: line_num,
3185                            is_ordered: list_item.is_ordered,
3186                            marker: if list_item.is_ordered {
3187                                None
3188                            } else {
3189                                Some(list_item.marker.clone())
3190                            },
3191                            blockquote_prefix: blockquote_prefix.clone(),
3192                            item_lines: vec![line_num],
3193                            nesting_level: nesting,
3194                            max_marker_width: if list_item.is_ordered {
3195                                list_item.marker.len() + 1
3196                            } else {
3197                                list_item.marker.len()
3198                            },
3199                        };
3200
3201                        // Initialize tracked state for new block (issue #148 optimization)
3202                        reset_tracking_state(
3203                            list_item,
3204                            &mut has_list_breaking_content_since_last_item,
3205                            &mut min_continuation_for_tracking,
3206                        );
3207                    }
3208                } else {
3209                    // Start a new block
3210                    current_block = Some(ListBlock {
3211                        start_line: line_num,
3212                        end_line: line_num,
3213                        is_ordered: list_item.is_ordered,
3214                        marker: if list_item.is_ordered {
3215                            None
3216                        } else {
3217                            Some(list_item.marker.clone())
3218                        },
3219                        blockquote_prefix,
3220                        item_lines: vec![line_num],
3221                        nesting_level: nesting,
3222                        max_marker_width: list_item.marker.len(),
3223                    });
3224
3225                    // Initialize tracked state for new block (issue #148 optimization)
3226                    reset_tracking_state(
3227                        list_item,
3228                        &mut has_list_breaking_content_since_last_item,
3229                        &mut min_continuation_for_tracking,
3230                    );
3231                }
3232
3233                last_list_item_line = line_num;
3234                current_indent_level = item_indent;
3235                last_marker_width = if list_item.is_ordered {
3236                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
3237                } else {
3238                    list_item.marker.len()
3239                };
3240            } else if let Some(ref mut block) = current_block {
3241                // Not a list item - check if it continues the current block
3242                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3243                    eprintln!(
3244                        "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3245                        line_num, line_info.is_blank
3246                    );
3247                }
3248
3249                // For MD032 compatibility, we use a simple approach:
3250                // - Indented lines continue the list
3251                // - Blank lines followed by indented content continue the list
3252                // - Everything else ends the list
3253
3254                // Check if the last line in the list block ended with a backslash (hard line break)
3255                // This handles cases where list items use backslash for hard line breaks
3256                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3257                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3258                } else {
3259                    false
3260                };
3261
3262                // Calculate minimum indentation for list continuation
3263                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
3264                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
3265                let min_continuation_indent = if block.is_ordered {
3266                    current_indent_level + last_marker_width
3267                } else {
3268                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
3269                };
3270
3271                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3272                    // Indented line or backslash continuation continues the list
3273                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3274                        eprintln!(
3275                            "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3276                            line_num, line_info.indent, min_continuation_indent
3277                        );
3278                    }
3279                    block.end_line = line_num;
3280                } else if line_info.is_blank {
3281                    // Blank line - check if it's internal to the list or ending it
3282                    // We only include blank lines that are followed by more list content
3283                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3284                        eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3285                    }
3286                    let mut check_idx = line_idx + 1;
3287                    let mut found_continuation = false;
3288
3289                    // Skip additional blank lines
3290                    while check_idx < lines.len() && lines[check_idx].is_blank {
3291                        check_idx += 1;
3292                    }
3293
3294                    if check_idx < lines.len() {
3295                        let next_line = &lines[check_idx];
3296                        // For blockquote lines, compute indent AFTER stripping the blockquote prefix
3297                        let next_content = next_line.content(content);
3298                        // Use blockquote level (count of >) to compare, not the full prefix
3299                        // This avoids issues where the regex captures extra whitespace
3300                        let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3301                        let next_bq_level_for_indent = next_content
3302                            .chars()
3303                            .take_while(|c| *c == '>' || c.is_whitespace())
3304                            .filter(|&c| c == '>')
3305                            .count();
3306                        let effective_indent =
3307                            if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3308                                // For lines in the same blockquote context, compute indent after the blockquote marker(s)
3309                                // Find position after ">" and one space
3310                                let mut pos = 0;
3311                                let mut found_markers = 0;
3312                                for c in next_content.chars() {
3313                                    pos += c.len_utf8();
3314                                    if c == '>' {
3315                                        found_markers += 1;
3316                                        if found_markers == next_bq_level_for_indent {
3317                                            // Skip optional space after last >
3318                                            if next_content.get(pos..pos + 1) == Some(" ") {
3319                                                pos += 1;
3320                                            }
3321                                            break;
3322                                        }
3323                                    }
3324                                }
3325                                let after_blockquote_marker = &next_content[pos..];
3326                                after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3327                            } else {
3328                                next_line.indent
3329                            };
3330                        // Also adjust min_continuation_indent for blockquote lists
3331                        // The marker_column includes blockquote prefix, so subtract it
3332                        let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3333                            // For blockquote lists, the continuation is relative to blockquote content
3334                            // current_indent_level includes blockquote prefix (2 for "> "), so use just 2 for unordered
3335                            if block.is_ordered { last_marker_width } else { 2 }
3336                        } else {
3337                            min_continuation_indent
3338                        };
3339                        // Check if followed by indented content (list continuation)
3340                        if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3341                            eprintln!(
3342                                "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3343                                line_num,
3344                                check_idx + 1,
3345                                effective_indent,
3346                                adjusted_min_continuation,
3347                                next_line.list_item.is_some(),
3348                                next_line.in_code_block
3349                            );
3350                        }
3351                        if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3352                            found_continuation = true;
3353                        }
3354                        // Check if followed by another list item at the same level
3355                        else if !next_line.in_code_block
3356                            && next_line.list_item.is_some()
3357                            && let Some(item) = &next_line.list_item
3358                        {
3359                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3360                                .find(next_line.content(content))
3361                                .map_or(String::new(), |m| m.as_str().to_string());
3362                            if item.marker_column == current_indent_level
3363                                && item.is_ordered == block.is_ordered
3364                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3365                            {
3366                                // Check if there was meaningful content between the list items (unused now)
3367                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
3368                                // Pre-compute block's blockquote level for use in closures
3369                                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3370                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3371                                    if let Some(between_line) = lines.get(idx) {
3372                                        let between_content = between_line.content(content);
3373                                        let trimmed = between_content.trim();
3374                                        // Skip empty lines
3375                                        if trimmed.is_empty() {
3376                                            return false;
3377                                        }
3378                                        // Check for meaningful content
3379                                        let line_indent = between_content.len() - between_content.trim_start().len();
3380
3381                                        // Check if blockquote level changed (not just if line starts with ">")
3382                                        let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3383                                            .find(between_content)
3384                                            .map_or(String::new(), |m| m.as_str().to_string());
3385                                        let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3386                                        let blockquote_level_changed =
3387                                            trimmed.starts_with(">") && between_bq_level != block_bq_level;
3388
3389                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
3390                                        if trimmed.starts_with("```")
3391                                            || trimmed.starts_with("~~~")
3392                                            || trimmed.starts_with("---")
3393                                            || trimmed.starts_with("***")
3394                                            || trimmed.starts_with("___")
3395                                            || blockquote_level_changed
3396                                            || crate::utils::skip_context::is_table_line(trimmed)
3397                                            || between_line.heading.is_some()
3398                                        {
3399                                            return true; // These are structural separators - meaningful content that breaks lists
3400                                        }
3401
3402                                        // Only properly indented content continues the list
3403                                        line_indent >= min_continuation_indent
3404                                    } else {
3405                                        false
3406                                    }
3407                                });
3408
3409                                if block.is_ordered {
3410                                    // For ordered lists: don't continue if there are structural separators
3411                                    // Check if there are structural separators between the list items
3412                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3413                                        if let Some(between_line) = lines.get(idx) {
3414                                            let between_content = between_line.content(content);
3415                                            let trimmed = between_content.trim();
3416                                            if trimmed.is_empty() {
3417                                                return false;
3418                                            }
3419                                            // Check if blockquote level changed (not just if line starts with ">")
3420                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3421                                                .find(between_content)
3422                                                .map_or(String::new(), |m| m.as_str().to_string());
3423                                            let between_bq_level =
3424                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3425                                            let blockquote_level_changed =
3426                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3427                                            // Check for structural separators that break lists
3428                                            trimmed.starts_with("```")
3429                                                || trimmed.starts_with("~~~")
3430                                                || trimmed.starts_with("---")
3431                                                || trimmed.starts_with("***")
3432                                                || trimmed.starts_with("___")
3433                                                || blockquote_level_changed
3434                                                || crate::utils::skip_context::is_table_line(trimmed)
3435                                                || between_line.heading.is_some()
3436                                        } else {
3437                                            false
3438                                        }
3439                                    });
3440                                    found_continuation = !has_structural_separators;
3441                                } else {
3442                                    // For unordered lists: also check for structural separators
3443                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3444                                        if let Some(between_line) = lines.get(idx) {
3445                                            let between_content = between_line.content(content);
3446                                            let trimmed = between_content.trim();
3447                                            if trimmed.is_empty() {
3448                                                return false;
3449                                            }
3450                                            // Check if blockquote level changed (not just if line starts with ">")
3451                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3452                                                .find(between_content)
3453                                                .map_or(String::new(), |m| m.as_str().to_string());
3454                                            let between_bq_level =
3455                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3456                                            let blockquote_level_changed =
3457                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3458                                            // Check for structural separators that break lists
3459                                            trimmed.starts_with("```")
3460                                                || trimmed.starts_with("~~~")
3461                                                || trimmed.starts_with("---")
3462                                                || trimmed.starts_with("***")
3463                                                || trimmed.starts_with("___")
3464                                                || blockquote_level_changed
3465                                                || crate::utils::skip_context::is_table_line(trimmed)
3466                                                || between_line.heading.is_some()
3467                                        } else {
3468                                            false
3469                                        }
3470                                    });
3471                                    found_continuation = !has_structural_separators;
3472                                }
3473                            }
3474                        }
3475                    }
3476
3477                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3478                        eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3479                    }
3480                    if found_continuation {
3481                        // Include the blank line in the block
3482                        block.end_line = line_num;
3483                    } else {
3484                        // Blank line ends the list - don't include it
3485                        list_blocks.push(block.clone());
3486                        current_block = None;
3487                    }
3488                } else {
3489                    // Check for lazy continuation - non-indented line immediately after a list item
3490                    // But only if the line has sufficient indentation for the list type
3491                    let min_required_indent = if block.is_ordered {
3492                        current_indent_level + last_marker_width
3493                    } else {
3494                        current_indent_level + 2
3495                    };
3496
3497                    // For lazy continuation to apply, the line must either:
3498                    // 1. Have no indentation (true lazy continuation)
3499                    // 2. Have sufficient indentation for the list type
3500                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3501                    let line_content = line_info.content(content).trim();
3502
3503                    // Check for table-like patterns
3504                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3505
3506                    // Check if blockquote level changed (not just if line starts with ">")
3507                    // Lines within the same blockquote level are NOT structural separators
3508                    let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3509                    let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3510                    let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3511
3512                    let is_structural_separator = line_info.heading.is_some()
3513                        || line_content.starts_with("```")
3514                        || line_content.starts_with("~~~")
3515                        || line_content.starts_with("---")
3516                        || line_content.starts_with("***")
3517                        || line_content.starts_with("___")
3518                        || blockquote_level_changed
3519                        || looks_like_table;
3520
3521                    // Allow lazy continuation if we're still within the same list block
3522                    // (not just immediately after a list item)
3523                    let is_lazy_continuation = !is_structural_separator
3524                        && !line_info.is_blank
3525                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3526
3527                    if is_lazy_continuation {
3528                        // Additional check: if the line starts with uppercase and looks like a new sentence,
3529                        // it's probably not a continuation
3530                        // BUT: for blockquote lines with sufficient effective indent, always treat as continuation
3531                        let line_content_raw = line_info.content(content);
3532                        let block_bq_level_lazy = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3533                        let line_bq_level_lazy = line_content_raw
3534                            .chars()
3535                            .take_while(|c| *c == '>' || c.is_whitespace())
3536                            .filter(|&c| c == '>')
3537                            .count();
3538                        let has_proper_blockquote_indent =
3539                            if line_bq_level_lazy > 0 && line_bq_level_lazy == block_bq_level_lazy {
3540                                // Compute effective indent after blockquote markers
3541                                let mut pos = 0;
3542                                let mut found_markers = 0;
3543                                for c in line_content_raw.chars() {
3544                                    pos += c.len_utf8();
3545                                    if c == '>' {
3546                                        found_markers += 1;
3547                                        if found_markers == line_bq_level_lazy {
3548                                            if line_content_raw.get(pos..pos + 1) == Some(" ") {
3549                                                pos += 1;
3550                                            }
3551                                            break;
3552                                        }
3553                                    }
3554                                }
3555                                let after_bq = &line_content_raw[pos..];
3556                                let effective_indent_lazy = after_bq.len() - after_bq.trim_start().len();
3557                                let min_required_for_bq = if block.is_ordered { last_marker_width } else { 2 };
3558                                effective_indent_lazy >= min_required_for_bq
3559                            } else {
3560                                false
3561                            };
3562
3563                        // If it has proper blockquote indent, it's a continuation regardless of uppercase
3564                        if has_proper_blockquote_indent {
3565                            block.end_line = line_num;
3566                        } else {
3567                            let content_to_check = if !blockquote_prefix.is_empty() {
3568                                // Strip blockquote prefix to check the actual content
3569                                line_info
3570                                    .content(content)
3571                                    .strip_prefix(&blockquote_prefix)
3572                                    .unwrap_or(line_info.content(content))
3573                                    .trim()
3574                            } else {
3575                                line_info.content(content).trim()
3576                            };
3577
3578                            let starts_with_uppercase =
3579                                content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3580
3581                            // If it starts with uppercase and the previous line ended with punctuation,
3582                            // it's likely a new paragraph, not a continuation
3583                            if starts_with_uppercase && last_list_item_line > 0 {
3584                                // This looks like a new paragraph
3585                                list_blocks.push(block.clone());
3586                                current_block = None;
3587                            } else {
3588                                // This is a lazy continuation line
3589                                block.end_line = line_num;
3590                            }
3591                        }
3592                    } else {
3593                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3594                        list_blocks.push(block.clone());
3595                        current_block = None;
3596                    }
3597                }
3598            }
3599        }
3600
3601        // Don't forget the last block
3602        if let Some(block) = current_block {
3603            list_blocks.push(block);
3604        }
3605
3606        // Merge adjacent blocks that should be one
3607        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3608
3609        list_blocks
3610    }
3611
3612    /// Compute character frequency for fast content analysis
3613    fn compute_char_frequency(content: &str) -> CharFrequency {
3614        let mut frequency = CharFrequency::default();
3615
3616        for ch in content.chars() {
3617            match ch {
3618                '#' => frequency.hash_count += 1,
3619                '*' => frequency.asterisk_count += 1,
3620                '_' => frequency.underscore_count += 1,
3621                '-' => frequency.hyphen_count += 1,
3622                '+' => frequency.plus_count += 1,
3623                '>' => frequency.gt_count += 1,
3624                '|' => frequency.pipe_count += 1,
3625                '[' => frequency.bracket_count += 1,
3626                '`' => frequency.backtick_count += 1,
3627                '<' => frequency.lt_count += 1,
3628                '!' => frequency.exclamation_count += 1,
3629                '\n' => frequency.newline_count += 1,
3630                _ => {}
3631            }
3632        }
3633
3634        frequency
3635    }
3636
3637    /// Parse HTML tags in the content
3638    fn parse_html_tags(
3639        content: &str,
3640        lines: &[LineInfo],
3641        code_blocks: &[(usize, usize)],
3642        flavor: MarkdownFlavor,
3643    ) -> Vec<HtmlTag> {
3644        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3645            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3646
3647        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3648
3649        for cap in HTML_TAG_REGEX.captures_iter(content) {
3650            let full_match = cap.get(0).unwrap();
3651            let match_start = full_match.start();
3652            let match_end = full_match.end();
3653
3654            // Skip if in code block
3655            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3656                continue;
3657            }
3658
3659            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3660            let tag_name_original = cap.get(2).unwrap().as_str();
3661            let tag_name = tag_name_original.to_lowercase();
3662            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3663
3664            // Skip JSX components in MDX files (tags starting with uppercase letter)
3665            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3666            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3667                continue;
3668            }
3669
3670            // Find which line this tag is on
3671            let mut line_num = 1;
3672            let mut col_start = match_start;
3673            let mut col_end = match_end;
3674            for (idx, line_info) in lines.iter().enumerate() {
3675                if match_start >= line_info.byte_offset {
3676                    line_num = idx + 1;
3677                    col_start = match_start - line_info.byte_offset;
3678                    col_end = match_end - line_info.byte_offset;
3679                } else {
3680                    break;
3681                }
3682            }
3683
3684            html_tags.push(HtmlTag {
3685                line: line_num,
3686                start_col: col_start,
3687                end_col: col_end,
3688                byte_offset: match_start,
3689                byte_end: match_end,
3690                tag_name,
3691                is_closing,
3692                is_self_closing,
3693                raw_content: full_match.as_str().to_string(),
3694            });
3695        }
3696
3697        html_tags
3698    }
3699
3700    /// Parse table rows in the content
3701    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3702        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3703
3704        for (line_idx, line_info) in lines.iter().enumerate() {
3705            // Skip lines in code blocks or blank lines
3706            if line_info.in_code_block || line_info.is_blank {
3707                continue;
3708            }
3709
3710            let line = line_info.content(content);
3711            let line_num = line_idx + 1;
3712
3713            // Check if this line contains pipes (potential table row)
3714            if !line.contains('|') {
3715                continue;
3716            }
3717
3718            // Count columns by splitting on pipes
3719            let parts: Vec<&str> = line.split('|').collect();
3720            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3721
3722            // Check if this is a separator row
3723            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3724            let mut column_alignments = Vec::new();
3725
3726            if is_separator {
3727                for part in &parts[1..parts.len() - 1] {
3728                    // Skip first and last empty parts
3729                    let trimmed = part.trim();
3730                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3731                        "center".to_string()
3732                    } else if trimmed.ends_with(':') {
3733                        "right".to_string()
3734                    } else if trimmed.starts_with(':') {
3735                        "left".to_string()
3736                    } else {
3737                        "none".to_string()
3738                    };
3739                    column_alignments.push(alignment);
3740                }
3741            }
3742
3743            table_rows.push(TableRow {
3744                line: line_num,
3745                is_separator,
3746                column_count,
3747                column_alignments,
3748            });
3749        }
3750
3751        table_rows
3752    }
3753
3754    /// Parse bare URLs and emails in the content
3755    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3756        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3757
3758        // Check for bare URLs (not in angle brackets or markdown links)
3759        for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3760            let full_match = cap.get(0).unwrap();
3761            let match_start = full_match.start();
3762            let match_end = full_match.end();
3763
3764            // Skip if in code block
3765            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3766                continue;
3767            }
3768
3769            // Skip if already in angle brackets or markdown links
3770            let preceding_char = if match_start > 0 {
3771                content.chars().nth(match_start - 1)
3772            } else {
3773                None
3774            };
3775            let following_char = content.chars().nth(match_end);
3776
3777            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3778                continue;
3779            }
3780            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3781                continue;
3782            }
3783
3784            let url = full_match.as_str();
3785            let url_type = if url.starts_with("https://") {
3786                "https"
3787            } else if url.starts_with("http://") {
3788                "http"
3789            } else if url.starts_with("ftp://") {
3790                "ftp"
3791            } else {
3792                "other"
3793            };
3794
3795            // Find which line this URL is on
3796            let mut line_num = 1;
3797            let mut col_start = match_start;
3798            let mut col_end = match_end;
3799            for (idx, line_info) in lines.iter().enumerate() {
3800                if match_start >= line_info.byte_offset {
3801                    line_num = idx + 1;
3802                    col_start = match_start - line_info.byte_offset;
3803                    col_end = match_end - line_info.byte_offset;
3804                } else {
3805                    break;
3806                }
3807            }
3808
3809            bare_urls.push(BareUrl {
3810                line: line_num,
3811                start_col: col_start,
3812                end_col: col_end,
3813                byte_offset: match_start,
3814                byte_end: match_end,
3815                url: url.to_string(),
3816                url_type: url_type.to_string(),
3817            });
3818        }
3819
3820        // Check for bare email addresses
3821        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3822            let full_match = cap.get(0).unwrap();
3823            let match_start = full_match.start();
3824            let match_end = full_match.end();
3825
3826            // Skip if in code block
3827            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3828                continue;
3829            }
3830
3831            // Skip if already in angle brackets or markdown links
3832            let preceding_char = if match_start > 0 {
3833                content.chars().nth(match_start - 1)
3834            } else {
3835                None
3836            };
3837            let following_char = content.chars().nth(match_end);
3838
3839            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3840                continue;
3841            }
3842            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3843                continue;
3844            }
3845
3846            let email = full_match.as_str();
3847
3848            // Find which line this email is on
3849            let mut line_num = 1;
3850            let mut col_start = match_start;
3851            let mut col_end = match_end;
3852            for (idx, line_info) in lines.iter().enumerate() {
3853                if match_start >= line_info.byte_offset {
3854                    line_num = idx + 1;
3855                    col_start = match_start - line_info.byte_offset;
3856                    col_end = match_end - line_info.byte_offset;
3857                } else {
3858                    break;
3859                }
3860            }
3861
3862            bare_urls.push(BareUrl {
3863                line: line_num,
3864                start_col: col_start,
3865                end_col: col_end,
3866                byte_offset: match_start,
3867                byte_end: match_end,
3868                url: email.to_string(),
3869                url_type: "email".to_string(),
3870            });
3871        }
3872
3873        bare_urls
3874    }
3875
3876    /// Get an iterator over valid CommonMark headings
3877    ///
3878    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3879    /// that should be flagged by MD018 but should not be processed by other heading rules.
3880    ///
3881    /// # Examples
3882    ///
3883    /// ```rust
3884    /// use rumdl_lib::lint_context::LintContext;
3885    /// use rumdl_lib::config::MarkdownFlavor;
3886    ///
3887    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
3888    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3889    ///
3890    /// for heading in ctx.valid_headings() {
3891    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
3892    /// }
3893    /// // Only prints valid headings, skips `#NoSpace`
3894    /// ```
3895    #[must_use]
3896    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3897        ValidHeadingsIter::new(&self.lines)
3898    }
3899
3900    /// Check if the document contains any valid CommonMark headings
3901    ///
3902    /// Returns `true` if there is at least one heading with proper space after `#`.
3903    #[must_use]
3904    pub fn has_valid_headings(&self) -> bool {
3905        self.lines
3906            .iter()
3907            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3908    }
3909}
3910
3911/// Merge adjacent list blocks that should be treated as one
3912fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3913    if list_blocks.len() < 2 {
3914        return;
3915    }
3916
3917    let mut merger = ListBlockMerger::new(content, lines);
3918    *list_blocks = merger.merge(list_blocks);
3919}
3920
3921/// Helper struct to manage the complex logic of merging list blocks
3922struct ListBlockMerger<'a> {
3923    content: &'a str,
3924    lines: &'a [LineInfo],
3925}
3926
3927impl<'a> ListBlockMerger<'a> {
3928    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3929        Self { content, lines }
3930    }
3931
3932    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3933        let mut merged = Vec::with_capacity(list_blocks.len());
3934        let mut current = list_blocks[0].clone();
3935
3936        for next in list_blocks.iter().skip(1) {
3937            if self.should_merge_blocks(&current, next) {
3938                current = self.merge_two_blocks(current, next);
3939            } else {
3940                merged.push(current);
3941                current = next.clone();
3942            }
3943        }
3944
3945        merged.push(current);
3946        merged
3947    }
3948
3949    /// Determine if two adjacent list blocks should be merged
3950    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3951        // Basic compatibility checks
3952        if !self.blocks_are_compatible(current, next) {
3953            return false;
3954        }
3955
3956        // Check spacing and content between blocks
3957        let spacing = self.analyze_spacing_between(current, next);
3958        match spacing {
3959            BlockSpacing::Consecutive => true,
3960            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3961            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3962                self.can_merge_with_content_between(current, next)
3963            }
3964        }
3965    }
3966
3967    /// Check if blocks have compatible structure for merging
3968    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3969        current.is_ordered == next.is_ordered
3970            && current.blockquote_prefix == next.blockquote_prefix
3971            && current.nesting_level == next.nesting_level
3972    }
3973
3974    /// Analyze the spacing between two list blocks
3975    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3976        let gap = next.start_line - current.end_line;
3977
3978        match gap {
3979            1 => BlockSpacing::Consecutive,
3980            2 => BlockSpacing::SingleBlank,
3981            _ if gap > 2 => {
3982                if self.has_only_blank_lines_between(current, next) {
3983                    BlockSpacing::MultipleBlanks
3984                } else {
3985                    BlockSpacing::ContentBetween
3986                }
3987            }
3988            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3989        }
3990    }
3991
3992    /// Check if unordered lists can be merged with a single blank line between
3993    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3994        // Check if there are structural separators between the blocks
3995        // If has_meaningful_content_between returns true, it means there are structural separators
3996        if has_meaningful_content_between(self.content, current, next, self.lines) {
3997            return false; // Structural separators prevent merging
3998        }
3999
4000        // Only merge unordered lists with same marker across single blank
4001        !current.is_ordered && current.marker == next.marker
4002    }
4003
4004    /// Check if ordered lists can be merged when there's content between them
4005    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4006        // Do not merge lists if there are structural separators between them
4007        if has_meaningful_content_between(self.content, current, next, self.lines) {
4008            return false; // Structural separators prevent merging
4009        }
4010
4011        // Only consider merging ordered lists if there's no structural content between
4012        current.is_ordered && next.is_ordered
4013    }
4014
4015    /// Check if there are only blank lines between blocks
4016    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4017        for line_num in (current.end_line + 1)..next.start_line {
4018            if let Some(line_info) = self.lines.get(line_num - 1)
4019                && !line_info.content(self.content).trim().is_empty()
4020            {
4021                return false;
4022            }
4023        }
4024        true
4025    }
4026
4027    /// Merge two compatible list blocks into one
4028    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4029        current.end_line = next.end_line;
4030        current.item_lines.extend_from_slice(&next.item_lines);
4031
4032        // Update max marker width
4033        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4034
4035        // Handle marker consistency for unordered lists
4036        if !current.is_ordered && self.markers_differ(&current, next) {
4037            current.marker = None; // Mixed markers
4038        }
4039
4040        current
4041    }
4042
4043    /// Check if two blocks have different markers
4044    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4045        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4046    }
4047}
4048
4049/// Types of spacing between list blocks
4050#[derive(Debug, PartialEq)]
4051enum BlockSpacing {
4052    Consecutive,    // No gap between blocks
4053    SingleBlank,    // One blank line between blocks
4054    MultipleBlanks, // Multiple blank lines but no content
4055    ContentBetween, // Content exists between blocks
4056}
4057
4058/// Check if there's meaningful content (not just blank lines) between two list blocks
4059fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4060    // Check lines between current.end_line and next.start_line
4061    for line_num in (current.end_line + 1)..next.start_line {
4062        if let Some(line_info) = lines.get(line_num - 1) {
4063            // Convert to 0-indexed
4064            let trimmed = line_info.content(content).trim();
4065
4066            // Skip empty lines
4067            if trimmed.is_empty() {
4068                continue;
4069            }
4070
4071            // Check for structural separators that should separate lists (CommonMark compliant)
4072
4073            // Headings separate lists
4074            if line_info.heading.is_some() {
4075                return true; // Has meaningful content - headings separate lists
4076            }
4077
4078            // Horizontal rules separate lists (---, ***, ___)
4079            if is_horizontal_rule(trimmed) {
4080                return true; // Has meaningful content - horizontal rules separate lists
4081            }
4082
4083            // Tables separate lists
4084            if crate::utils::skip_context::is_table_line(trimmed) {
4085                return true; // Has meaningful content - tables separate lists
4086            }
4087
4088            // Blockquotes separate lists
4089            if trimmed.starts_with('>') {
4090                return true; // Has meaningful content - blockquotes separate lists
4091            }
4092
4093            // Code block fences separate lists (unless properly indented as list content)
4094            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4095                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4096
4097                // Check if this code block is properly indented as list continuation
4098                let min_continuation_indent = if current.is_ordered {
4099                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
4100                } else {
4101                    current.nesting_level + 2
4102                };
4103
4104                if line_indent < min_continuation_indent {
4105                    // This is a standalone code block that separates lists
4106                    return true; // Has meaningful content - standalone code blocks separate lists
4107                }
4108            }
4109
4110            // Check if this line has proper indentation for list continuation
4111            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4112
4113            // Calculate minimum indentation needed to be list continuation
4114            let min_indent = if current.is_ordered {
4115                current.nesting_level + current.max_marker_width
4116            } else {
4117                current.nesting_level + 2
4118            };
4119
4120            // If the line is not indented enough to be list continuation, it's meaningful content
4121            if line_indent < min_indent {
4122                return true; // Has meaningful content - content not indented as list continuation
4123            }
4124
4125            // If we reach here, the line is properly indented as list continuation
4126            // Continue checking other lines
4127        }
4128    }
4129
4130    // Only blank lines or properly indented list continuation content between blocks
4131    false
4132}
4133
4134/// Check if a line is a horizontal rule (---, ***, ___) per CommonMark spec.
4135/// CommonMark rules for thematic breaks (horizontal rules):
4136/// - May have 0-3 spaces of leading indentation (but NOT tabs)
4137/// - Must have 3+ of the same character (-, *, or _)
4138/// - May have spaces between characters
4139/// - No other characters allowed
4140pub fn is_horizontal_rule_line(line: &str) -> bool {
4141    // CommonMark: HRs can have 0-3 spaces of leading indentation, not tabs
4142    let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4143    if leading_spaces > 3 || line.starts_with('\t') {
4144        return false;
4145    }
4146
4147    is_horizontal_rule_content(line.trim())
4148}
4149
4150/// Check if trimmed content matches horizontal rule pattern.
4151/// Use `is_horizontal_rule_line` for full CommonMark compliance including indentation check.
4152pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4153    if trimmed.len() < 3 {
4154        return false;
4155    }
4156
4157    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
4158    let chars: Vec<char> = trimmed.chars().collect();
4159    if let Some(&first_char) = chars.first()
4160        && (first_char == '-' || first_char == '*' || first_char == '_')
4161    {
4162        let mut count = 0;
4163        for &ch in &chars {
4164            if ch == first_char {
4165                count += 1;
4166            } else if ch != ' ' && ch != '\t' {
4167                return false; // Non-matching, non-whitespace character
4168            }
4169        }
4170        return count >= 3;
4171    }
4172    false
4173}
4174
4175/// Backwards-compatible alias for `is_horizontal_rule_content`
4176pub fn is_horizontal_rule(trimmed: &str) -> bool {
4177    is_horizontal_rule_content(trimmed)
4178}
4179
4180/// Check if content contains patterns that cause the markdown crate to panic
4181#[cfg(test)]
4182mod tests {
4183    use super::*;
4184
4185    #[test]
4186    fn test_empty_content() {
4187        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4188        assert_eq!(ctx.content, "");
4189        assert_eq!(ctx.line_offsets, vec![0]);
4190        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4191        assert_eq!(ctx.lines.len(), 0);
4192    }
4193
4194    #[test]
4195    fn test_single_line() {
4196        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4197        assert_eq!(ctx.content, "# Hello");
4198        assert_eq!(ctx.line_offsets, vec![0]);
4199        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4200        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4201    }
4202
4203    #[test]
4204    fn test_multi_line() {
4205        let content = "# Title\n\nSecond line\nThird line";
4206        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4207        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4208        // Test offset to line/col
4209        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
4210        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
4211        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
4212        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
4213        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
4214    }
4215
4216    #[test]
4217    fn test_line_info() {
4218        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
4219        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4220
4221        // Test line info
4222        assert_eq!(ctx.lines.len(), 7);
4223
4224        // Line 1: "# Title"
4225        let line1 = &ctx.lines[0];
4226        assert_eq!(line1.content(ctx.content), "# Title");
4227        assert_eq!(line1.byte_offset, 0);
4228        assert_eq!(line1.indent, 0);
4229        assert!(!line1.is_blank);
4230        assert!(!line1.in_code_block);
4231        assert!(line1.list_item.is_none());
4232
4233        // Line 2: "    indented"
4234        let line2 = &ctx.lines[1];
4235        assert_eq!(line2.content(ctx.content), "    indented");
4236        assert_eq!(line2.byte_offset, 8);
4237        assert_eq!(line2.indent, 4);
4238        assert!(!line2.is_blank);
4239
4240        // Line 3: "" (blank)
4241        let line3 = &ctx.lines[2];
4242        assert_eq!(line3.content(ctx.content), "");
4243        assert!(line3.is_blank);
4244
4245        // Test helper methods
4246        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4247        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4248        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4249        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4250    }
4251
4252    #[test]
4253    fn test_list_item_detection() {
4254        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
4255        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4256
4257        // Line 1: "- Unordered item"
4258        let line1 = &ctx.lines[0];
4259        assert!(line1.list_item.is_some());
4260        let list1 = line1.list_item.as_ref().unwrap();
4261        assert_eq!(list1.marker, "-");
4262        assert!(!list1.is_ordered);
4263        assert_eq!(list1.marker_column, 0);
4264        assert_eq!(list1.content_column, 2);
4265
4266        // Line 2: "  * Nested item"
4267        let line2 = &ctx.lines[1];
4268        assert!(line2.list_item.is_some());
4269        let list2 = line2.list_item.as_ref().unwrap();
4270        assert_eq!(list2.marker, "*");
4271        assert_eq!(list2.marker_column, 2);
4272
4273        // Line 3: "1. Ordered item"
4274        let line3 = &ctx.lines[2];
4275        assert!(line3.list_item.is_some());
4276        let list3 = line3.list_item.as_ref().unwrap();
4277        assert_eq!(list3.marker, "1.");
4278        assert!(list3.is_ordered);
4279        assert_eq!(list3.number, Some(1));
4280
4281        // Line 6: "Not a list"
4282        let line6 = &ctx.lines[5];
4283        assert!(line6.list_item.is_none());
4284    }
4285
4286    #[test]
4287    fn test_offset_to_line_col_edge_cases() {
4288        let content = "a\nb\nc";
4289        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4290        // line_offsets: [0, 2, 4]
4291        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
4292        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
4293        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
4294        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
4295        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
4296        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
4297    }
4298
4299    #[test]
4300    fn test_mdx_esm_blocks() {
4301        let content = r##"import {Chart} from './snowfall.js'
4302export const year = 2023
4303
4304# Last year's snowfall
4305
4306In {year}, the snowfall was above average.
4307It was followed by a warm spring which caused
4308flood conditions in many of the nearby rivers.
4309
4310<Chart color="#fcb32c" year={year} />
4311"##;
4312
4313        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4314
4315        // Check that lines 1 and 2 are marked as ESM blocks
4316        assert_eq!(ctx.lines.len(), 10);
4317        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4318        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4319        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4320        assert!(
4321            !ctx.lines[3].in_esm_block,
4322            "Line 4 (heading) should NOT be in_esm_block"
4323        );
4324        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4325        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4326    }
4327
4328    #[test]
4329    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4330        let content = r#"import {Chart} from './snowfall.js'
4331export const year = 2023
4332
4333# Last year's snowfall
4334"#;
4335
4336        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4337
4338        // ESM blocks should NOT be detected in Standard flavor
4339        assert!(
4340            !ctx.lines[0].in_esm_block,
4341            "Line 1 should NOT be in_esm_block in Standard flavor"
4342        );
4343        assert!(
4344            !ctx.lines[1].in_esm_block,
4345            "Line 2 should NOT be in_esm_block in Standard flavor"
4346        );
4347    }
4348}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs