rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::path::PathBuf;
11use std::sync::LazyLock;
12
13/// Macro for profiling sections - only active in non-WASM builds
14#[cfg(not(target_arch = "wasm32"))]
15macro_rules! profile_section {
16    ($name:expr, $profile:expr, $code:expr) => {{
17        let start = std::time::Instant::now();
18        let result = $code;
19        if $profile {
20            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
21        }
22        result
23    }};
24}
25
26#[cfg(target_arch = "wasm32")]
27macro_rules! profile_section {
28    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
29}
30
31// Comprehensive link pattern that captures both inline and reference links
32// Use (?s) flag to make . match newlines
33static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
34    Regex::new(
35        r#"(?sx)
36        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
37        (?:
38            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
39            |
40            \[([^\]]*)\]      # Reference ID in group 6
41        )"#
42    ).unwrap()
43});
44
45// Image pattern (similar to links but with ! prefix)
46// Use (?s) flag to make . match newlines
47static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
48    Regex::new(
49        r#"(?sx)
50        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
51        (?:
52            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
53            |
54            \[([^\]]*)\]      # Reference ID in group 6
55        )"#
56    ).unwrap()
57});
58
59// Reference definition pattern
60static REF_DEF_PATTERN: LazyLock<Regex> =
61    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
62
63// Pattern for bare URLs - uses centralized URL pattern from regex_cache
64
65// Pattern for email addresses
66static BARE_EMAIL_PATTERN: LazyLock<Regex> =
67    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
68
69// Pattern for blockquote prefix in parse_list_blocks
70static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
71
72/// Pre-computed information about a line
73#[derive(Debug, Clone)]
74pub struct LineInfo {
75    /// Byte offset where this line starts in the document
76    pub byte_offset: usize,
77    /// Length of the line in bytes (without newline)
78    pub byte_len: usize,
79    /// Number of bytes of leading whitespace (for substring extraction)
80    pub indent: usize,
81    /// Visual column width of leading whitespace (with proper tab expansion)
82    /// Per CommonMark, tabs expand to the next column that is a multiple of 4.
83    /// Use this for numeric comparisons like checking for indented code blocks (>= 4).
84    pub visual_indent: usize,
85    /// Whether the line is blank (empty or only whitespace)
86    pub is_blank: bool,
87    /// Whether this line is inside a code block
88    pub in_code_block: bool,
89    /// Whether this line is inside front matter
90    pub in_front_matter: bool,
91    /// Whether this line is inside an HTML block
92    pub in_html_block: bool,
93    /// Whether this line is inside an HTML comment
94    pub in_html_comment: bool,
95    /// List item information if this line starts a list item
96    pub list_item: Option<ListItemInfo>,
97    /// Heading information if this line is a heading
98    pub heading: Option<HeadingInfo>,
99    /// Blockquote information if this line is a blockquote
100    pub blockquote: Option<BlockquoteInfo>,
101    /// Whether this line is inside a mkdocstrings autodoc block
102    pub in_mkdocstrings: bool,
103    /// Whether this line is part of an ESM import/export block (MDX only)
104    pub in_esm_block: bool,
105    /// Whether this line is a continuation of a multi-line code span from a previous line
106    pub in_code_span_continuation: bool,
107    /// Whether this line is a horizontal rule (---, ***, ___, etc.)
108    /// Pre-computed for consistent detection across all rules
109    pub is_horizontal_rule: bool,
110    /// Whether this line is inside a math block ($$ ... $$)
111    pub in_math_block: bool,
112}
113
114impl LineInfo {
115    /// Get the line content as a string slice from the source document
116    pub fn content<'a>(&self, source: &'a str) -> &'a str {
117        &source[self.byte_offset..self.byte_offset + self.byte_len]
118    }
119}
120
121/// Information about a list item
122#[derive(Debug, Clone)]
123pub struct ListItemInfo {
124    /// The marker used (*, -, +, or number with . or ))
125    pub marker: String,
126    /// Whether it's ordered (true) or unordered (false)
127    pub is_ordered: bool,
128    /// The number for ordered lists
129    pub number: Option<usize>,
130    /// Column where the marker starts (0-based)
131    pub marker_column: usize,
132    /// Column where content after marker starts
133    pub content_column: usize,
134}
135
136/// Heading style type
137#[derive(Debug, Clone, PartialEq)]
138pub enum HeadingStyle {
139    /// ATX style heading (# Heading)
140    ATX,
141    /// Setext style heading with = underline
142    Setext1,
143    /// Setext style heading with - underline
144    Setext2,
145}
146
147/// Parsed link information
148#[derive(Debug, Clone)]
149pub struct ParsedLink<'a> {
150    /// Line number (1-indexed)
151    pub line: usize,
152    /// Start column (0-indexed) in the line
153    pub start_col: usize,
154    /// End column (0-indexed) in the line
155    pub end_col: usize,
156    /// Byte offset in document
157    pub byte_offset: usize,
158    /// End byte offset in document
159    pub byte_end: usize,
160    /// Link text
161    pub text: Cow<'a, str>,
162    /// Link URL or reference
163    pub url: Cow<'a, str>,
164    /// Whether this is a reference link [text][ref] vs inline [text](url)
165    pub is_reference: bool,
166    /// Reference ID for reference links
167    pub reference_id: Option<Cow<'a, str>>,
168    /// Link type from pulldown-cmark
169    pub link_type: LinkType,
170}
171
172/// Information about a broken link reported by pulldown-cmark
173#[derive(Debug, Clone)]
174pub struct BrokenLinkInfo {
175    /// The reference text that couldn't be resolved
176    pub reference: String,
177    /// Byte span in the source document
178    pub span: std::ops::Range<usize>,
179}
180
181/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
182#[derive(Debug, Clone)]
183pub struct FootnoteRef {
184    /// The footnote ID (without the ^ prefix)
185    pub id: String,
186    /// Line number (1-indexed)
187    pub line: usize,
188    /// Start byte offset in document
189    pub byte_offset: usize,
190    /// End byte offset in document
191    pub byte_end: usize,
192}
193
194/// Parsed image information
195#[derive(Debug, Clone)]
196pub struct ParsedImage<'a> {
197    /// Line number (1-indexed)
198    pub line: usize,
199    /// Start column (0-indexed) in the line
200    pub start_col: usize,
201    /// End column (0-indexed) in the line
202    pub end_col: usize,
203    /// Byte offset in document
204    pub byte_offset: usize,
205    /// End byte offset in document
206    pub byte_end: usize,
207    /// Alt text
208    pub alt_text: Cow<'a, str>,
209    /// Image URL or reference
210    pub url: Cow<'a, str>,
211    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
212    pub is_reference: bool,
213    /// Reference ID for reference images
214    pub reference_id: Option<Cow<'a, str>>,
215    /// Link type from pulldown-cmark
216    pub link_type: LinkType,
217}
218
219/// Reference definition [ref]: url "title"
220#[derive(Debug, Clone)]
221pub struct ReferenceDef {
222    /// Line number (1-indexed)
223    pub line: usize,
224    /// Reference ID (normalized to lowercase)
225    pub id: String,
226    /// URL
227    pub url: String,
228    /// Optional title
229    pub title: Option<String>,
230    /// Byte offset where the reference definition starts
231    pub byte_offset: usize,
232    /// Byte offset where the reference definition ends
233    pub byte_end: usize,
234    /// Byte offset where the title starts (if present, includes quote)
235    pub title_byte_start: Option<usize>,
236    /// Byte offset where the title ends (if present, includes quote)
237    pub title_byte_end: Option<usize>,
238}
239
240/// Parsed code span information
241#[derive(Debug, Clone)]
242pub struct CodeSpan {
243    /// Line number where the code span starts (1-indexed)
244    pub line: usize,
245    /// Line number where the code span ends (1-indexed)
246    pub end_line: usize,
247    /// Start column (0-indexed) in the line
248    pub start_col: usize,
249    /// End column (0-indexed) in the line
250    pub end_col: usize,
251    /// Byte offset in document
252    pub byte_offset: usize,
253    /// End byte offset in document
254    pub byte_end: usize,
255    /// Number of backticks used (1, 2, 3, etc.)
256    pub backtick_count: usize,
257    /// Content inside the code span (without backticks)
258    pub content: String,
259}
260
261/// Parsed math span information (inline $...$ or display $$...$$)
262#[derive(Debug, Clone)]
263pub struct MathSpan {
264    /// Line number where the math span starts (1-indexed)
265    pub line: usize,
266    /// Line number where the math span ends (1-indexed)
267    pub end_line: usize,
268    /// Start column (0-indexed) in the line
269    pub start_col: usize,
270    /// End column (0-indexed) in the line
271    pub end_col: usize,
272    /// Byte offset in document
273    pub byte_offset: usize,
274    /// End byte offset in document
275    pub byte_end: usize,
276    /// Whether this is display math ($$...$$) vs inline ($...$)
277    pub is_display: bool,
278    /// Content inside the math delimiters
279    pub content: String,
280}
281
282/// Information about a heading
283#[derive(Debug, Clone)]
284pub struct HeadingInfo {
285    /// Heading level (1-6 for ATX, 1-2 for Setext)
286    pub level: u8,
287    /// Style of heading
288    pub style: HeadingStyle,
289    /// The heading marker (# characters or underline)
290    pub marker: String,
291    /// Column where the marker starts (0-based)
292    pub marker_column: usize,
293    /// Column where heading text starts
294    pub content_column: usize,
295    /// The heading text (without markers and without custom ID syntax)
296    pub text: String,
297    /// Custom header ID if present (e.g., from {#custom-id} syntax)
298    pub custom_id: Option<String>,
299    /// Original heading text including custom ID syntax
300    pub raw_text: String,
301    /// Whether it has a closing sequence (for ATX)
302    pub has_closing_sequence: bool,
303    /// The closing sequence if present
304    pub closing_sequence: String,
305    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
306    /// False for malformed headings like `#NoSpace` that MD018 should flag
307    pub is_valid: bool,
308}
309
310/// A valid heading from a filtered iteration
311///
312/// Only includes headings that are CommonMark-compliant (have space after #).
313/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
314#[derive(Debug, Clone)]
315pub struct ValidHeading<'a> {
316    /// The 1-indexed line number in the document
317    pub line_num: usize,
318    /// Reference to the heading information
319    pub heading: &'a HeadingInfo,
320    /// Reference to the full line info (for rules that need additional context)
321    pub line_info: &'a LineInfo,
322}
323
324/// Iterator over valid CommonMark headings in a document
325///
326/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
327/// but should not be processed by other heading rules.
328pub struct ValidHeadingsIter<'a> {
329    lines: &'a [LineInfo],
330    current_index: usize,
331}
332
333impl<'a> ValidHeadingsIter<'a> {
334    fn new(lines: &'a [LineInfo]) -> Self {
335        Self {
336            lines,
337            current_index: 0,
338        }
339    }
340}
341
342impl<'a> Iterator for ValidHeadingsIter<'a> {
343    type Item = ValidHeading<'a>;
344
345    fn next(&mut self) -> Option<Self::Item> {
346        while self.current_index < self.lines.len() {
347            let idx = self.current_index;
348            self.current_index += 1;
349
350            let line_info = &self.lines[idx];
351            if let Some(heading) = &line_info.heading
352                && heading.is_valid
353            {
354                return Some(ValidHeading {
355                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
356                    heading,
357                    line_info,
358                });
359            }
360        }
361        None
362    }
363}
364
365/// Information about a blockquote line
366#[derive(Debug, Clone)]
367pub struct BlockquoteInfo {
368    /// Nesting level (1 for >, 2 for >>, etc.)
369    pub nesting_level: usize,
370    /// The indentation before the blockquote marker
371    pub indent: String,
372    /// Column where the first > starts (0-based)
373    pub marker_column: usize,
374    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
375    pub prefix: String,
376    /// Content after the blockquote marker(s)
377    pub content: String,
378    /// Whether the line has no space after the marker
379    pub has_no_space_after_marker: bool,
380    /// Whether the line has multiple spaces after the marker
381    pub has_multiple_spaces_after_marker: bool,
382    /// Whether this is an empty blockquote line needing MD028 fix
383    pub needs_md028_fix: bool,
384}
385
386/// Information about a list block
387#[derive(Debug, Clone)]
388pub struct ListBlock {
389    /// Line number where the list starts (1-indexed)
390    pub start_line: usize,
391    /// Line number where the list ends (1-indexed)
392    pub end_line: usize,
393    /// Whether it's ordered or unordered
394    pub is_ordered: bool,
395    /// The consistent marker for unordered lists (if any)
396    pub marker: Option<String>,
397    /// Blockquote prefix for this list (empty if not in blockquote)
398    pub blockquote_prefix: String,
399    /// Lines that are list items within this block
400    pub item_lines: Vec<usize>,
401    /// Nesting level (0 for top-level lists)
402    pub nesting_level: usize,
403    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
404    pub max_marker_width: usize,
405}
406
407use std::sync::{Arc, OnceLock};
408
409/// Map from line byte offset to list item data: (is_ordered, marker, marker_column, content_column, number)
410type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
411
412/// Character frequency data for fast content analysis
413#[derive(Debug, Clone, Default)]
414pub struct CharFrequency {
415    /// Count of # characters (headings)
416    pub hash_count: usize,
417    /// Count of * characters (emphasis, lists, horizontal rules)
418    pub asterisk_count: usize,
419    /// Count of _ characters (emphasis, horizontal rules)
420    pub underscore_count: usize,
421    /// Count of - characters (lists, horizontal rules, setext headings)
422    pub hyphen_count: usize,
423    /// Count of + characters (lists)
424    pub plus_count: usize,
425    /// Count of > characters (blockquotes)
426    pub gt_count: usize,
427    /// Count of | characters (tables)
428    pub pipe_count: usize,
429    /// Count of [ characters (links, images)
430    pub bracket_count: usize,
431    /// Count of ` characters (code spans, code blocks)
432    pub backtick_count: usize,
433    /// Count of < characters (HTML tags, autolinks)
434    pub lt_count: usize,
435    /// Count of ! characters (images)
436    pub exclamation_count: usize,
437    /// Count of newline characters
438    pub newline_count: usize,
439}
440
441/// Pre-parsed HTML tag information
442#[derive(Debug, Clone)]
443pub struct HtmlTag {
444    /// Line number (1-indexed)
445    pub line: usize,
446    /// Start column (0-indexed) in the line
447    pub start_col: usize,
448    /// End column (0-indexed) in the line
449    pub end_col: usize,
450    /// Byte offset in document
451    pub byte_offset: usize,
452    /// End byte offset in document
453    pub byte_end: usize,
454    /// Tag name (e.g., "div", "img", "br")
455    pub tag_name: String,
456    /// Whether it's a closing tag (`</tag>`)
457    pub is_closing: bool,
458    /// Whether it's self-closing (`<tag />`)
459    pub is_self_closing: bool,
460    /// Raw tag content
461    pub raw_content: String,
462}
463
464/// Pre-parsed emphasis span information
465#[derive(Debug, Clone)]
466pub struct EmphasisSpan {
467    /// Line number (1-indexed)
468    pub line: usize,
469    /// Start column (0-indexed) in the line
470    pub start_col: usize,
471    /// End column (0-indexed) in the line
472    pub end_col: usize,
473    /// Byte offset in document
474    pub byte_offset: usize,
475    /// End byte offset in document
476    pub byte_end: usize,
477    /// Type of emphasis ('*' or '_')
478    pub marker: char,
479    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
480    pub marker_count: usize,
481    /// Content inside the emphasis
482    pub content: String,
483}
484
485/// Pre-parsed table row information
486#[derive(Debug, Clone)]
487pub struct TableRow {
488    /// Line number (1-indexed)
489    pub line: usize,
490    /// Whether this is a separator row (contains only |, -, :, and spaces)
491    pub is_separator: bool,
492    /// Number of columns (pipe-separated cells)
493    pub column_count: usize,
494    /// Alignment info from separator row
495    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
496}
497
498/// Pre-parsed bare URL information (not in links)
499#[derive(Debug, Clone)]
500pub struct BareUrl {
501    /// Line number (1-indexed)
502    pub line: usize,
503    /// Start column (0-indexed) in the line
504    pub start_col: usize,
505    /// End column (0-indexed) in the line
506    pub end_col: usize,
507    /// Byte offset in document
508    pub byte_offset: usize,
509    /// End byte offset in document
510    pub byte_end: usize,
511    /// The URL string
512    pub url: String,
513    /// Type of URL ("http", "https", "ftp", "email")
514    pub url_type: String,
515}
516
517pub struct LintContext<'a> {
518    pub content: &'a str,
519    pub line_offsets: Vec<usize>,
520    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
521    pub lines: Vec<LineInfo>,             // Pre-computed line information
522    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
523    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
524    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
525    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
526    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
527    reference_defs_map: HashMap<String, usize>, // O(1) lookup by lowercase ID -> index in reference_defs
528    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
529    math_spans_cache: OnceLock<Arc<Vec<MathSpan>>>, // Lazy-loaded math spans ($...$ and $$...$$)
530    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
531    pub char_frequency: CharFrequency,    // Character frequency analysis
532    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
533    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
534    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
535    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
536    has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
537    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
538    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
539    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
540    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
541    pub flavor: MarkdownFlavor,           // Markdown flavor being used
542    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
543}
544
545/// Detailed blockquote parse result with all components
546struct BlockquoteComponents<'a> {
547    indent: &'a str,
548    markers: &'a str,
549    spaces_after: &'a str,
550    content: &'a str,
551}
552
553/// Parse blockquote prefix with detailed components using manual parsing
554#[inline]
555fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
556    let bytes = line.as_bytes();
557    let mut pos = 0;
558
559    // Parse leading whitespace (indent)
560    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
561        pos += 1;
562    }
563    let indent_end = pos;
564
565    // Must have at least one '>' marker
566    if pos >= bytes.len() || bytes[pos] != b'>' {
567        return None;
568    }
569
570    // Parse '>' markers
571    while pos < bytes.len() && bytes[pos] == b'>' {
572        pos += 1;
573    }
574    let markers_end = pos;
575
576    // Parse spaces after markers
577    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
578        pos += 1;
579    }
580    let spaces_end = pos;
581
582    Some(BlockquoteComponents {
583        indent: &line[0..indent_end],
584        markers: &line[indent_end..markers_end],
585        spaces_after: &line[markers_end..spaces_end],
586        content: &line[spaces_end..],
587    })
588}
589
590impl<'a> LintContext<'a> {
591    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
592        #[cfg(not(target_arch = "wasm32"))]
593        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
594        #[cfg(target_arch = "wasm32")]
595        let profile = false;
596
597        let line_offsets = profile_section!("Line offsets", profile, {
598            let mut offsets = vec![0];
599            for (i, c) in content.char_indices() {
600                if c == '\n' {
601                    offsets.push(i + 1);
602                }
603            }
604            offsets
605        });
606
607        // Detect code blocks once and cache them
608        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
609
610        // Pre-compute HTML comment ranges ONCE for all operations
611        let html_comment_ranges = profile_section!(
612            "HTML comment ranges",
613            profile,
614            crate::utils::skip_context::compute_html_comment_ranges(content)
615        );
616
617        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
618        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
619            if flavor == MarkdownFlavor::MkDocs {
620                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
621            } else {
622                Vec::new()
623            }
624        });
625
626        // Pre-compute line information AND emphasis spans (without headings/blockquotes yet)
627        // Emphasis spans are captured during the same pulldown-cmark parse as list detection
628        let (mut lines, emphasis_spans) = profile_section!(
629            "Basic line info",
630            profile,
631            Self::compute_basic_line_info(
632                content,
633                &line_offsets,
634                &code_blocks,
635                flavor,
636                &html_comment_ranges,
637                &autodoc_ranges,
638            )
639        );
640
641        // Detect HTML blocks BEFORE heading detection
642        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
643
644        // Detect ESM import/export blocks in MDX files BEFORE heading detection
645        profile_section!(
646            "ESM blocks",
647            profile,
648            Self::detect_esm_blocks(content, &mut lines, flavor)
649        );
650
651        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
652        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
653
654        // Now detect headings and blockquotes
655        profile_section!(
656            "Headings & blockquotes",
657            profile,
658            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
659        );
660
661        // Parse code spans early so we can exclude them from link/image parsing
662        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
663
664        // Mark lines that are continuations of multi-line code spans
665        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
666        for span in &code_spans {
667            if span.end_line > span.line {
668                // Mark lines after the first line as continuations
669                for line_num in (span.line + 1)..=span.end_line {
670                    if let Some(line_info) = lines.get_mut(line_num - 1) {
671                        line_info.in_code_span_continuation = true;
672                    }
673                }
674            }
675        }
676
677        // Parse links, images, references, and list blocks
678        let (links, broken_links, footnote_refs) = profile_section!(
679            "Links",
680            profile,
681            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
682        );
683
684        let images = profile_section!(
685            "Images",
686            profile,
687            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
688        );
689
690        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
691
692        // Build O(1) lookup map for reference definitions by lowercase ID
693        let reference_defs_map: HashMap<String, usize> = reference_defs
694            .iter()
695            .enumerate()
696            .map(|(idx, def)| (def.id.to_lowercase(), idx))
697            .collect();
698
699        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
700
701        // Compute character frequency for fast content analysis
702        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
703
704        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
705        let table_blocks = profile_section!(
706            "Table blocks",
707            profile,
708            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
709                content,
710                &code_blocks,
711                &code_spans,
712                &html_comment_ranges,
713            )
714        );
715
716        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
717        let line_index = profile_section!(
718            "Line index",
719            profile,
720            crate::utils::range_utils::LineIndex::new(content)
721        );
722
723        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
724        let jinja_ranges = profile_section!(
725            "Jinja ranges",
726            profile,
727            crate::utils::jinja_utils::find_jinja_ranges(content)
728        );
729
730        Self {
731            content,
732            line_offsets,
733            code_blocks,
734            lines,
735            links,
736            images,
737            broken_links,
738            footnote_refs,
739            reference_defs,
740            reference_defs_map,
741            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
742            math_spans_cache: OnceLock::new(), // Lazy-loaded on first access
743            list_blocks,
744            char_frequency,
745            html_tags_cache: OnceLock::new(),
746            emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
747            table_rows_cache: OnceLock::new(),
748            bare_urls_cache: OnceLock::new(),
749            has_mixed_list_nesting_cache: OnceLock::new(),
750            html_comment_ranges,
751            table_blocks,
752            line_index,
753            jinja_ranges,
754            flavor,
755            source_file,
756        }
757    }
758
759    /// Get code spans - computed lazily on first access
760    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
761        Arc::clone(
762            self.code_spans_cache
763                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
764        )
765    }
766
767    /// Get math spans - computed lazily on first access
768    pub fn math_spans(&self) -> Arc<Vec<MathSpan>> {
769        Arc::clone(
770            self.math_spans_cache
771                .get_or_init(|| Arc::new(Self::parse_math_spans(self.content, &self.lines))),
772        )
773    }
774
775    /// Check if a byte position is within a math span (inline $...$ or display $$...$$)
776    pub fn is_in_math_span(&self, byte_pos: usize) -> bool {
777        let math_spans = self.math_spans();
778        math_spans
779            .iter()
780            .any(|span| byte_pos >= span.byte_offset && byte_pos < span.byte_end)
781    }
782
783    /// Get HTML comment ranges - pre-computed during LintContext construction
784    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
785        &self.html_comment_ranges
786    }
787
788    /// Get HTML tags - computed lazily on first access
789    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
790        Arc::clone(self.html_tags_cache.get_or_init(|| {
791            Arc::new(Self::parse_html_tags(
792                self.content,
793                &self.lines,
794                &self.code_blocks,
795                self.flavor,
796            ))
797        }))
798    }
799
800    /// Get emphasis spans - pre-computed during construction
801    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
802        Arc::clone(
803            self.emphasis_spans_cache
804                .get()
805                .expect("emphasis_spans_cache initialized during construction"),
806        )
807    }
808
809    /// Get table rows - computed lazily on first access
810    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
811        Arc::clone(
812            self.table_rows_cache
813                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
814        )
815    }
816
817    /// Get bare URLs - computed lazily on first access
818    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
819        Arc::clone(
820            self.bare_urls_cache
821                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
822        )
823    }
824
825    /// Check if document has mixed ordered/unordered list nesting.
826    /// Result is cached after first computation (document-level invariant).
827    /// This is used by MD007 for smart style auto-detection.
828    pub fn has_mixed_list_nesting(&self) -> bool {
829        *self
830            .has_mixed_list_nesting_cache
831            .get_or_init(|| self.compute_mixed_list_nesting())
832    }
833
834    /// Internal computation for mixed list nesting (only called once per LintContext).
835    fn compute_mixed_list_nesting(&self) -> bool {
836        // Track parent list items by their marker position and type
837        // Using marker_column instead of indent because it works correctly
838        // for blockquoted content where indent doesn't account for the prefix
839        // Stack stores: (marker_column, is_ordered)
840        let mut stack: Vec<(usize, bool)> = Vec::new();
841        let mut last_was_blank = false;
842
843        for line_info in &self.lines {
844            // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
845            if line_info.in_code_block
846                || line_info.in_front_matter
847                || line_info.in_mkdocstrings
848                || line_info.in_html_comment
849                || line_info.in_esm_block
850            {
851                continue;
852            }
853
854            // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
855            if line_info.is_blank {
856                last_was_blank = true;
857                continue;
858            }
859
860            if let Some(list_item) = &line_info.list_item {
861                // Normalize column 1 to column 0 (consistent with MD007 check function)
862                let current_pos = if list_item.marker_column == 1 {
863                    0
864                } else {
865                    list_item.marker_column
866                };
867
868                // If there was a blank line and this item is at root level, reset stack
869                if last_was_blank && current_pos == 0 {
870                    stack.clear();
871                }
872                last_was_blank = false;
873
874                // Pop items at same or greater position (they're siblings or deeper, not parents)
875                while let Some(&(pos, _)) = stack.last() {
876                    if pos >= current_pos {
877                        stack.pop();
878                    } else {
879                        break;
880                    }
881                }
882
883                // Check if immediate parent has different type - this is mixed nesting
884                if let Some(&(_, parent_is_ordered)) = stack.last()
885                    && parent_is_ordered != list_item.is_ordered
886                {
887                    return true; // Found mixed nesting - early exit
888                }
889
890                stack.push((current_pos, list_item.is_ordered));
891            } else {
892                // Non-list line (but not blank) - could be paragraph or other content
893                last_was_blank = false;
894            }
895        }
896
897        false
898    }
899
900    /// Map a byte offset to (line, column)
901    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
902        match self.line_offsets.binary_search(&offset) {
903            Ok(line) => (line + 1, 1),
904            Err(line) => {
905                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
906                (line, offset - line_start + 1)
907            }
908        }
909    }
910
911    /// Check if a position is within a code block or code span
912    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
913        // Check code blocks first
914        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
915            return true;
916        }
917
918        // Check inline code spans (lazy load if needed)
919        self.code_spans()
920            .iter()
921            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
922    }
923
924    /// Get line information by line number (1-indexed)
925    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
926        if line_num > 0 {
927            self.lines.get(line_num - 1)
928        } else {
929            None
930        }
931    }
932
933    /// Get byte offset for a line number (1-indexed)
934    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
935        self.line_info(line_num).map(|info| info.byte_offset)
936    }
937
938    /// Get URL for a reference link/image by its ID (O(1) lookup via HashMap)
939    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
940        let normalized_id = ref_id.to_lowercase();
941        self.reference_defs_map
942            .get(&normalized_id)
943            .map(|&idx| self.reference_defs[idx].url.as_str())
944    }
945
946    /// Get a reference definition by its ID (O(1) lookup via HashMap)
947    pub fn get_reference_def(&self, ref_id: &str) -> Option<&ReferenceDef> {
948        let normalized_id = ref_id.to_lowercase();
949        self.reference_defs_map
950            .get(&normalized_id)
951            .map(|&idx| &self.reference_defs[idx])
952    }
953
954    /// Check if a reference definition exists by ID (O(1) lookup via HashMap)
955    pub fn has_reference_def(&self, ref_id: &str) -> bool {
956        let normalized_id = ref_id.to_lowercase();
957        self.reference_defs_map.contains_key(&normalized_id)
958    }
959
960    /// Check if a line is part of a list block
961    pub fn is_in_list_block(&self, line_num: usize) -> bool {
962        self.list_blocks
963            .iter()
964            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
965    }
966
967    /// Get the list block containing a specific line
968    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
969        self.list_blocks
970            .iter()
971            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
972    }
973
974    // Compatibility methods for DocumentStructure migration
975
976    /// Check if a line is within a code block
977    pub fn is_in_code_block(&self, line_num: usize) -> bool {
978        if line_num == 0 || line_num > self.lines.len() {
979            return false;
980        }
981        self.lines[line_num - 1].in_code_block
982    }
983
984    /// Check if a line is within front matter
985    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
986        if line_num == 0 || line_num > self.lines.len() {
987            return false;
988        }
989        self.lines[line_num - 1].in_front_matter
990    }
991
992    /// Check if a line is within an HTML block
993    pub fn is_in_html_block(&self, line_num: usize) -> bool {
994        if line_num == 0 || line_num > self.lines.len() {
995            return false;
996        }
997        self.lines[line_num - 1].in_html_block
998    }
999
1000    /// Check if a line and column is within a code span
1001    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1002        if line_num == 0 || line_num > self.lines.len() {
1003            return false;
1004        }
1005
1006        // Use the code spans cache to check
1007        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
1008        // Convert col to 0-indexed for comparison
1009        let col_0indexed = if col > 0 { col - 1 } else { 0 };
1010        let code_spans = self.code_spans();
1011        code_spans.iter().any(|span| {
1012            // Check if line is within the span's line range
1013            if line_num < span.line || line_num > span.end_line {
1014                return false;
1015            }
1016
1017            if span.line == span.end_line {
1018                // Single-line span: check column bounds
1019                col_0indexed >= span.start_col && col_0indexed < span.end_col
1020            } else if line_num == span.line {
1021                // First line of multi-line span: anything after start_col is in span
1022                col_0indexed >= span.start_col
1023            } else if line_num == span.end_line {
1024                // Last line of multi-line span: anything before end_col is in span
1025                col_0indexed < span.end_col
1026            } else {
1027                // Middle line of multi-line span: entire line is in span
1028                true
1029            }
1030        })
1031    }
1032
1033    /// Check if a byte offset is within a code span
1034    #[inline]
1035    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
1036        let code_spans = self.code_spans();
1037        code_spans
1038            .iter()
1039            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
1040    }
1041
1042    /// Check if a byte position is within a reference definition
1043    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
1044    #[inline]
1045    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
1046        self.reference_defs
1047            .iter()
1048            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
1049    }
1050
1051    /// Check if a byte position is within an HTML comment
1052    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
1053    /// where k is the number of HTML comments (typically very small)
1054    #[inline]
1055    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
1056        self.html_comment_ranges
1057            .iter()
1058            .any(|range| byte_pos >= range.start && byte_pos < range.end)
1059    }
1060
1061    /// Check if a byte position is within an HTML tag (including multiline tags)
1062    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
1063    #[inline]
1064    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1065        self.html_tags()
1066            .iter()
1067            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1068    }
1069
1070    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
1071    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1072        self.jinja_ranges
1073            .iter()
1074            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1075    }
1076
1077    /// Check if a byte position is within a link reference definition title
1078    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1079        self.reference_defs.iter().any(|def| {
1080            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1081                byte_pos >= start && byte_pos < end
1082            } else {
1083                false
1084            }
1085        })
1086    }
1087
1088    /// Check if content has any instances of a specific character (fast)
1089    pub fn has_char(&self, ch: char) -> bool {
1090        match ch {
1091            '#' => self.char_frequency.hash_count > 0,
1092            '*' => self.char_frequency.asterisk_count > 0,
1093            '_' => self.char_frequency.underscore_count > 0,
1094            '-' => self.char_frequency.hyphen_count > 0,
1095            '+' => self.char_frequency.plus_count > 0,
1096            '>' => self.char_frequency.gt_count > 0,
1097            '|' => self.char_frequency.pipe_count > 0,
1098            '[' => self.char_frequency.bracket_count > 0,
1099            '`' => self.char_frequency.backtick_count > 0,
1100            '<' => self.char_frequency.lt_count > 0,
1101            '!' => self.char_frequency.exclamation_count > 0,
1102            '\n' => self.char_frequency.newline_count > 0,
1103            _ => self.content.contains(ch), // Fallback for other characters
1104        }
1105    }
1106
1107    /// Get count of a specific character (fast)
1108    pub fn char_count(&self, ch: char) -> usize {
1109        match ch {
1110            '#' => self.char_frequency.hash_count,
1111            '*' => self.char_frequency.asterisk_count,
1112            '_' => self.char_frequency.underscore_count,
1113            '-' => self.char_frequency.hyphen_count,
1114            '+' => self.char_frequency.plus_count,
1115            '>' => self.char_frequency.gt_count,
1116            '|' => self.char_frequency.pipe_count,
1117            '[' => self.char_frequency.bracket_count,
1118            '`' => self.char_frequency.backtick_count,
1119            '<' => self.char_frequency.lt_count,
1120            '!' => self.char_frequency.exclamation_count,
1121            '\n' => self.char_frequency.newline_count,
1122            _ => self.content.matches(ch).count(), // Fallback for other characters
1123        }
1124    }
1125
1126    /// Check if content likely contains headings (fast)
1127    pub fn likely_has_headings(&self) -> bool {
1128        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1129    }
1130
1131    /// Check if content likely contains lists (fast)
1132    pub fn likely_has_lists(&self) -> bool {
1133        self.char_frequency.asterisk_count > 0
1134            || self.char_frequency.hyphen_count > 0
1135            || self.char_frequency.plus_count > 0
1136    }
1137
1138    /// Check if content likely contains emphasis (fast)
1139    pub fn likely_has_emphasis(&self) -> bool {
1140        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1141    }
1142
1143    /// Check if content likely contains tables (fast)
1144    pub fn likely_has_tables(&self) -> bool {
1145        self.char_frequency.pipe_count > 2
1146    }
1147
1148    /// Check if content likely contains blockquotes (fast)
1149    pub fn likely_has_blockquotes(&self) -> bool {
1150        self.char_frequency.gt_count > 0
1151    }
1152
1153    /// Check if content likely contains code (fast)
1154    pub fn likely_has_code(&self) -> bool {
1155        self.char_frequency.backtick_count > 0
1156    }
1157
1158    /// Check if content likely contains links or images (fast)
1159    pub fn likely_has_links_or_images(&self) -> bool {
1160        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1161    }
1162
1163    /// Check if content likely contains HTML (fast)
1164    pub fn likely_has_html(&self) -> bool {
1165        self.char_frequency.lt_count > 0
1166    }
1167
1168    /// Get the blockquote prefix for inserting a blank line at the given line index.
1169    /// Returns the prefix without trailing content (e.g., ">" or ">>").
1170    /// This is needed because blank lines inside blockquotes must preserve the blockquote structure.
1171    /// Returns an empty string if the line is not inside a blockquote.
1172    pub fn blockquote_prefix_for_blank_line(&self, line_idx: usize) -> String {
1173        if let Some(line_info) = self.lines.get(line_idx)
1174            && let Some(ref bq) = line_info.blockquote
1175        {
1176            bq.prefix.trim_end().to_string()
1177        } else {
1178            String::new()
1179        }
1180    }
1181
1182    /// Get HTML tags on a specific line
1183    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1184        self.html_tags()
1185            .iter()
1186            .filter(|tag| tag.line == line_num)
1187            .cloned()
1188            .collect()
1189    }
1190
1191    /// Get emphasis spans on a specific line
1192    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1193        self.emphasis_spans()
1194            .iter()
1195            .filter(|span| span.line == line_num)
1196            .cloned()
1197            .collect()
1198    }
1199
1200    /// Get table rows on a specific line
1201    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1202        self.table_rows()
1203            .iter()
1204            .filter(|row| row.line == line_num)
1205            .cloned()
1206            .collect()
1207    }
1208
1209    /// Get bare URLs on a specific line
1210    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1211        self.bare_urls()
1212            .iter()
1213            .filter(|url| url.line == line_num)
1214            .cloned()
1215            .collect()
1216    }
1217
1218    /// Find the line index for a given byte offset using binary search.
1219    /// Returns (line_index, line_number, column) where:
1220    /// - line_index is the 0-based index in the lines array
1221    /// - line_number is the 1-based line number
1222    /// - column is the byte offset within that line
1223    #[inline]
1224    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1225        // Binary search to find the line containing this byte offset
1226        let idx = match lines.binary_search_by(|line| {
1227            if byte_offset < line.byte_offset {
1228                std::cmp::Ordering::Greater
1229            } else if byte_offset > line.byte_offset + line.byte_len {
1230                std::cmp::Ordering::Less
1231            } else {
1232                std::cmp::Ordering::Equal
1233            }
1234        }) {
1235            Ok(idx) => idx,
1236            Err(idx) => idx.saturating_sub(1),
1237        };
1238
1239        let line = &lines[idx];
1240        let line_num = idx + 1;
1241        let col = byte_offset.saturating_sub(line.byte_offset);
1242
1243        (idx, line_num, col)
1244    }
1245
1246    /// Check if a byte offset is within a code span using binary search
1247    #[inline]
1248    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1249        // Since spans are sorted by byte_offset, use partition_point for binary search
1250        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1251
1252        // Check the span that starts at or before our offset
1253        if idx > 0 {
1254            let span = &code_spans[idx - 1];
1255            if offset >= span.byte_offset && offset < span.byte_end {
1256                return true;
1257            }
1258        }
1259
1260        false
1261    }
1262
1263    /// Collect byte ranges of all links using pulldown-cmark
1264    /// This is used to skip heading detection for lines that fall within link syntax
1265    /// (e.g., multiline links like `[text](url\n#fragment)`)
1266    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1267        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1268
1269        let mut link_ranges = Vec::new();
1270        let mut options = Options::empty();
1271        options.insert(Options::ENABLE_WIKILINKS);
1272        options.insert(Options::ENABLE_FOOTNOTES);
1273
1274        let parser = Parser::new_ext(content, options).into_offset_iter();
1275        let mut link_stack: Vec<usize> = Vec::new();
1276
1277        for (event, range) in parser {
1278            match event {
1279                Event::Start(Tag::Link { .. }) => {
1280                    link_stack.push(range.start);
1281                }
1282                Event::End(TagEnd::Link) => {
1283                    if let Some(start_pos) = link_stack.pop() {
1284                        link_ranges.push((start_pos, range.end));
1285                    }
1286                }
1287                _ => {}
1288            }
1289        }
1290
1291        link_ranges
1292    }
1293
1294    /// Parse all links in the content
1295    fn parse_links(
1296        content: &'a str,
1297        lines: &[LineInfo],
1298        code_blocks: &[(usize, usize)],
1299        code_spans: &[CodeSpan],
1300        flavor: MarkdownFlavor,
1301        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1302    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1303        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1304        use std::collections::HashSet;
1305
1306        let mut links = Vec::with_capacity(content.len() / 500);
1307        let mut broken_links = Vec::new();
1308        let mut footnote_refs = Vec::new();
1309
1310        // Track byte positions of links found by pulldown-cmark
1311        let mut found_positions = HashSet::new();
1312
1313        // Use pulldown-cmark's streaming parser with BrokenLink callback
1314        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1315        // This automatically handles:
1316        // - Escaped links (won't generate events)
1317        // - Links in code blocks/spans (won't generate Link events)
1318        // - Images (generates Tag::Image instead)
1319        // - Reference resolution (dest_url is already resolved!)
1320        // - Broken references (callback is invoked)
1321        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1322        let mut options = Options::empty();
1323        options.insert(Options::ENABLE_WIKILINKS);
1324        options.insert(Options::ENABLE_FOOTNOTES);
1325
1326        let parser = Parser::new_with_broken_link_callback(
1327            content,
1328            options,
1329            Some(|link: BrokenLink<'_>| {
1330                broken_links.push(BrokenLinkInfo {
1331                    reference: link.reference.to_string(),
1332                    span: link.span.clone(),
1333                });
1334                None
1335            }),
1336        )
1337        .into_offset_iter();
1338
1339        let mut link_stack: Vec<(
1340            usize,
1341            usize,
1342            pulldown_cmark::CowStr<'a>,
1343            LinkType,
1344            pulldown_cmark::CowStr<'a>,
1345        )> = Vec::new();
1346        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1347
1348        for (event, range) in parser {
1349            match event {
1350                Event::Start(Tag::Link {
1351                    link_type,
1352                    dest_url,
1353                    id,
1354                    ..
1355                }) => {
1356                    // Link start - record position, URL, and reference ID
1357                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1358                    text_chunks.clear();
1359                }
1360                Event::Text(text) if !link_stack.is_empty() => {
1361                    // Track text content with its byte range
1362                    text_chunks.push((text.to_string(), range.start, range.end));
1363                }
1364                Event::Code(code) if !link_stack.is_empty() => {
1365                    // Include inline code in link text (with backticks)
1366                    let code_text = format!("`{code}`");
1367                    text_chunks.push((code_text, range.start, range.end));
1368                }
1369                Event::End(TagEnd::Link) => {
1370                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1371                        // Skip if in HTML comment
1372                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1373                            text_chunks.clear();
1374                            continue;
1375                        }
1376
1377                        // Find line and column information
1378                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1379
1380                        // Skip if this link is on a MkDocs snippet line
1381                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1382                            text_chunks.clear();
1383                            continue;
1384                        }
1385
1386                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1387
1388                        let is_reference = matches!(
1389                            link_type,
1390                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1391                        );
1392
1393                        // Extract link text directly from source bytes to preserve escaping
1394                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1395                        let link_text = if start_pos < content.len() {
1396                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1397
1398                            // Find MATCHING ] by tracking bracket depth for nested brackets
1399                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1400                            // Brackets inside code spans (between backticks) should be ignored
1401                            let mut close_pos = None;
1402                            let mut depth = 0;
1403                            let mut in_code_span = false;
1404
1405                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1406                                // Count preceding backslashes
1407                                let mut backslash_count = 0;
1408                                let mut j = i;
1409                                while j > 0 && link_bytes[j - 1] == b'\\' {
1410                                    backslash_count += 1;
1411                                    j -= 1;
1412                                }
1413                                let is_escaped = backslash_count % 2 != 0;
1414
1415                                // Track code spans - backticks toggle in/out of code
1416                                if byte == b'`' && !is_escaped {
1417                                    in_code_span = !in_code_span;
1418                                }
1419
1420                                // Only count brackets when NOT in a code span
1421                                if !is_escaped && !in_code_span {
1422                                    if byte == b'[' {
1423                                        depth += 1;
1424                                    } else if byte == b']' {
1425                                        if depth == 0 {
1426                                            // Found the matching closing bracket
1427                                            close_pos = Some(i);
1428                                            break;
1429                                        } else {
1430                                            depth -= 1;
1431                                        }
1432                                    }
1433                                }
1434                            }
1435
1436                            if let Some(pos) = close_pos {
1437                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1438                            } else {
1439                                Cow::Borrowed("")
1440                            }
1441                        } else {
1442                            Cow::Borrowed("")
1443                        };
1444
1445                        // For reference links, use the actual reference ID from pulldown-cmark
1446                        let reference_id = if is_reference && !ref_id.is_empty() {
1447                            Some(Cow::Owned(ref_id.to_lowercase()))
1448                        } else if is_reference {
1449                            // For collapsed/shortcut references without explicit ID, use the link text
1450                            Some(Cow::Owned(link_text.to_lowercase()))
1451                        } else {
1452                            None
1453                        };
1454
1455                        // Track this position as found
1456                        found_positions.insert(start_pos);
1457
1458                        links.push(ParsedLink {
1459                            line: line_num,
1460                            start_col: col_start,
1461                            end_col: col_end,
1462                            byte_offset: start_pos,
1463                            byte_end: range.end,
1464                            text: link_text,
1465                            url: Cow::Owned(url.to_string()),
1466                            is_reference,
1467                            reference_id,
1468                            link_type,
1469                        });
1470
1471                        text_chunks.clear();
1472                    }
1473                }
1474                Event::FootnoteReference(footnote_id) => {
1475                    // Capture footnote references like [^1], [^note]
1476                    // Skip if in HTML comment
1477                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1478                        continue;
1479                    }
1480
1481                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1482                    footnote_refs.push(FootnoteRef {
1483                        id: footnote_id.to_string(),
1484                        line: line_num,
1485                        byte_offset: range.start,
1486                        byte_end: range.end,
1487                    });
1488                }
1489                _ => {}
1490            }
1491        }
1492
1493        // Also find undefined references using regex
1494        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1495        // because the reference is undefined
1496        for cap in LINK_PATTERN.captures_iter(content) {
1497            let full_match = cap.get(0).unwrap();
1498            let match_start = full_match.start();
1499            let match_end = full_match.end();
1500
1501            // Skip if this was already found by pulldown-cmark (it's a valid link)
1502            if found_positions.contains(&match_start) {
1503                continue;
1504            }
1505
1506            // Skip if escaped
1507            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1508                continue;
1509            }
1510
1511            // Skip if it's an image
1512            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1513                continue;
1514            }
1515
1516            // Skip if in code block
1517            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1518                continue;
1519            }
1520
1521            // Skip if in code span
1522            if Self::is_offset_in_code_span(code_spans, match_start) {
1523                continue;
1524            }
1525
1526            // Skip if in HTML comment
1527            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1528                continue;
1529            }
1530
1531            // Find line and column information
1532            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1533
1534            // Skip if this link is on a MkDocs snippet line
1535            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1536                continue;
1537            }
1538
1539            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1540
1541            let text = cap.get(1).map_or("", |m| m.as_str());
1542
1543            // Only process reference links (group 6)
1544            if let Some(ref_id) = cap.get(6) {
1545                let ref_id_str = ref_id.as_str();
1546                let normalized_ref = if ref_id_str.is_empty() {
1547                    Cow::Owned(text.to_lowercase()) // Implicit reference
1548                } else {
1549                    Cow::Owned(ref_id_str.to_lowercase())
1550                };
1551
1552                // This is an undefined reference (pulldown-cmark didn't parse it)
1553                links.push(ParsedLink {
1554                    line: line_num,
1555                    start_col: col_start,
1556                    end_col: col_end,
1557                    byte_offset: match_start,
1558                    byte_end: match_end,
1559                    text: Cow::Borrowed(text),
1560                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1561                    is_reference: true,
1562                    reference_id: Some(normalized_ref),
1563                    link_type: LinkType::Reference, // Undefined references are reference-style
1564                });
1565            }
1566        }
1567
1568        (links, broken_links, footnote_refs)
1569    }
1570
1571    /// Parse all images in the content
1572    fn parse_images(
1573        content: &'a str,
1574        lines: &[LineInfo],
1575        code_blocks: &[(usize, usize)],
1576        code_spans: &[CodeSpan],
1577        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1578    ) -> Vec<ParsedImage<'a>> {
1579        use crate::utils::skip_context::is_in_html_comment_ranges;
1580        use std::collections::HashSet;
1581
1582        // Pre-size based on a heuristic: images are less common than links
1583        let mut images = Vec::with_capacity(content.len() / 1000);
1584        let mut found_positions = HashSet::new();
1585
1586        // Use pulldown-cmark for parsing - more accurate and faster
1587        let parser = Parser::new(content).into_offset_iter();
1588        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1589            Vec::new();
1590        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1591
1592        for (event, range) in parser {
1593            match event {
1594                Event::Start(Tag::Image {
1595                    link_type,
1596                    dest_url,
1597                    id,
1598                    ..
1599                }) => {
1600                    image_stack.push((range.start, dest_url, link_type, id));
1601                    text_chunks.clear();
1602                }
1603                Event::Text(text) if !image_stack.is_empty() => {
1604                    text_chunks.push((text.to_string(), range.start, range.end));
1605                }
1606                Event::Code(code) if !image_stack.is_empty() => {
1607                    let code_text = format!("`{code}`");
1608                    text_chunks.push((code_text, range.start, range.end));
1609                }
1610                Event::End(TagEnd::Image) => {
1611                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1612                        // Skip if in code block
1613                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1614                            continue;
1615                        }
1616
1617                        // Skip if in code span
1618                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1619                            continue;
1620                        }
1621
1622                        // Skip if in HTML comment
1623                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1624                            continue;
1625                        }
1626
1627                        // Find line and column using binary search
1628                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1629                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1630
1631                        let is_reference = matches!(
1632                            link_type,
1633                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1634                        );
1635
1636                        // Extract alt text directly from source bytes to preserve escaping
1637                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1638                        let alt_text = if start_pos < content.len() {
1639                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1640
1641                            // Find MATCHING ] by tracking bracket depth for nested brackets
1642                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1643                            let mut close_pos = None;
1644                            let mut depth = 0;
1645
1646                            if image_bytes.len() > 2 {
1647                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1648                                    // Count preceding backslashes
1649                                    let mut backslash_count = 0;
1650                                    let mut j = i;
1651                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1652                                        backslash_count += 1;
1653                                        j -= 1;
1654                                    }
1655                                    let is_escaped = backslash_count % 2 != 0;
1656
1657                                    if !is_escaped {
1658                                        if byte == b'[' {
1659                                            depth += 1;
1660                                        } else if byte == b']' {
1661                                            if depth == 0 {
1662                                                // Found the matching closing bracket
1663                                                close_pos = Some(i);
1664                                                break;
1665                                            } else {
1666                                                depth -= 1;
1667                                            }
1668                                        }
1669                                    }
1670                                }
1671                            }
1672
1673                            if let Some(pos) = close_pos {
1674                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1675                            } else {
1676                                Cow::Borrowed("")
1677                            }
1678                        } else {
1679                            Cow::Borrowed("")
1680                        };
1681
1682                        let reference_id = if is_reference && !ref_id.is_empty() {
1683                            Some(Cow::Owned(ref_id.to_lowercase()))
1684                        } else if is_reference {
1685                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1686                        } else {
1687                            None
1688                        };
1689
1690                        found_positions.insert(start_pos);
1691                        images.push(ParsedImage {
1692                            line: line_num,
1693                            start_col: col_start,
1694                            end_col: col_end,
1695                            byte_offset: start_pos,
1696                            byte_end: range.end,
1697                            alt_text,
1698                            url: Cow::Owned(url.to_string()),
1699                            is_reference,
1700                            reference_id,
1701                            link_type,
1702                        });
1703                    }
1704                }
1705                _ => {}
1706            }
1707        }
1708
1709        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1710        for cap in IMAGE_PATTERN.captures_iter(content) {
1711            let full_match = cap.get(0).unwrap();
1712            let match_start = full_match.start();
1713            let match_end = full_match.end();
1714
1715            // Skip if already found by pulldown-cmark
1716            if found_positions.contains(&match_start) {
1717                continue;
1718            }
1719
1720            // Skip if the ! is escaped
1721            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1722                continue;
1723            }
1724
1725            // Skip if in code block, code span, or HTML comment
1726            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1727                || Self::is_offset_in_code_span(code_spans, match_start)
1728                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1729            {
1730                continue;
1731            }
1732
1733            // Only process reference images (undefined references not found by pulldown-cmark)
1734            if let Some(ref_id) = cap.get(6) {
1735                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1736                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1737                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1738                let ref_id_str = ref_id.as_str();
1739                let normalized_ref = if ref_id_str.is_empty() {
1740                    Cow::Owned(alt_text.to_lowercase())
1741                } else {
1742                    Cow::Owned(ref_id_str.to_lowercase())
1743                };
1744
1745                images.push(ParsedImage {
1746                    line: line_num,
1747                    start_col: col_start,
1748                    end_col: col_end,
1749                    byte_offset: match_start,
1750                    byte_end: match_end,
1751                    alt_text: Cow::Borrowed(alt_text),
1752                    url: Cow::Borrowed(""),
1753                    is_reference: true,
1754                    reference_id: Some(normalized_ref),
1755                    link_type: LinkType::Reference, // Undefined references are reference-style
1756                });
1757            }
1758        }
1759
1760        images
1761    }
1762
1763    /// Parse reference definitions
1764    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1765        // Pre-size based on lines count as reference definitions are line-based
1766        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1767
1768        for (line_idx, line_info) in lines.iter().enumerate() {
1769            // Skip lines in code blocks
1770            if line_info.in_code_block {
1771                continue;
1772            }
1773
1774            let line = line_info.content(content);
1775            let line_num = line_idx + 1;
1776
1777            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1778                let id_raw = cap.get(1).unwrap().as_str();
1779
1780                // Skip footnote definitions - they use [^id]: syntax and are semantically
1781                // different from reference link definitions
1782                if id_raw.starts_with('^') {
1783                    continue;
1784                }
1785
1786                let id = id_raw.to_lowercase();
1787                let url = cap.get(2).unwrap().as_str().to_string();
1788                let title_match = cap.get(3).or_else(|| cap.get(4));
1789                let title = title_match.map(|m| m.as_str().to_string());
1790
1791                // Calculate byte positions
1792                // The match starts at the beginning of the line (0) and extends to the end
1793                let match_obj = cap.get(0).unwrap();
1794                let byte_offset = line_info.byte_offset + match_obj.start();
1795                let byte_end = line_info.byte_offset + match_obj.end();
1796
1797                // Calculate title byte positions (includes the quote character before content)
1798                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1799                    // The match is the content inside quotes, so we include the quote before
1800                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1801                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1802                    (Some(start), Some(end))
1803                } else {
1804                    (None, None)
1805                };
1806
1807                refs.push(ReferenceDef {
1808                    line: line_num,
1809                    id,
1810                    url,
1811                    title,
1812                    byte_offset,
1813                    byte_end,
1814                    title_byte_start,
1815                    title_byte_end,
1816                });
1817            }
1818        }
1819
1820        refs
1821    }
1822
1823    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1824    /// Handles nested blockquotes like `> > > content`
1825    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1826    #[inline]
1827    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1828        let trimmed_start = line.trim_start();
1829        if !trimmed_start.starts_with('>') {
1830            return None;
1831        }
1832
1833        // Track total prefix length to handle nested blockquotes
1834        let mut remaining = line;
1835        let mut total_prefix_len = 0;
1836
1837        loop {
1838            let trimmed = remaining.trim_start();
1839            if !trimmed.starts_with('>') {
1840                break;
1841            }
1842
1843            // Add leading whitespace + '>' to prefix
1844            let leading_ws_len = remaining.len() - trimmed.len();
1845            total_prefix_len += leading_ws_len + 1;
1846
1847            let after_gt = &trimmed[1..];
1848
1849            // Handle optional whitespace after '>' (space or tab)
1850            if let Some(stripped) = after_gt.strip_prefix(' ') {
1851                total_prefix_len += 1;
1852                remaining = stripped;
1853            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1854                total_prefix_len += 1;
1855                remaining = stripped;
1856            } else {
1857                remaining = after_gt;
1858            }
1859        }
1860
1861        Some((&line[..total_prefix_len], remaining))
1862    }
1863
1864    /// Detect list items using pulldown-cmark for CommonMark-compliant parsing.
1865    ///
1866    /// Returns a HashMap keyed by line byte offset, containing:
1867    /// `(is_ordered, marker, marker_column, content_column, number)`
1868    ///
1869    /// ## Why pulldown-cmark?
1870    /// Using pulldown-cmark instead of regex ensures we only detect actual list items,
1871    /// not lines that merely look like lists (e.g., continuation paragraphs, code blocks).
1872    /// This fixes issue #253 where continuation lines were falsely detected.
1873    ///
1874    /// ## Tab indentation quirk
1875    /// Pulldown-cmark reports nested list items at the newline character position
1876    /// when tab indentation is used. For example, in `"* Item\n\t- Nested"`,
1877    /// the nested item is reported at byte 7 (the `\n`), not byte 8 (the `\t`).
1878    /// We detect this and advance to the correct line.
1879    ///
1880    /// ## HashMap key strategy
1881    /// We use `entry().or_insert()` because pulldown-cmark may emit multiple events
1882    /// that resolve to the same line (after newline adjustment). The first event
1883    /// for each line is authoritative.
1884    /// Detect list items and emphasis spans in a single pulldown-cmark pass.
1885    /// Returns both list items (for LineInfo) and emphasis spans (for MD030).
1886    /// This avoids a separate parse for emphasis detection.
1887    fn detect_list_items_and_emphasis_with_pulldown(
1888        content: &str,
1889        line_offsets: &[usize],
1890        flavor: MarkdownFlavor,
1891        front_matter_end: usize,
1892        code_blocks: &[(usize, usize)],
1893    ) -> (ListItemMap, Vec<EmphasisSpan>) {
1894        use std::collections::HashMap;
1895
1896        let mut list_items = HashMap::new();
1897        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1898
1899        let mut options = Options::empty();
1900        options.insert(Options::ENABLE_TABLES);
1901        options.insert(Options::ENABLE_FOOTNOTES);
1902        options.insert(Options::ENABLE_STRIKETHROUGH);
1903        options.insert(Options::ENABLE_TASKLISTS);
1904        // Always enable GFM features for consistency with existing behavior
1905        options.insert(Options::ENABLE_GFM);
1906
1907        // Suppress unused variable warning
1908        let _ = flavor;
1909
1910        let parser = Parser::new_ext(content, options).into_offset_iter();
1911        let mut list_depth: usize = 0;
1912        let mut list_stack: Vec<bool> = Vec::new();
1913
1914        for (event, range) in parser {
1915            match event {
1916                // Capture emphasis spans (for MD030's emphasis detection)
1917                Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1918                    let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1919                        2
1920                    } else {
1921                        1
1922                    };
1923                    let match_start = range.start;
1924                    let match_end = range.end;
1925
1926                    // Skip if in code block
1927                    if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1928                        // Determine marker character by looking at the content at the start
1929                        let marker = content[match_start..].chars().next().unwrap_or('*');
1930                        if marker == '*' || marker == '_' {
1931                            // Extract content between markers
1932                            let content_start = match_start + marker_count;
1933                            let content_end = if match_end >= marker_count {
1934                                match_end - marker_count
1935                            } else {
1936                                match_end
1937                            };
1938                            let content_part = if content_start < content_end && content_end <= content.len() {
1939                                &content[content_start..content_end]
1940                            } else {
1941                                ""
1942                            };
1943
1944                            // Find which line this emphasis is on using line_offsets
1945                            let line_idx = match line_offsets.binary_search(&match_start) {
1946                                Ok(idx) => idx,
1947                                Err(idx) => idx.saturating_sub(1),
1948                            };
1949                            let line_num = line_idx + 1;
1950                            let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1951                            let col_start = match_start - line_start;
1952                            let col_end = match_end - line_start;
1953
1954                            emphasis_spans.push(EmphasisSpan {
1955                                line: line_num,
1956                                start_col: col_start,
1957                                end_col: col_end,
1958                                byte_offset: match_start,
1959                                byte_end: match_end,
1960                                marker,
1961                                marker_count,
1962                                content: content_part.to_string(),
1963                            });
1964                        }
1965                    }
1966                }
1967                Event::Start(Tag::List(start_number)) => {
1968                    list_depth += 1;
1969                    list_stack.push(start_number.is_some());
1970                }
1971                Event::End(TagEnd::List(_)) => {
1972                    list_depth = list_depth.saturating_sub(1);
1973                    list_stack.pop();
1974                }
1975                Event::Start(Tag::Item) if list_depth > 0 => {
1976                    // Get the ordered state for the CURRENT (innermost) list
1977                    let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1978                    // Find which line this byte offset corresponds to
1979                    let item_start = range.start;
1980
1981                    // Binary search to find the line number
1982                    let mut line_idx = match line_offsets.binary_search(&item_start) {
1983                        Ok(idx) => idx,
1984                        Err(idx) => idx.saturating_sub(1),
1985                    };
1986
1987                    // Pulldown-cmark reports nested list items at the newline before the item
1988                    // when using tab indentation (e.g., "* Item\n\t- Nested").
1989                    // Advance to the actual content line in this case.
1990                    if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1991                        line_idx += 1;
1992                    }
1993
1994                    // Skip list items in frontmatter (they are YAML/TOML syntax, not Markdown)
1995                    if front_matter_end > 0 && line_idx < front_matter_end {
1996                        continue;
1997                    }
1998
1999                    if line_idx < line_offsets.len() {
2000                        let line_start_byte = line_offsets[line_idx];
2001                        let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
2002                        let line = &content[line_start_byte..line_end.min(content.len())];
2003
2004                        // Strip trailing newline
2005                        let line = line
2006                            .strip_suffix('\n')
2007                            .or_else(|| line.strip_suffix("\r\n"))
2008                            .unwrap_or(line);
2009
2010                        // Strip blockquote prefix if present
2011                        let blockquote_parse = Self::parse_blockquote_prefix(line);
2012                        let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
2013                            (prefix.len(), content)
2014                        } else {
2015                            (0, line)
2016                        };
2017
2018                        // Parse the list marker from the actual line
2019                        if current_list_is_ordered {
2020                            if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
2021                                Self::parse_ordered_list(line_to_parse)
2022                            {
2023                                let marker = format!("{number_str}{delimiter}");
2024                                let marker_column = blockquote_prefix_len + leading_spaces.len();
2025                                let content_column = marker_column + marker.len() + spacing.len();
2026                                let number = number_str.parse().ok();
2027
2028                                list_items.entry(line_start_byte).or_insert((
2029                                    true,
2030                                    marker,
2031                                    marker_column,
2032                                    content_column,
2033                                    number,
2034                                ));
2035                            }
2036                        } else if let Some((leading_spaces, marker, spacing, _content)) =
2037                            Self::parse_unordered_list(line_to_parse)
2038                        {
2039                            let marker_column = blockquote_prefix_len + leading_spaces.len();
2040                            let content_column = marker_column + 1 + spacing.len();
2041
2042                            list_items.entry(line_start_byte).or_insert((
2043                                false,
2044                                marker.to_string(),
2045                                marker_column,
2046                                content_column,
2047                                None,
2048                            ));
2049                        }
2050                    }
2051                }
2052                _ => {}
2053            }
2054        }
2055
2056        (list_items, emphasis_spans)
2057    }
2058
2059    /// Fast unordered list parser - replaces regex for 5-10x speedup
2060    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
2061    /// Returns: Some((leading_ws, marker, spacing, content)) or None
2062    #[inline]
2063    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
2064        let bytes = line.as_bytes();
2065        let mut i = 0;
2066
2067        // Skip leading whitespace
2068        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2069            i += 1;
2070        }
2071
2072        // Check for marker
2073        if i >= bytes.len() {
2074            return None;
2075        }
2076        let marker = bytes[i] as char;
2077        if marker != '-' && marker != '*' && marker != '+' {
2078            return None;
2079        }
2080        let marker_pos = i;
2081        i += 1;
2082
2083        // Collect spacing after marker (space or tab only)
2084        let spacing_start = i;
2085        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2086            i += 1;
2087        }
2088
2089        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2090    }
2091
2092    /// Fast ordered list parser - replaces regex for 5-10x speedup
2093    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
2094    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
2095    #[inline]
2096    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2097        let bytes = line.as_bytes();
2098        let mut i = 0;
2099
2100        // Skip leading whitespace
2101        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2102            i += 1;
2103        }
2104
2105        // Collect digits
2106        let number_start = i;
2107        while i < bytes.len() && bytes[i].is_ascii_digit() {
2108            i += 1;
2109        }
2110        if i == number_start {
2111            return None; // No digits found
2112        }
2113
2114        // Check for delimiter
2115        if i >= bytes.len() {
2116            return None;
2117        }
2118        let delimiter = bytes[i] as char;
2119        if delimiter != '.' && delimiter != ')' {
2120            return None;
2121        }
2122        let delimiter_pos = i;
2123        i += 1;
2124
2125        // Collect spacing after delimiter (space or tab only)
2126        let spacing_start = i;
2127        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2128            i += 1;
2129        }
2130
2131        Some((
2132            &line[..number_start],
2133            &line[number_start..delimiter_pos],
2134            delimiter,
2135            &line[spacing_start..i],
2136            &line[i..],
2137        ))
2138    }
2139
2140    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
2141    /// Returns a Vec<bool> where index i indicates if line i is in a code block
2142    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2143        let num_lines = line_offsets.len();
2144        let mut in_code_block = vec![false; num_lines];
2145
2146        // For each code block, mark all lines within it
2147        for &(start, end) in code_blocks {
2148            // Ensure we're at valid UTF-8 boundaries
2149            let safe_start = if start > 0 && !content.is_char_boundary(start) {
2150                let mut boundary = start;
2151                while boundary > 0 && !content.is_char_boundary(boundary) {
2152                    boundary -= 1;
2153                }
2154                boundary
2155            } else {
2156                start
2157            };
2158
2159            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2160                let mut boundary = end;
2161                while boundary < content.len() && !content.is_char_boundary(boundary) {
2162                    boundary += 1;
2163                }
2164                boundary
2165            } else {
2166                end.min(content.len())
2167            };
2168
2169            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
2170            // That function now has proper list context awareness (see code_block_utils.rs)
2171            // and correctly distinguishes between:
2172            // - Fenced code blocks (``` or ~~~)
2173            // - Indented code blocks at document level (4 spaces + blank line before)
2174            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
2175            //
2176            // We no longer need to re-validate here. The original validation logic
2177            // was causing false positives by marking list continuation paragraphs as
2178            // code blocks when they have 4 spaces of indentation.
2179
2180            // Use binary search to find the first and last line indices
2181            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
2182            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
2183            //
2184            // Find the line that CONTAINS safe_start: the line with the largest
2185            // start offset that is <= safe_start. partition_point gives us the
2186            // first line that starts AFTER safe_start, so we subtract 1.
2187            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2188            let first_line = first_line_after.saturating_sub(1);
2189            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2190
2191            // Mark all lines in the range at once
2192            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2193                *flag = true;
2194            }
2195        }
2196
2197        in_code_block
2198    }
2199
2200    /// Pre-compute which lines are inside math blocks ($$ ... $$) - O(n) single pass
2201    /// Returns a Vec<bool> where index i indicates if line i is in a math block
2202    fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2203        let content_lines: Vec<&str> = content.lines().collect();
2204        let num_lines = content_lines.len();
2205        let mut in_math_block = vec![false; num_lines];
2206
2207        let mut inside_math = false;
2208
2209        for (i, line) in content_lines.iter().enumerate() {
2210            // Skip lines that are in code blocks - math delimiters inside code are literal
2211            if code_block_map.get(i).copied().unwrap_or(false) {
2212                continue;
2213            }
2214
2215            let trimmed = line.trim();
2216
2217            // Check for math block delimiter ($$)
2218            // A line with just $$ toggles the math block state
2219            if trimmed == "$$" {
2220                if inside_math {
2221                    // Closing delimiter - this line is still part of the math block
2222                    in_math_block[i] = true;
2223                    inside_math = false;
2224                } else {
2225                    // Opening delimiter - this line starts the math block
2226                    in_math_block[i] = true;
2227                    inside_math = true;
2228                }
2229            } else if inside_math {
2230                // Content inside math block
2231                in_math_block[i] = true;
2232            }
2233        }
2234
2235        in_math_block
2236    }
2237
2238    /// Pre-compute basic line information (without headings/blockquotes)
2239    /// Also returns emphasis spans detected during the pulldown-cmark parse
2240    fn compute_basic_line_info(
2241        content: &str,
2242        line_offsets: &[usize],
2243        code_blocks: &[(usize, usize)],
2244        flavor: MarkdownFlavor,
2245        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2246        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2247    ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2248        let content_lines: Vec<&str> = content.lines().collect();
2249        let mut lines = Vec::with_capacity(content_lines.len());
2250
2251        // Pre-compute which lines are in code blocks
2252        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2253
2254        // Pre-compute which lines are in math blocks ($$ ... $$)
2255        let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2256
2257        // Detect front matter boundaries FIRST, before any other parsing
2258        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
2259        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2260
2261        // Use pulldown-cmark to detect list items AND emphasis spans in a single pass
2262        // (context-aware, eliminates false positives)
2263        let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2264            content,
2265            line_offsets,
2266            flavor,
2267            front_matter_end,
2268            code_blocks,
2269        );
2270
2271        for (i, line) in content_lines.iter().enumerate() {
2272            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2273            let indent = line.len() - line.trim_start().len();
2274            // Compute visual indent with proper CommonMark tab expansion
2275            let visual_indent = ElementCache::calculate_indentation_width_default(line);
2276
2277            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
2278            let blockquote_parse = Self::parse_blockquote_prefix(line);
2279
2280            // For blank detection, consider blockquote context
2281            let is_blank = if let Some((_, content)) = blockquote_parse {
2282                // In blockquote context, check if content after prefix is blank
2283                content.trim().is_empty()
2284            } else {
2285                line.trim().is_empty()
2286            };
2287
2288            // Use pre-computed map for O(1) lookup instead of O(m) iteration
2289            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2290
2291            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
2292            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2293                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2294            // Check if the ENTIRE line is within an HTML comment (not just the line start)
2295            // This ensures content after `-->` on the same line is not incorrectly skipped
2296            let line_end_offset = byte_offset + line.len();
2297            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2298                html_comment_ranges,
2299                byte_offset,
2300                line_end_offset,
2301            );
2302            // Use pulldown-cmark's list detection for context-aware parsing
2303            // This eliminates false positives on continuation lines (issue #253)
2304            let list_item =
2305                list_item_map
2306                    .get(&byte_offset)
2307                    .map(
2308                        |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2309                            marker: marker.clone(),
2310                            is_ordered: *is_ordered,
2311                            number: *number,
2312                            marker_column: *marker_column,
2313                            content_column: *content_column,
2314                        },
2315                    );
2316
2317            // Detect horizontal rules (only outside code blocks and frontmatter)
2318            // Uses CommonMark-compliant check including leading indentation validation
2319            let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2320            let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2321
2322            // Get math block status for this line
2323            let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2324
2325            lines.push(LineInfo {
2326                byte_offset,
2327                byte_len: line.len(),
2328                indent,
2329                visual_indent,
2330                is_blank,
2331                in_code_block,
2332                in_front_matter,
2333                in_html_block: false, // Will be populated after line creation
2334                in_html_comment,
2335                list_item,
2336                heading: None,    // Will be populated in second pass for Setext headings
2337                blockquote: None, // Will be populated after line creation
2338                in_mkdocstrings,
2339                in_esm_block: false, // Will be populated after line creation for MDX files
2340                in_code_span_continuation: false, // Will be populated after code spans are parsed
2341                is_horizontal_rule: is_hr,
2342                in_math_block,
2343            });
2344        }
2345
2346        (lines, emphasis_spans)
2347    }
2348
2349    /// Detect headings and blockquotes (called after HTML block detection)
2350    fn detect_headings_and_blockquotes(
2351        content: &str,
2352        lines: &mut [LineInfo],
2353        flavor: MarkdownFlavor,
2354        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2355        link_byte_ranges: &[(usize, usize)],
2356    ) {
2357        // Regex for heading detection
2358        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2359            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2360        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2361            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2362
2363        let content_lines: Vec<&str> = content.lines().collect();
2364
2365        // Detect front matter boundaries to skip those lines
2366        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2367
2368        // Detect headings (including Setext which needs look-ahead) and blockquotes
2369        for i in 0..lines.len() {
2370            let line = content_lines[i];
2371
2372            // Detect blockquotes FIRST, before any skip conditions.
2373            // A line can be both a blockquote AND contain a code block inside it.
2374            // We need to know about the blockquote marker regardless of code block status.
2375            // Skip only frontmatter lines - those are never blockquotes.
2376            if !(front_matter_end > 0 && i < front_matter_end)
2377                && let Some(bq) = parse_blockquote_detailed(line)
2378            {
2379                let nesting_level = bq.markers.len();
2380                let marker_column = bq.indent.len();
2381                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2382                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2383                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2384                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2385
2386                lines[i].blockquote = Some(BlockquoteInfo {
2387                    nesting_level,
2388                    indent: bq.indent.to_string(),
2389                    marker_column,
2390                    prefix,
2391                    content: bq.content.to_string(),
2392                    has_no_space_after_marker: has_no_space,
2393                    has_multiple_spaces_after_marker: has_multiple_spaces,
2394                    needs_md028_fix,
2395                });
2396
2397                // Update is_horizontal_rule for blockquote content
2398                // The original detection doesn't strip blockquote prefix, so we need to check here
2399                if !lines[i].in_code_block && is_horizontal_rule_content(bq.content.trim()) {
2400                    lines[i].is_horizontal_rule = true;
2401                }
2402            }
2403
2404            // Now apply skip conditions for heading detection
2405            if lines[i].in_code_block {
2406                continue;
2407            }
2408
2409            // Skip lines in front matter
2410            if front_matter_end > 0 && i < front_matter_end {
2411                continue;
2412            }
2413
2414            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2415            if lines[i].in_html_block {
2416                continue;
2417            }
2418
2419            // Skip heading detection for blank lines
2420            if lines[i].is_blank {
2421                continue;
2422            }
2423
2424            // Check for ATX headings (but skip MkDocs snippet lines)
2425            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2426            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2427                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2428                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2429            } else {
2430                false
2431            };
2432
2433            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2434                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2435                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2436                    continue;
2437                }
2438                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2439                // This prevents false positives where `#fragment` is detected as a heading
2440                let line_offset = lines[i].byte_offset;
2441                if link_byte_ranges
2442                    .iter()
2443                    .any(|&(start, end)| line_offset > start && line_offset < end)
2444                {
2445                    continue;
2446                }
2447                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2448                let hashes = caps.get(2).map_or("", |m| m.as_str());
2449                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2450                let rest = caps.get(4).map_or("", |m| m.as_str());
2451
2452                let level = hashes.len() as u8;
2453                let marker_column = leading_spaces.len();
2454
2455                // Check for closing sequence, but handle custom IDs that might come after
2456                let (text, has_closing, closing_seq) = {
2457                    // First check if there's a custom ID at the end
2458                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2459                        // Check if this looks like a valid custom ID (ends with })
2460                        if rest[id_start..].trim_end().ends_with('}') {
2461                            // Split off the custom ID
2462                            (&rest[..id_start], &rest[id_start..])
2463                        } else {
2464                            (rest, "")
2465                        }
2466                    } else {
2467                        (rest, "")
2468                    };
2469
2470                    // Now look for closing hashes in the part before the custom ID
2471                    let trimmed_rest = rest_without_id.trim_end();
2472                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2473                        // Find the start of the hash sequence by walking backwards
2474                        // Use char_indices to get byte positions at char boundaries
2475                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2476
2477                        // Find which char index corresponds to last_hash_byte_pos
2478                        let last_hash_char_idx = char_positions
2479                            .iter()
2480                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2481
2482                        if let Some(mut char_idx) = last_hash_char_idx {
2483                            // Walk backwards to find start of hash sequence
2484                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2485                                char_idx -= 1;
2486                            }
2487
2488                            // Get the byte position of the start of hashes
2489                            let start_of_hashes = char_positions[char_idx].0;
2490
2491                            // Check if there's at least one space before the closing hashes
2492                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2493
2494                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2495                            let potential_closing = &trimmed_rest[start_of_hashes..];
2496                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2497
2498                            if is_all_hashes && has_space_before {
2499                                // This is a closing sequence
2500                                let closing_hashes = potential_closing.to_string();
2501                                // The text is everything before the closing hashes
2502                                // Don't include the custom ID here - it will be extracted later
2503                                let text_part = if !custom_id_part.is_empty() {
2504                                    // If we have a custom ID, append it back to get the full rest
2505                                    // This allows the extract_header_id function to handle it properly
2506                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2507                                } else {
2508                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2509                                };
2510                                (text_part, true, closing_hashes)
2511                            } else {
2512                                // Not a valid closing sequence, return the full content
2513                                (rest.to_string(), false, String::new())
2514                            }
2515                        } else {
2516                            // Couldn't find char boundary, return the full content
2517                            (rest.to_string(), false, String::new())
2518                        }
2519                    } else {
2520                        // No hashes found, return the full content
2521                        (rest.to_string(), false, String::new())
2522                    }
2523                };
2524
2525                let content_column = marker_column + hashes.len() + spaces_after.len();
2526
2527                // Extract custom header ID if present
2528                let raw_text = text.trim().to_string();
2529                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2530
2531                // If no custom ID was found on the header line, check the next line for standalone attr-list
2532                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2533                    let next_line = content_lines[i + 1];
2534                    if !lines[i + 1].in_code_block
2535                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2536                        && let Some(next_line_id) =
2537                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2538                    {
2539                        custom_id = Some(next_line_id);
2540                    }
2541                }
2542
2543                // ATX heading is "valid" for processing by heading rules if:
2544                // 1. Has space after # (CommonMark compliant): `# Heading`
2545                // 2. Is empty (just hashes): `#`
2546                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2547                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2548                //
2549                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2550                // - `#tag` - single # with lowercase (social hashtag)
2551                // - `#123` - single # with number (GitHub issue ref)
2552                let is_valid = !spaces_after.is_empty()
2553                    || rest.is_empty()
2554                    || level > 1
2555                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2556
2557                lines[i].heading = Some(HeadingInfo {
2558                    level,
2559                    style: HeadingStyle::ATX,
2560                    marker: hashes.to_string(),
2561                    marker_column,
2562                    content_column,
2563                    text: clean_text,
2564                    custom_id,
2565                    raw_text,
2566                    has_closing_sequence: has_closing,
2567                    closing_sequence: closing_seq,
2568                    is_valid,
2569                });
2570            }
2571            // Check for Setext headings (need to look at next line)
2572            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2573                let next_line = content_lines[i + 1];
2574                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2575                    // Skip if next line is front matter delimiter
2576                    if front_matter_end > 0 && i < front_matter_end {
2577                        continue;
2578                    }
2579
2580                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2581                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2582                    {
2583                        continue;
2584                    }
2585
2586                    // Per CommonMark spec 4.3, setext heading content cannot be interpretable as:
2587                    // list item, ATX heading, block quote, thematic break, code fence, or HTML block
2588                    let content_line = line.trim();
2589
2590                    // Skip list items (-, *, +) and thematic breaks (---, ***, etc.)
2591                    if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2592                        continue;
2593                    }
2594
2595                    // Skip underscore thematic breaks (___)
2596                    if content_line.starts_with('_') {
2597                        let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2598                        if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2599                            continue;
2600                        }
2601                    }
2602
2603                    // Skip numbered lists (1. Item, 2. Item, etc.)
2604                    if let Some(first_char) = content_line.chars().next()
2605                        && first_char.is_ascii_digit()
2606                    {
2607                        let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2608                        if num_end < content_line.len() {
2609                            let next = content_line.chars().nth(num_end);
2610                            if next == Some('.') || next == Some(')') {
2611                                continue;
2612                            }
2613                        }
2614                    }
2615
2616                    // Skip ATX headings
2617                    if ATX_HEADING_REGEX.is_match(line) {
2618                        continue;
2619                    }
2620
2621                    // Skip blockquotes
2622                    if content_line.starts_with('>') {
2623                        continue;
2624                    }
2625
2626                    // Skip code fences
2627                    let trimmed_start = line.trim_start();
2628                    if trimmed_start.len() >= 3 {
2629                        let first_three: String = trimmed_start.chars().take(3).collect();
2630                        if first_three == "```" || first_three == "~~~" {
2631                            continue;
2632                        }
2633                    }
2634
2635                    // Skip HTML blocks
2636                    if content_line.starts_with('<') {
2637                        continue;
2638                    }
2639
2640                    let underline = next_line.trim();
2641
2642                    let level = if underline.starts_with('=') { 1 } else { 2 };
2643                    let style = if level == 1 {
2644                        HeadingStyle::Setext1
2645                    } else {
2646                        HeadingStyle::Setext2
2647                    };
2648
2649                    // Extract custom header ID if present
2650                    let raw_text = line.trim().to_string();
2651                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2652
2653                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2654                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2655                        let attr_line = content_lines[i + 2];
2656                        if !lines[i + 2].in_code_block
2657                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2658                            && let Some(attr_line_id) =
2659                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2660                        {
2661                            custom_id = Some(attr_line_id);
2662                        }
2663                    }
2664
2665                    lines[i].heading = Some(HeadingInfo {
2666                        level,
2667                        style,
2668                        marker: underline.to_string(),
2669                        marker_column: next_line.len() - next_line.trim_start().len(),
2670                        content_column: lines[i].indent,
2671                        text: clean_text,
2672                        custom_id,
2673                        raw_text,
2674                        has_closing_sequence: false,
2675                        closing_sequence: String::new(),
2676                        is_valid: true, // Setext headings are always valid
2677                    });
2678                }
2679            }
2680        }
2681    }
2682
2683    /// Detect HTML blocks in the content
2684    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2685        // HTML block elements that trigger block context
2686        // Includes HTML5 media, embedded content, and interactive elements
2687        const BLOCK_ELEMENTS: &[&str] = &[
2688            "address",
2689            "article",
2690            "aside",
2691            "audio",
2692            "blockquote",
2693            "canvas",
2694            "details",
2695            "dialog",
2696            "dd",
2697            "div",
2698            "dl",
2699            "dt",
2700            "embed",
2701            "fieldset",
2702            "figcaption",
2703            "figure",
2704            "footer",
2705            "form",
2706            "h1",
2707            "h2",
2708            "h3",
2709            "h4",
2710            "h5",
2711            "h6",
2712            "header",
2713            "hr",
2714            "iframe",
2715            "li",
2716            "main",
2717            "menu",
2718            "nav",
2719            "noscript",
2720            "object",
2721            "ol",
2722            "p",
2723            "picture",
2724            "pre",
2725            "script",
2726            "search",
2727            "section",
2728            "source",
2729            "style",
2730            "summary",
2731            "svg",
2732            "table",
2733            "tbody",
2734            "td",
2735            "template",
2736            "textarea",
2737            "tfoot",
2738            "th",
2739            "thead",
2740            "tr",
2741            "track",
2742            "ul",
2743            "video",
2744        ];
2745
2746        let mut i = 0;
2747        while i < lines.len() {
2748            // Skip if already in code block or front matter
2749            if lines[i].in_code_block || lines[i].in_front_matter {
2750                i += 1;
2751                continue;
2752            }
2753
2754            let trimmed = lines[i].content(content).trim_start();
2755
2756            // Check if line starts with an HTML tag
2757            if trimmed.starts_with('<') && trimmed.len() > 1 {
2758                // Extract tag name safely
2759                let after_bracket = &trimmed[1..];
2760                let is_closing = after_bracket.starts_with('/');
2761                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2762
2763                // Extract tag name (stop at space, >, /, or end of string)
2764                let tag_name = tag_start
2765                    .chars()
2766                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2767                    .collect::<String>()
2768                    .to_lowercase();
2769
2770                // Check if it's a block element
2771                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2772                    // Mark this line as in HTML block
2773                    lines[i].in_html_block = true;
2774
2775                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2776                    // This avoids complex nesting logic that might cause infinite loops
2777                    if !is_closing {
2778                        let closing_tag = format!("</{tag_name}>");
2779                        // style and script tags can contain blank lines (CSS/JS formatting)
2780                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2781                        let mut j = i + 1;
2782                        let mut found_closing_tag = false;
2783                        while j < lines.len() && j < i + 100 {
2784                            // Limit search to 100 lines
2785                            // Stop at blank lines (except for style/script tags)
2786                            if !allow_blank_lines && lines[j].is_blank {
2787                                break;
2788                            }
2789
2790                            lines[j].in_html_block = true;
2791
2792                            // Check if this line contains the closing tag
2793                            if lines[j].content(content).contains(&closing_tag) {
2794                                found_closing_tag = true;
2795                            }
2796
2797                            // After finding closing tag, continue marking lines as
2798                            // in_html_block until blank line (per CommonMark spec)
2799                            if found_closing_tag {
2800                                j += 1;
2801                                // Continue marking subsequent lines until blank
2802                                while j < lines.len() && j < i + 100 {
2803                                    if lines[j].is_blank {
2804                                        break;
2805                                    }
2806                                    lines[j].in_html_block = true;
2807                                    j += 1;
2808                                }
2809                                break;
2810                            }
2811                            j += 1;
2812                        }
2813                    }
2814                }
2815            }
2816
2817            i += 1;
2818        }
2819    }
2820
2821    /// Detect ESM import/export blocks in MDX files
2822    /// ESM blocks consist of contiguous import/export statements at the top of the file
2823    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2824        // Only process MDX files
2825        if !flavor.supports_esm_blocks() {
2826            return;
2827        }
2828
2829        let mut in_multiline_comment = false;
2830
2831        for line in lines.iter_mut() {
2832            // Skip blank lines and HTML comments
2833            if line.is_blank || line.in_html_comment {
2834                continue;
2835            }
2836
2837            let trimmed = line.content(content).trim_start();
2838
2839            // Handle continuation of multi-line JS comments
2840            if in_multiline_comment {
2841                if trimmed.contains("*/") {
2842                    in_multiline_comment = false;
2843                }
2844                continue;
2845            }
2846
2847            // Skip single-line JS comments (// and ///)
2848            if trimmed.starts_with("//") {
2849                continue;
2850            }
2851
2852            // Handle start of multi-line JS comment
2853            if trimmed.starts_with("/*") {
2854                if !trimmed.contains("*/") {
2855                    in_multiline_comment = true;
2856                }
2857                continue;
2858            }
2859
2860            // Check if line starts with import or export
2861            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2862                line.in_esm_block = true;
2863            } else {
2864                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2865                break;
2866            }
2867        }
2868    }
2869
2870    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2871    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2872        let mut code_spans = Vec::new();
2873
2874        // Quick check - if no backticks, no code spans
2875        if !content.contains('`') {
2876            return code_spans;
2877        }
2878
2879        // Use pulldown-cmark's streaming parser with byte offsets
2880        let parser = Parser::new(content).into_offset_iter();
2881
2882        for (event, range) in parser {
2883            if let Event::Code(_) = event {
2884                let start_pos = range.start;
2885                let end_pos = range.end;
2886
2887                // The range includes the backticks, extract the actual content
2888                let full_span = &content[start_pos..end_pos];
2889                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2890
2891                // Extract content between backticks, preserving spaces
2892                let content_start = start_pos + backtick_count;
2893                let content_end = end_pos - backtick_count;
2894                let span_content = if content_start < content_end {
2895                    content[content_start..content_end].to_string()
2896                } else {
2897                    String::new()
2898                };
2899
2900                // Use binary search to find line number - O(log n) instead of O(n)
2901                // Find the rightmost line whose byte_offset <= start_pos
2902                let line_idx = lines
2903                    .partition_point(|line| line.byte_offset <= start_pos)
2904                    .saturating_sub(1);
2905                let line_num = line_idx + 1;
2906                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2907
2908                // Find end column using binary search
2909                let end_line_idx = lines
2910                    .partition_point(|line| line.byte_offset <= end_pos)
2911                    .saturating_sub(1);
2912                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2913
2914                // Convert byte offsets to character positions for correct Unicode handling
2915                // This ensures consistency with warning.column which uses character positions
2916                let line_content = lines[line_idx].content(content);
2917                let col_start = if byte_col_start <= line_content.len() {
2918                    line_content[..byte_col_start].chars().count()
2919                } else {
2920                    line_content.chars().count()
2921                };
2922
2923                let end_line_content = lines[end_line_idx].content(content);
2924                let col_end = if byte_col_end <= end_line_content.len() {
2925                    end_line_content[..byte_col_end].chars().count()
2926                } else {
2927                    end_line_content.chars().count()
2928                };
2929
2930                code_spans.push(CodeSpan {
2931                    line: line_num,
2932                    end_line: end_line_idx + 1,
2933                    start_col: col_start,
2934                    end_col: col_end,
2935                    byte_offset: start_pos,
2936                    byte_end: end_pos,
2937                    backtick_count,
2938                    content: span_content,
2939                });
2940            }
2941        }
2942
2943        // Sort by position to ensure consistent ordering
2944        code_spans.sort_by_key(|span| span.byte_offset);
2945
2946        code_spans
2947    }
2948
2949    /// Parse all math spans (inline $...$ and display $$...$$) using pulldown-cmark
2950    fn parse_math_spans(content: &str, lines: &[LineInfo]) -> Vec<MathSpan> {
2951        let mut math_spans = Vec::new();
2952
2953        // Quick check - if no $ signs, no math spans
2954        if !content.contains('$') {
2955            return math_spans;
2956        }
2957
2958        // Use pulldown-cmark with ENABLE_MATH option
2959        let mut options = Options::empty();
2960        options.insert(Options::ENABLE_MATH);
2961        let parser = Parser::new_ext(content, options).into_offset_iter();
2962
2963        for (event, range) in parser {
2964            let (is_display, math_content) = match &event {
2965                Event::InlineMath(text) => (false, text.as_ref()),
2966                Event::DisplayMath(text) => (true, text.as_ref()),
2967                _ => continue,
2968            };
2969
2970            let start_pos = range.start;
2971            let end_pos = range.end;
2972
2973            // Use binary search to find line number - O(log n) instead of O(n)
2974            let line_idx = lines
2975                .partition_point(|line| line.byte_offset <= start_pos)
2976                .saturating_sub(1);
2977            let line_num = line_idx + 1;
2978            let byte_col_start = start_pos - lines[line_idx].byte_offset;
2979
2980            // Find end column using binary search
2981            let end_line_idx = lines
2982                .partition_point(|line| line.byte_offset <= end_pos)
2983                .saturating_sub(1);
2984            let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2985
2986            // Convert byte offsets to character positions for correct Unicode handling
2987            let line_content = lines[line_idx].content(content);
2988            let col_start = if byte_col_start <= line_content.len() {
2989                line_content[..byte_col_start].chars().count()
2990            } else {
2991                line_content.chars().count()
2992            };
2993
2994            let end_line_content = lines[end_line_idx].content(content);
2995            let col_end = if byte_col_end <= end_line_content.len() {
2996                end_line_content[..byte_col_end].chars().count()
2997            } else {
2998                end_line_content.chars().count()
2999            };
3000
3001            math_spans.push(MathSpan {
3002                line: line_num,
3003                end_line: end_line_idx + 1,
3004                start_col: col_start,
3005                end_col: col_end,
3006                byte_offset: start_pos,
3007                byte_end: end_pos,
3008                is_display,
3009                content: math_content.to_string(),
3010            });
3011        }
3012
3013        // Sort by position to ensure consistent ordering
3014        math_spans.sort_by_key(|span| span.byte_offset);
3015
3016        math_spans
3017    }
3018
3019    /// Parse all list blocks in the content (legacy line-by-line approach)
3020    ///
3021    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
3022    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
3023    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
3024    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
3025    ///   treated as list continuation (based on the list marker width)
3026    ///
3027    /// When a new list item is encountered, we check if list-breaking content was seen
3028    /// since the last item. If so, we start a new list block.
3029    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
3030        // Minimum indentation for unordered list continuation per CommonMark spec
3031        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
3032
3033        /// Initialize or reset the forward-scanning tracking state.
3034        /// This helper eliminates code duplication across three initialization sites.
3035        #[inline]
3036        fn reset_tracking_state(
3037            list_item: &ListItemInfo,
3038            has_list_breaking_content: &mut bool,
3039            min_continuation: &mut usize,
3040        ) {
3041            *has_list_breaking_content = false;
3042            let marker_width = if list_item.is_ordered {
3043                list_item.marker.len() + 1 // Ordered markers need space after period/paren
3044            } else {
3045                list_item.marker.len()
3046            };
3047            *min_continuation = if list_item.is_ordered {
3048                marker_width
3049            } else {
3050                UNORDERED_LIST_MIN_CONTINUATION_INDENT
3051            };
3052        }
3053
3054        // Pre-size based on lines that could be list items
3055        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
3056        let mut current_block: Option<ListBlock> = None;
3057        let mut last_list_item_line = 0;
3058        let mut current_indent_level = 0;
3059        let mut last_marker_width = 0;
3060
3061        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
3062        let mut has_list_breaking_content_since_last_item = false;
3063        let mut min_continuation_for_tracking = 0;
3064
3065        for (line_idx, line_info) in lines.iter().enumerate() {
3066            let line_num = line_idx + 1;
3067
3068            // Enhanced code block handling using Design #3's context analysis
3069            if line_info.in_code_block {
3070                if let Some(ref mut block) = current_block {
3071                    // Calculate minimum indentation for list continuation
3072                    let min_continuation_indent =
3073                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
3074
3075                    // Analyze code block context using the three-tier classification
3076                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
3077
3078                    match context {
3079                        CodeBlockContext::Indented => {
3080                            // Code block is properly indented - continues the list
3081                            block.end_line = line_num;
3082                            continue;
3083                        }
3084                        CodeBlockContext::Standalone => {
3085                            // Code block separates lists - end current block
3086                            let completed_block = current_block.take().unwrap();
3087                            list_blocks.push(completed_block);
3088                            continue;
3089                        }
3090                        CodeBlockContext::Adjacent => {
3091                            // Edge case - use conservative behavior (continue list)
3092                            block.end_line = line_num;
3093                            continue;
3094                        }
3095                    }
3096                } else {
3097                    // No current list block - skip code block lines
3098                    continue;
3099                }
3100            }
3101
3102            // Extract blockquote prefix if any
3103            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
3104                caps.get(0).unwrap().as_str().to_string()
3105            } else {
3106                String::new()
3107            };
3108
3109            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
3110            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
3111            if let Some(ref block) = current_block
3112                && line_info.list_item.is_none()
3113                && !line_info.is_blank
3114                && !line_info.in_code_span_continuation
3115            {
3116                let line_content = line_info.content(content).trim();
3117
3118                // Check for structural separators that break lists
3119                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
3120                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
3121                // as they indicate improper indentation rather than lazy continuation.
3122                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
3123
3124                // Check if blockquote context changes (different prefix than current block)
3125                // Lines within the SAME blockquote context don't break lists
3126                let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
3127
3128                let breaks_list = line_info.heading.is_some()
3129                    || line_content.starts_with("---")
3130                    || line_content.starts_with("***")
3131                    || line_content.starts_with("___")
3132                    || crate::utils::skip_context::is_table_line(line_content)
3133                    || blockquote_prefix_changes
3134                    || (line_info.indent > 0
3135                        && line_info.indent < min_continuation_for_tracking
3136                        && !is_lazy_continuation);
3137
3138                if breaks_list {
3139                    has_list_breaking_content_since_last_item = true;
3140                }
3141            }
3142
3143            // If this line is a code span continuation within an active list block,
3144            // extend the block's end_line to include this line (maintains list continuity)
3145            if line_info.in_code_span_continuation
3146                && line_info.list_item.is_none()
3147                && let Some(ref mut block) = current_block
3148            {
3149                block.end_line = line_num;
3150            }
3151
3152            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
3153            // properly indented lines within the list). This ensures the workaround at line 2448
3154            // works correctly when there are multiple continuation lines before a nested list item.
3155            // Also include lazy continuation lines (indent=0) per CommonMark spec.
3156            // For blockquote lines, compute effective indent after stripping the prefix
3157            let effective_continuation_indent = if let Some(ref block) = current_block {
3158                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3159                let line_content = line_info.content(content);
3160                let line_bq_level = line_content
3161                    .chars()
3162                    .take_while(|c| *c == '>' || c.is_whitespace())
3163                    .filter(|&c| c == '>')
3164                    .count();
3165                if line_bq_level > 0 && line_bq_level == block_bq_level {
3166                    // Compute indent after blockquote markers
3167                    let mut pos = 0;
3168                    let mut found_markers = 0;
3169                    for c in line_content.chars() {
3170                        pos += c.len_utf8();
3171                        if c == '>' {
3172                            found_markers += 1;
3173                            if found_markers == line_bq_level {
3174                                if line_content.get(pos..pos + 1) == Some(" ") {
3175                                    pos += 1;
3176                                }
3177                                break;
3178                            }
3179                        }
3180                    }
3181                    let after_bq = &line_content[pos..];
3182                    after_bq.len() - after_bq.trim_start().len()
3183                } else {
3184                    line_info.indent
3185                }
3186            } else {
3187                line_info.indent
3188            };
3189            let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3190                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3191                if block_bq_level > 0 {
3192                    if block.is_ordered { last_marker_width } else { 2 }
3193                } else {
3194                    min_continuation_for_tracking
3195                }
3196            } else {
3197                min_continuation_for_tracking
3198            };
3199            let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3200                || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
3201
3202            if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3203                eprintln!(
3204                    "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3205                    line_num,
3206                    effective_continuation_indent,
3207                    adjusted_min_continuation_for_tracking,
3208                    is_valid_continuation,
3209                    line_info.in_code_span_continuation,
3210                    line_info.in_code_block,
3211                    current_block.is_some()
3212                );
3213            }
3214
3215            if !line_info.in_code_span_continuation
3216                && line_info.list_item.is_none()
3217                && !line_info.is_blank
3218                && !line_info.in_code_block
3219                && is_valid_continuation
3220                && let Some(ref mut block) = current_block
3221            {
3222                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3223                    eprintln!(
3224                        "[DEBUG] Line {}: extending block.end_line from {} to {}",
3225                        line_num, block.end_line, line_num
3226                    );
3227                }
3228                block.end_line = line_num;
3229            }
3230
3231            // Check if this line is a list item
3232            if let Some(list_item) = &line_info.list_item {
3233                // Calculate nesting level based on indentation
3234                let item_indent = list_item.marker_column;
3235                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
3236
3237                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3238                    eprintln!(
3239                        "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3240                        line_num, list_item.marker, item_indent
3241                    );
3242                }
3243
3244                if let Some(ref mut block) = current_block {
3245                    // Check if this continues the current block
3246                    // For nested lists, we need to check if this is a nested item (higher nesting level)
3247                    // or a continuation at the same or lower level
3248                    let is_nested = nesting > block.nesting_level;
3249                    let same_type =
3250                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3251                    let same_context = block.blockquote_prefix == blockquote_prefix;
3252                    // Allow one blank line after last item, or lines immediately after block content
3253                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3254
3255                    // For unordered lists, also check marker consistency
3256                    let marker_compatible =
3257                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3258
3259                    // O(1) check: Use the tracked variable instead of O(n) nested loop
3260                    // This eliminates the quadratic bottleneck from issue #148
3261                    let has_non_list_content = has_list_breaking_content_since_last_item;
3262
3263                    // A list continues if:
3264                    // 1. It's a nested item (indented more than the parent), OR
3265                    // 2. It's the same type at the same level with reasonable distance
3266                    let mut continues_list = if is_nested {
3267                        // Nested items always continue the list if they're in the same context
3268                        same_context && reasonable_distance && !has_non_list_content
3269                    } else {
3270                        // Same-level items need to match type and markers
3271                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3272                    };
3273
3274                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3275                        eprintln!(
3276                            "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3277                            line_num,
3278                            continues_list,
3279                            is_nested,
3280                            same_type,
3281                            same_context,
3282                            reasonable_distance,
3283                            marker_compatible,
3284                            has_non_list_content,
3285                            last_list_item_line,
3286                            block.end_line
3287                        );
3288                    }
3289
3290                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
3291                    // This handles edge cases where content patterns might otherwise split lists incorrectly
3292                    // Apply for: nested items (different types OK), OR same-level same-type items
3293                    if !continues_list
3294                        && (is_nested || same_type)
3295                        && reasonable_distance
3296                        && line_num > 0
3297                        && block.end_line == line_num - 1
3298                    {
3299                        // Check if the previous line was a list item or a continuation of a list item
3300                        // (including lazy continuation lines)
3301                        if block.item_lines.contains(&(line_num - 1)) {
3302                            // They're consecutive list items - force them to be in the same list
3303                            continues_list = true;
3304                        } else {
3305                            // Previous line is a continuation line within this block
3306                            // (e.g., lazy continuation with indent=0)
3307                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
3308                            continues_list = true;
3309                        }
3310                    }
3311
3312                    if continues_list {
3313                        // Extend current block
3314                        block.end_line = line_num;
3315                        block.item_lines.push(line_num);
3316
3317                        // Update max marker width
3318                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3319                            list_item.marker.len() + 1
3320                        } else {
3321                            list_item.marker.len()
3322                        });
3323
3324                        // Update marker consistency for unordered lists
3325                        if !block.is_ordered
3326                            && block.marker.is_some()
3327                            && block.marker.as_ref() != Some(&list_item.marker)
3328                        {
3329                            // Mixed markers, clear the marker field
3330                            block.marker = None;
3331                        }
3332
3333                        // Reset tracked state for issue #148 optimization
3334                        reset_tracking_state(
3335                            list_item,
3336                            &mut has_list_breaking_content_since_last_item,
3337                            &mut min_continuation_for_tracking,
3338                        );
3339                    } else {
3340                        // End current block and start a new one
3341                        // When a different list type starts AT THE SAME LEVEL (not nested),
3342                        // trim back lazy continuation lines (they become part of the gap, not the list)
3343                        // For nested items, different types are fine - they're sub-lists
3344                        if !same_type
3345                            && !is_nested
3346                            && let Some(&last_item) = block.item_lines.last()
3347                        {
3348                            block.end_line = last_item;
3349                        }
3350
3351                        list_blocks.push(block.clone());
3352
3353                        *block = ListBlock {
3354                            start_line: line_num,
3355                            end_line: line_num,
3356                            is_ordered: list_item.is_ordered,
3357                            marker: if list_item.is_ordered {
3358                                None
3359                            } else {
3360                                Some(list_item.marker.clone())
3361                            },
3362                            blockquote_prefix: blockquote_prefix.clone(),
3363                            item_lines: vec![line_num],
3364                            nesting_level: nesting,
3365                            max_marker_width: if list_item.is_ordered {
3366                                list_item.marker.len() + 1
3367                            } else {
3368                                list_item.marker.len()
3369                            },
3370                        };
3371
3372                        // Initialize tracked state for new block (issue #148 optimization)
3373                        reset_tracking_state(
3374                            list_item,
3375                            &mut has_list_breaking_content_since_last_item,
3376                            &mut min_continuation_for_tracking,
3377                        );
3378                    }
3379                } else {
3380                    // Start a new block
3381                    current_block = Some(ListBlock {
3382                        start_line: line_num,
3383                        end_line: line_num,
3384                        is_ordered: list_item.is_ordered,
3385                        marker: if list_item.is_ordered {
3386                            None
3387                        } else {
3388                            Some(list_item.marker.clone())
3389                        },
3390                        blockquote_prefix,
3391                        item_lines: vec![line_num],
3392                        nesting_level: nesting,
3393                        max_marker_width: list_item.marker.len(),
3394                    });
3395
3396                    // Initialize tracked state for new block (issue #148 optimization)
3397                    reset_tracking_state(
3398                        list_item,
3399                        &mut has_list_breaking_content_since_last_item,
3400                        &mut min_continuation_for_tracking,
3401                    );
3402                }
3403
3404                last_list_item_line = line_num;
3405                current_indent_level = item_indent;
3406                last_marker_width = if list_item.is_ordered {
3407                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
3408                } else {
3409                    list_item.marker.len()
3410                };
3411            } else if let Some(ref mut block) = current_block {
3412                // Not a list item - check if it continues the current block
3413                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3414                    eprintln!(
3415                        "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3416                        line_num, line_info.is_blank
3417                    );
3418                }
3419
3420                // For MD032 compatibility, we use a simple approach:
3421                // - Indented lines continue the list
3422                // - Blank lines followed by indented content continue the list
3423                // - Everything else ends the list
3424
3425                // Check if the last line in the list block ended with a backslash (hard line break)
3426                // This handles cases where list items use backslash for hard line breaks
3427                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3428                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3429                } else {
3430                    false
3431                };
3432
3433                // Calculate minimum indentation for list continuation
3434                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
3435                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
3436                let min_continuation_indent = if block.is_ordered {
3437                    current_indent_level + last_marker_width
3438                } else {
3439                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
3440                };
3441
3442                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3443                    // Indented line or backslash continuation continues the list
3444                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3445                        eprintln!(
3446                            "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3447                            line_num, line_info.indent, min_continuation_indent
3448                        );
3449                    }
3450                    block.end_line = line_num;
3451                } else if line_info.is_blank {
3452                    // Blank line - check if it's internal to the list or ending it
3453                    // We only include blank lines that are followed by more list content
3454                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3455                        eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3456                    }
3457                    let mut check_idx = line_idx + 1;
3458                    let mut found_continuation = false;
3459
3460                    // Skip additional blank lines
3461                    while check_idx < lines.len() && lines[check_idx].is_blank {
3462                        check_idx += 1;
3463                    }
3464
3465                    if check_idx < lines.len() {
3466                        let next_line = &lines[check_idx];
3467                        // For blockquote lines, compute indent AFTER stripping the blockquote prefix
3468                        let next_content = next_line.content(content);
3469                        // Use blockquote level (count of >) to compare, not the full prefix
3470                        // This avoids issues where the regex captures extra whitespace
3471                        let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3472                        let next_bq_level_for_indent = next_content
3473                            .chars()
3474                            .take_while(|c| *c == '>' || c.is_whitespace())
3475                            .filter(|&c| c == '>')
3476                            .count();
3477                        let effective_indent =
3478                            if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3479                                // For lines in the same blockquote context, compute indent after the blockquote marker(s)
3480                                // Find position after ">" and one space
3481                                let mut pos = 0;
3482                                let mut found_markers = 0;
3483                                for c in next_content.chars() {
3484                                    pos += c.len_utf8();
3485                                    if c == '>' {
3486                                        found_markers += 1;
3487                                        if found_markers == next_bq_level_for_indent {
3488                                            // Skip optional space after last >
3489                                            if next_content.get(pos..pos + 1) == Some(" ") {
3490                                                pos += 1;
3491                                            }
3492                                            break;
3493                                        }
3494                                    }
3495                                }
3496                                let after_blockquote_marker = &next_content[pos..];
3497                                after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3498                            } else {
3499                                next_line.indent
3500                            };
3501                        // Also adjust min_continuation_indent for blockquote lists
3502                        // The marker_column includes blockquote prefix, so subtract it
3503                        let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3504                            // For blockquote lists, the continuation is relative to blockquote content
3505                            // current_indent_level includes blockquote prefix (2 for "> "), so use just 2 for unordered
3506                            if block.is_ordered { last_marker_width } else { 2 }
3507                        } else {
3508                            min_continuation_indent
3509                        };
3510                        // Check if followed by indented content (list continuation)
3511                        if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3512                            eprintln!(
3513                                "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3514                                line_num,
3515                                check_idx + 1,
3516                                effective_indent,
3517                                adjusted_min_continuation,
3518                                next_line.list_item.is_some(),
3519                                next_line.in_code_block
3520                            );
3521                        }
3522                        if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3523                            found_continuation = true;
3524                        }
3525                        // Check if followed by another list item at the same level
3526                        else if !next_line.in_code_block
3527                            && next_line.list_item.is_some()
3528                            && let Some(item) = &next_line.list_item
3529                        {
3530                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3531                                .find(next_line.content(content))
3532                                .map_or(String::new(), |m| m.as_str().to_string());
3533                            if item.marker_column == current_indent_level
3534                                && item.is_ordered == block.is_ordered
3535                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3536                            {
3537                                // Check if there was meaningful content between the list items (unused now)
3538                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
3539                                // Pre-compute block's blockquote level for use in closures
3540                                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3541                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3542                                    if let Some(between_line) = lines.get(idx) {
3543                                        let between_content = between_line.content(content);
3544                                        let trimmed = between_content.trim();
3545                                        // Skip empty lines
3546                                        if trimmed.is_empty() {
3547                                            return false;
3548                                        }
3549                                        // Check for meaningful content
3550                                        let line_indent = between_content.len() - between_content.trim_start().len();
3551
3552                                        // Check if blockquote level changed (not just if line starts with ">")
3553                                        let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3554                                            .find(between_content)
3555                                            .map_or(String::new(), |m| m.as_str().to_string());
3556                                        let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3557                                        let blockquote_level_changed =
3558                                            trimmed.starts_with(">") && between_bq_level != block_bq_level;
3559
3560                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
3561                                        if trimmed.starts_with("```")
3562                                            || trimmed.starts_with("~~~")
3563                                            || trimmed.starts_with("---")
3564                                            || trimmed.starts_with("***")
3565                                            || trimmed.starts_with("___")
3566                                            || blockquote_level_changed
3567                                            || crate::utils::skip_context::is_table_line(trimmed)
3568                                            || between_line.heading.is_some()
3569                                        {
3570                                            return true; // These are structural separators - meaningful content that breaks lists
3571                                        }
3572
3573                                        // Only properly indented content continues the list
3574                                        line_indent >= min_continuation_indent
3575                                    } else {
3576                                        false
3577                                    }
3578                                });
3579
3580                                if block.is_ordered {
3581                                    // For ordered lists: don't continue if there are structural separators
3582                                    // Check if there are structural separators between the list items
3583                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3584                                        if let Some(between_line) = lines.get(idx) {
3585                                            let between_content = between_line.content(content);
3586                                            let trimmed = between_content.trim();
3587                                            if trimmed.is_empty() {
3588                                                return false;
3589                                            }
3590                                            // Check if blockquote level changed (not just if line starts with ">")
3591                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3592                                                .find(between_content)
3593                                                .map_or(String::new(), |m| m.as_str().to_string());
3594                                            let between_bq_level =
3595                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3596                                            let blockquote_level_changed =
3597                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3598                                            // Check for structural separators that break lists
3599                                            trimmed.starts_with("```")
3600                                                || trimmed.starts_with("~~~")
3601                                                || trimmed.starts_with("---")
3602                                                || trimmed.starts_with("***")
3603                                                || trimmed.starts_with("___")
3604                                                || blockquote_level_changed
3605                                                || crate::utils::skip_context::is_table_line(trimmed)
3606                                                || between_line.heading.is_some()
3607                                        } else {
3608                                            false
3609                                        }
3610                                    });
3611                                    found_continuation = !has_structural_separators;
3612                                } else {
3613                                    // For unordered lists: also check for structural separators
3614                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3615                                        if let Some(between_line) = lines.get(idx) {
3616                                            let between_content = between_line.content(content);
3617                                            let trimmed = between_content.trim();
3618                                            if trimmed.is_empty() {
3619                                                return false;
3620                                            }
3621                                            // Check if blockquote level changed (not just if line starts with ">")
3622                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3623                                                .find(between_content)
3624                                                .map_or(String::new(), |m| m.as_str().to_string());
3625                                            let between_bq_level =
3626                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3627                                            let blockquote_level_changed =
3628                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3629                                            // Check for structural separators that break lists
3630                                            trimmed.starts_with("```")
3631                                                || trimmed.starts_with("~~~")
3632                                                || trimmed.starts_with("---")
3633                                                || trimmed.starts_with("***")
3634                                                || trimmed.starts_with("___")
3635                                                || blockquote_level_changed
3636                                                || crate::utils::skip_context::is_table_line(trimmed)
3637                                                || between_line.heading.is_some()
3638                                        } else {
3639                                            false
3640                                        }
3641                                    });
3642                                    found_continuation = !has_structural_separators;
3643                                }
3644                            }
3645                        }
3646                    }
3647
3648                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3649                        eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3650                    }
3651                    if found_continuation {
3652                        // Include the blank line in the block
3653                        block.end_line = line_num;
3654                    } else {
3655                        // Blank line ends the list - don't include it
3656                        list_blocks.push(block.clone());
3657                        current_block = None;
3658                    }
3659                } else {
3660                    // Check for lazy continuation - non-indented line immediately after a list item
3661                    // But only if the line has sufficient indentation for the list type
3662                    let min_required_indent = if block.is_ordered {
3663                        current_indent_level + last_marker_width
3664                    } else {
3665                        current_indent_level + 2
3666                    };
3667
3668                    // For lazy continuation to apply, the line must either:
3669                    // 1. Have no indentation (true lazy continuation)
3670                    // 2. Have sufficient indentation for the list type
3671                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3672                    let line_content = line_info.content(content).trim();
3673
3674                    // Check for table-like patterns
3675                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3676
3677                    // Check if blockquote level changed (not just if line starts with ">")
3678                    // Lines within the same blockquote level are NOT structural separators
3679                    let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3680                    let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3681                    let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3682
3683                    let is_structural_separator = line_info.heading.is_some()
3684                        || line_content.starts_with("```")
3685                        || line_content.starts_with("~~~")
3686                        || line_content.starts_with("---")
3687                        || line_content.starts_with("***")
3688                        || line_content.starts_with("___")
3689                        || blockquote_level_changed
3690                        || looks_like_table;
3691
3692                    // Allow lazy continuation if we're still within the same list block
3693                    // (not just immediately after a list item)
3694                    // Also treat code span continuations as valid continuations regardless of indent
3695                    let is_lazy_continuation = !is_structural_separator
3696                        && !line_info.is_blank
3697                        && (line_info.indent == 0
3698                            || line_info.indent >= min_required_indent
3699                            || line_info.in_code_span_continuation);
3700
3701                    if is_lazy_continuation {
3702                        // Per CommonMark, lazy continuation continues until a blank line
3703                        // or structural element, regardless of uppercase at line start
3704                        block.end_line = line_num;
3705                    } else {
3706                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3707                        list_blocks.push(block.clone());
3708                        current_block = None;
3709                    }
3710                }
3711            }
3712        }
3713
3714        // Don't forget the last block
3715        if let Some(block) = current_block {
3716            list_blocks.push(block);
3717        }
3718
3719        // Merge adjacent blocks that should be one
3720        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3721
3722        list_blocks
3723    }
3724
3725    /// Compute character frequency for fast content analysis
3726    fn compute_char_frequency(content: &str) -> CharFrequency {
3727        let mut frequency = CharFrequency::default();
3728
3729        for ch in content.chars() {
3730            match ch {
3731                '#' => frequency.hash_count += 1,
3732                '*' => frequency.asterisk_count += 1,
3733                '_' => frequency.underscore_count += 1,
3734                '-' => frequency.hyphen_count += 1,
3735                '+' => frequency.plus_count += 1,
3736                '>' => frequency.gt_count += 1,
3737                '|' => frequency.pipe_count += 1,
3738                '[' => frequency.bracket_count += 1,
3739                '`' => frequency.backtick_count += 1,
3740                '<' => frequency.lt_count += 1,
3741                '!' => frequency.exclamation_count += 1,
3742                '\n' => frequency.newline_count += 1,
3743                _ => {}
3744            }
3745        }
3746
3747        frequency
3748    }
3749
3750    /// Parse HTML tags in the content
3751    fn parse_html_tags(
3752        content: &str,
3753        lines: &[LineInfo],
3754        code_blocks: &[(usize, usize)],
3755        flavor: MarkdownFlavor,
3756    ) -> Vec<HtmlTag> {
3757        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3758            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3759
3760        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3761
3762        for cap in HTML_TAG_REGEX.captures_iter(content) {
3763            let full_match = cap.get(0).unwrap();
3764            let match_start = full_match.start();
3765            let match_end = full_match.end();
3766
3767            // Skip if in code block
3768            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3769                continue;
3770            }
3771
3772            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3773            let tag_name_original = cap.get(2).unwrap().as_str();
3774            let tag_name = tag_name_original.to_lowercase();
3775            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3776
3777            // Skip JSX components in MDX files (tags starting with uppercase letter)
3778            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3779            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3780                continue;
3781            }
3782
3783            // Find which line this tag is on
3784            let mut line_num = 1;
3785            let mut col_start = match_start;
3786            let mut col_end = match_end;
3787            for (idx, line_info) in lines.iter().enumerate() {
3788                if match_start >= line_info.byte_offset {
3789                    line_num = idx + 1;
3790                    col_start = match_start - line_info.byte_offset;
3791                    col_end = match_end - line_info.byte_offset;
3792                } else {
3793                    break;
3794                }
3795            }
3796
3797            html_tags.push(HtmlTag {
3798                line: line_num,
3799                start_col: col_start,
3800                end_col: col_end,
3801                byte_offset: match_start,
3802                byte_end: match_end,
3803                tag_name,
3804                is_closing,
3805                is_self_closing,
3806                raw_content: full_match.as_str().to_string(),
3807            });
3808        }
3809
3810        html_tags
3811    }
3812
3813    /// Parse table rows in the content
3814    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3815        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3816
3817        for (line_idx, line_info) in lines.iter().enumerate() {
3818            // Skip lines in code blocks or blank lines
3819            if line_info.in_code_block || line_info.is_blank {
3820                continue;
3821            }
3822
3823            let line = line_info.content(content);
3824            let line_num = line_idx + 1;
3825
3826            // Check if this line contains pipes (potential table row)
3827            if !line.contains('|') {
3828                continue;
3829            }
3830
3831            // Count columns by splitting on pipes
3832            let parts: Vec<&str> = line.split('|').collect();
3833            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3834
3835            // Check if this is a separator row
3836            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3837            let mut column_alignments = Vec::new();
3838
3839            if is_separator {
3840                for part in &parts[1..parts.len() - 1] {
3841                    // Skip first and last empty parts
3842                    let trimmed = part.trim();
3843                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3844                        "center".to_string()
3845                    } else if trimmed.ends_with(':') {
3846                        "right".to_string()
3847                    } else if trimmed.starts_with(':') {
3848                        "left".to_string()
3849                    } else {
3850                        "none".to_string()
3851                    };
3852                    column_alignments.push(alignment);
3853                }
3854            }
3855
3856            table_rows.push(TableRow {
3857                line: line_num,
3858                is_separator,
3859                column_count,
3860                column_alignments,
3861            });
3862        }
3863
3864        table_rows
3865    }
3866
3867    /// Parse bare URLs and emails in the content
3868    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3869        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3870
3871        // Check for bare URLs (not in angle brackets or markdown links)
3872        for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3873            let full_match = cap.get(0).unwrap();
3874            let match_start = full_match.start();
3875            let match_end = full_match.end();
3876
3877            // Skip if in code block
3878            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3879                continue;
3880            }
3881
3882            // Skip if already in angle brackets or markdown links
3883            let preceding_char = if match_start > 0 {
3884                content.chars().nth(match_start - 1)
3885            } else {
3886                None
3887            };
3888            let following_char = content.chars().nth(match_end);
3889
3890            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3891                continue;
3892            }
3893            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3894                continue;
3895            }
3896
3897            let url = full_match.as_str();
3898            let url_type = if url.starts_with("https://") {
3899                "https"
3900            } else if url.starts_with("http://") {
3901                "http"
3902            } else if url.starts_with("ftp://") {
3903                "ftp"
3904            } else {
3905                "other"
3906            };
3907
3908            // Find which line this URL is on
3909            let mut line_num = 1;
3910            let mut col_start = match_start;
3911            let mut col_end = match_end;
3912            for (idx, line_info) in lines.iter().enumerate() {
3913                if match_start >= line_info.byte_offset {
3914                    line_num = idx + 1;
3915                    col_start = match_start - line_info.byte_offset;
3916                    col_end = match_end - line_info.byte_offset;
3917                } else {
3918                    break;
3919                }
3920            }
3921
3922            bare_urls.push(BareUrl {
3923                line: line_num,
3924                start_col: col_start,
3925                end_col: col_end,
3926                byte_offset: match_start,
3927                byte_end: match_end,
3928                url: url.to_string(),
3929                url_type: url_type.to_string(),
3930            });
3931        }
3932
3933        // Check for bare email addresses
3934        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3935            let full_match = cap.get(0).unwrap();
3936            let match_start = full_match.start();
3937            let match_end = full_match.end();
3938
3939            // Skip if in code block
3940            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3941                continue;
3942            }
3943
3944            // Skip if already in angle brackets or markdown links
3945            let preceding_char = if match_start > 0 {
3946                content.chars().nth(match_start - 1)
3947            } else {
3948                None
3949            };
3950            let following_char = content.chars().nth(match_end);
3951
3952            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3953                continue;
3954            }
3955            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3956                continue;
3957            }
3958
3959            let email = full_match.as_str();
3960
3961            // Find which line this email is on
3962            let mut line_num = 1;
3963            let mut col_start = match_start;
3964            let mut col_end = match_end;
3965            for (idx, line_info) in lines.iter().enumerate() {
3966                if match_start >= line_info.byte_offset {
3967                    line_num = idx + 1;
3968                    col_start = match_start - line_info.byte_offset;
3969                    col_end = match_end - line_info.byte_offset;
3970                } else {
3971                    break;
3972                }
3973            }
3974
3975            bare_urls.push(BareUrl {
3976                line: line_num,
3977                start_col: col_start,
3978                end_col: col_end,
3979                byte_offset: match_start,
3980                byte_end: match_end,
3981                url: email.to_string(),
3982                url_type: "email".to_string(),
3983            });
3984        }
3985
3986        bare_urls
3987    }
3988
3989    /// Get an iterator over valid CommonMark headings
3990    ///
3991    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3992    /// that should be flagged by MD018 but should not be processed by other heading rules.
3993    ///
3994    /// # Examples
3995    ///
3996    /// ```rust
3997    /// use rumdl_lib::lint_context::LintContext;
3998    /// use rumdl_lib::config::MarkdownFlavor;
3999    ///
4000    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
4001    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4002    ///
4003    /// for heading in ctx.valid_headings() {
4004    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
4005    /// }
4006    /// // Only prints valid headings, skips `#NoSpace`
4007    /// ```
4008    #[must_use]
4009    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
4010        ValidHeadingsIter::new(&self.lines)
4011    }
4012
4013    /// Check if the document contains any valid CommonMark headings
4014    ///
4015    /// Returns `true` if there is at least one heading with proper space after `#`.
4016    #[must_use]
4017    pub fn has_valid_headings(&self) -> bool {
4018        self.lines
4019            .iter()
4020            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
4021    }
4022}
4023
4024/// Merge adjacent list blocks that should be treated as one
4025fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
4026    if list_blocks.len() < 2 {
4027        return;
4028    }
4029
4030    let mut merger = ListBlockMerger::new(content, lines);
4031    *list_blocks = merger.merge(list_blocks);
4032}
4033
4034/// Helper struct to manage the complex logic of merging list blocks
4035struct ListBlockMerger<'a> {
4036    content: &'a str,
4037    lines: &'a [LineInfo],
4038}
4039
4040impl<'a> ListBlockMerger<'a> {
4041    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
4042        Self { content, lines }
4043    }
4044
4045    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
4046        let mut merged = Vec::with_capacity(list_blocks.len());
4047        let mut current = list_blocks[0].clone();
4048
4049        for next in list_blocks.iter().skip(1) {
4050            if self.should_merge_blocks(&current, next) {
4051                current = self.merge_two_blocks(current, next);
4052            } else {
4053                merged.push(current);
4054                current = next.clone();
4055            }
4056        }
4057
4058        merged.push(current);
4059        merged
4060    }
4061
4062    /// Determine if two adjacent list blocks should be merged
4063    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
4064        // Basic compatibility checks
4065        if !self.blocks_are_compatible(current, next) {
4066            return false;
4067        }
4068
4069        // Check spacing and content between blocks
4070        let spacing = self.analyze_spacing_between(current, next);
4071        match spacing {
4072            BlockSpacing::Consecutive => true,
4073            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
4074            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
4075                self.can_merge_with_content_between(current, next)
4076            }
4077        }
4078    }
4079
4080    /// Check if blocks have compatible structure for merging
4081    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
4082        current.is_ordered == next.is_ordered
4083            && current.blockquote_prefix == next.blockquote_prefix
4084            && current.nesting_level == next.nesting_level
4085    }
4086
4087    /// Analyze the spacing between two list blocks
4088    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
4089        let gap = next.start_line - current.end_line;
4090
4091        match gap {
4092            1 => BlockSpacing::Consecutive,
4093            2 => BlockSpacing::SingleBlank,
4094            _ if gap > 2 => {
4095                if self.has_only_blank_lines_between(current, next) {
4096                    BlockSpacing::MultipleBlanks
4097                } else {
4098                    BlockSpacing::ContentBetween
4099                }
4100            }
4101            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
4102        }
4103    }
4104
4105    /// Check if unordered lists can be merged with a single blank line between
4106    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4107        // Check if there are structural separators between the blocks
4108        // If has_meaningful_content_between returns true, it means there are structural separators
4109        if has_meaningful_content_between(self.content, current, next, self.lines) {
4110            return false; // Structural separators prevent merging
4111        }
4112
4113        // Only merge unordered lists with same marker across single blank
4114        !current.is_ordered && current.marker == next.marker
4115    }
4116
4117    /// Check if ordered lists can be merged when there's content between them
4118    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4119        // Do not merge lists if there are structural separators between them
4120        if has_meaningful_content_between(self.content, current, next, self.lines) {
4121            return false; // Structural separators prevent merging
4122        }
4123
4124        // Only consider merging ordered lists if there's no structural content between
4125        current.is_ordered && next.is_ordered
4126    }
4127
4128    /// Check if there are only blank lines between blocks
4129    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4130        for line_num in (current.end_line + 1)..next.start_line {
4131            if let Some(line_info) = self.lines.get(line_num - 1)
4132                && !line_info.content(self.content).trim().is_empty()
4133            {
4134                return false;
4135            }
4136        }
4137        true
4138    }
4139
4140    /// Merge two compatible list blocks into one
4141    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4142        current.end_line = next.end_line;
4143        current.item_lines.extend_from_slice(&next.item_lines);
4144
4145        // Update max marker width
4146        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4147
4148        // Handle marker consistency for unordered lists
4149        if !current.is_ordered && self.markers_differ(&current, next) {
4150            current.marker = None; // Mixed markers
4151        }
4152
4153        current
4154    }
4155
4156    /// Check if two blocks have different markers
4157    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4158        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4159    }
4160}
4161
4162/// Types of spacing between list blocks
4163#[derive(Debug, PartialEq)]
4164enum BlockSpacing {
4165    Consecutive,    // No gap between blocks
4166    SingleBlank,    // One blank line between blocks
4167    MultipleBlanks, // Multiple blank lines but no content
4168    ContentBetween, // Content exists between blocks
4169}
4170
4171/// Check if there's meaningful content (not just blank lines) between two list blocks
4172fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4173    // Check lines between current.end_line and next.start_line
4174    for line_num in (current.end_line + 1)..next.start_line {
4175        if let Some(line_info) = lines.get(line_num - 1) {
4176            // Convert to 0-indexed
4177            let trimmed = line_info.content(content).trim();
4178
4179            // Skip empty lines
4180            if trimmed.is_empty() {
4181                continue;
4182            }
4183
4184            // Check for structural separators that should separate lists (CommonMark compliant)
4185
4186            // Headings separate lists
4187            if line_info.heading.is_some() {
4188                return true; // Has meaningful content - headings separate lists
4189            }
4190
4191            // Horizontal rules separate lists (---, ***, ___)
4192            if is_horizontal_rule(trimmed) {
4193                return true; // Has meaningful content - horizontal rules separate lists
4194            }
4195
4196            // Tables separate lists
4197            if crate::utils::skip_context::is_table_line(trimmed) {
4198                return true; // Has meaningful content - tables separate lists
4199            }
4200
4201            // Blockquotes separate lists
4202            if trimmed.starts_with('>') {
4203                return true; // Has meaningful content - blockquotes separate lists
4204            }
4205
4206            // Code block fences separate lists (unless properly indented as list content)
4207            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4208                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4209
4210                // Check if this code block is properly indented as list continuation
4211                let min_continuation_indent = if current.is_ordered {
4212                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
4213                } else {
4214                    current.nesting_level + 2
4215                };
4216
4217                if line_indent < min_continuation_indent {
4218                    // This is a standalone code block that separates lists
4219                    return true; // Has meaningful content - standalone code blocks separate lists
4220                }
4221            }
4222
4223            // Check if this line has proper indentation for list continuation
4224            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4225
4226            // Calculate minimum indentation needed to be list continuation
4227            let min_indent = if current.is_ordered {
4228                current.nesting_level + current.max_marker_width
4229            } else {
4230                current.nesting_level + 2
4231            };
4232
4233            // If the line is not indented enough to be list continuation, it's meaningful content
4234            if line_indent < min_indent {
4235                return true; // Has meaningful content - content not indented as list continuation
4236            }
4237
4238            // If we reach here, the line is properly indented as list continuation
4239            // Continue checking other lines
4240        }
4241    }
4242
4243    // Only blank lines or properly indented list continuation content between blocks
4244    false
4245}
4246
4247/// Check if a line is a horizontal rule (---, ***, ___) per CommonMark spec.
4248/// CommonMark rules for thematic breaks (horizontal rules):
4249/// - May have 0-3 spaces of leading indentation (but NOT tabs)
4250/// - Must have 3+ of the same character (-, *, or _)
4251/// - May have spaces between characters
4252/// - No other characters allowed
4253pub fn is_horizontal_rule_line(line: &str) -> bool {
4254    // CommonMark: HRs can have 0-3 spaces of leading indentation, not tabs
4255    let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4256    if leading_spaces > 3 || line.starts_with('\t') {
4257        return false;
4258    }
4259
4260    is_horizontal_rule_content(line.trim())
4261}
4262
4263/// Check if trimmed content matches horizontal rule pattern.
4264/// Use `is_horizontal_rule_line` for full CommonMark compliance including indentation check.
4265pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4266    if trimmed.len() < 3 {
4267        return false;
4268    }
4269
4270    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
4271    let chars: Vec<char> = trimmed.chars().collect();
4272    if let Some(&first_char) = chars.first()
4273        && (first_char == '-' || first_char == '*' || first_char == '_')
4274    {
4275        let mut count = 0;
4276        for &ch in &chars {
4277            if ch == first_char {
4278                count += 1;
4279            } else if ch != ' ' && ch != '\t' {
4280                return false; // Non-matching, non-whitespace character
4281            }
4282        }
4283        return count >= 3;
4284    }
4285    false
4286}
4287
4288/// Backwards-compatible alias for `is_horizontal_rule_content`
4289pub fn is_horizontal_rule(trimmed: &str) -> bool {
4290    is_horizontal_rule_content(trimmed)
4291}
4292
4293/// Check if content contains patterns that cause the markdown crate to panic
4294#[cfg(test)]
4295mod tests {
4296    use super::*;
4297
4298    #[test]
4299    fn test_empty_content() {
4300        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4301        assert_eq!(ctx.content, "");
4302        assert_eq!(ctx.line_offsets, vec![0]);
4303        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4304        assert_eq!(ctx.lines.len(), 0);
4305    }
4306
4307    #[test]
4308    fn test_single_line() {
4309        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4310        assert_eq!(ctx.content, "# Hello");
4311        assert_eq!(ctx.line_offsets, vec![0]);
4312        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4313        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4314    }
4315
4316    #[test]
4317    fn test_multi_line() {
4318        let content = "# Title\n\nSecond line\nThird line";
4319        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4320        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4321        // Test offset to line/col
4322        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
4323        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
4324        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
4325        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
4326        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
4327    }
4328
4329    #[test]
4330    fn test_line_info() {
4331        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
4332        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4333
4334        // Test line info
4335        assert_eq!(ctx.lines.len(), 7);
4336
4337        // Line 1: "# Title"
4338        let line1 = &ctx.lines[0];
4339        assert_eq!(line1.content(ctx.content), "# Title");
4340        assert_eq!(line1.byte_offset, 0);
4341        assert_eq!(line1.indent, 0);
4342        assert!(!line1.is_blank);
4343        assert!(!line1.in_code_block);
4344        assert!(line1.list_item.is_none());
4345
4346        // Line 2: "    indented"
4347        let line2 = &ctx.lines[1];
4348        assert_eq!(line2.content(ctx.content), "    indented");
4349        assert_eq!(line2.byte_offset, 8);
4350        assert_eq!(line2.indent, 4);
4351        assert!(!line2.is_blank);
4352
4353        // Line 3: "" (blank)
4354        let line3 = &ctx.lines[2];
4355        assert_eq!(line3.content(ctx.content), "");
4356        assert!(line3.is_blank);
4357
4358        // Test helper methods
4359        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4360        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4361        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4362        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4363    }
4364
4365    #[test]
4366    fn test_list_item_detection() {
4367        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
4368        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4369
4370        // Line 1: "- Unordered item"
4371        let line1 = &ctx.lines[0];
4372        assert!(line1.list_item.is_some());
4373        let list1 = line1.list_item.as_ref().unwrap();
4374        assert_eq!(list1.marker, "-");
4375        assert!(!list1.is_ordered);
4376        assert_eq!(list1.marker_column, 0);
4377        assert_eq!(list1.content_column, 2);
4378
4379        // Line 2: "  * Nested item"
4380        let line2 = &ctx.lines[1];
4381        assert!(line2.list_item.is_some());
4382        let list2 = line2.list_item.as_ref().unwrap();
4383        assert_eq!(list2.marker, "*");
4384        assert_eq!(list2.marker_column, 2);
4385
4386        // Line 3: "1. Ordered item"
4387        let line3 = &ctx.lines[2];
4388        assert!(line3.list_item.is_some());
4389        let list3 = line3.list_item.as_ref().unwrap();
4390        assert_eq!(list3.marker, "1.");
4391        assert!(list3.is_ordered);
4392        assert_eq!(list3.number, Some(1));
4393
4394        // Line 6: "Not a list"
4395        let line6 = &ctx.lines[5];
4396        assert!(line6.list_item.is_none());
4397    }
4398
4399    #[test]
4400    fn test_offset_to_line_col_edge_cases() {
4401        let content = "a\nb\nc";
4402        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4403        // line_offsets: [0, 2, 4]
4404        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
4405        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
4406        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
4407        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
4408        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
4409        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
4410    }
4411
4412    #[test]
4413    fn test_mdx_esm_blocks() {
4414        let content = r##"import {Chart} from './snowfall.js'
4415export const year = 2023
4416
4417# Last year's snowfall
4418
4419In {year}, the snowfall was above average.
4420It was followed by a warm spring which caused
4421flood conditions in many of the nearby rivers.
4422
4423<Chart color="#fcb32c" year={year} />
4424"##;
4425
4426        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4427
4428        // Check that lines 1 and 2 are marked as ESM blocks
4429        assert_eq!(ctx.lines.len(), 10);
4430        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4431        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4432        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4433        assert!(
4434            !ctx.lines[3].in_esm_block,
4435            "Line 4 (heading) should NOT be in_esm_block"
4436        );
4437        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4438        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4439    }
4440
4441    #[test]
4442    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4443        let content = r#"import {Chart} from './snowfall.js'
4444export const year = 2023
4445
4446# Last year's snowfall
4447"#;
4448
4449        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4450
4451        // ESM blocks should NOT be detected in Standard flavor
4452        assert!(
4453            !ctx.lines[0].in_esm_block,
4454            "Line 1 should NOT be in_esm_block in Standard flavor"
4455        );
4456        assert!(
4457            !ctx.lines[1].in_esm_block,
4458            "Line 2 should NOT be in_esm_block in Standard flavor"
4459        );
4460    }
4461
4462    #[test]
4463    fn test_blockquote_with_indented_content() {
4464        // Lines with `>` followed by heavily-indented content should be detected as blockquotes.
4465        // The content inside the blockquote may also be detected as a code block (which is correct),
4466        // but for MD046 purposes, we need to know the line is inside a blockquote.
4467        let content = r#"# Heading
4468
4469>      -S socket-path
4470>                    More text
4471"#;
4472        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4473
4474        // Line 3 (index 2) should be detected as blockquote
4475        assert!(
4476            ctx.lines.get(2).is_some_and(|l| l.blockquote.is_some()),
4477            "Line 3 should be a blockquote"
4478        );
4479        // Line 4 (index 3) should also be blockquote
4480        assert!(
4481            ctx.lines.get(3).is_some_and(|l| l.blockquote.is_some()),
4482            "Line 4 should be a blockquote"
4483        );
4484
4485        // Verify blockquote content is correctly parsed
4486        // Note: spaces_after includes the spaces between `>` and content
4487        let bq3 = ctx.lines.get(2).unwrap().blockquote.as_ref().unwrap();
4488        assert_eq!(bq3.content, "-S socket-path");
4489        assert_eq!(bq3.nesting_level, 1);
4490        // 6 spaces after the `>` marker
4491        assert!(bq3.has_multiple_spaces_after_marker);
4492
4493        let bq4 = ctx.lines.get(3).unwrap().blockquote.as_ref().unwrap();
4494        assert_eq!(bq4.content, "More text");
4495        assert_eq!(bq4.nesting_level, 1);
4496    }
4497
4498    #[test]
4499    fn test_footnote_definitions_not_parsed_as_reference_defs() {
4500        // Footnote definitions use [^id]: syntax and should NOT be parsed as reference definitions
4501        let content = r#"# Title
4502
4503A footnote[^1].
4504
4505[^1]: This is the footnote content.
4506
4507[^note]: Another footnote with [link](https://example.com).
4508
4509[regular]: ./path.md "A real reference definition"
4510"#;
4511        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4512
4513        // Should only have one reference definition (the regular one)
4514        assert_eq!(
4515            ctx.reference_defs.len(),
4516            1,
4517            "Footnotes should not be parsed as reference definitions"
4518        );
4519
4520        // The only reference def should be the regular one
4521        assert_eq!(ctx.reference_defs[0].id, "regular");
4522        assert_eq!(ctx.reference_defs[0].url, "./path.md");
4523        assert_eq!(
4524            ctx.reference_defs[0].title,
4525            Some("A real reference definition".to_string())
4526        );
4527    }
4528
4529    #[test]
4530    fn test_footnote_with_inline_link_not_misidentified() {
4531        // Regression test for issue #286: footnote containing an inline link
4532        // was incorrectly parsed as a reference definition with URL "[link](url)"
4533        let content = r#"# Title
4534
4535A footnote[^1].
4536
4537[^1]: [link](https://www.google.com).
4538"#;
4539        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4540
4541        // Should have no reference definitions
4542        assert!(
4543            ctx.reference_defs.is_empty(),
4544            "Footnote with inline link should not create a reference definition"
4545        );
4546    }
4547
4548    #[test]
4549    fn test_various_footnote_formats_excluded() {
4550        // Test various footnote ID formats are all excluded
4551        let content = r#"[^1]: Numeric footnote
4552[^note]: Named footnote
4553[^a]: Single char footnote
4554[^long-footnote-name]: Long named footnote
4555[^123abc]: Mixed alphanumeric
4556
4557[ref1]: ./file1.md
4558[ref2]: ./file2.md
4559"#;
4560        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4561
4562        // Should only have the two regular reference definitions
4563        assert_eq!(
4564            ctx.reference_defs.len(),
4565            2,
4566            "Only regular reference definitions should be parsed"
4567        );
4568
4569        let ids: Vec<&str> = ctx.reference_defs.iter().map(|r| r.id.as_str()).collect();
4570        assert!(ids.contains(&"ref1"));
4571        assert!(ids.contains(&"ref2"));
4572        assert!(!ids.iter().any(|id| id.starts_with('^')));
4573    }
4574
4575    // =========================================================================
4576    // Tests for has_char and char_count methods
4577    // =========================================================================
4578
4579    #[test]
4580    fn test_has_char_tracked_characters() {
4581        // Test all 12 tracked characters
4582        let content = "# Heading\n* list item\n_emphasis_ and -hyphen-\n+ plus\n> quote\n| table |\n[link]\n`code`\n<html>\n!image";
4583        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4584
4585        // All tracked characters should be detected
4586        assert!(ctx.has_char('#'), "Should detect hash");
4587        assert!(ctx.has_char('*'), "Should detect asterisk");
4588        assert!(ctx.has_char('_'), "Should detect underscore");
4589        assert!(ctx.has_char('-'), "Should detect hyphen");
4590        assert!(ctx.has_char('+'), "Should detect plus");
4591        assert!(ctx.has_char('>'), "Should detect gt");
4592        assert!(ctx.has_char('|'), "Should detect pipe");
4593        assert!(ctx.has_char('['), "Should detect bracket");
4594        assert!(ctx.has_char('`'), "Should detect backtick");
4595        assert!(ctx.has_char('<'), "Should detect lt");
4596        assert!(ctx.has_char('!'), "Should detect exclamation");
4597        assert!(ctx.has_char('\n'), "Should detect newline");
4598    }
4599
4600    #[test]
4601    fn test_has_char_absent_characters() {
4602        let content = "Simple text without special chars";
4603        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4604
4605        // None of the tracked characters should be present
4606        assert!(!ctx.has_char('#'), "Should not detect hash");
4607        assert!(!ctx.has_char('*'), "Should not detect asterisk");
4608        assert!(!ctx.has_char('_'), "Should not detect underscore");
4609        assert!(!ctx.has_char('-'), "Should not detect hyphen");
4610        assert!(!ctx.has_char('+'), "Should not detect plus");
4611        assert!(!ctx.has_char('>'), "Should not detect gt");
4612        assert!(!ctx.has_char('|'), "Should not detect pipe");
4613        assert!(!ctx.has_char('['), "Should not detect bracket");
4614        assert!(!ctx.has_char('`'), "Should not detect backtick");
4615        assert!(!ctx.has_char('<'), "Should not detect lt");
4616        assert!(!ctx.has_char('!'), "Should not detect exclamation");
4617        // Note: single line content has no newlines
4618        assert!(!ctx.has_char('\n'), "Should not detect newline in single line");
4619    }
4620
4621    #[test]
4622    fn test_has_char_fallback_for_untracked() {
4623        let content = "Text with @mention and $dollar and %percent";
4624        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4625
4626        // Untracked characters should fall back to content.contains()
4627        assert!(ctx.has_char('@'), "Should detect @ via fallback");
4628        assert!(ctx.has_char('$'), "Should detect $ via fallback");
4629        assert!(ctx.has_char('%'), "Should detect % via fallback");
4630        assert!(!ctx.has_char('^'), "Should not detect absent ^ via fallback");
4631    }
4632
4633    #[test]
4634    fn test_char_count_tracked_characters() {
4635        let content = "## Heading ##\n***bold***\n__emphasis__\n---\n+++\n>> nested\n|| table ||\n[[link]]\n``code``\n<<html>>\n!!";
4636        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4637
4638        // Count each tracked character
4639        assert_eq!(ctx.char_count('#'), 4, "Should count 4 hashes");
4640        assert_eq!(ctx.char_count('*'), 6, "Should count 6 asterisks");
4641        assert_eq!(ctx.char_count('_'), 4, "Should count 4 underscores");
4642        assert_eq!(ctx.char_count('-'), 3, "Should count 3 hyphens");
4643        assert_eq!(ctx.char_count('+'), 3, "Should count 3 pluses");
4644        assert_eq!(ctx.char_count('>'), 4, "Should count 4 gt (2 nested + 2 in <<html>>)");
4645        assert_eq!(ctx.char_count('|'), 4, "Should count 4 pipes");
4646        assert_eq!(ctx.char_count('['), 2, "Should count 2 brackets");
4647        assert_eq!(ctx.char_count('`'), 4, "Should count 4 backticks");
4648        assert_eq!(ctx.char_count('<'), 2, "Should count 2 lt");
4649        assert_eq!(ctx.char_count('!'), 2, "Should count 2 exclamations");
4650        assert_eq!(ctx.char_count('\n'), 10, "Should count 10 newlines");
4651    }
4652
4653    #[test]
4654    fn test_char_count_zero_for_absent() {
4655        let content = "Plain text";
4656        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4657
4658        assert_eq!(ctx.char_count('#'), 0);
4659        assert_eq!(ctx.char_count('*'), 0);
4660        assert_eq!(ctx.char_count('_'), 0);
4661        assert_eq!(ctx.char_count('\n'), 0);
4662    }
4663
4664    #[test]
4665    fn test_char_count_fallback_for_untracked() {
4666        let content = "@@@ $$ %%%";
4667        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4668
4669        assert_eq!(ctx.char_count('@'), 3, "Should count 3 @ via fallback");
4670        assert_eq!(ctx.char_count('$'), 2, "Should count 2 $ via fallback");
4671        assert_eq!(ctx.char_count('%'), 3, "Should count 3 % via fallback");
4672        assert_eq!(ctx.char_count('^'), 0, "Should count 0 for absent char");
4673    }
4674
4675    #[test]
4676    fn test_char_count_empty_content() {
4677        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4678
4679        assert_eq!(ctx.char_count('#'), 0);
4680        assert_eq!(ctx.char_count('*'), 0);
4681        assert_eq!(ctx.char_count('@'), 0);
4682        assert!(!ctx.has_char('#'));
4683        assert!(!ctx.has_char('@'));
4684    }
4685
4686    // =========================================================================
4687    // Tests for is_in_html_tag method
4688    // =========================================================================
4689
4690    #[test]
4691    fn test_is_in_html_tag_simple() {
4692        let content = "<div>content</div>";
4693        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4694
4695        // Inside opening tag
4696        assert!(ctx.is_in_html_tag(0), "Position 0 (<) should be in tag");
4697        assert!(ctx.is_in_html_tag(1), "Position 1 (d) should be in tag");
4698        assert!(ctx.is_in_html_tag(4), "Position 4 (>) should be in tag");
4699
4700        // Outside tag (in content)
4701        assert!(!ctx.is_in_html_tag(5), "Position 5 (c) should not be in tag");
4702        assert!(!ctx.is_in_html_tag(10), "Position 10 (t) should not be in tag");
4703
4704        // Inside closing tag
4705        assert!(ctx.is_in_html_tag(12), "Position 12 (<) should be in tag");
4706        assert!(ctx.is_in_html_tag(17), "Position 17 (>) should be in tag");
4707    }
4708
4709    #[test]
4710    fn test_is_in_html_tag_self_closing() {
4711        let content = "Text <br/> more text";
4712        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4713
4714        // Before tag
4715        assert!(!ctx.is_in_html_tag(0), "Position 0 should not be in tag");
4716        assert!(!ctx.is_in_html_tag(4), "Position 4 (space) should not be in tag");
4717
4718        // Inside self-closing tag
4719        assert!(ctx.is_in_html_tag(5), "Position 5 (<) should be in tag");
4720        assert!(ctx.is_in_html_tag(8), "Position 8 (/) should be in tag");
4721        assert!(ctx.is_in_html_tag(9), "Position 9 (>) should be in tag");
4722
4723        // After tag
4724        assert!(!ctx.is_in_html_tag(10), "Position 10 (space) should not be in tag");
4725    }
4726
4727    #[test]
4728    fn test_is_in_html_tag_with_attributes() {
4729        let content = r#"<a href="url" class="link">text</a>"#;
4730        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4731
4732        // All positions inside opening tag with attributes
4733        assert!(ctx.is_in_html_tag(0), "Start of tag");
4734        assert!(ctx.is_in_html_tag(10), "Inside href attribute");
4735        assert!(ctx.is_in_html_tag(20), "Inside class attribute");
4736        assert!(ctx.is_in_html_tag(26), "End of opening tag");
4737
4738        // Content between tags
4739        assert!(!ctx.is_in_html_tag(27), "Start of content");
4740        assert!(!ctx.is_in_html_tag(30), "End of content");
4741
4742        // Closing tag
4743        assert!(ctx.is_in_html_tag(31), "Start of closing tag");
4744    }
4745
4746    #[test]
4747    fn test_is_in_html_tag_multiline() {
4748        let content = "<div\n  class=\"test\"\n>\ncontent\n</div>";
4749        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4750
4751        // Opening tag spans multiple lines
4752        assert!(ctx.is_in_html_tag(0), "Start of multiline tag");
4753        assert!(ctx.is_in_html_tag(5), "After first newline in tag");
4754        assert!(ctx.is_in_html_tag(15), "Inside attribute");
4755
4756        // After closing > of opening tag
4757        let closing_bracket_pos = content.find(">\n").unwrap();
4758        assert!(!ctx.is_in_html_tag(closing_bracket_pos + 2), "Content after tag");
4759    }
4760
4761    #[test]
4762    fn test_is_in_html_tag_no_tags() {
4763        let content = "Plain text without any HTML";
4764        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4765
4766        // No position should be in an HTML tag
4767        for i in 0..content.len() {
4768            assert!(!ctx.is_in_html_tag(i), "Position {i} should not be in tag");
4769        }
4770    }
4771
4772    // =========================================================================
4773    // Tests for is_in_jinja_range method
4774    // =========================================================================
4775
4776    #[test]
4777    fn test_is_in_jinja_range_expression() {
4778        let content = "Hello {{ name }}!";
4779        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4780
4781        // Before Jinja
4782        assert!(!ctx.is_in_jinja_range(0), "H should not be in Jinja");
4783        assert!(!ctx.is_in_jinja_range(5), "Space before Jinja should not be in Jinja");
4784
4785        // Inside Jinja expression (positions 6-15 for "{{ name }}")
4786        assert!(ctx.is_in_jinja_range(6), "First brace should be in Jinja");
4787        assert!(ctx.is_in_jinja_range(7), "Second brace should be in Jinja");
4788        assert!(ctx.is_in_jinja_range(10), "name should be in Jinja");
4789        assert!(ctx.is_in_jinja_range(14), "Closing brace should be in Jinja");
4790        assert!(ctx.is_in_jinja_range(15), "Second closing brace should be in Jinja");
4791
4792        // After Jinja
4793        assert!(!ctx.is_in_jinja_range(16), "! should not be in Jinja");
4794    }
4795
4796    #[test]
4797    fn test_is_in_jinja_range_statement() {
4798        let content = "{% if condition %}content{% endif %}";
4799        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4800
4801        // Inside opening statement
4802        assert!(ctx.is_in_jinja_range(0), "Start of Jinja statement");
4803        assert!(ctx.is_in_jinja_range(5), "condition should be in Jinja");
4804        assert!(ctx.is_in_jinja_range(17), "End of opening statement");
4805
4806        // Content between
4807        assert!(!ctx.is_in_jinja_range(18), "content should not be in Jinja");
4808
4809        // Inside closing statement
4810        assert!(ctx.is_in_jinja_range(25), "Start of endif");
4811        assert!(ctx.is_in_jinja_range(32), "endif should be in Jinja");
4812    }
4813
4814    #[test]
4815    fn test_is_in_jinja_range_multiple() {
4816        let content = "{{ a }} and {{ b }}";
4817        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4818
4819        // First Jinja expression
4820        assert!(ctx.is_in_jinja_range(0));
4821        assert!(ctx.is_in_jinja_range(3));
4822        assert!(ctx.is_in_jinja_range(6));
4823
4824        // Between expressions
4825        assert!(!ctx.is_in_jinja_range(8));
4826        assert!(!ctx.is_in_jinja_range(11));
4827
4828        // Second Jinja expression
4829        assert!(ctx.is_in_jinja_range(12));
4830        assert!(ctx.is_in_jinja_range(15));
4831        assert!(ctx.is_in_jinja_range(18));
4832    }
4833
4834    #[test]
4835    fn test_is_in_jinja_range_no_jinja() {
4836        let content = "Plain text with single braces but not Jinja";
4837        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4838
4839        // No position should be in Jinja
4840        for i in 0..content.len() {
4841            assert!(!ctx.is_in_jinja_range(i), "Position {i} should not be in Jinja");
4842        }
4843    }
4844
4845    // =========================================================================
4846    // Tests for is_in_link_title method
4847    // =========================================================================
4848
4849    #[test]
4850    fn test_is_in_link_title_with_title() {
4851        let content = r#"[ref]: https://example.com "Title text"
4852
4853Some content."#;
4854        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4855
4856        // Verify we have a reference def with title
4857        assert_eq!(ctx.reference_defs.len(), 1);
4858        let def = &ctx.reference_defs[0];
4859        assert!(def.title_byte_start.is_some());
4860        assert!(def.title_byte_end.is_some());
4861
4862        let title_start = def.title_byte_start.unwrap();
4863        let title_end = def.title_byte_end.unwrap();
4864
4865        // Before title (in URL)
4866        assert!(!ctx.is_in_link_title(10), "URL should not be in title");
4867
4868        // Inside title
4869        assert!(ctx.is_in_link_title(title_start), "Title start should be in title");
4870        assert!(
4871            ctx.is_in_link_title(title_start + 5),
4872            "Middle of title should be in title"
4873        );
4874        assert!(ctx.is_in_link_title(title_end - 1), "End of title should be in title");
4875
4876        // After title
4877        assert!(
4878            !ctx.is_in_link_title(title_end),
4879            "After title end should not be in title"
4880        );
4881    }
4882
4883    #[test]
4884    fn test_is_in_link_title_without_title() {
4885        let content = "[ref]: https://example.com\n\nSome content.";
4886        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4887
4888        // Reference def without title
4889        assert_eq!(ctx.reference_defs.len(), 1);
4890        let def = &ctx.reference_defs[0];
4891        assert!(def.title_byte_start.is_none());
4892        assert!(def.title_byte_end.is_none());
4893
4894        // No position should be in a title
4895        for i in 0..content.len() {
4896            assert!(!ctx.is_in_link_title(i), "Position {i} should not be in title");
4897        }
4898    }
4899
4900    #[test]
4901    fn test_is_in_link_title_multiple_refs() {
4902        let content = r#"[ref1]: /url1 "Title One"
4903[ref2]: /url2
4904[ref3]: /url3 "Title Three"
4905"#;
4906        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4907
4908        // Should have 3 reference defs
4909        assert_eq!(ctx.reference_defs.len(), 3);
4910
4911        // ref1 has title
4912        let ref1 = ctx.reference_defs.iter().find(|r| r.id == "ref1").unwrap();
4913        assert!(ref1.title_byte_start.is_some());
4914
4915        // ref2 has no title
4916        let ref2 = ctx.reference_defs.iter().find(|r| r.id == "ref2").unwrap();
4917        assert!(ref2.title_byte_start.is_none());
4918
4919        // ref3 has title
4920        let ref3 = ctx.reference_defs.iter().find(|r| r.id == "ref3").unwrap();
4921        assert!(ref3.title_byte_start.is_some());
4922
4923        // Check positions in ref1's title
4924        if let (Some(start), Some(end)) = (ref1.title_byte_start, ref1.title_byte_end) {
4925            assert!(ctx.is_in_link_title(start + 1));
4926            assert!(!ctx.is_in_link_title(end + 5));
4927        }
4928
4929        // Check positions in ref3's title
4930        if let (Some(start), Some(_end)) = (ref3.title_byte_start, ref3.title_byte_end) {
4931            assert!(ctx.is_in_link_title(start + 1));
4932        }
4933    }
4934
4935    #[test]
4936    fn test_is_in_link_title_single_quotes() {
4937        let content = "[ref]: /url 'Single quoted title'\n";
4938        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4939
4940        assert_eq!(ctx.reference_defs.len(), 1);
4941        let def = &ctx.reference_defs[0];
4942
4943        if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
4944            assert!(ctx.is_in_link_title(start));
4945            assert!(ctx.is_in_link_title(start + 5));
4946            assert!(!ctx.is_in_link_title(end));
4947        }
4948    }
4949
4950    #[test]
4951    fn test_is_in_link_title_parentheses() {
4952        // Note: The reference def parser may not support parenthesized titles
4953        // This test verifies the is_in_link_title method works when titles exist
4954        let content = "[ref]: /url (Parenthesized title)\n";
4955        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4956
4957        // Parser behavior: may or may not parse parenthesized titles
4958        // We test that is_in_link_title correctly reflects whatever was parsed
4959        if ctx.reference_defs.is_empty() {
4960            // Parser didn't recognize this as a reference def
4961            for i in 0..content.len() {
4962                assert!(!ctx.is_in_link_title(i));
4963            }
4964        } else {
4965            let def = &ctx.reference_defs[0];
4966            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
4967                assert!(ctx.is_in_link_title(start));
4968                assert!(ctx.is_in_link_title(start + 5));
4969                assert!(!ctx.is_in_link_title(end));
4970            } else {
4971                // Title wasn't parsed, so no position should be in title
4972                for i in 0..content.len() {
4973                    assert!(!ctx.is_in_link_title(i));
4974                }
4975            }
4976        }
4977    }
4978
4979    #[test]
4980    fn test_is_in_link_title_no_refs() {
4981        let content = "Just plain text without any reference definitions.";
4982        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4983
4984        assert!(ctx.reference_defs.is_empty());
4985
4986        for i in 0..content.len() {
4987            assert!(!ctx.is_in_link_title(i));
4988        }
4989    }
4990
4991    // =========================================================================
4992    // Math span tests (Issue #289)
4993    // =========================================================================
4994
4995    #[test]
4996    fn test_math_spans_inline() {
4997        let content = "Text with inline math $[f](x)$ in it.";
4998        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4999
5000        let math_spans = ctx.math_spans();
5001        assert_eq!(math_spans.len(), 1, "Should detect one inline math span");
5002
5003        let span = &math_spans[0];
5004        assert!(!span.is_display, "Should be inline math, not display");
5005        assert_eq!(span.content, "[f](x)", "Content should be extracted correctly");
5006    }
5007
5008    #[test]
5009    fn test_math_spans_display_single_line() {
5010        let content = "$$X(\\zeta) = \\mathcal Z [x](\\zeta)$$";
5011        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5012
5013        let math_spans = ctx.math_spans();
5014        assert_eq!(math_spans.len(), 1, "Should detect one display math span");
5015
5016        let span = &math_spans[0];
5017        assert!(span.is_display, "Should be display math");
5018        assert!(
5019            span.content.contains("[x](\\zeta)"),
5020            "Content should contain the link-like pattern"
5021        );
5022    }
5023
5024    #[test]
5025    fn test_math_spans_display_multiline() {
5026        let content = "Before\n\n$$\n[x](\\zeta) = \\sum_k x(k)\n$$\n\nAfter";
5027        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5028
5029        let math_spans = ctx.math_spans();
5030        assert_eq!(math_spans.len(), 1, "Should detect one display math span");
5031
5032        let span = &math_spans[0];
5033        assert!(span.is_display, "Should be display math");
5034    }
5035
5036    #[test]
5037    fn test_is_in_math_span() {
5038        let content = "Text $[f](x)$ more text";
5039        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5040
5041        // Position inside the math span
5042        let math_start = content.find('$').unwrap();
5043        let math_end = content.rfind('$').unwrap() + 1;
5044
5045        assert!(
5046            ctx.is_in_math_span(math_start + 1),
5047            "Position inside math span should return true"
5048        );
5049        assert!(
5050            ctx.is_in_math_span(math_start + 3),
5051            "Position inside math span should return true"
5052        );
5053
5054        // Position outside the math span
5055        assert!(!ctx.is_in_math_span(0), "Position before math span should return false");
5056        assert!(
5057            !ctx.is_in_math_span(math_end + 1),
5058            "Position after math span should return false"
5059        );
5060    }
5061
5062    #[test]
5063    fn test_math_spans_mixed_with_code() {
5064        let content = "Math $[f](x)$ and code `[g](y)` mixed";
5065        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5066
5067        let math_spans = ctx.math_spans();
5068        let code_spans = ctx.code_spans();
5069
5070        assert_eq!(math_spans.len(), 1, "Should have one math span");
5071        assert_eq!(code_spans.len(), 1, "Should have one code span");
5072
5073        // Verify math span content
5074        assert_eq!(math_spans[0].content, "[f](x)");
5075        // Verify code span content
5076        assert_eq!(code_spans[0].content, "[g](y)");
5077    }
5078
5079    #[test]
5080    fn test_math_spans_no_math() {
5081        let content = "Regular text without any math at all.";
5082        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5083
5084        let math_spans = ctx.math_spans();
5085        assert!(math_spans.is_empty(), "Should have no math spans");
5086    }
5087
5088    #[test]
5089    fn test_math_spans_multiple() {
5090        let content = "First $a$ and second $b$ and display $$c$$";
5091        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5092
5093        let math_spans = ctx.math_spans();
5094        assert_eq!(math_spans.len(), 3, "Should detect three math spans");
5095
5096        // Two inline, one display
5097        let inline_count = math_spans.iter().filter(|s| !s.is_display).count();
5098        let display_count = math_spans.iter().filter(|s| s.is_display).count();
5099
5100        assert_eq!(inline_count, 2, "Should have two inline math spans");
5101        assert_eq!(display_count, 1, "Should have one display math span");
5102    }
5103
5104    #[test]
5105    fn test_is_in_math_span_boundary_positions() {
5106        // Test exact boundary positions: $[f](x)$
5107        // Byte positions:                0123456789
5108        let content = "$[f](x)$";
5109        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5110
5111        let math_spans = ctx.math_spans();
5112        assert_eq!(math_spans.len(), 1, "Should have one math span");
5113
5114        let span = &math_spans[0];
5115
5116        // Position at opening $ should be in span (byte 0)
5117        assert!(
5118            ctx.is_in_math_span(span.byte_offset),
5119            "Start position should be in span"
5120        );
5121
5122        // Position just inside should be in span
5123        assert!(
5124            ctx.is_in_math_span(span.byte_offset + 1),
5125            "Position after start should be in span"
5126        );
5127
5128        // Position at closing $ should be in span (exclusive end means we check byte_end - 1)
5129        assert!(
5130            ctx.is_in_math_span(span.byte_end - 1),
5131            "Position at end-1 should be in span"
5132        );
5133
5134        // Position at byte_end should NOT be in span (exclusive end)
5135        assert!(
5136            !ctx.is_in_math_span(span.byte_end),
5137            "Position at byte_end should NOT be in span (exclusive)"
5138        );
5139    }
5140
5141    #[test]
5142    fn test_math_spans_at_document_start() {
5143        let content = "$x$ text";
5144        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5145
5146        let math_spans = ctx.math_spans();
5147        assert_eq!(math_spans.len(), 1);
5148        assert_eq!(math_spans[0].byte_offset, 0, "Math should start at byte 0");
5149    }
5150
5151    #[test]
5152    fn test_math_spans_at_document_end() {
5153        let content = "text $x$";
5154        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5155
5156        let math_spans = ctx.math_spans();
5157        assert_eq!(math_spans.len(), 1);
5158        assert_eq!(math_spans[0].byte_end, content.len(), "Math should end at document end");
5159    }
5160
5161    #[test]
5162    fn test_math_spans_consecutive() {
5163        let content = "$a$$b$";
5164        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5165
5166        let math_spans = ctx.math_spans();
5167        // pulldown-cmark should parse these as separate spans
5168        assert!(!math_spans.is_empty(), "Should detect at least one math span");
5169
5170        // All positions should be in some math span
5171        for i in 0..content.len() {
5172            assert!(ctx.is_in_math_span(i), "Position {i} should be in a math span");
5173        }
5174    }
5175
5176    #[test]
5177    fn test_math_spans_currency_not_math() {
5178        // Unbalanced $ should not create math spans
5179        let content = "Price is $100";
5180        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5181
5182        let math_spans = ctx.math_spans();
5183        // pulldown-cmark requires balanced delimiters for math
5184        // $100 alone is not math
5185        assert!(
5186            math_spans.is_empty() || !math_spans.iter().any(|s| s.content.contains("100")),
5187            "Unbalanced $ should not create math span containing 100"
5188        );
5189    }
5190
5191    // =========================================================================
5192    // Tests for O(1) reference definition lookups via HashMap
5193    // =========================================================================
5194
5195    #[test]
5196    fn test_reference_lookup_o1_basic() {
5197        let content = r#"[ref1]: /url1
5198[REF2]: /url2 "Title"
5199[Ref3]: /url3
5200
5201Use [link][ref1] and [link][REF2]."#;
5202        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5203
5204        // Verify we have 3 reference defs
5205        assert_eq!(ctx.reference_defs.len(), 3);
5206
5207        // Test get_reference_url with various cases
5208        assert_eq!(ctx.get_reference_url("ref1"), Some("/url1"));
5209        assert_eq!(ctx.get_reference_url("REF1"), Some("/url1")); // case insensitive
5210        assert_eq!(ctx.get_reference_url("Ref1"), Some("/url1")); // case insensitive
5211        assert_eq!(ctx.get_reference_url("ref2"), Some("/url2"));
5212        assert_eq!(ctx.get_reference_url("REF2"), Some("/url2"));
5213        assert_eq!(ctx.get_reference_url("ref3"), Some("/url3"));
5214        assert_eq!(ctx.get_reference_url("nonexistent"), None);
5215    }
5216
5217    #[test]
5218    fn test_reference_lookup_o1_get_reference_def() {
5219        let content = r#"[myref]: https://example.com "My Title"
5220"#;
5221        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5222
5223        // Test get_reference_def
5224        let def = ctx.get_reference_def("myref").expect("Should find myref");
5225        assert_eq!(def.url, "https://example.com");
5226        assert_eq!(def.title.as_deref(), Some("My Title"));
5227
5228        // Case insensitive
5229        let def2 = ctx.get_reference_def("MYREF").expect("Should find MYREF");
5230        assert_eq!(def2.url, "https://example.com");
5231
5232        // Non-existent
5233        assert!(ctx.get_reference_def("nonexistent").is_none());
5234    }
5235
5236    #[test]
5237    fn test_reference_lookup_o1_has_reference_def() {
5238        let content = r#"[foo]: /foo
5239[BAR]: /bar
5240"#;
5241        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5242
5243        // Test has_reference_def
5244        assert!(ctx.has_reference_def("foo"));
5245        assert!(ctx.has_reference_def("FOO")); // case insensitive
5246        assert!(ctx.has_reference_def("bar"));
5247        assert!(ctx.has_reference_def("Bar")); // case insensitive
5248        assert!(!ctx.has_reference_def("baz")); // doesn't exist
5249    }
5250
5251    #[test]
5252    fn test_reference_lookup_o1_empty_content() {
5253        let content = "No references here.";
5254        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5255
5256        assert!(ctx.reference_defs.is_empty());
5257        assert_eq!(ctx.get_reference_url("anything"), None);
5258        assert!(ctx.get_reference_def("anything").is_none());
5259        assert!(!ctx.has_reference_def("anything"));
5260    }
5261
5262    #[test]
5263    fn test_reference_lookup_o1_special_characters_in_id() {
5264        let content = r#"[ref-with-dash]: /url1
5265[ref_with_underscore]: /url2
5266[ref.with.dots]: /url3
5267"#;
5268        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5269
5270        assert_eq!(ctx.get_reference_url("ref-with-dash"), Some("/url1"));
5271        assert_eq!(ctx.get_reference_url("ref_with_underscore"), Some("/url2"));
5272        assert_eq!(ctx.get_reference_url("ref.with.dots"), Some("/url3"));
5273    }
5274
5275    #[test]
5276    fn test_reference_lookup_o1_unicode_id() {
5277        let content = r#"[日本語]: /japanese
5278[émoji]: /emoji
5279"#;
5280        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5281
5282        assert_eq!(ctx.get_reference_url("日本語"), Some("/japanese"));
5283        assert_eq!(ctx.get_reference_url("émoji"), Some("/emoji"));
5284        assert_eq!(ctx.get_reference_url("ÉMOJI"), Some("/emoji")); // uppercase
5285    }
5286}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs